16 #ifndef included_vector_avx512_h 17 #define included_vector_avx512_h 20 #include <x86intrin.h> 23 #define foreach_avx512_vec512i \ 24 _(i,8,64,epi8) _(i,16,32,epi16) _(i,32,16,epi32) _(i,64,8,epi64) 25 #define foreach_avx512_vec512u \ 26 _(u,8,64,epi8) _(u,16,32,epi16) _(u,32,16,epi32) _(u,64,8,epi64) 27 #define foreach_avx512_vec512f \ 28 _(f,32,8,ps) _(f,64,4,pd) 32 #define _(t, s, c, i) \ 33 static_always_inline t##s##x##c \ 34 t##s##x##c##_splat (t##s x) \ 35 { return (t##s##x##c) _mm512_set1_##i (x); } \ 37 static_always_inline t##s##x##c \ 38 t##s##x##c##_load_aligned (void *p) \ 39 { return (t##s##x##c) _mm512_load_si512 (p); } \ 41 static_always_inline void \ 42 t##s##x##c##_store_aligned (t##s##x##c v, void *p) \ 43 { _mm512_store_si512 ((__m512i *) p, (__m512i) v); } \ 45 static_always_inline t##s##x##c \ 46 t##s##x##c##_load_unaligned (void *p) \ 47 { return (t##s##x##c) _mm512_loadu_si512 (p); } \ 49 static_always_inline void \ 50 t##s##x##c##_store_unaligned (t##s##x##c v, void *p) \ 51 { _mm512_storeu_si512 ((__m512i *) p, (__m512i) v); } \ 53 static_always_inline int \ 54 t##s##x##c##_is_all_zero (t##s##x##c v) \ 55 { return (_mm512_test_epi64_mask ((__m512i) v, (__m512i) v) == 0); } \ 57 static_always_inline int \ 58 t##s##x##c##_is_equal (t##s##x##c a, t##s##x##c b) \ 59 { return t##s##x##c##_is_all_zero (a ^ b); } \ 61 static_always_inline int \ 62 t##s##x##c##_is_all_equal (t##s##x##c v, t##s x) \ 63 { return t##s##x##c##_is_equal (v, t##s##x##c##_splat (x)); } \ 65 static_always_inline u##c \ 66 t##s##x##c##_is_zero_mask (t##s##x##c v) \ 67 { return _mm512_test_##i##_mask ((__m512i) v, (__m512i) v); } \ 69 static_always_inline t##s##x##c \ 70 t##s##x##c##_interleave_lo (t##s##x##c a, t##s##x##c b) \ 71 { return (t##s##x##c) _mm512_unpacklo_##i ((__m512i) a, (__m512i) b); } \ 73 static_always_inline t##s##x##c \ 74 t##s##x##c##_interleave_hi (t##s##x##c a, t##s##x##c b) \ 75 { return (t##s##x##c) _mm512_unpackhi_##i ((__m512i) a, (__m512i) b); } \ 85 return (
u32) _mm512_movepi16_mask ((__m512i) v);
90 always_inline t t##_pack (f lo, f hi) \ 92 return (t) fn ((__m512i) lo, (__m512i) hi); \ 95 _ (i16x32, i8x64, _mm512_packs_epi16)
96 _ (i16x32,
u8x64, _mm512_packus_epi16)
97 _ (i32x16, i16x32, _mm512_packs_epi32)
98 _ (i32x16,
u16x32, _mm512_packus_epi32)
105 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
106 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
107 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
108 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
110 return (u32x16) _mm512_shuffle_epi8 ((__m512i) v, (__m512i) swap);
117 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
118 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
119 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
120 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14
122 return (u16x32) _mm512_shuffle_epi8 ((__m512i) v, (__m512i) swap);
126 static_always_inline t f##_extract_lo (f v) \ 128 return (t) _mm512_extracti64x4_epi64 ((__m512i) v, 0); \ 130 static_always_inline t f##_extract_hi (f v) \ 132 return (t) _mm512_extracti64x4_epi64 ((__m512i) v, 1); \ 145 u32x16_extract_hi (v)));
151 return (u32x16) _mm512_inserti64x4 ((__m512i) r, (__m256i) v, 0);
157 return (u32x16) _mm512_inserti64x4 ((__m512i) r, (__m256i) v, 1);
163 return (u64x8) _mm512_permutex2var_epi64 ((__m512i) a, (__m512i) mask,
168 #define u32x16_ternary_logic(a, b, c, d) \ 169 (u32x16) _mm512_ternarylogic_epi32 ((__m512i) a, (__m512i) b, (__m512i) c, d) 171 #define u8x64_insert_u8x16(a, b, n) \ 172 (u8x64) _mm512_inserti64x2 ((__m512i) (a), (__m128i) (b), n) 174 #define u8x64_extract_u8x16(a, n) \ 175 (u8x16) _mm512_extracti64x2_epi64 ((__m512i) (a), n) 177 #define u8x64_word_shift_left(a,n) (u8x64) _mm512_bslli_epi128((__m512i) a, n) 178 #define u8x64_word_shift_right(a,n) (u8x64) _mm512_bsrli_epi128((__m512i) a, n) 183 return (u8x64) _mm512_ternarylogic_epi32 ((__m512i) a, (__m512i) b,
190 static const u8x64
mask = {
191 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
192 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
193 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
194 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
196 return (u8x64) _mm512_shuffle_epi8 ((__m512i) x, (__m512i) mask);
202 return (u8x64) _mm512_shuffle_epi8 ((__m512i) v, (__m512i) m);
205 #define u8x64_align_right(a, b, imm) \ 206 (u8x64) _mm512_alignr_epi8 ((__m512i) a, (__m512i) b, imm) 214 sum8 = u32x16_extract_hi (sum16) + u32x16_extract_lo (sum16);
215 return sum8[0] + sum8[4];
218 #define _(t, m, p, i, e) \ 219 static_always_inline t t##_mask_load (t a, void *p, m mask) \ 221 return (t) p##_mask_loadu_##e ((i) a, mask, p); \ 223 static_always_inline t t##_mask_load_zero (void *p, m mask) \ 225 return (t) p##_maskz_loadu_##e (mask, p); \ 227 static_always_inline void t##_mask_store (t a, void *p, m mask) \ 229 p##_mask_storeu_##e (p, mask, (i) a); \ 232 _ (u8x64,
u64, _mm512, __m512i, epi8)
235 _ (u16x32, u32, _mm512, __m512i, epi16)
236 _ (u16x16, u16, _mm256, __m256i, epi16)
237 _ (u16x8,
u8, _mm, __m128i, epi16)
238 _ (u32x16, u16, _mm512, __m512i, epi32)
239 _ (u32x8, u8, _mm256, __m256i, epi32)
240 _ (
u32x4, u8, _mm, __m128i, epi32)
241 _ (u64x8, u8, _mm512, __m512i, epi64)
242 _ (
u64x4, u8, _mm256, __m256i, epi64)
243 _ (
u64x2, u8, _mm, __m128i, epi64)
246 #ifdef CLIB_HAVE_VEC512 247 #define CLIB_HAVE_VEC512_MASK_LOAD_STORE 249 #ifdef CLIB_HAVE_VEC256 250 #define CLIB_HAVE_VEC256_MASK_LOAD_STORE 252 #ifdef CLIB_HAVE_VEC128 253 #define CLIB_HAVE_VEC128_MASK_LOAD_STORE 259 return (u8x64) _mm512_broadcast_i64x2 ((__m128i) a);
265 return (u32x16) _mm512_broadcast_i64x2 ((__m128i) a);
271 return (u32x16) _mm512_mask_blend_epi32 (mask, (__m512i) a, (__m512i) b);
277 return (u8x64) _mm512_mask_blend_epi8 (mask, (__m512i) a, (__m512i) b);
280 #define _(t, m, e, p, it) \ 281 static_always_inline m t##_is_equal_mask (t a, t b) \ 283 return p##_cmpeq_##e##_mask ((it) a, (it) b); \ 285 _ (u8x16, u16, epu8, _mm, __m128i)
287 _ (u32x4, u8, epu32, _mm, __m128i)
290 _ (u8x32, u32, epu8, _mm256, __m256i)
291 _ (u16x16, u16, epu16, _mm256, __m256i)
292 _ (u32x8, u8, epu32, _mm256, __m256i)
293 _ (u64x4, u8, epu64, _mm256, __m256i)
295 _ (u8x64,
u64, epu8, _mm512, __m512i)
296 _ (u16x32, u32, epu16, _mm512, __m512i)
297 _ (u32x16, u16, epu32, _mm512, __m512i)
298 _ (u64x8, u8, epu64, _mm512, __m512i)
301 #define _(f, t, fn, it) \ 302 static_always_inline t t##_from_##f (f x) { return (t) fn ((it) x); } 303 _ (u16x16, u32x16, _mm512_cvtepi16_epi32, __m256i)
304 _ (u32x16, u16x16, _mm512_cvtusepi32_epi16, __m512i)
305 _ (u32x8, u16x8, _mm256_cvtusepi32_epi16, __m256i)
306 _ (u32x8, u64x8, _mm512_cvtepu32_epi64, __m256i)
309 #define _(vt, mt, p, it, epi) \ 310 static_always_inline vt vt##_compress (vt a, mt mask) \ 312 return (vt) p##_maskz_compress_##epi (mask, (it) a); \ 314 static_always_inline vt vt##_expand (vt a, mt mask) \ 316 return (vt) p##_maskz_expand_##epi (mask, (it) a); \ 318 static_always_inline void vt##_compress_store (vt v, mt mask, void *p) \ 320 p##_mask_compressstoreu_##epi (p, mask, (it) v); \ 323 _ (u64x8, u8, _mm512, __m512i, epi64)
324 _ (u32x16, u16, _mm512, __m512i, epi32)
325 _ (u64x4, u8, _mm256, __m256i, epi64)
326 _ (u32x8, u8, _mm256, __m256i, epi32)
327 _ (u64x2, u8, _mm, __m128i, epi64)
328 _ (u32x4, u8, _mm, __m128i, epi32)
329 #ifdef __AVX512VBMI2__ 330 _ (u16x32, u32, _mm512, __m512i, epi16)
331 _ (u8x64, u64, _mm512, __m512i, epi8)
332 _ (u16x16, u16, _mm256, __m256i, epi16)
333 _ (u8x32, u32, _mm256, __m256i, epi8)
334 _ (u16x8, u8, _mm, __m128i, epi16)
335 _ (u8x16, u16, _mm, __m128i, epi8)
339 #ifdef CLIB_HAVE_VEC256 340 #define CLIB_HAVE_VEC256_COMPRESS 342 #ifdef CLIB_HAVE_VEC512 343 #define CLIB_HAVE_VEC512_COMPRESS 346 #ifndef __AVX512VBMI2__ 350 return u16x16_from_u32x16 (u32x16_compress (u32x16_from_u16x16 (v), mask));
356 return u16x8_from_u32x8 (u32x8_compress (u32x8_from_u16x8 (v), mask));
363 __m512i
r[16],
a,
b,
c, d, x, y;
366 __m512i pm1 = (__m512i) (u64x8) { 0, 1, 8, 9, 4, 5, 12, 13};
367 __m512i pm2 = (__m512i) (u64x8) { 2, 3, 10, 11, 6, 7, 14, 15};
368 __m512i pm3 = (__m512i) (u64x8) { 0, 1, 2, 3, 8, 9, 10, 11};
369 __m512i pm4 = (__m512i) (u64x8) { 4, 5, 6, 7, 12, 13, 14, 15};
372 r[0] = _mm512_unpacklo_epi32 ((__m512i) m[0], (__m512i) m[1]);
373 r[1] = _mm512_unpacklo_epi32 ((__m512i) m[2], (__m512i) m[3]);
374 r[2] = _mm512_unpacklo_epi32 ((__m512i) m[4], (__m512i) m[5]);
375 r[3] = _mm512_unpacklo_epi32 ((__m512i) m[6], (__m512i) m[7]);
376 r[4] = _mm512_unpacklo_epi32 ((__m512i) m[8], (__m512i) m[9]);
377 r[5] = _mm512_unpacklo_epi32 ((__m512i) m[10], (__m512i) m[11]);
378 r[6] = _mm512_unpacklo_epi32 ((__m512i) m[12], (__m512i) m[13]);
379 r[7] = _mm512_unpacklo_epi32 ((__m512i) m[14], (__m512i) m[15]);
381 r[8] = _mm512_unpackhi_epi32 ((__m512i) m[0], (__m512i) m[1]);
382 r[9] = _mm512_unpackhi_epi32 ((__m512i) m[2], (__m512i) m[3]);
383 r[10] = _mm512_unpackhi_epi32 ((__m512i) m[4], (__m512i) m[5]);
384 r[11] = _mm512_unpackhi_epi32 ((__m512i) m[6], (__m512i) m[7]);
385 r[12] = _mm512_unpackhi_epi32 ((__m512i) m[8], (__m512i) m[9]);
386 r[13] = _mm512_unpackhi_epi32 ((__m512i) m[10], (__m512i) m[11]);
387 r[14] = _mm512_unpackhi_epi32 ((__m512i) m[12], (__m512i) m[13]);
388 r[15] = _mm512_unpackhi_epi32 ((__m512i) m[14], (__m512i) m[15]);
390 a = _mm512_unpacklo_epi64 (r[0], r[1]);
391 b = _mm512_unpacklo_epi64 (r[2], r[3]);
392 c = _mm512_unpacklo_epi64 (r[4], r[5]);
393 d = _mm512_unpacklo_epi64 (r[6], r[7]);
394 x = _mm512_permutex2var_epi64 (a, pm1, b);
395 y = _mm512_permutex2var_epi64 (c, pm1, d);
396 m[0] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
397 m[8] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
398 x = _mm512_permutex2var_epi64 (a, pm2, b);
399 y = _mm512_permutex2var_epi64 (c, pm2, d);
400 m[4] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
401 m[12] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
403 a = _mm512_unpacklo_epi64 (r[8], r[9]);
404 b = _mm512_unpacklo_epi64 (r[10], r[11]);
405 c = _mm512_unpacklo_epi64 (r[12], r[13]);
406 d = _mm512_unpacklo_epi64 (r[14], r[15]);
407 x = _mm512_permutex2var_epi64 (a, pm1, b);
408 y = _mm512_permutex2var_epi64 (c, pm1, d);
409 m[2] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
410 m[10] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
411 x = _mm512_permutex2var_epi64 (a, pm2, b);
412 y = _mm512_permutex2var_epi64 (c, pm2, d);
413 m[6] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
414 m[14] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
416 a = _mm512_unpackhi_epi64 (r[0], r[1]);
417 b = _mm512_unpackhi_epi64 (r[2], r[3]);
418 c = _mm512_unpackhi_epi64 (r[4], r[5]);
419 d = _mm512_unpackhi_epi64 (r[6], r[7]);
420 x = _mm512_permutex2var_epi64 (a, pm1, b);
421 y = _mm512_permutex2var_epi64 (c, pm1, d);
422 m[1] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
423 m[9] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
424 x = _mm512_permutex2var_epi64 (a, pm2, b);
425 y = _mm512_permutex2var_epi64 (c, pm2, d);
426 m[5] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
427 m[13] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
429 a = _mm512_unpackhi_epi64 (r[8], r[9]);
430 b = _mm512_unpackhi_epi64 (r[10], r[11]);
431 c = _mm512_unpackhi_epi64 (r[12], r[13]);
432 d = _mm512_unpackhi_epi64 (r[14], r[15]);
433 x = _mm512_permutex2var_epi64 (a, pm1, b);
434 y = _mm512_permutex2var_epi64 (c, pm1, d);
435 m[3] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
436 m[11] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
437 x = _mm512_permutex2var_epi64 (a, pm2, b);
438 y = _mm512_permutex2var_epi64 (c, pm2, d);
439 m[7] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
440 m[15] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
451 __m512i pm1 = (__m512i) (u64x8) { 0, 1, 8, 9, 4, 5, 12, 13};
452 __m512i pm2 = (__m512i) (u64x8) { 2, 3, 10, 11, 6, 7, 14, 15};
453 __m512i pm3 = (__m512i) (u64x8) { 0, 1, 2, 3, 8, 9, 10, 11};
454 __m512i pm4 = (__m512i) (u64x8) { 4, 5, 6, 7, 12, 13, 14, 15};
457 r[0] = _mm512_unpacklo_epi64 ((__m512i) m[0], (__m512i) m[1]);
458 r[1] = _mm512_unpacklo_epi64 ((__m512i) m[2], (__m512i) m[3]);
459 r[2] = _mm512_unpacklo_epi64 ((__m512i) m[4], (__m512i) m[5]);
460 r[3] = _mm512_unpacklo_epi64 ((__m512i) m[6], (__m512i) m[7]);
461 r[4] = _mm512_unpackhi_epi64 ((__m512i) m[0], (__m512i) m[1]);
462 r[5] = _mm512_unpackhi_epi64 ((__m512i) m[2], (__m512i) m[3]);
463 r[6] = _mm512_unpackhi_epi64 ((__m512i) m[4], (__m512i) m[5]);
464 r[7] = _mm512_unpackhi_epi64 ((__m512i) m[6], (__m512i) m[7]);
466 x = _mm512_permutex2var_epi64 (r[0], pm1, r[1]);
467 y = _mm512_permutex2var_epi64 (r[2], pm1, r[3]);
468 m[0] = (
u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
469 m[4] = (
u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
470 x = _mm512_permutex2var_epi64 (r[0], pm2, r[1]);
471 y = _mm512_permutex2var_epi64 (r[2], pm2, r[3]);
472 m[2] = (
u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
473 m[6] = (
u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
475 x = _mm512_permutex2var_epi64 (r[4], pm1, r[5]);
476 y = _mm512_permutex2var_epi64 (r[6], pm1, r[7]);
477 m[1] = (
u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
478 m[5] = (
u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
479 x = _mm512_permutex2var_epi64 (r[4], pm2, r[5]);
480 y = _mm512_permutex2var_epi64 (r[6], pm2, r[7]);
481 m[3] = (
u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
482 m[7] = (
u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
static_always_inline u64x8 u64x8_permute(u64x8 a, u64x8 b, u64x8 mask)
static_always_inline u16x8 u16x8_compress(u16x8 v, u8 mask)
vnet_hw_if_output_node_runtime_t * r
static_always_inline void u64x8_transpose(u64x8 m[8])
static_always_inline u8x64 u8x64_reflect_u8x16(u8x64 x)
static_always_inline u32 u32x8_min_scalar(u32x8 v)
static_always_inline u8x64 u8x64_shuffle(u8x64 v, u8x64 m)
static_always_inline void u32x16_transpose(u32x16 m[16])
static_always_inline u32x16 u32x16_insert_lo(u32x16 r, u32x8 v)
static_always_inline u32x16 u32x16_splat_u32x4(u32x4 a)
#define static_always_inline
u32x8 u8x32 static_always_inline u32 u32x16_min_scalar(u32x16 v)
#define foreach_avx512_vec512i
static_always_inline u8x64 u8x64_xor3(u8x64 a, u8x64 b, u8x64 c)
static_always_inline u16x32 u16x32_byte_swap(u16x32 v)
epu8_epi32 epu16_epi32 u64x2
#define u8x64_align_right(a, b, imm)
static_always_inline u32x16 u32x16_mask_blend(u32x16 a, u32x16 b, u16 mask)
epi8 epi16 epi16 epi32 epi64 epi64 static_always_inline u8x64 u8x64_splat_u8x16(u8x16 a)
static_always_inline u32 u32x16_sum_elts(u32x16 sum16)
static_always_inline u32x8 u32x8_min(u32x8 a, u32x8 b)
foreach_avx512_vec512i foreach_avx512_vec512u static_always_inline u32 u16x32_msb_mask(u16x32 v)
epi32 epi32 epi32 static_always_inline u16x16 u16x16_compress(u16x16 v, u16 mask)
static_always_inline u32x16 u32x16_insert_hi(u32x16 r, u32x8 v)
static_always_inline u8x64 u8x64_mask_blend(u8x64 a, u8x64 b, u64 mask)
_mm512_packus_epi16 _mm512_packus_epi32 static_always_inline u32x16 u32x16_byte_swap(u32x16 v)
#define foreach_avx512_vec512u
_mm512_packus_epi16 u16x32