FD.io VPP  v21.06
Vector Packet Processing
vector_avx512.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #ifndef included_vector_avx512_h
17 #define included_vector_avx512_h
18 
19 #include <vppinfra/clib.h>
20 #include <x86intrin.h>
21 
22 /* *INDENT-OFF* */
23 #define foreach_avx512_vec512i \
24  _(i,8,64,epi8) _(i,16,32,epi16) _(i,32,16,epi32) _(i,64,8,epi64)
25 #define foreach_avx512_vec512u \
26  _(u,8,64,epi8) _(u,16,32,epi16) _(u,32,16,epi32) _(u,64,8,epi64)
27 #define foreach_avx512_vec512f \
28  _(f,32,8,ps) _(f,64,4,pd)
29 
30 /* splat, load_unaligned, store_unaligned, is_all_zero, is_equal,
31  is_all_equal, is_zero_mask */
32 #define _(t, s, c, i) \
33 static_always_inline t##s##x##c \
34 t##s##x##c##_splat (t##s x) \
35 { return (t##s##x##c) _mm512_set1_##i (x); } \
36 \
37 static_always_inline t##s##x##c \
38 t##s##x##c##_load_aligned (void *p) \
39 { return (t##s##x##c) _mm512_load_si512 (p); } \
40 \
41 static_always_inline void \
42 t##s##x##c##_store_aligned (t##s##x##c v, void *p) \
43 { _mm512_store_si512 ((__m512i *) p, (__m512i) v); } \
44 \
45 static_always_inline t##s##x##c \
46 t##s##x##c##_load_unaligned (void *p) \
47 { return (t##s##x##c) _mm512_loadu_si512 (p); } \
48 \
49 static_always_inline void \
50 t##s##x##c##_store_unaligned (t##s##x##c v, void *p) \
51 { _mm512_storeu_si512 ((__m512i *) p, (__m512i) v); } \
52 \
53 static_always_inline int \
54 t##s##x##c##_is_all_zero (t##s##x##c v) \
55 { return (_mm512_test_epi64_mask ((__m512i) v, (__m512i) v) == 0); } \
56 \
57 static_always_inline int \
58 t##s##x##c##_is_equal (t##s##x##c a, t##s##x##c b) \
59 { return t##s##x##c##_is_all_zero (a ^ b); } \
60 \
61 static_always_inline int \
62 t##s##x##c##_is_all_equal (t##s##x##c v, t##s x) \
63 { return t##s##x##c##_is_equal (v, t##s##x##c##_splat (x)); } \
64 \
65 static_always_inline u##c \
66 t##s##x##c##_is_zero_mask (t##s##x##c v) \
67 { return _mm512_test_##i##_mask ((__m512i) v, (__m512i) v); } \
68 \
69 static_always_inline t##s##x##c \
70 t##s##x##c##_interleave_lo (t##s##x##c a, t##s##x##c b) \
71 { return (t##s##x##c) _mm512_unpacklo_##i ((__m512i) a, (__m512i) b); } \
72 \
73 static_always_inline t##s##x##c \
74 t##s##x##c##_interleave_hi (t##s##x##c a, t##s##x##c b) \
75 { return (t##s##x##c) _mm512_unpackhi_##i ((__m512i) a, (__m512i) b); } \
76 
77 
79 #undef _
80 /* *INDENT-ON* */
81 
84 {
85  return (u32) _mm512_movepi16_mask ((__m512i) v);
86 }
87 
88 /* 512-bit packs */
89 #define _(f, t, fn) \
90  always_inline t t##_pack (f lo, f hi) \
91  { \
92  return (t) fn ((__m512i) lo, (__m512i) hi); \
93  }
94 
95 _ (i16x32, i8x64, _mm512_packs_epi16)
96 _ (i16x32, u8x64, _mm512_packus_epi16)
97 _ (i32x16, i16x32, _mm512_packs_epi32)
98 _ (i32x16, u16x32, _mm512_packus_epi32)
99 #undef _
100 
103 {
104  u8x64 swap = {
105  3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
106  3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
107  3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12,
108  3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
109  };
110  return (u32x16) _mm512_shuffle_epi8 ((__m512i) v, (__m512i) swap);
111 }
112 
115 {
116  u8x64 swap = {
117  1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
118  1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
119  1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
120  1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14
121  };
122  return (u16x32) _mm512_shuffle_epi8 ((__m512i) v, (__m512i) swap);
123 }
124 
125 #define _(f, t) \
126  static_always_inline t f##_extract_lo (f v) \
127  { \
128  return (t) _mm512_extracti64x4_epi64 ((__m512i) v, 0); \
129  } \
130  static_always_inline t f##_extract_hi (f v) \
131  { \
132  return (t) _mm512_extracti64x4_epi64 ((__m512i) v, 1); \
133  }
134 
135 _ (u64x8, u64x4)
136 _ (u32x16, u32x8)
137 _ (u16x32, u16x16)
138 _ (u8x64, u8x32)
139 #undef _
140 
143 {
144  return u32x8_min_scalar (u32x8_min (u32x16_extract_lo (v),
145  u32x16_extract_hi (v)));
146 }
147 
149 u32x16_insert_lo (u32x16 r, u32x8 v)
150 {
151  return (u32x16) _mm512_inserti64x4 ((__m512i) r, (__m256i) v, 0);
152 }
153 
155 u32x16_insert_hi (u32x16 r, u32x8 v)
156 {
157  return (u32x16) _mm512_inserti64x4 ((__m512i) r, (__m256i) v, 1);
158 }
159 
161 u64x8_permute (u64x8 a, u64x8 b, u64x8 mask)
162 {
163  return (u64x8) _mm512_permutex2var_epi64 ((__m512i) a, (__m512i) mask,
164  (__m512i) b);
165 }
166 
167 
168 #define u32x16_ternary_logic(a, b, c, d) \
169  (u32x16) _mm512_ternarylogic_epi32 ((__m512i) a, (__m512i) b, (__m512i) c, d)
170 
171 #define u8x64_insert_u8x16(a, b, n) \
172  (u8x64) _mm512_inserti64x2 ((__m512i) (a), (__m128i) (b), n)
173 
174 #define u8x64_extract_u8x16(a, n) \
175  (u8x16) _mm512_extracti64x2_epi64 ((__m512i) (a), n)
176 
177 #define u8x64_word_shift_left(a,n) (u8x64) _mm512_bslli_epi128((__m512i) a, n)
178 #define u8x64_word_shift_right(a,n) (u8x64) _mm512_bsrli_epi128((__m512i) a, n)
179 
181 u8x64_xor3 (u8x64 a, u8x64 b, u8x64 c)
182 {
183  return (u8x64) _mm512_ternarylogic_epi32 ((__m512i) a, (__m512i) b,
184  (__m512i) c, 0x96);
185 }
186 
189 {
190  static const u8x64 mask = {
191  15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
192  15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
193  15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
194  15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
195  };
196  return (u8x64) _mm512_shuffle_epi8 ((__m512i) x, (__m512i) mask);
197 }
198 
200 u8x64_shuffle (u8x64 v, u8x64 m)
201 {
202  return (u8x64) _mm512_shuffle_epi8 ((__m512i) v, (__m512i) m);
203 }
204 
205 #define u8x64_align_right(a, b, imm) \
206  (u8x64) _mm512_alignr_epi8 ((__m512i) a, (__m512i) b, imm)
207 
209 u32x16_sum_elts (u32x16 sum16)
210 {
211  u32x8 sum8;
212  sum16 += (u32x16) u8x64_align_right (sum16, sum16, 8);
213  sum16 += (u32x16) u8x64_align_right (sum16, sum16, 4);
214  sum8 = u32x16_extract_hi (sum16) + u32x16_extract_lo (sum16);
215  return sum8[0] + sum8[4];
216 }
217 
218 #define _(t, m, p, i, e) \
219  static_always_inline t t##_mask_load (t a, void *p, m mask) \
220  { \
221  return (t) p##_mask_loadu_##e ((i) a, mask, p); \
222  } \
223  static_always_inline t t##_mask_load_zero (void *p, m mask) \
224  { \
225  return (t) p##_maskz_loadu_##e (mask, p); \
226  } \
227  static_always_inline void t##_mask_store (t a, void *p, m mask) \
228  { \
229  p##_mask_storeu_##e (p, mask, (i) a); \
230  }
231 
232 _ (u8x64, u64, _mm512, __m512i, epi8)
233 _ (u8x32, u32, _mm256, __m256i, epi8)
234 _ (u8x16, u16, _mm, __m128i, epi8)
235 _ (u16x32, u32, _mm512, __m512i, epi16)
236 _ (u16x16, u16, _mm256, __m256i, epi16)
237 _ (u16x8, u8, _mm, __m128i, epi16)
238 _ (u32x16, u16, _mm512, __m512i, epi32)
239 _ (u32x8, u8, _mm256, __m256i, epi32)
240 _ (u32x4, u8, _mm, __m128i, epi32)
241 _ (u64x8, u8, _mm512, __m512i, epi64)
242 _ (u64x4, u8, _mm256, __m256i, epi64)
243 _ (u64x2, u8, _mm, __m128i, epi64)
244 #undef _
245 
246 #ifdef CLIB_HAVE_VEC512
247 #define CLIB_HAVE_VEC512_MASK_LOAD_STORE
248 #endif
249 #ifdef CLIB_HAVE_VEC256
250 #define CLIB_HAVE_VEC256_MASK_LOAD_STORE
251 #endif
252 #ifdef CLIB_HAVE_VEC128
253 #define CLIB_HAVE_VEC128_MASK_LOAD_STORE
254 #endif
255 
258 {
259  return (u8x64) _mm512_broadcast_i64x2 ((__m128i) a);
260 }
261 
264 {
265  return (u32x16) _mm512_broadcast_i64x2 ((__m128i) a);
266 }
267 
269 u32x16_mask_blend (u32x16 a, u32x16 b, u16 mask)
270 {
271  return (u32x16) _mm512_mask_blend_epi32 (mask, (__m512i) a, (__m512i) b);
272 }
273 
275 u8x64_mask_blend (u8x64 a, u8x64 b, u64 mask)
276 {
277  return (u8x64) _mm512_mask_blend_epi8 (mask, (__m512i) a, (__m512i) b);
278 }
279 
280 #define _(t, m, e, p, it) \
281  static_always_inline m t##_is_equal_mask (t a, t b) \
282  { \
283  return p##_cmpeq_##e##_mask ((it) a, (it) b); \
284  }
285 _ (u8x16, u16, epu8, _mm, __m128i)
286 _ (u16x8, u8, epu16, _mm, __m128i)
287 _ (u32x4, u8, epu32, _mm, __m128i)
288 _ (u64x2, u8, epu64, _mm, __m128i)
289 
290 _ (u8x32, u32, epu8, _mm256, __m256i)
291 _ (u16x16, u16, epu16, _mm256, __m256i)
292 _ (u32x8, u8, epu32, _mm256, __m256i)
293 _ (u64x4, u8, epu64, _mm256, __m256i)
294 
295 _ (u8x64, u64, epu8, _mm512, __m512i)
296 _ (u16x32, u32, epu16, _mm512, __m512i)
297 _ (u32x16, u16, epu32, _mm512, __m512i)
298 _ (u64x8, u8, epu64, _mm512, __m512i)
299 #undef _
300 
301 #define _(f, t, fn, it) \
302  static_always_inline t t##_from_##f (f x) { return (t) fn ((it) x); }
303 _ (u16x16, u32x16, _mm512_cvtepi16_epi32, __m256i)
304 _ (u32x16, u16x16, _mm512_cvtusepi32_epi16, __m512i)
305 _ (u32x8, u16x8, _mm256_cvtusepi32_epi16, __m256i)
306 _ (u32x8, u64x8, _mm512_cvtepu32_epi64, __m256i)
307 #undef _
308 
309 #define _(vt, mt, p, it, epi) \
310  static_always_inline vt vt##_compress (vt a, mt mask) \
311  { \
312  return (vt) p##_maskz_compress_##epi (mask, (it) a); \
313  } \
314  static_always_inline vt vt##_expand (vt a, mt mask) \
315  { \
316  return (vt) p##_maskz_expand_##epi (mask, (it) a); \
317  } \
318  static_always_inline void vt##_compress_store (vt v, mt mask, void *p) \
319  { \
320  p##_mask_compressstoreu_##epi (p, mask, (it) v); \
321  }
322 
323 _ (u64x8, u8, _mm512, __m512i, epi64)
324 _ (u32x16, u16, _mm512, __m512i, epi32)
325 _ (u64x4, u8, _mm256, __m256i, epi64)
326 _ (u32x8, u8, _mm256, __m256i, epi32)
327 _ (u64x2, u8, _mm, __m128i, epi64)
328 _ (u32x4, u8, _mm, __m128i, epi32)
329 #ifdef __AVX512VBMI2__
330 _ (u16x32, u32, _mm512, __m512i, epi16)
331 _ (u8x64, u64, _mm512, __m512i, epi8)
332 _ (u16x16, u16, _mm256, __m256i, epi16)
333 _ (u8x32, u32, _mm256, __m256i, epi8)
334 _ (u16x8, u8, _mm, __m128i, epi16)
335 _ (u8x16, u16, _mm, __m128i, epi8)
336 #endif
337 #undef _
338 
339 #ifdef CLIB_HAVE_VEC256
340 #define CLIB_HAVE_VEC256_COMPRESS
341 #endif
342 #ifdef CLIB_HAVE_VEC512
343 #define CLIB_HAVE_VEC512_COMPRESS
344 #endif
345 
346 #ifndef __AVX512VBMI2__
348 u16x16_compress (u16x16 v, u16 mask)
349 {
350  return u16x16_from_u32x16 (u32x16_compress (u32x16_from_u16x16 (v), mask));
351 }
352 
354 u16x8_compress (u16x8 v, u8 mask)
355 {
356  return u16x8_from_u32x8 (u32x8_compress (u32x8_from_u16x8 (v), mask));
357 }
358 #endif
359 
361 u32x16_transpose (u32x16 m[16])
362 {
363  __m512i r[16], a, b, c, d, x, y;
364 
365  /* *INDENT-OFF* */
366  __m512i pm1 = (__m512i) (u64x8) { 0, 1, 8, 9, 4, 5, 12, 13};
367  __m512i pm2 = (__m512i) (u64x8) { 2, 3, 10, 11, 6, 7, 14, 15};
368  __m512i pm3 = (__m512i) (u64x8) { 0, 1, 2, 3, 8, 9, 10, 11};
369  __m512i pm4 = (__m512i) (u64x8) { 4, 5, 6, 7, 12, 13, 14, 15};
370  /* *INDENT-ON* */
371 
372  r[0] = _mm512_unpacklo_epi32 ((__m512i) m[0], (__m512i) m[1]);
373  r[1] = _mm512_unpacklo_epi32 ((__m512i) m[2], (__m512i) m[3]);
374  r[2] = _mm512_unpacklo_epi32 ((__m512i) m[4], (__m512i) m[5]);
375  r[3] = _mm512_unpacklo_epi32 ((__m512i) m[6], (__m512i) m[7]);
376  r[4] = _mm512_unpacklo_epi32 ((__m512i) m[8], (__m512i) m[9]);
377  r[5] = _mm512_unpacklo_epi32 ((__m512i) m[10], (__m512i) m[11]);
378  r[6] = _mm512_unpacklo_epi32 ((__m512i) m[12], (__m512i) m[13]);
379  r[7] = _mm512_unpacklo_epi32 ((__m512i) m[14], (__m512i) m[15]);
380 
381  r[8] = _mm512_unpackhi_epi32 ((__m512i) m[0], (__m512i) m[1]);
382  r[9] = _mm512_unpackhi_epi32 ((__m512i) m[2], (__m512i) m[3]);
383  r[10] = _mm512_unpackhi_epi32 ((__m512i) m[4], (__m512i) m[5]);
384  r[11] = _mm512_unpackhi_epi32 ((__m512i) m[6], (__m512i) m[7]);
385  r[12] = _mm512_unpackhi_epi32 ((__m512i) m[8], (__m512i) m[9]);
386  r[13] = _mm512_unpackhi_epi32 ((__m512i) m[10], (__m512i) m[11]);
387  r[14] = _mm512_unpackhi_epi32 ((__m512i) m[12], (__m512i) m[13]);
388  r[15] = _mm512_unpackhi_epi32 ((__m512i) m[14], (__m512i) m[15]);
389 
390  a = _mm512_unpacklo_epi64 (r[0], r[1]);
391  b = _mm512_unpacklo_epi64 (r[2], r[3]);
392  c = _mm512_unpacklo_epi64 (r[4], r[5]);
393  d = _mm512_unpacklo_epi64 (r[6], r[7]);
394  x = _mm512_permutex2var_epi64 (a, pm1, b);
395  y = _mm512_permutex2var_epi64 (c, pm1, d);
396  m[0] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
397  m[8] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
398  x = _mm512_permutex2var_epi64 (a, pm2, b);
399  y = _mm512_permutex2var_epi64 (c, pm2, d);
400  m[4] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
401  m[12] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
402 
403  a = _mm512_unpacklo_epi64 (r[8], r[9]);
404  b = _mm512_unpacklo_epi64 (r[10], r[11]);
405  c = _mm512_unpacklo_epi64 (r[12], r[13]);
406  d = _mm512_unpacklo_epi64 (r[14], r[15]);
407  x = _mm512_permutex2var_epi64 (a, pm1, b);
408  y = _mm512_permutex2var_epi64 (c, pm1, d);
409  m[2] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
410  m[10] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
411  x = _mm512_permutex2var_epi64 (a, pm2, b);
412  y = _mm512_permutex2var_epi64 (c, pm2, d);
413  m[6] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
414  m[14] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
415 
416  a = _mm512_unpackhi_epi64 (r[0], r[1]);
417  b = _mm512_unpackhi_epi64 (r[2], r[3]);
418  c = _mm512_unpackhi_epi64 (r[4], r[5]);
419  d = _mm512_unpackhi_epi64 (r[6], r[7]);
420  x = _mm512_permutex2var_epi64 (a, pm1, b);
421  y = _mm512_permutex2var_epi64 (c, pm1, d);
422  m[1] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
423  m[9] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
424  x = _mm512_permutex2var_epi64 (a, pm2, b);
425  y = _mm512_permutex2var_epi64 (c, pm2, d);
426  m[5] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
427  m[13] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
428 
429  a = _mm512_unpackhi_epi64 (r[8], r[9]);
430  b = _mm512_unpackhi_epi64 (r[10], r[11]);
431  c = _mm512_unpackhi_epi64 (r[12], r[13]);
432  d = _mm512_unpackhi_epi64 (r[14], r[15]);
433  x = _mm512_permutex2var_epi64 (a, pm1, b);
434  y = _mm512_permutex2var_epi64 (c, pm1, d);
435  m[3] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
436  m[11] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
437  x = _mm512_permutex2var_epi64 (a, pm2, b);
438  y = _mm512_permutex2var_epi64 (c, pm2, d);
439  m[7] = (u32x16) _mm512_permutex2var_epi64 (x, pm3, y);
440  m[15] = (u32x16) _mm512_permutex2var_epi64 (x, pm4, y);
441 }
442 
443 
444 
446 u64x8_transpose (u64x8 m[8])
447 {
448  __m512i r[8], x, y;
449 
450  /* *INDENT-OFF* */
451  __m512i pm1 = (__m512i) (u64x8) { 0, 1, 8, 9, 4, 5, 12, 13};
452  __m512i pm2 = (__m512i) (u64x8) { 2, 3, 10, 11, 6, 7, 14, 15};
453  __m512i pm3 = (__m512i) (u64x8) { 0, 1, 2, 3, 8, 9, 10, 11};
454  __m512i pm4 = (__m512i) (u64x8) { 4, 5, 6, 7, 12, 13, 14, 15};
455  /* *INDENT-ON* */
456 
457  r[0] = _mm512_unpacklo_epi64 ((__m512i) m[0], (__m512i) m[1]);
458  r[1] = _mm512_unpacklo_epi64 ((__m512i) m[2], (__m512i) m[3]);
459  r[2] = _mm512_unpacklo_epi64 ((__m512i) m[4], (__m512i) m[5]);
460  r[3] = _mm512_unpacklo_epi64 ((__m512i) m[6], (__m512i) m[7]);
461  r[4] = _mm512_unpackhi_epi64 ((__m512i) m[0], (__m512i) m[1]);
462  r[5] = _mm512_unpackhi_epi64 ((__m512i) m[2], (__m512i) m[3]);
463  r[6] = _mm512_unpackhi_epi64 ((__m512i) m[4], (__m512i) m[5]);
464  r[7] = _mm512_unpackhi_epi64 ((__m512i) m[6], (__m512i) m[7]);
465 
466  x = _mm512_permutex2var_epi64 (r[0], pm1, r[1]);
467  y = _mm512_permutex2var_epi64 (r[2], pm1, r[3]);
468  m[0] = (u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
469  m[4] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
470  x = _mm512_permutex2var_epi64 (r[0], pm2, r[1]);
471  y = _mm512_permutex2var_epi64 (r[2], pm2, r[3]);
472  m[2] = (u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
473  m[6] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
474 
475  x = _mm512_permutex2var_epi64 (r[4], pm1, r[5]);
476  y = _mm512_permutex2var_epi64 (r[6], pm1, r[7]);
477  m[1] = (u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
478  m[5] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
479  x = _mm512_permutex2var_epi64 (r[4], pm2, r[5]);
480  y = _mm512_permutex2var_epi64 (r[6], pm2, r[7]);
481  m[3] = (u64x8) _mm512_permutex2var_epi64 (x, pm3, y);
482  m[7] = (u64x8) _mm512_permutex2var_epi64 (x, pm4, y);
483 }
484 
485 #endif /* included_vector_avx512_h */
486 /*
487  * fd.io coding-style-patch-verification: ON
488  *
489  * Local Variables:
490  * eval: (c-set-style "gnu")
491  * End:
492  */
static_always_inline u64x8 u64x8_permute(u64x8 a, u64x8 b, u64x8 mask)
a
Definition: bitmap.h:544
static_always_inline u16x8 u16x8_compress(u16x8 v, u8 mask)
vnet_hw_if_output_node_runtime_t * r
static_always_inline void u64x8_transpose(u64x8 m[8])
unsigned long u64
Definition: types.h:89
static_always_inline u8x64 u8x64_reflect_u8x16(u8x64 x)
static_always_inline u32 u32x8_min_scalar(u32x8 v)
Definition: vector_avx2.h:336
static_always_inline u8x64 u8x64_shuffle(u8x64 v, u8x64 m)
static_always_inline void u32x16_transpose(u32x16 m[16])
unsigned char u8
Definition: types.h:56
vlib_buffer_t ** b
epu16
unsigned int u32
Definition: types.h:88
static_always_inline u32x16 u32x16_insert_lo(u32x16 r, u32x8 v)
static_always_inline u32x16 u32x16_splat_u32x4(u32x4 a)
#define static_always_inline
Definition: clib.h:112
u32x8 u8x32 static_always_inline u32 u32x16_min_scalar(u32x16 v)
u8x16
Definition: vector_sse42.h:194
#define foreach_avx512_vec512i
Definition: vector_avx512.h:23
static_always_inline u8x64 u8x64_xor3(u8x64 a, u8x64 b, u8x64 c)
static_always_inline u16x32 u16x32_byte_swap(u16x32 v)
epu8_epi32 epu16_epi32 u64x2
Definition: vector_sse42.h:641
#define u8x64_align_right(a, b, imm)
unsigned short u16
Definition: types.h:57
static_always_inline u32x16 u32x16_mask_blend(u32x16 a, u32x16 b, u16 mask)
epi8 epi16 epi16 epi32 epi64 epi64 static_always_inline u8x64 u8x64_splat_u8x16(u8x16 a)
static_always_inline u32 u32x16_sum_elts(u32x16 sum16)
u8x64
Definition: vector_avx512.h:96
svmdb_client_t * c
vl_api_pnat_mask_t mask
Definition: pnat.api:45
__m128i epu64
u8x32
Definition: vector_avx2.h:116
static_always_inline u32x8 u32x8_min(u32x8 a, u32x8 b)
Definition: vector_avx2.h:330
foreach_avx512_vec512i foreach_avx512_vec512u static_always_inline u32 u16x32_msb_mask(u16x32 v)
Definition: vector_avx512.h:83
epi32 epi32 epi32 static_always_inline u16x16 u16x16_compress(u16x16 v, u16 mask)
static_always_inline u32x16 u32x16_insert_hi(u32x16 r, u32x8 v)
static_always_inline u8x64 u8x64_mask_blend(u8x64 a, u8x64 b, u64 mask)
unsigned long long u32x4
Definition: ixge.c:28
_mm512_packus_epi16 _mm512_packus_epi32 static_always_inline u32x16 u32x16_byte_swap(u32x16 v)
u64x4
Definition: vector_avx2.h:142
#define foreach_avx512_vec512u
Definition: vector_avx512.h:25
_mm512_packus_epi16 u16x32
Definition: vector_avx512.h:98
__m512i u64x8