FD.io VPP  v19.08.2-294-g37e99c22d
Vector Packet Processing
vector_sse42.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 /*
16  Copyright (c) 2005 Eliot Dresselhaus
17 
18  Permission is hereby granted, free of charge, to any person obtaining
19  a copy of this software and associated documentation files (the
20  "Software"), to deal in the Software without restriction, including
21  without limitation the rights to use, copy, modify, merge, publish,
22  distribute, sublicense, and/or sell copies of the Software, and to
23  permit persons to whom the Software is furnished to do so, subject to
24  the following conditions:
25 
26  The above copyright notice and this permission notice shall be
27  included in all copies or substantial portions of the Software.
28 
29  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
30  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
31  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
32  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
33  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
34  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
35  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
36 */
37 
38 #ifndef included_vector_sse2_h
39 #define included_vector_sse2_h
40 
41 #include <vppinfra/error_bootstrap.h> /* for ASSERT */
42 #include <x86intrin.h>
43 
44 /* *INDENT-OFF* */
45 #define foreach_sse42_vec128i \
46  _(i,8,16,epi8) _(i,16,8,epi16) _(i,32,4,epi32) _(i,64,2,epi64x)
47 #define foreach_sse42_vec128u \
48  _(u,8,16,epi8) _(u,16,8,epi16) _(u,32,4,epi32) _(u,64,2,epi64x)
49 #define foreach_sse42_vec128f \
50  _(f,32,4,ps) _(f,64,2,pd)
51 
52 /* splat, load_unaligned, store_unaligned, is_all_zero, is_equal,
53  is_all_equal */
54 #define _(t, s, c, i) \
55 static_always_inline t##s##x##c \
56 t##s##x##c##_splat (t##s x) \
57 { return (t##s##x##c) _mm_set1_##i (x); } \
58 \
59 static_always_inline t##s##x##c \
60 t##s##x##c##_load_unaligned (void *p) \
61 { return (t##s##x##c) _mm_loadu_si128 (p); } \
62 \
63 static_always_inline void \
64 t##s##x##c##_store_unaligned (t##s##x##c v, void *p) \
65 { _mm_storeu_si128 ((__m128i *) p, (__m128i) v); } \
66 \
67 static_always_inline int \
68 t##s##x##c##_is_all_zero (t##s##x##c x) \
69 { return _mm_testz_si128 ((__m128i) x, (__m128i) x); } \
70 \
71 static_always_inline int \
72 t##s##x##c##_is_equal (t##s##x##c a, t##s##x##c b) \
73 { return t##s##x##c##_is_all_zero (a ^ b); } \
74 \
75 static_always_inline int \
76 t##s##x##c##_is_all_equal (t##s##x##c v, t##s x) \
77 { return t##s##x##c##_is_equal (v, t##s##x##c##_splat (x)); }; \
78 
80 #undef _
81 
82 /* min, max */
83 #define _(t, s, c, i) \
84 static_always_inline t##s##x##c \
85 t##s##x##c##_min (t##s##x##c a, t##s##x##c b) \
86 { return (t##s##x##c) _mm_min_##i ((__m128i) a, (__m128i) b); } \
87 \
88 static_always_inline t##s##x##c \
89 t##s##x##c##_max (t##s##x##c a, t##s##x##c b) \
90 { return (t##s##x##c) _mm_max_##i ((__m128i) a, (__m128i) b); } \
91 
92 _(i,8,16,epi8) _(i,16,8,epi16) _(i,32,4,epi32) _(i,64,2,epi64)
93 _(u,8,16,epu8) _(u,16,8,epu16) _(u,32,4,epu32) _(u,64,2,epu64)
94 #undef _
95 /* *INDENT-ON* */
96 
97 #define CLIB_VEC128_SPLAT_DEFINED
98 #define CLIB_HAVE_VEC128_UNALIGNED_LOAD_STORE
99 
100 /* 128 bit interleaves. */
101 always_inline u8x16
102 u8x16_interleave_hi (u8x16 a, u8x16 b)
103 {
104  return (u8x16) _mm_unpackhi_epi8 ((__m128i) a, (__m128i) b);
105 }
106 
107 always_inline u8x16
108 u8x16_interleave_lo (u8x16 a, u8x16 b)
109 {
110  return (u8x16) _mm_unpacklo_epi8 ((__m128i) a, (__m128i) b);
111 }
112 
113 always_inline u16x8
114 u16x8_interleave_hi (u16x8 a, u16x8 b)
115 {
116  return (u16x8) _mm_unpackhi_epi16 ((__m128i) a, (__m128i) b);
117 }
118 
119 always_inline u16x8
120 u16x8_interleave_lo (u16x8 a, u16x8 b)
121 {
122  return (u16x8) _mm_unpacklo_epi16 ((__m128i) a, (__m128i) b);
123 }
124 
127 {
128  return (u32x4) _mm_unpackhi_epi32 ((__m128i) a, (__m128i) b);
129 }
130 
133 {
134  return (u32x4) _mm_unpacklo_epi32 ((__m128i) a, (__m128i) b);
135 }
136 
139 {
140  return (u64x2) _mm_unpackhi_epi64 ((__m128i) a, (__m128i) b);
141 }
142 
145 {
146  return (u64x2) _mm_unpacklo_epi64 ((__m128i) a, (__m128i) b);
147 }
148 
149 /* 64 bit interleaves. */
150 always_inline u8x8
151 u8x8_interleave_hi (u8x8 a, u8x8 b)
152 {
153  return (u8x8) _m_punpckhbw ((__m64) a, (__m64) b);
154 }
155 
156 always_inline u8x8
157 u8x8_interleave_lo (u8x8 a, u8x8 b)
158 {
159  return (u8x8) _m_punpcklbw ((__m64) a, (__m64) b);
160 }
161 
162 always_inline u16x4
163 u16x4_interleave_hi (u16x4 a, u16x4 b)
164 {
165  return (u16x4) _m_punpckhwd ((__m64) a, (__m64) b);
166 }
167 
168 always_inline u16x4
169 u16x4_interleave_lo (u16x4 a, u16x4 b)
170 {
171  return (u16x4) _m_punpcklwd ((__m64) a, (__m64) b);
172 }
173 
174 always_inline u32x2
175 u32x2_interleave_hi (u32x2 a, u32x2 b)
176 {
177  return (u32x2) _m_punpckhdq ((__m64) a, (__m64) b);
178 }
179 
180 always_inline u32x2
181 u32x2_interleave_lo (u32x2 a, u32x2 b)
182 {
183  return (u32x2) _m_punpckldq ((__m64) a, (__m64) b);
184 }
185 
186 /* 128 bit packs. */
187 always_inline u8x16
188 u16x8_pack (u16x8 lo, u16x8 hi)
189 {
190  return (u8x16) _mm_packus_epi16 ((__m128i) lo, (__m128i) hi);
191 }
192 
193 always_inline i8x16
195 {
196  return (i8x16) _mm_packs_epi16 ((__m128i) lo, (__m128i) hi);
197 }
198 
199 always_inline u16x8
201 {
202  return (u16x8) _mm_packs_epi32 ((__m128i) lo, (__m128i) hi);
203 }
204 
205 /* 64 bit packs. */
206 always_inline u8x8
207 u16x4_pack (u16x4 lo, u16x4 hi)
208 {
209  return (u8x8) _m_packuswb ((__m64) lo, (__m64) hi);
210 }
211 
212 always_inline i8x8
213 i16x4_pack (i16x4 lo, i16x4 hi)
214 {
215  return (i8x8) _m_packsswb ((__m64) lo, (__m64) hi);
216 }
217 
218 always_inline u16x4
219 u32x2_pack (u32x2 lo, u32x2 hi)
220 {
221  return (u16x4) _m_packssdw ((__m64) lo, (__m64) hi);
222 }
223 
224 always_inline i16x4
225 i32x2_pack (i32x2 lo, i32x2 hi)
226 {
227  return (i16x4) _m_packssdw ((__m64) lo, (__m64) hi);
228 }
229 
230 #ifndef __ICC
233 {
234  return (u64x2) _mm_loadl_pi ((__m128) x, (__m64 *) a);
235 }
236 
239 {
240  return (u64x2) _mm_loadh_pi ((__m128) x, (__m64 *) a);
241 }
242 
243 always_inline void
245 {
246  _mm_storel_pi ((__m64 *) a, (__m128) x);
247 }
248 
249 always_inline void
251 {
252  _mm_storeh_pi ((__m64 *) a, (__m128) x);
253 }
254 #endif
255 
256 #define _signed_binop(n,m,f,g) \
257  /* Unsigned */ \
258  always_inline u##n##x##m \
259  u##n##x##m##_##f (u##n##x##m x, u##n##x##m y) \
260  { return (u##n##x##m) _mm_##g##n ((__m128i) x, (__m128i) y); } \
261  \
262  /* Signed */ \
263  always_inline i##n##x##m \
264  i##n##x##m##_##f (i##n##x##m x, i##n##x##m y) \
265  { return (i##n##x##m) _mm_##g##n ((__m128i) x, (__m128i) y); }
266 /* Addition/subtraction with saturation. */
267 _signed_binop (8, 16, add_saturate, adds_epu)
268 _signed_binop (16, 8, add_saturate, adds_epu)
269 _signed_binop (8, 16, sub_saturate, subs_epu)
270 _signed_binop (16, 8, sub_saturate, subs_epu)
271 /* Multiplication. */
273 {
274  return (i16x8) _mm_mullo_epi16 ((__m128i) x, (__m128i) y);
275 }
276 
277 always_inline u16x8
278 u16x8_mul_lo (u16x8 x, u16x8 y)
279 {
280  return (u16x8) _mm_mullo_epi16 ((__m128i) x, (__m128i) y);
281 }
282 
285 {
286  return (i16x8) _mm_mulhi_epu16 ((__m128i) x, (__m128i) y);
287 }
288 
289 always_inline u16x8
290 u16x8_mul_hi (u16x8 x, u16x8 y)
291 {
292  return (u16x8) _mm_mulhi_epu16 ((__m128i) x, (__m128i) y);
293 }
294 
295 /* 128 bit shifts. */
296 
297 #define _(p,a,b,c,f) \
298  always_inline p##a##x##b p##a##x##b##_ishift_##c (p##a##x##b x, int i) \
299  { return (p##a##x##b) _mm_##f##i_epi##a ((__m128i) x, i); } \
300  \
301  always_inline p##a##x##b p##a##x##b##_shift_##c (p##a##x##b x, p##a##x##b y) \
302  { return (p##a##x##b) _mm_##f##_epi##a ((__m128i) x, (__m128i) y); }
303 
304 _(u, 16, 8, left, sll)
305 _(u, 32, 4, left, sll)
306 _(u, 64, 2, left, sll)
307 _(u, 16, 8, right, srl)
308 _(u, 32, 4, right, srl)
309 _(u, 64, 2, right, srl)
310 _(i, 16, 8, left, sll)
311 _(i, 32, 4, left, sll)
312 _(i, 64, 2, left, sll) _(i, 16, 8, right, sra) _(i, 32, 4, right, sra)
313 #undef _
314 /* 64 bit shifts. */
315  always_inline u16x4
316 u16x4_shift_left (u16x4 x, u16x4 i)
317 {
318  return (u16x4) _m_psllw ((__m64) x, (__m64) i);
319 };
320 
321 always_inline u32x2
322 u32x2_shift_left (u32x2 x, u32x2 i)
323 {
324  return (u32x2) _m_pslld ((__m64) x, (__m64) i);
325 };
326 
327 always_inline u16x4
328 u16x4_shift_right (u16x4 x, u16x4 i)
329 {
330  return (u16x4) _m_psrlw ((__m64) x, (__m64) i);
331 };
332 
333 always_inline u32x2
334 u32x2_shift_right (u32x2 x, u32x2 i)
335 {
336  return (u32x2) _m_psrld ((__m64) x, (__m64) i);
337 };
338 
339 always_inline i16x4
340 i16x4_shift_left (i16x4 x, i16x4 i)
341 {
342  return (i16x4) _m_psllw ((__m64) x, (__m64) i);
343 };
344 
345 always_inline i32x2
346 i32x2_shift_left (i32x2 x, i32x2 i)
347 {
348  return (i32x2) _m_pslld ((__m64) x, (__m64) i);
349 };
350 
351 always_inline i16x4
352 i16x4_shift_right (i16x4 x, i16x4 i)
353 {
354  return (i16x4) _m_psraw ((__m64) x, (__m64) i);
355 };
356 
357 always_inline i32x2
358 i32x2_shift_right (i32x2 x, i32x2 i)
359 {
360  return (i32x2) _m_psrad ((__m64) x, (__m64) i);
361 };
362 
363 #define u8x16_word_shift_left(a,n) (u8x16) _mm_slli_si128((__m128i) a, n)
364 #define u8x16_word_shift_right(a,n) (u8x16) _mm_srli_si128((__m128i) a, n)
365 
366 #define i8x16_word_shift_left(a,n) \
367  ((i8x16) u8x16_word_shift_left((u8x16) (a), (n)))
368 #define i8x16_word_shift_right(a,n) \
369  ((i8x16) u8x16_word_shift_right((u8x16) (a), (n)))
370 
371 #define u16x8_word_shift_left(a,n) \
372  ((u16x8) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u16)))
373 #define i16x8_word_shift_left(a,n) \
374  ((u16x8) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u16)))
375 #define u16x8_word_shift_right(a,n) \
376  ((u16x8) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u16)))
377 #define i16x8_word_shift_right(a,n) \
378  ((i16x8) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u16)))
379 
380 #define u32x4_word_shift_left(a,n) \
381  ((u32x4) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u32)))
382 #define i32x4_word_shift_left(a,n) \
383  ((u32x4) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u32)))
384 #define u32x4_word_shift_right(a,n) \
385  ((u32x4) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u32)))
386 #define i32x4_word_shift_right(a,n) \
387  ((i32x4) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u32)))
388 
389 #define u64x2_word_shift_left(a,n) \
390  ((u64x2) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u64)))
391 #define i64x2_word_shift_left(a,n) \
392  ((u64x2) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u64)))
393 #define u64x2_word_shift_right(a,n) \
394  ((u64x2) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u64)))
395 #define i64x2_word_shift_right(a,n) \
396  ((i64x2) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u64)))
397 
398 /* SSE2 has no rotate instructions: use shifts to simulate them. */
399 #define _(t,n,lr1,lr2) \
400  always_inline t##x##n \
401  t##x##n##_irotate_##lr1 (t##x##n w, int i) \
402  { \
403  ASSERT (i >= 0 && i <= BITS (t)); \
404  return (t##x##n##_ishift_##lr1 (w, i) \
405  | t##x##n##_ishift_##lr2 (w, BITS (t) - i)); \
406  } \
407  \
408  always_inline t##x##n \
409  t##x##n##_rotate_##lr1 (t##x##n w, t##x##n i) \
410  { \
411  t##x##n j = t##x##n##_splat (BITS (t)); \
412  return (t##x##n##_shift_##lr1 (w, i) \
413  | t##x##n##_shift_##lr2 (w, j - i)); \
414  }
415 
416 _(u16, 8, left, right);
417 _(u16, 8, right, left);
418 _(u32, 4, left, right);
419 _(u32, 4, right, left);
420 _(u64, 2, left, right);
421 _(u64, 2, right, left);
422 
423 #undef _
424 
425 #ifndef __clang__
426 #define _(t,n,lr1,lr2) \
427  always_inline t##x##n \
428  t##x##n##_word_rotate2_##lr1 (t##x##n w0, t##x##n w1, int i) \
429  { \
430  int m = sizeof (t##x##n) / sizeof (t); \
431  ASSERT (i >= 0 && i < m); \
432  return (t##x##n##_word_shift_##lr1 (w0, i) \
433  | t##x##n##_word_shift_##lr2 (w1, m - i)); \
434  } \
435  \
436  always_inline t##x##n \
437  t##x##n##_word_rotate_##lr1 (t##x##n w0, int i) \
438  { return t##x##n##_word_rotate2_##lr1 (w0, w0, i); }
439 
440 _(u8, 16, left, right);
441 _(u8, 16, right, left);
442 _(u16, 8, left, right);
443 _(u16, 8, right, left);
444 _(u32, 4, left, right);
445 _(u32, 4, right, left);
446 _(u64, 2, left, right);
447 _(u64, 2, right, left);
448 
449 #undef _
450 #endif
451 
452 #define u32x4_select(A,MASK) \
453 ({ \
454  u32x4 _x, _y; \
455  _x = (A); \
456  asm volatile ("pshufd %[mask], %[x], %[y]" \
457  : /* outputs */ [y] "=x" (_y) \
458  : /* inputs */ [x] "x" (_x), [mask] "i" (MASK)); \
459  _y; \
460 })
461 
462 #define u32x4_splat_word(x,i) \
463  u32x4_select ((x), (((i) << (2*0)) \
464  | ((i) << (2*1)) \
465  | ((i) << (2*2)) \
466  | ((i) << (2*3))))
467 
468 /* Extract low order 32 bit word. */
471 {
472  u32 result;
473  asm volatile ("movd %[x], %[result]": /* outputs */ [result] "=r" (result)
474  : /* inputs */ [x] "x" (x));
475  return result;
476 }
477 
480 {
481  u32x4 result;
482  asm volatile ("movd %[x], %[result]": /* outputs */ [result] "=x" (result)
483  : /* inputs */ [x] "r" (x));
484  return result;
485 }
486 
489 {
490  return (i32x4) u32x4_set0 ((u32) x);
491 }
492 
495 {
496  return (i32) u32x4_get0 ((u32x4) x);
497 }
498 
499 /* Converts all ones/zeros compare mask to bitmap. */
502 {
503  return _mm_movemask_epi8 ((__m128i) x);
504 }
505 
507 
510 {
511  u32 m = u8x16_compare_byte_mask ((u8x16) x);
512  return (u32x4_compare_word_mask_table[(m >> 0) & 0xff]
513  | (u32x4_compare_word_mask_table[(m >> 8) & 0xff] << 2));
514 }
515 
518 {
519  u8x16 zero = { 0 };
520  return u8x16_compare_byte_mask (x == zero);
521 }
522 
525 {
526  u16x8 zero = { 0 };
527  return u8x16_compare_byte_mask ((u8x16) (x == zero));
528 }
529 
532 {
533  u32x4 zero = { 0 };
534  return u8x16_compare_byte_mask ((u8x16) (x == zero));
535 }
536 
539 {
540  x = u8x16_max (x, u8x16_word_shift_right (x, 8));
541  x = u8x16_max (x, u8x16_word_shift_right (x, 4));
542  x = u8x16_max (x, u8x16_word_shift_right (x, 2));
543  x = u8x16_max (x, u8x16_word_shift_right (x, 1));
544  return _mm_extract_epi16 ((__m128i) x, 0) & 0xff;
545 }
546 
549 {
550  x = u8x16_min (x, u8x16_word_shift_right (x, 8));
551  x = u8x16_min (x, u8x16_word_shift_right (x, 4));
552  x = u8x16_min (x, u8x16_word_shift_right (x, 2));
553  x = u8x16_min (x, u8x16_word_shift_right (x, 1));
554  return _mm_extract_epi16 ((__m128i) x, 0) & 0xff;
555 }
556 
559 {
560  x = i16x8_max (x, i16x8_word_shift_right (x, 4));
561  x = i16x8_max (x, i16x8_word_shift_right (x, 2));
562  x = i16x8_max (x, i16x8_word_shift_right (x, 1));
563  return _mm_extract_epi16 ((__m128i) x, 0);
564 }
565 
568 {
569  x = i16x8_min (x, i16x8_word_shift_right (x, 4));
570  x = i16x8_min (x, i16x8_word_shift_right (x, 2));
571  x = i16x8_min (x, i16x8_word_shift_right (x, 1));
572  return _mm_extract_epi16 ((__m128i) x, 0);
573 }
574 
575 #define u8x16_align_right(a, b, imm) \
576  (u8x16) _mm_alignr_epi8 ((__m128i) a, (__m128i) b, imm)
577 
580 {
581  v = u32x4_min (v, (u32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 8));
582  v = u32x4_min (v, (u32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 4));
583  return v[0];
584 }
585 
588 {
589  v = u32x4_max (v, (u32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 8));
590  v = u32x4_max (v, (u32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 4));
591  return v[0];
592 }
593 
596 {
597  v = i32x4_min (v, (i32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 8));
598  v = i32x4_min (v, (i32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 4));
599  return v[0];
600 }
601 
604 {
605  v = i32x4_max (v, (i32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 8));
606  v = i32x4_max (v, (i32x4) u8x16_align_right ((u8x16) v, (u8x16) v, 4));
607  return v[0];
608 }
609 
611 u8x16_msb_mask (u8x16 v)
612 {
613  return _mm_movemask_epi8 ((__m128i) v);
614 }
615 
616 #define CLIB_HAVE_VEC128_MSB_MASK
617 
618 #undef _signed_binop
619 
622 {
623  u8x16 swap = {
624  3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
625  };
626  return (u32x4) _mm_shuffle_epi8 ((__m128i) v, (__m128i) swap);
627 }
628 
631 {
632  u8x16 swap = {
633  1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
634  };
635  return (u16x8) _mm_shuffle_epi8 ((__m128i) v, (__m128i) swap);
636 }
637 
640 {
641  return (u32x4) _mm_hadd_epi32 ((__m128i) v1, (__m128i) v2);
642 }
643 
645 u8x16_shuffle (u8x16 v, u8x16 m)
646 {
647  return (u8x16) _mm_shuffle_epi8 ((__m128i) v, (__m128i) m);
648 }
649 
651 u32x4_shuffle (u32x4 v, const int a, const int b, const int c, const int d)
652 {
653 #if defined(__clang__) || !__OPTIMIZE__
654  u32x4 r = { v[a], v[b], v[c], v[d] };
655  return r;
656 #else
657  return (u32x4) _mm_shuffle_epi32 ((__m128i) v,
658  a | b << 2 | c << 4 | d << 6);
659 #endif
660 }
661 
662 /* _extend_to_ */
663 /* *INDENT-OFF* */
664 #define _(f,t,i) \
665 static_always_inline t \
666 f##_extend_to_##t (f x) \
667 { return (t) _mm_cvt##i ((__m128i) x); }
668 
669 _(u8x16, u16x8, epu8_epi16)
670 _(u8x16, u32x4, epu8_epi32)
671 _(u8x16, u64x2, epu8_epi64)
672 _(u16x8, u32x4, epu16_epi32)
673 _(u16x8, u64x2, epu16_epi64)
674 _(u32x4, u64x2, epu32_epi64)
675 
676 _(i8x16, i16x8, epi8_epi16)
677 _(i8x16, i32x4, epi8_epi32)
678 _(i8x16, i64x2, epi8_epi64)
679 _(i16x8, i32x4, epi16_epi32)
680 _(i16x8, i64x2, epi16_epi64)
681 _(i32x4, i64x2, epi32_epi64)
682 #undef _
683 /* *INDENT-ON* */
684 
686 u64x2_gather (void *p0, void *p1)
687 {
688  u64x2 r = { *(u64 *) p0, *(u64 *) p1 };
689  return r;
690 }
691 
693 u32x4_gather (void *p0, void *p1, void *p2, void *p3)
694 {
695  u32x4 r = { *(u32 *) p0, *(u32 *) p1, *(u32 *) p2, *(u32 *) p3 };
696  return r;
697 }
698 
699 
701 u64x2_scatter (u64x2 r, void *p0, void *p1)
702 {
703  *(u64 *) p0 = r[0];
704  *(u64 *) p1 = r[1];
705 }
706 
708 u32x4_scatter (u32x4 r, void *p0, void *p1, void *p2, void *p3)
709 {
710  *(u32 *) p0 = r[0];
711  *(u32 *) p1 = r[1];
712  *(u32 *) p2 = r[2];
713  *(u32 *) p3 = r[3];
714 }
715 
717 u64x2_scatter_one (u64x2 r, int index, void *p)
718 {
719  *(u64 *) p = r[index];
720 }
721 
723 u32x4_scatter_one (u32x4 r, int index, void *p)
724 {
725  *(u32 *) p = r[index];
726 }
727 
729 u8x16_is_greater (u8x16 v1, u8x16 v2)
730 {
731  return (u8x16) _mm_cmpgt_epi8 ((__m128i) v1, (__m128i) v2);
732 }
733 
735 u8x16_blend (u8x16 v1, u8x16 v2, u8x16 mask)
736 {
737  return (u8x16) _mm_blendv_epi8 ((__m128i) v1, (__m128i) v2, (__m128i) mask);
738 }
739 
740 
741 #endif /* included_vector_sse2_h */
742 
743 /*
744  * fd.io coding-style-patch-verification: ON
745  *
746  * Local Variables:
747  * eval: (c-set-style "gnu")
748  * End:
749  */
static u32x2 u32x2_interleave_hi(u32x2 a, u32x2 b)
Definition: vector_sse42.h:175
#define u8x16_word_shift_right(a, n)
Definition: vector_sse42.h:364
static u64x2 u64x2_interleave_hi(u64x2 a, u64x2 b)
Definition: vector_sse42.h:138
vmrglw vmrglh hi
static u32x4 u32x4_interleave_lo(u32x4 a, u32x4 b)
Definition: vector_sse42.h:132
static u16x8 u16x8_mul_lo(u16x8 x, u16x8 y)
Definition: vector_sse42.h:278
#define u8x16_align_right(a, b, imm)
Definition: vector_sse42.h:575
static_always_inline u32 u32x4_min_scalar(u32x4 v)
Definition: vector_sse42.h:579
sll right
Definition: vector_sse42.h:307
static i16 i16x8_max_scalar(i16x8 x)
Definition: vector_sse42.h:558
a
Definition: bitmap.h:538
static u64x2 u64x2_interleave_lo(u64x2 a, u64x2 b)
Definition: vector_sse42.h:144
static_always_inline void u64x2_scatter_one(u64x2 r, int index, void *p)
Definition: vector_sse42.h:717
static_always_inline u32x4 u32x4_byte_swap(u32x4 v)
Definition: vector_sse42.h:621
static u8x8 u16x4_pack(u16x4 lo, u16x4 hi)
Definition: vector_sse42.h:207
unsigned long u64
Definition: types.h:89
static u8x16 u16x8_pack(u16x8 lo, u16x8 hi)
Definition: vector_sse42.h:188
static u16x4 u16x4_interleave_hi(u16x4 a, u16x4 b)
Definition: vector_sse42.h:163
static i8x16 i16x8_pack(i16x8 lo, i16x8 hi)
Definition: vector_sse42.h:194
#define foreach_sse42_vec128i
Definition: vector_sse42.h:45
static_always_inline u32x4 u32x4_gather(void *p0, void *p1, void *p2, void *p3)
Definition: vector_sse42.h:693
static u8x8 u8x8_interleave_hi(u8x8 a, u8x8 b)
Definition: vector_sse42.h:151
static i16x4 i16x4_shift_right(i16x4 x, i16x4 i)
Definition: vector_sse42.h:352
static i16x8 i16x8_mul_hi(i16x8 x, i16x8 y)
Definition: vector_sse42.h:284
static u16x8 u16x8_interleave_hi(u16x8 a, u16x8 b)
Definition: vector_sse42.h:114
static_always_inline u8x16 u8x16_blend(u8x16 v1, u8x16 v2, u8x16 mask)
Definition: vector_sse42.h:735
adds_epu sub_saturate
Definition: vector_sse42.h:270
static void u64x2_write_hi(u64x2 x, u64 *a)
Definition: vector_sse42.h:250
static u16x4 u16x4_interleave_lo(u16x4 a, u16x4 b)
Definition: vector_sse42.h:169
unsigned char u8
Definition: types.h:56
static u32 u16x8_zero_byte_mask(u16x8 x)
Definition: vector_sse42.h:524
static u16x8 u16x8_interleave_lo(u16x8 a, u16x8 b)
Definition: vector_sse42.h:120
static u32x2 u32x2_interleave_lo(u32x2 a, u32x2 b)
Definition: vector_sse42.h:181
#define static_always_inline
Definition: clib.h:100
static u16x8 u32x4_pack(u32x4 lo, u32x4 hi)
Definition: vector_sse42.h:200
static_always_inline void u32x4_scatter(u32x4 r, void *p0, void *p1, void *p2, void *p3)
Definition: vector_sse42.h:708
#define always_inline
Definition: clib.h:99
static_always_inline u16 u8x16_msb_mask(u8x16 v)
Definition: vector_sse42.h:611
static i32x2 i32x2_shift_left(i32x2 x, i32x2 i)
Definition: vector_sse42.h:346
static u32 u32x4_get0(u32x4 x)
Definition: vector_sse42.h:470
static_always_inline u32 u32x4_max_scalar(u32x4 v)
Definition: vector_sse42.h:587
unsigned int u32
Definition: types.h:88
epu8_epi32 epu16_epi32 epu32_epi64 i32x4
Definition: vector_sse42.h:677
static i32x4 i32x4_set0(i32 x)
Definition: vector_sse42.h:488
adds_epu static subs_epu i16x8 i16x8_mul_lo(i16x8 x, i16x8 y)
Definition: vector_sse42.h:272
epu8_epi32 epu16_epi32 u64x2
Definition: vector_sse42.h:674
static u8x16 u8x16_interleave_hi(u8x16 a, u8x16 b)
Definition: vector_sse42.h:102
epu8_epi32 epu16_epi32 epu32_epi64 epi8_epi32 epi16_epi32 epi32_epi64 static_always_inline u64x2 u64x2_gather(void *p0, void *p1)
Definition: vector_sse42.h:686
static i16 i16x8_min_scalar(i16x8 x)
Definition: vector_sse42.h:567
static_always_inline u16x8 u16x8_byte_swap(u16x8 v)
Definition: vector_sse42.h:630
lo
static u8 u8x16_min_scalar(u8x16 x)
Definition: vector_sse42.h:548
static const __m128i zero
Definition: aes_gcm.c:39
unsigned short u16
Definition: types.h:57
static_always_inline u32x4 u32x4_shuffle(u32x4 v, const int a, const int b, const int c, const int d)
Definition: vector_sse42.h:651
static_always_inline void u64x2_scatter(u64x2 r, void *p0, void *p1)
Definition: vector_sse42.h:701
static_always_inline u8x16 u8x16_shuffle(u8x16 v, u8x16 m)
Definition: vector_sse42.h:645
static u32x2 u32x2_shift_left(u32x2 x, u32x2 i)
Definition: vector_sse42.h:322
#define i16x8_word_shift_right(a, n)
Definition: vector_sse42.h:377
static u32 u8x16_zero_byte_mask(u8x16 x)
Definition: vector_sse42.h:517
add_saturate
Definition: vector_sse42.h:268
svmdb_client_t * c
sll srl srl sll sra u16x4 i
Definition: vector_sse42.h:317
static i32 i32x4_get0(i32x4 x)
Definition: vector_sse42.h:494
static u32x4 u32x4_interleave_hi(u32x4 a, u32x4 b)
Definition: vector_sse42.h:126
epu8_epi32 epu16_epi32 epu32_epi64 epi8_epi32 epi16_epi32 i64x2
Definition: vector_sse42.h:681
signed int i32
Definition: types.h:77
static u32 u8x16_compare_byte_mask(u8x16 x)
Definition: vector_sse42.h:501
u8 u32x4_compare_word_mask_table[256]
static_always_inline u32 i32x4_max_scalar(i32x4 v)
Definition: vector_sse42.h:603
static i16x4 i16x4_shift_left(i16x4 x, i16x4 i)
Definition: vector_sse42.h:340
static i8x8 i16x4_pack(i16x4 lo, i16x4 hi)
Definition: vector_sse42.h:213
static u32 u8x16_max_scalar(u8x16 x)
Definition: vector_sse42.h:538
static u32x2 u32x2_shift_right(u32x2 x, u32x2 i)
Definition: vector_sse42.h:334
vmrglw i16x8
left
Definition: vector_sse42.h:305
static i16x4 i32x2_pack(i32x2 lo, i32x2 hi)
Definition: vector_sse42.h:225
static u16x4 u32x2_pack(u32x2 lo, u32x2 hi)
Definition: vector_sse42.h:219
#define foreach_sse42_vec128u
Definition: vector_sse42.h:47
static_always_inline u8x16 u8x16_is_greater(u8x16 v1, u8x16 v2)
Definition: vector_sse42.h:729
static_always_inline void u32x4_scatter_one(u32x4 r, int index, void *p)
Definition: vector_sse42.h:723
static u16x4 u16x4_shift_right(u16x4 x, u16x4 i)
Definition: vector_sse42.h:328
static_always_inline u32 i32x4_min_scalar(i32x4 v)
Definition: vector_sse42.h:595
static u8x8 u8x8_interleave_lo(u8x8 a, u8x8 b)
Definition: vector_sse42.h:157
static u32 u32x4_compare_word_mask(u32x4 x)
Definition: vector_sse42.h:509
static u8x16 u8x16_interleave_lo(u8x16 a, u8x16 b)
Definition: vector_sse42.h:108
static u64x2 u64x2_read_hi(u64x2 x, u64 *a)
Definition: vector_sse42.h:238
static u16x8 u16x8_mul_hi(u16x8 x, u16x8 y)
Definition: vector_sse42.h:290
static i32x2 i32x2_shift_right(i32x2 x, i32x2 i)
Definition: vector_sse42.h:358
static void u64x2_write_lo(u64x2 x, u64 *a)
Definition: vector_sse42.h:244
unsigned long long u32x4
Definition: ixge.c:28
static u32x4 u32x4_set0(u32 x)
Definition: vector_sse42.h:479
static u64x2 u64x2_read_lo(u64x2 x, u64 *a)
Definition: vector_sse42.h:232
static u32 u32x4_zero_byte_mask(u32x4 x)
Definition: vector_sse42.h:531
static_always_inline u32x4 u32x4_hadd(u32x4 v1, u32x4 v2)
Definition: vector_sse42.h:639
signed short i16
Definition: types.h:46