FD.io VPP  v17.04-9-g99c0734
Vector Packet Processing
vector_sse2.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 Cisco and/or its affiliates.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at:
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 /*
16  Copyright (c) 2005 Eliot Dresselhaus
17 
18  Permission is hereby granted, free of charge, to any person obtaining
19  a copy of this software and associated documentation files (the
20  "Software"), to deal in the Software without restriction, including
21  without limitation the rights to use, copy, modify, merge, publish,
22  distribute, sublicense, and/or sell copies of the Software, and to
23  permit persons to whom the Software is furnished to do so, subject to
24  the following conditions:
25 
26  The above copyright notice and this permission notice shall be
27  included in all copies or substantial portions of the Software.
28 
29  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
30  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
31  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
32  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
33  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
34  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
35  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
36 */
37 
38 #ifndef included_vector_sse2_h
39 #define included_vector_sse2_h
40 
41 #include <vppinfra/error_bootstrap.h> /* for ASSERT */
42 #include <x86intrin.h>
43 
44 /* 128 bit interleaves. */
45 always_inline u8x16
46 u8x16_interleave_hi (u8x16 a, u8x16 b)
47 {
48  return (u8x16) _mm_unpackhi_epi8 ((__m128i) a, (__m128i) b);
49 }
50 
51 always_inline u8x16
52 u8x16_interleave_lo (u8x16 a, u8x16 b)
53 {
54  return (u8x16) _mm_unpacklo_epi8 ((__m128i) a, (__m128i) b);
55 }
56 
57 always_inline u16x8
58 u16x8_interleave_hi (u16x8 a, u16x8 b)
59 {
60  return (u16x8) _mm_unpackhi_epi16 ((__m128i) a, (__m128i) b);
61 }
62 
63 always_inline u16x8
64 u16x8_interleave_lo (u16x8 a, u16x8 b)
65 {
66  return (u16x8) _mm_unpacklo_epi16 ((__m128i) a, (__m128i) b);
67 }
68 
71 {
72  return (u32x4) _mm_unpackhi_epi32 ((__m128i) a, (__m128i) b);
73 }
74 
77 {
78  return (u32x4) _mm_unpacklo_epi32 ((__m128i) a, (__m128i) b);
79 }
80 
81 always_inline u64x2
82 u64x2_interleave_hi (u64x2 a, u64x2 b)
83 {
84  return (u64x2) _mm_unpackhi_epi64 ((__m128i) a, (__m128i) b);
85 }
86 
87 always_inline u64x2
88 u64x2_interleave_lo (u64x2 a, u64x2 b)
89 {
90  return (u64x2) _mm_unpacklo_epi64 ((__m128i) a, (__m128i) b);
91 }
92 
93 /* 64 bit interleaves. */
94 always_inline u8x8
95 u8x8_interleave_hi (u8x8 a, u8x8 b)
96 {
97  return (u8x8) _m_punpckhbw ((__m64) a, (__m64) b);
98 }
99 
100 always_inline u8x8
101 u8x8_interleave_lo (u8x8 a, u8x8 b)
102 {
103  return (u8x8) _m_punpcklbw ((__m64) a, (__m64) b);
104 }
105 
106 always_inline u16x4
107 u16x4_interleave_hi (u16x4 a, u16x4 b)
108 {
109  return (u16x4) _m_punpckhwd ((__m64) a, (__m64) b);
110 }
111 
112 always_inline u16x4
113 u16x4_interleave_lo (u16x4 a, u16x4 b)
114 {
115  return (u16x4) _m_punpcklwd ((__m64) a, (__m64) b);
116 }
117 
118 always_inline u32x2
119 u32x2_interleave_hi (u32x2 a, u32x2 b)
120 {
121  return (u32x2) _m_punpckhdq ((__m64) a, (__m64) b);
122 }
123 
124 always_inline u32x2
125 u32x2_interleave_lo (u32x2 a, u32x2 b)
126 {
127  return (u32x2) _m_punpckldq ((__m64) a, (__m64) b);
128 }
129 
130 /* 128 bit packs. */
131 always_inline u8x16
132 u16x8_pack (u16x8 lo, u16x8 hi)
133 {
134  return (u8x16) _mm_packus_epi16 ((__m128i) lo, (__m128i) hi);
135 }
136 
137 always_inline i8x16
139 {
140  return (i8x16) _mm_packs_epi16 ((__m128i) lo, (__m128i) hi);
141 }
142 
143 always_inline u16x8
145 {
146  return (u16x8) _mm_packs_epi32 ((__m128i) lo, (__m128i) hi);
147 }
148 
149 /* 64 bit packs. */
150 always_inline u8x8
151 u16x4_pack (u16x4 lo, u16x4 hi)
152 {
153  return (u8x8) _m_packuswb ((__m64) lo, (__m64) hi);
154 }
155 
156 always_inline i8x8
157 i16x4_pack (i16x4 lo, i16x4 hi)
158 {
159  return (i8x8) _m_packsswb ((__m64) lo, (__m64) hi);
160 }
161 
162 always_inline u16x4
163 u32x2_pack (u32x2 lo, u32x2 hi)
164 {
165  return (u16x4) _m_packssdw ((__m64) lo, (__m64) hi);
166 }
167 
168 always_inline i16x4
169 i32x2_pack (i32x2 lo, i32x2 hi)
170 {
171  return (i16x4) _m_packssdw ((__m64) lo, (__m64) hi);
172 }
173 
174 /* Splats: replicate scalar value into vector. */
175 always_inline u64x2
177 {
178  u64x2 x = { a };
179  x = u64x2_interleave_lo (x, x);
180  return x;
181 }
182 
185 {
186  u32x4 x = { a };
187  x = u32x4_interleave_lo (x, x);
188  x = (u32x4) u64x2_interleave_lo ((u64x2) x, (u64x2) x);
189  return x;
190 }
191 
192 always_inline u16x8
194 {
195  u32 t = (u32) a | ((u32) a << 16);
196  return (u16x8) u32x4_splat (t);
197 }
198 
199 always_inline u8x16
201 {
202  u32 t = (u32) a | ((u32) a << 8);
203  t |= t << 16;
204  return (u8x16) u16x8_splat (t);
205 }
206 
207 always_inline u32x2
209 {
210  u32x2 x = { a };
211  x = u32x2_interleave_lo (x, x);
212  return x;
213 }
214 
215 always_inline u16x4
217 {
218  u32 t = (u32) a | ((u32) a << 16);
219  return (u16x4) u32x2_splat (t);
220 }
221 
222 always_inline u8x8
224 {
225  u32 t = (u32) a | ((u32) a << 8);
226  t |= t << 16;
227  return (u8x8) u32x2_splat (t);
228 }
229 
230 #define i64x2_splat u64x2_splat
231 #define i32x4_splat u32x4_splat
232 #define i16x8_splat u16x8_splat
233 #define i8x16_splat u8x16_splat
234 #define i32x2_splat u32x2_splat
235 #define i16x4_splat u16x4_splat
236 #define i8x8_splat u8x8_splat
237 
238 #ifndef __ICC
239 always_inline u64x2
240 u64x2_read_lo (u64x2 x, u64 * a)
241 {
242  return (u64x2) _mm_loadl_pi ((__m128) x, (__m64 *) a);
243 }
244 
245 always_inline u64x2
246 u64x2_read_hi (u64x2 x, u64 * a)
247 {
248  return (u64x2) _mm_loadh_pi ((__m128) x, (__m64 *) a);
249 }
250 
251 always_inline void
252 u64x2_write_lo (u64x2 x, u64 * a)
253 {
254  _mm_storel_pi ((__m64 *) a, (__m128) x);
255 }
256 
257 always_inline void
258 u64x2_write_hi (u64x2 x, u64 * a)
259 {
260  _mm_storeh_pi ((__m64 *) a, (__m128) x);
261 }
262 #endif
263 
264 /* Unaligned loads/stores. */
265 
266 #define _(t) \
267  always_inline void t##_store_unaligned (t x, t * a) \
268  { _mm_storeu_si128 ((__m128i *) a, (__m128i) x); } \
269  always_inline t t##_load_unaligned (t * a) \
270  { return (t) _mm_loadu_si128 ((__m128i *) a); }
271 
272 _(u8x16) _(u16x8) _(u32x4) _(u64x2) _(i8x16) _(i16x8) _(i32x4) _(i64x2)
273 #undef _
274 #define _signed_binop(n,m,f,g) \
275  /* Unsigned */ \
276  always_inline u##n##x##m \
277  u##n##x##m##_##f (u##n##x##m x, u##n##x##m y) \
278  { return (u##n##x##m) _mm_##g##n ((__m128i) x, (__m128i) y); } \
279  \
280  /* Signed */ \
281  always_inline i##n##x##m \
282  i##n##x##m##_##f (i##n##x##m x, i##n##x##m y) \
283  { return (i##n##x##m) _mm_##g##n ((__m128i) x, (__m128i) y); }
284 /* Addition/subtraction. */
285  _signed_binop (8, 16, add, add_epi)
286 _signed_binop (16, 8, add, add_epi)
287 _signed_binop (32, 4, add, add_epi)
288 _signed_binop (64, 2, add, add_epi)
289 _signed_binop (8, 16, sub, sub_epi)
290 _signed_binop (16, 8, sub, sub_epi)
291 _signed_binop (32, 4, sub, sub_epi) _signed_binop (64, 2, sub, sub_epi)
292 /* Addition/subtraction with saturation. */
293  _signed_binop (8, 16, add_saturate, adds_epu)
294 _signed_binop (16, 8, add_saturate, adds_epu)
295 _signed_binop (8, 16, sub_saturate, subs_epu)
296 _signed_binop (16, 8, sub_saturate, subs_epu)
297 /* Multiplication. */
298  always_inline i16x8 i16x8_mul_lo (i16x8 x, i16x8 y)
299 {
300  return (i16x8) _mm_mullo_epi16 ((__m128i) x, (__m128i) y);
301 }
302 
303 always_inline u16x8
304 u16x8_mul_lo (u16x8 x, u16x8 y)
305 {
306  return (u16x8) _mm_mullo_epi16 ((__m128i) x, (__m128i) y);
307 }
308 
311 {
312  return (i16x8) _mm_mulhi_epu16 ((__m128i) x, (__m128i) y);
313 }
314 
315 always_inline u16x8
316 u16x8_mul_hi (u16x8 x, u16x8 y)
317 {
318  return (u16x8) _mm_mulhi_epu16 ((__m128i) x, (__m128i) y);
319 }
320 
321 /* 128 bit shifts. */
322 
323 #define _(p,a,b,c,f) \
324  always_inline p##a##x##b p##a##x##b##_ishift_##c (p##a##x##b x, int i) \
325  { return (p##a##x##b) _mm_##f##i_epi##a ((__m128i) x, i); } \
326  \
327  always_inline p##a##x##b p##a##x##b##_shift_##c (p##a##x##b x, p##a##x##b y) \
328  { return (p##a##x##b) _mm_##f##_epi##a ((__m128i) x, (__m128i) y); }
329 
330 _(u, 16, 8, left, sll)
331 _(u, 32, 4, left, sll)
332 _(u, 64, 2, left, sll)
333 _(u, 16, 8, right, srl)
334 _(u, 32, 4, right, srl)
335 _(u, 64, 2, right, srl)
336 _(i, 16, 8, left, sll)
337 _(i, 32, 4, left, sll)
338 _(i, 64, 2, left, sll) _(i, 16, 8, right, sra) _(i, 32, 4, right, sra)
339 #undef _
340 /* 64 bit shifts. */
341  always_inline u16x4
342 u16x4_shift_left (u16x4 x, u16x4 i)
343 {
344  return (u16x4) _m_psllw ((__m64) x, (__m64) i);
345 };
346 
347 always_inline u32x2
348 u32x2_shift_left (u32x2 x, u32x2 i)
349 {
350  return (u32x2) _m_pslld ((__m64) x, (__m64) i);
351 };
352 
353 always_inline u16x4
354 u16x4_shift_right (u16x4 x, u16x4 i)
355 {
356  return (u16x4) _m_psrlw ((__m64) x, (__m64) i);
357 };
358 
359 always_inline u32x2
360 u32x2_shift_right (u32x2 x, u32x2 i)
361 {
362  return (u32x2) _m_psrld ((__m64) x, (__m64) i);
363 };
364 
365 always_inline i16x4
366 i16x4_shift_left (i16x4 x, i16x4 i)
367 {
368  return (i16x4) _m_psllw ((__m64) x, (__m64) i);
369 };
370 
371 always_inline i32x2
372 i32x2_shift_left (i32x2 x, i32x2 i)
373 {
374  return (i32x2) _m_pslld ((__m64) x, (__m64) i);
375 };
376 
377 always_inline i16x4
378 i16x4_shift_right (i16x4 x, i16x4 i)
379 {
380  return (i16x4) _m_psraw ((__m64) x, (__m64) i);
381 };
382 
383 always_inline i32x2
384 i32x2_shift_right (i32x2 x, i32x2 i)
385 {
386  return (i32x2) _m_psrad ((__m64) x, (__m64) i);
387 };
388 
389 #define u8x16_word_shift_left(a,n) (u8x16) _mm_slli_si128((__m128i) a, n)
390 #define u8x16_word_shift_right(a,n) (u8x16) _mm_srli_si128((__m128i) a, n)
391 
392 #define i8x16_word_shift_left(a,n) \
393  ((i8x16) u8x16_word_shift_left((u8x16) (a), (n)))
394 #define i8x16_word_shift_right(a,n) \
395  ((i8x16) u8x16_word_shift_right((u8x16) (a), (n)))
396 
397 #define u16x8_word_shift_left(a,n) \
398  ((u16x8) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u16)))
399 #define i16x8_word_shift_left(a,n) \
400  ((u16x8) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u16)))
401 #define u16x8_word_shift_right(a,n) \
402  ((u16x8) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u16)))
403 #define i16x8_word_shift_right(a,n) \
404  ((i16x8) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u16)))
405 
406 #define u32x4_word_shift_left(a,n) \
407  ((u32x4) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u32)))
408 #define i32x4_word_shift_left(a,n) \
409  ((u32x4) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u32)))
410 #define u32x4_word_shift_right(a,n) \
411  ((u32x4) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u32)))
412 #define i32x4_word_shift_right(a,n) \
413  ((i32x4) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u32)))
414 
415 #define u64x2_word_shift_left(a,n) \
416  ((u64x2) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u64)))
417 #define i64x2_word_shift_left(a,n) \
418  ((u64x2) u8x16_word_shift_left((u8x16) (a), (n) * sizeof (u64)))
419 #define u64x2_word_shift_right(a,n) \
420  ((u64x2) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u64)))
421 #define i64x2_word_shift_right(a,n) \
422  ((i64x2) u8x16_word_shift_right((u8x16) (a), (n) * sizeof (u64)))
423 
424 /* SSE2 has no rotate instructions: use shifts to simulate them. */
425 #define _(t,n,lr1,lr2) \
426  always_inline t##x##n \
427  t##x##n##_irotate_##lr1 (t##x##n w, int i) \
428  { \
429  ASSERT (i >= 0 && i <= BITS (t)); \
430  return (t##x##n##_ishift_##lr1 (w, i) \
431  | t##x##n##_ishift_##lr2 (w, BITS (t) - i)); \
432  } \
433  \
434  always_inline t##x##n \
435  t##x##n##_rotate_##lr1 (t##x##n w, t##x##n i) \
436  { \
437  t##x##n j = t##x##n##_splat (BITS (t)); \
438  return (t##x##n##_shift_##lr1 (w, i) \
439  | t##x##n##_shift_##lr2 (w, j - i)); \
440  }
441 
442 _(u16, 8, left, right);
443 _(u16, 8, right, left);
444 _(u32, 4, left, right);
445 _(u32, 4, right, left);
446 _(u64, 2, left, right);
447 _(u64, 2, right, left);
448 
449 #undef _
450 
451 #ifndef __clang__
452 #define _(t,n,lr1,lr2) \
453  always_inline t##x##n \
454  t##x##n##_word_rotate2_##lr1 (t##x##n w0, t##x##n w1, int i) \
455  { \
456  int m = sizeof (t##x##n) / sizeof (t); \
457  ASSERT (i >= 0 && i < m); \
458  return (t##x##n##_word_shift_##lr1 (w0, i) \
459  | t##x##n##_word_shift_##lr2 (w1, m - i)); \
460  } \
461  \
462  always_inline t##x##n \
463  t##x##n##_word_rotate_##lr1 (t##x##n w0, int i) \
464  { return t##x##n##_word_rotate2_##lr1 (w0, w0, i); }
465 
466 _(u8, 16, left, right);
467 _(u8, 16, right, left);
468 _(u16, 8, left, right);
469 _(u16, 8, right, left);
470 _(u32, 4, left, right);
471 _(u32, 4, right, left);
472 _(u64, 2, left, right);
473 _(u64, 2, right, left);
474 
475 #undef _
476 #endif
477 
478 /* Compare operations. */
479 always_inline u8x16
480 u8x16_is_equal (u8x16 x, u8x16 y)
481 {
482  return (u8x16) _mm_cmpeq_epi8 ((__m128i) x, (__m128i) y);
483 }
484 
485 always_inline i8x16
486 i8x16_is_equal (i8x16 x, i8x16 y)
487 {
488  return (i8x16) _mm_cmpeq_epi8 ((__m128i) x, (__m128i) y);
489 }
490 
491 always_inline u16x8
492 u16x8_is_equal (u16x8 x, u16x8 y)
493 {
494  return (u16x8) _mm_cmpeq_epi16 ((__m128i) x, (__m128i) y);
495 }
496 
499 {
500  return (i16x8) _mm_cmpeq_epi16 ((__m128i) x, (__m128i) y);
501 }
502 
505 {
506  return (u32x4) _mm_cmpeq_epi32 ((__m128i) x, (__m128i) y);
507 }
508 
511 {
512  return (i32x4) _mm_cmpeq_epi32 ((__m128i) x, (__m128i) y);
513 }
514 
515 always_inline u8x16
516 i8x16_is_greater (i8x16 x, i8x16 y)
517 {
518  return (u8x16) _mm_cmpgt_epi8 ((__m128i) x, (__m128i) y);
519 }
520 
521 always_inline u16x8
523 {
524  return (u16x8) _mm_cmpgt_epi16 ((__m128i) x, (__m128i) y);
525 }
526 
529 {
530  return (u32x4) _mm_cmpgt_epi32 ((__m128i) x, (__m128i) y);
531 }
532 
533 always_inline u8x16
534 u8x16_is_zero (u8x16 x)
535 {
536  u8x16 zero = { 0 };
537  return u8x16_is_equal (x, zero);
538 }
539 
540 always_inline u16x8
541 u16x8_is_zero (u16x8 x)
542 {
543  u16x8 zero = { 0 };
544  return u16x8_is_equal (x, zero);
545 }
546 
549 {
550  u32x4 zero = { 0 };
551  return u32x4_is_equal (x, zero);
552 }
553 
554 #define u32x4_select(A,MASK) \
555 ({ \
556  u32x4 _x, _y; \
557  _x = (A); \
558  asm volatile ("pshufd %[mask], %[x], %[y]" \
559  : /* outputs */ [y] "=x" (_y) \
560  : /* inputs */ [x] "x" (_x), [mask] "i" (MASK)); \
561  _y; \
562 })
563 
564 #define u32x4_splat_word(x,i) \
565  u32x4_select ((x), (((i) << (2*0)) \
566  | ((i) << (2*1)) \
567  | ((i) << (2*2)) \
568  | ((i) << (2*3))))
569 
570 /* Extract low order 32 bit word. */
573 {
574  u32 result;
575  asm volatile ("movd %[x], %[result]": /* outputs */ [result] "=r" (result)
576  : /* inputs */ [x] "x" (x));
577  return result;
578 }
579 
582 {
583  u32x4 result;
584  asm volatile ("movd %[x], %[result]": /* outputs */ [result] "=x" (result)
585  : /* inputs */ [x] "r" (x));
586  return result;
587 }
588 
591 {
592  return (i32x4) u32x4_set0 ((u32) x);
593 }
594 
597 {
598  return (i32) u32x4_get0 ((u32x4) x);
599 }
600 
601 /* Converts all ones/zeros compare mask to bitmap. */
604 {
605  return _mm_movemask_epi8 ((__m128i) x);
606 }
607 
609 
612 {
613  u32 m = u8x16_compare_byte_mask ((u8x16) x);
614  return (u32x4_compare_word_mask_table[(m >> 0) & 0xff]
615  | (u32x4_compare_word_mask_table[(m >> 8) & 0xff] << 2));
616 }
617 
620 {
621  u8x16 zero = { 0 };
622  return u8x16_compare_byte_mask (u8x16_is_equal (x, zero));
623 }
624 
627 {
628  u16x8 zero = { 0 };
629  return u8x16_compare_byte_mask ((u8x16) u16x8_is_equal (x, zero));
630 }
631 
634 {
635  u32x4 zero = { 0 };
636  return u8x16_compare_byte_mask ((u8x16) u32x4_is_equal (x, zero));
637 }
638 
639 always_inline u8x16
640 u8x16_max (u8x16 x, u8x16 y)
641 {
642  return (u8x16) _mm_max_epu8 ((__m128i) x, (__m128i) y);
643 }
644 
647 {
648  x = u8x16_max (x, u8x16_word_shift_right (x, 8));
649  x = u8x16_max (x, u8x16_word_shift_right (x, 4));
650  x = u8x16_max (x, u8x16_word_shift_right (x, 2));
651  x = u8x16_max (x, u8x16_word_shift_right (x, 1));
652  return _mm_extract_epi16 ((__m128i) x, 0) & 0xff;
653 }
654 
655 always_inline u8x16
656 u8x16_min (u8x16 x, u8x16 y)
657 {
658  return (u8x16) _mm_min_epu8 ((__m128i) x, (__m128i) y);
659 }
660 
663 {
664  x = u8x16_min (x, u8x16_word_shift_right (x, 8));
665  x = u8x16_min (x, u8x16_word_shift_right (x, 4));
666  x = u8x16_min (x, u8x16_word_shift_right (x, 2));
667  x = u8x16_min (x, u8x16_word_shift_right (x, 1));
668  return _mm_extract_epi16 ((__m128i) x, 0) & 0xff;
669 }
670 
673 {
674  return (i16x8) _mm_max_epi16 ((__m128i) x, (__m128i) y);
675 }
676 
679 {
680  x = i16x8_max (x, i16x8_word_shift_right (x, 4));
681  x = i16x8_max (x, i16x8_word_shift_right (x, 2));
682  x = i16x8_max (x, i16x8_word_shift_right (x, 1));
683  return _mm_extract_epi16 ((__m128i) x, 0);
684 }
685 
688 {
689  return (i16x8) _mm_min_epi16 ((__m128i) x, (__m128i) y);
690 }
691 
694 {
695  x = i16x8_min (x, i16x8_word_shift_right (x, 4));
696  x = i16x8_min (x, i16x8_word_shift_right (x, 2));
697  x = i16x8_min (x, i16x8_word_shift_right (x, 1));
698  return _mm_extract_epi16 ((__m128i) x, 0);
699 }
700 
701 #undef _signed_binop
702 
703 #endif /* included_vector_sse2_h */
704 
705 /*
706  * fd.io coding-style-patch-verification: ON
707  *
708  * Local Variables:
709  * eval: (c-set-style "gnu")
710  * End:
711  */
static i16x8 i16x8_max(i16x8 x, i16x8 y)
Definition: vector_sse2.h:672
static u32x4 u32x4_is_equal(u32x4 x, u32x4 y)
Definition: vector_sse2.h:504
vmrglw vmrglh hi
#define i16x8_word_shift_right(a, n)
Definition: vector_sse2.h:403
sll srl srl sll sra u16x4 i
Definition: vector_sse2.h:343
static u8x16 u8x16_interleave_lo(u8x16 a, u8x16 b)
Definition: vector_sse2.h:52
#define u8x16_word_shift_right(a, n)
Definition: vector_sse2.h:390
a
Definition: bitmap.h:516
static i8x16 i16x8_pack(i16x8 lo, i16x8 hi)
Definition: vector_sse2.h:138
static i32x4 i32x4_set0(i32 x)
Definition: vector_sse2.h:590
static u64x2 u64x2_interleave_hi(u64x2 a, u64x2 b)
Definition: vector_sse2.h:82
static u32 u8x16_max_scalar(u8x16 x)
Definition: vector_sse2.h:646
static u16x8 u16x8_is_zero(u16x8 x)
Definition: vector_sse2.h:541
sll right
Definition: vector_sse2.h:333
add_epi add_epi sub
Definition: vector_sse2.h:289
static u16x4 u32x2_pack(u32x2 lo, u32x2 hi)
Definition: vector_sse2.h:163
static u64x2 u64x2_splat(u64 a)
Definition: vector_sse2.h:176
static u8x16 u8x16_max(u8x16 x, u8x16 y)
Definition: vector_sse2.h:640
static u8x16 i8x16_is_greater(i8x16 x, i8x16 y)
Definition: vector_sse2.h:516
static u32 u8x16_compare_byte_mask(u8x16 x)
Definition: vector_sse2.h:603
static u32x2 u32x2_shift_right(u32x2 x, u32x2 i)
Definition: vector_sse2.h:360
add_epi add_epi sub_epi sub_epi adds_epu subs_epu i16x8 y
Definition: vector_sse2.h:299
u8 u32x4_compare_word_mask_table[256]
add_epi add_epi sub_epi sub_epi add_saturate
Definition: vector_sse2.h:293
static u32 u32x4_zero_byte_mask(u32x4 x)
Definition: vector_sse2.h:633
static i16 i16x8_max_scalar(i16x8 x)
Definition: vector_sse2.h:678
static i16 i16x8_min_scalar(i16x8 x)
Definition: vector_sse2.h:693
static i16x8 i16x8_is_equal(i16x8 x, i16x8 y)
Definition: vector_sse2.h:498
static u32x2 u32x2_splat(u32 a)
Definition: vector_sse2.h:208
static i16x4 i16x4_shift_left(i16x4 x, i16x4 i)
Definition: vector_sse2.h:366
static u32x4 u32x4_splat(u32 a)
Definition: vector_sse2.h:184
static i16x4 i32x2_pack(i32x2 lo, i32x2 hi)
Definition: vector_sse2.h:169
static u32x2 u32x2_interleave_hi(u32x2 a, u32x2 b)
Definition: vector_sse2.h:119
i32x4
static u32 u32x4_compare_word_mask(u32x4 x)
Definition: vector_sse2.h:611
#define always_inline
Definition: clib.h:84
static u16x4 u16x4_interleave_lo(u16x4 a, u16x4 b)
Definition: vector_sse2.h:113
unsigned long long u32x4
Definition: ixge.c:28
int i32
Definition: types.h:81
static u8 u8x16_min_scalar(u8x16 x)
Definition: vector_sse2.h:662
static u32x4 i32x4_is_greater(i32x4 x, i32x4 y)
Definition: vector_sse2.h:528
unsigned long u64
Definition: types.h:89
static i16x8 i16x8_min(i16x8 x, i16x8 y)
Definition: vector_sse2.h:687
left
Definition: vector_sse2.h:331
static u16x8 u32x4_pack(u32x4 lo, u32x4 hi)
Definition: vector_sse2.h:144
static u16x4 u16x4_interleave_hi(u16x4 a, u16x4 b)
Definition: vector_sse2.h:107
static u16x8 u16x8_is_equal(u16x8 x, u16x8 y)
Definition: vector_sse2.h:492
add
Definition: vector_sse2.h:285
static u32x4 u32x4_set0(u32 x)
Definition: vector_sse2.h:581
static u8x8 u8x8_interleave_hi(u8x8 a, u8x8 b)
Definition: vector_sse2.h:95
static u8x16 u8x16_min(u8x16 x, u8x16 y)
Definition: vector_sse2.h:656
static u32x4 u32x4_interleave_hi(u32x4 a, u32x4 b)
Definition: vector_sse2.h:70
static u8x16 u8x16_interleave_hi(u8x16 a, u8x16 b)
Definition: vector_sse2.h:46
static i16x4 i16x4_shift_right(i16x4 x, i16x4 i)
Definition: vector_sse2.h:378
static u16x8 u16x8_mul_hi(u16x8 x, u16x8 y)
Definition: vector_sse2.h:316
static u32x4 u32x4_is_zero(u32x4 x)
Definition: vector_sse2.h:548
static u64x2 u64x2_interleave_lo(u64x2 a, u64x2 b)
Definition: vector_sse2.h:88
static u16x8 u16x8_interleave_lo(u16x8 a, u16x8 b)
Definition: vector_sse2.h:64
static i32x2 i32x2_shift_left(i32x2 x, i32x2 i)
Definition: vector_sse2.h:372
static u64x2 u64x2_read_lo(u64x2 x, u64 *a)
Definition: vector_sse2.h:240
static void u64x2_write_lo(u64x2 x, u64 *a)
Definition: vector_sse2.h:252
static i8x8 i16x4_pack(i16x4 lo, i16x4 hi)
Definition: vector_sse2.h:157
static u8x8 u16x4_pack(u16x4 lo, u16x4 hi)
Definition: vector_sse2.h:151
static u64x2 u64x2_read_hi(u64x2 x, u64 *a)
Definition: vector_sse2.h:246
static u16x8 u16x8_splat(u16 a)
Definition: vector_sse2.h:193
static u8x16 u8x16_is_equal(u8x16 x, u8x16 y)
Definition: vector_sse2.h:480
add_epi add_epi sub_epi sub_epi adds_epu sub_saturate
Definition: vector_sse2.h:295
unsigned int u32
Definition: types.h:88
static u32x4 u32x4_interleave_lo(u32x4 a, u32x4 b)
Definition: vector_sse2.h:76
static u8x16 u16x8_pack(u16x8 lo, u16x8 hi)
Definition: vector_sse2.h:132
vmrglw i16x8
static i32x2 i32x2_shift_right(i32x2 x, i32x2 i)
Definition: vector_sse2.h:384
static u32 u16x8_zero_byte_mask(u16x8 x)
Definition: vector_sse2.h:626
static u8x16 u8x16_splat(u8 a)
Definition: vector_sse2.h:200
static u32x2 u32x2_shift_left(u32x2 x, u32x2 i)
Definition: vector_sse2.h:348
static void u64x2_write_hi(u64x2 x, u64 *a)
Definition: vector_sse2.h:258
unsigned short u16
Definition: types.h:57
static u16x8 u16x8_interleave_hi(u16x8 a, u16x8 b)
Definition: vector_sse2.h:58
static i32 i32x4_get0(i32x4 x)
Definition: vector_sse2.h:596
static u8x8 u8x8_interleave_lo(u8x8 a, u8x8 b)
Definition: vector_sse2.h:101
static u32 u8x16_zero_byte_mask(u8x16 x)
Definition: vector_sse2.h:619
unsigned char u8
Definition: types.h:56
static u32 u32x4_get0(u32x4 x)
Definition: vector_sse2.h:572
static i32x4 i32x4_is_equal(i32x4 x, i32x4 y)
Definition: vector_sse2.h:510
static u16x8 i16x8_is_greater(i16x8 x, i16x8 y)
Definition: vector_sse2.h:522
short i16
Definition: types.h:46
static u32x2 u32x2_interleave_lo(u32x2 a, u32x2 b)
Definition: vector_sse2.h:125
static u8x16 u8x16_is_zero(u8x16 x)
Definition: vector_sse2.h:534
static u16x4 u16x4_shift_right(u16x4 x, u16x4 i)
Definition: vector_sse2.h:354
static i8x16 i8x16_is_equal(i8x16 x, i8x16 y)
Definition: vector_sse2.h:486
static i16x8 i16x8_mul_hi(i16x8 x, i16x8 y)
Definition: vector_sse2.h:310
static u8x8 u8x8_splat(u8 a)
Definition: vector_sse2.h:223
static u16x8 u16x8_mul_lo(u16x8 x, u16x8 y)
Definition: vector_sse2.h:304
static u16x4 u16x4_splat(u16 a)
Definition: vector_sse2.h:216