1 /*************************************************************************
3 * Copyright 2016 Realm Inc.
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
17 **************************************************************************/
19 #ifndef REALM_NMMINTRIN_H
20 #define REALM_NMMINTRIN_H
23 We must support runtime detection of CPU support of SSE when distributing Realm as a closed source library.
25 This is a problem on gcc and llvm: To use SSE intrinsics we need to pass -msse on the command line (to get offered
26 __builtin_ accessors used by intrinsics functions). However, the -msse flag allows gcc to emit SSE instructions
27 in its code generation/optimization. This is unwanted because the binary would crash on non-SSE CPUs.
29 Since there exists no flag in gcc that enables intrinsics but probits SSE in code generation, we define our
30 own intrinsics to be assembled by the back end assembler and omit passing -msse to gcc.
35 #ifdef REALM_COMPILER_SSE
36 #include <emmintrin.h> // SSE2 (using __m128i)
42 #ifdef REALM_COMPILER_AVX
43 typedef float __m256 __attribute__((__vector_size__(32), __may_alias__));
44 typedef double __m256d __attribute__((__vector_size__(32), __may_alias__));
46 const int _CMP_EQ_OQ = 0x00; // Equal (ordered, non-signaling)
47 const int _CMP_NEQ_OQ = 0x0c; // Not-equal (ordered, non-signaling)
48 const int _CMP_LT_OQ = 0x11; // Less-than (ordered, non-signaling)
49 const int _CMP_LE_OQ = 0x12; // Less-than-or-equal (ordered, non-signaling)
50 const int _CMP_GE_OQ = 0x1d; // Greater-than-or-equal (ordered, non-signaling)
51 const int _CMP_GT_OQ = 0x1e; // Greater-than (ordered, non-signaling)
55 static int movemask_cmp_ps(__m256* y1, __m256* y2)
58 __asm__("vmovaps %0, %%ymm0" : : "m"(*y1) : "%xmm0" );
59 __asm__("vmovaps %0, %%ymm1" : : "m"(*y2) : "%xmm1" );
60 __asm__("vcmpps %0, %%ymm0, %%ymm1, %%ymm0" : : "I"(op) : "%xmm0" );
61 __asm__("vmovmskps %%ymm0, %0" : "=r"(ret) : : );
66 static inline int movemask_cmp_pd(__m256d* y1, __m256d* y2)
69 __asm__("vmovapd %0, %%ymm0" : : "m"(*y1) : "%xmm0" );
70 __asm__("vmovapd %0, %%ymm1" : : "m"(*y2) : "%xmm1" );
71 __asm__("vcmppd %0, %%ymm0, %%ymm1, %%ymm0" : : "I"(op) : "%xmm0" );
72 __asm__("vmovmskpd %%ymm0, %0" : "=r"(ret) : : );
78 static inline int movemask_cmp_ps(__m256* y1, __m256* y2, int op)
80 // todo, use constexpr;
82 return movemask_cmp_ps<_CMP_NEQ_OQ>(y1, y2);
83 else if (op == _CMP_NEQ_OQ)
84 return movemask_cmp_ps<_CMP_NEQ_OQ>(y1, y2);
85 else if (op == _CMP_LT_OQ)
86 return movemask_cmp_ps<_CMP_LT_OQ>(y1, y2);
87 else if (op == _CMP_LE_OQ)
88 return movemask_cmp_ps<_CMP_LE_OQ>(y1, y2);
89 else if (op == _CMP_GE_OQ)
90 return movemask_cmp_ps<_CMP_GE_OQ>(y1, y2);
91 else if (op == _CMP_GT_OQ)
92 return movemask_cmp_ps<_CMP_GT_OQ>(y1, y2);
98 static inline int movemask_cmp_pd(__m256d* y1, __m256d* y2, int op)
100 // todo, use constexpr;
101 if (op == _CMP_EQ_OQ)
102 return movemask_cmp_pd<_CMP_NEQ_OQ>(y1, y2);
103 else if (op == _CMP_NEQ_OQ)
104 return movemask_cmp_pd<_CMP_NEQ_OQ>(y1, y2);
105 else if (op == _CMP_LT_OQ)
106 return movemask_cmp_pd<_CMP_LT_OQ>(y1, y2);
107 else if (op == _CMP_LE_OQ)
108 return movemask_cmp_pd<_CMP_LE_OQ>(y1, y2);
109 else if (op == _CMP_GE_OQ)
110 return movemask_cmp_pd<_CMP_GE_OQ>(y1, y2);
111 else if (op == _CMP_GT_OQ)
112 return movemask_cmp_pd<_CMP_GT_OQ>(y1, y2);
122 // Instructions introduced by SSE 3 and 4.2
123 static inline __m128i _mm_cmpgt_epi64(__m128i xmm1, __m128i xmm2)
125 __asm__("pcmpgtq %1, %0" : "+x" (xmm1) : "xm" (xmm2));
129 static inline __m128i _mm_cmpeq_epi64(__m128i xmm1, __m128i xmm2)
131 __asm__("pcmpeqq %1, %0" : "+x" (xmm1) : "xm" (xmm2));
135 static inline __m128i __attribute__((always_inline)) _mm_min_epi8(__m128i xmm1, __m128i xmm2)
137 __asm__("pminsb %1, %0" : "+x" (xmm1) : "xm" (xmm2));
141 static inline __m128i __attribute__((always_inline)) _mm_max_epi8(__m128i xmm1, __m128i xmm2)
143 __asm__("pmaxsb %1, %0" : "+x" (xmm1) : "xm" (xmm2));
147 static inline __m128i __attribute__((always_inline)) _mm_max_epi32(__m128i xmm1, __m128i xmm2)
149 __asm__("pmaxsd %1, %0" : "+x" (xmm1) : "xm" (xmm2));
153 static inline __m128i __attribute__((always_inline)) _mm_min_epi32(__m128i xmm1, __m128i xmm2)
155 __asm__("pminsd %1, %0" : "+x" (xmm1) : "xm" (xmm2));
159 static inline __m128i __attribute__((always_inline)) _mm_cvtepi8_epi16(__m128i xmm2)
162 __asm__("pmovsxbw %1, %0" : "=x" (xmm1) : "xm" (xmm2) : "xmm1");
165 static inline __m128i __attribute__((always_inline)) _mm_cvtepi16_epi32(__m128i xmm2)
168 asm("pmovsxwd %1, %0" : "=x" (xmm1) : "xm" (xmm2));
172 static inline __m128i __attribute__((always_inline)) _mm_cvtepi32_epi64(__m128i xmm2)
175 __asm__("pmovsxdq %1, %0" : "=x" (xmm1) : "xm" (xmm2));