iOS/Pods/Realm/include/core/realm/realm_nmmintrin.h

   1 /*************************************************************************
   2  *
   3  * Copyright 2016 Realm Inc.
   4  *
   5  * Licensed under the Apache License, Version 2.0 (the "License");
   6  * you may not use this file except in compliance with the License.
   7  * You may obtain a copy of the License at
   8  *
   9  * http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  *
  17  **************************************************************************/
  18
  19 #ifndef REALM_NMMINTRIN_H
  20 #define REALM_NMMINTRIN_H
  21
  22 /*
  23     We must support runtime detection of CPU support of SSE when distributing Realm as a closed source library.
  24
  25     This is a problem on gcc and llvm: To use SSE intrinsics we need to pass -msse on the command line (to get offered
  26     __builtin_ accessors used by intrinsics functions). However, the -msse flag allows gcc to emit SSE instructions
  27     in its code generation/optimization. This is unwanted because the binary would crash on non-SSE CPUs.
  28
  29     Since there exists no flag in gcc that enables intrinsics but probits SSE in code generation, we define our
  30     own intrinsics to be assembled by the back end assembler and omit passing -msse to gcc.
  31 */
  32
  33 #ifndef _MSC_VER
  34
  35 #ifdef REALM_COMPILER_SSE
  36 #include <emmintrin.h> // SSE2 (using __m128i)
  37 #endif
  38
  39 namespace realm {
  40
  41 #if 0
  42 #ifdef REALM_COMPILER_AVX
  43 typedef float __m256 __attribute__((__vector_size__(32), __may_alias__));
  44 typedef double __m256d __attribute__((__vector_size__(32), __may_alias__));
  45
  46 const int _CMP_EQ_OQ = 0x00; // Equal (ordered, non-signaling)
  47 const int _CMP_NEQ_OQ = 0x0c; // Not-equal (ordered, non-signaling)
  48 const int _CMP_LT_OQ = 0x11; // Less-than (ordered, non-signaling)
  49 const int _CMP_LE_OQ = 0x12; // Less-than-or-equal (ordered, non-signaling)
  50 const int _CMP_GE_OQ = 0x1d; // Greater-than-or-equal (ordered, non-signaling)
  51 const int _CMP_GT_OQ = 0x1e; // Greater-than (ordered, non-signaling)
  52
  53
  54 template<int op>
  55 static int movemask_cmp_ps(__m256* y1, __m256* y2)
  56 {
  57     int ret;
  58     __asm__("vmovaps %0, %%ymm0"                    :                   : "m"(*y1)                      : "%xmm0"   );
  59     __asm__("vmovaps %0, %%ymm1"                    :                   : "m"(*y2)                      : "%xmm1"   );
  60     __asm__("vcmpps %0, %%ymm0, %%ymm1, %%ymm0"     :                   : "I"(op)                       : "%xmm0"   );
  61     __asm__("vmovmskps %%ymm0, %0"                  : "=r"(ret)         :                               :           );
  62     return ret;
  63 }
  64
  65 template<int op>
  66 static inline int movemask_cmp_pd(__m256d* y1, __m256d* y2)
  67 {
  68     int ret;
  69     __asm__("vmovapd %0, %%ymm0"                    :                   : "m"(*y1)                      : "%xmm0"   );
  70     __asm__("vmovapd %0, %%ymm1"                    :                   : "m"(*y2)                      : "%xmm1"   );
  71     __asm__("vcmppd %0, %%ymm0, %%ymm1, %%ymm0"     :                   : "I"(op)                       : "%xmm0"   );
  72     __asm__("vmovmskpd %%ymm0, %0"                  : "=r"(ret)         :                               :           );
  73     return ret;
  74 }
  75
  76
  77
  78 static inline int movemask_cmp_ps(__m256* y1, __m256* y2, int op)
  79 {
  80     // todo, use constexpr;
  81     if (op == _CMP_EQ_OQ)
  82         return movemask_cmp_ps<_CMP_NEQ_OQ>(y1, y2);
  83     else if (op == _CMP_NEQ_OQ)
  84         return movemask_cmp_ps<_CMP_NEQ_OQ>(y1, y2);
  85     else if (op == _CMP_LT_OQ)
  86         return movemask_cmp_ps<_CMP_LT_OQ>(y1, y2);
  87     else if (op == _CMP_LE_OQ)
  88         return movemask_cmp_ps<_CMP_LE_OQ>(y1, y2);
  89     else if (op == _CMP_GE_OQ)
  90         return movemask_cmp_ps<_CMP_GE_OQ>(y1, y2);
  91     else if (op == _CMP_GT_OQ)
  92         return movemask_cmp_ps<_CMP_GT_OQ>(y1, y2);
  93
  94     REALM_ASSERT(false);
  95     return 0;
  96 }
  97
  98 static inline int movemask_cmp_pd(__m256d* y1, __m256d* y2, int op)
  99 {
 100     // todo, use constexpr;
 101     if (op == _CMP_EQ_OQ)
 102         return movemask_cmp_pd<_CMP_NEQ_OQ>(y1, y2);
 103     else if (op == _CMP_NEQ_OQ)
 104         return movemask_cmp_pd<_CMP_NEQ_OQ>(y1, y2);
 105     else if (op == _CMP_LT_OQ)
 106         return movemask_cmp_pd<_CMP_LT_OQ>(y1, y2);
 107     else if (op == _CMP_LE_OQ)
 108         return movemask_cmp_pd<_CMP_LE_OQ>(y1, y2);
 109     else if (op == _CMP_GE_OQ)
 110         return movemask_cmp_pd<_CMP_GE_OQ>(y1, y2);
 111     else if (op == _CMP_GT_OQ)
 112         return movemask_cmp_pd<_CMP_GT_OQ>(y1, y2);
 113
 114     REALM_ASSERT(false);
 115     return 0;
 116 }
 117
 118
 119 #endif
 120 #endif
 121
 122 // Instructions introduced by SSE 3 and 4.2
 123 static inline __m128i _mm_cmpgt_epi64(__m128i xmm1, __m128i xmm2)
 124 {
 125     __asm__("pcmpgtq %1, %0" : "+x" (xmm1) : "xm" (xmm2));
 126     return xmm1;
 127 }
 128
 129 static inline __m128i _mm_cmpeq_epi64(__m128i xmm1, __m128i xmm2)
 130 {
 131     __asm__("pcmpeqq %1, %0" : "+x" (xmm1) : "xm" (xmm2));
 132     return xmm1;
 133 }
 134
 135 static inline __m128i __attribute__((always_inline)) _mm_min_epi8(__m128i xmm1, __m128i xmm2)
 136 {
 137     __asm__("pminsb %1, %0" : "+x" (xmm1) : "xm" (xmm2));
 138     return xmm1;
 139 }
 140
 141 static inline __m128i __attribute__((always_inline)) _mm_max_epi8(__m128i xmm1, __m128i xmm2)
 142 {
 143     __asm__("pmaxsb %1, %0" : "+x" (xmm1) : "xm" (xmm2));
 144     return xmm1;
 145 }
 146
 147 static inline __m128i __attribute__((always_inline)) _mm_max_epi32(__m128i xmm1, __m128i xmm2)
 148 {
 149     __asm__("pmaxsd %1, %0" : "+x" (xmm1) : "xm" (xmm2));
 150     return xmm1;
 151 }
 152
 153 static inline __m128i __attribute__((always_inline)) _mm_min_epi32(__m128i xmm1, __m128i xmm2)
 154 {
 155     __asm__("pminsd %1, %0" : "+x" (xmm1) : "xm" (xmm2));
 156     return xmm1;
 157 }
 158
 159 static inline __m128i __attribute__((always_inline)) _mm_cvtepi8_epi16(__m128i xmm2)
 160 {
 161     __m128i xmm1;
 162     __asm__("pmovsxbw %1, %0" : "=x" (xmm1) : "xm" (xmm2) : "xmm1");
 163     return xmm1;
 164 }
 165 static inline __m128i __attribute__((always_inline)) _mm_cvtepi16_epi32(__m128i xmm2)
 166 {
 167     __m128i xmm1;
 168     asm("pmovsxwd %1, %0" : "=x" (xmm1) : "xm" (xmm2));
 169     return xmm1;
 170 }
 171
 172 static inline __m128i __attribute__((always_inline)) _mm_cvtepi32_epi64(__m128i xmm2)
 173 {
 174     __m128i xmm1;
 175     __asm__("pmovsxdq %1, %0" : "=x" (xmm1) : "xm" (xmm2));
 176     return xmm1;
 177 }
 178
 179 } // namespace realm
 180
 181 #endif
 182 #endif