pcmpestrm在ARM上的替换方法
函数功能:检查str2中每个字节元素(m128i_u8[index])是否在str1中存在, 如果存在,result中的相应bit位上置1。
pcmpestrm对应的Intrinsic函数详细说明,请参考Intrinsics Guide。
- x86上的代码段:
template<int MODE> static inline __m128i SSE4_cmpestrm(__m128i str1, int len1, __m128i str2, int len2) { #ifdef __clang__ /// Use asm reg rather than Yz output constraint to workaround LLVM bug 13199 - /// clang doesn't support Y-prefixed asm constraints. register volatile __m128i result asm ("xmm0"); __asm__ __volatile__ ("pcmpestrm %5, %2, %1": "=x"(result) : "x"(str1), "xm"(str2), "a"(len1), "d"(len2), "i"(MODE) : "cc"); #else __m128i result; __asm__ __volatile__ ("pcmpestrm %5, %2, %1": "=Yz"(result) : "x"(str1), "xm"(str2), "a"(len1), "d"(len2), "i"(MODE) : "cc"); #endif return result; }
- 在鲲鹏上替换后:
#include <arm_neon.h> typedef union __attribute__((aligned(16))) __oword{ int32x4_t m128i; uint8_t m128i_u8[16]; } __oword; template<int MODE> static inline uint16_t SSE4_cmpestrm(int32x4_t str1, int len1, int32x4_t str2, int len2) { __oword a, b; a.m128i = str1; b.m128i = str2; uint16_t result = 0; uint16_t i = 0; uint16_t j = 0; // Impala中用到的模式 STRCHR_MODE = PCMPSTR_EQUAL_ANY | PCMPSTR_UBYTE_OPS for (i = 0; i < len2; i++) { for ( j = 0; j < len1; j++) { if (a.m128i_u8[j] == b.m128i_u8[i]) { result |= (1 << i); } } } return result; }
父主题: 源码修改类案例