intrinsics实现示例
如果需要在代码中调用NEON Intrinsics函数,需要加入头文件"arm_neon.h"。以数组加法为例。
C语言实现:
using namespace std; void add(int* out, int* input1, int* input2, int count) { for(int i = 0; i < count; i += 1) { out[i] = input1[i] + input2[i]; } } int main() { int count; count = 10000 * 4; int a[count]; int b[count]; int c[count]; clock_t start, finish; double duration; for(int i = 0; i < count; i += 1) { a[i] = rand(); } for(int i = 0; i < count; i += 1) { b[i] = rand(); } start = clock(); for(int i = 0; i < count; i += 1) { add(c, a, b, count); } finish = clock(); duration = (double)(finish - start) / CLOCKS_PER_SEC; printf( "%f seconds\n", duration); return 0; }
输出结果如下:
1.910000 seconds
NEON intrinsics实现:
using namespace std; void add_neon(int* out, int* input1, int* input2, int count) { int32x4_t input1_neon, input2_neon, out_neon; for(int i = 0; i < count; i += 4) { input1_neon = vld1q_s32(input1); input1 += 4; input2_neon = vld1q_s32(input2); input2 += 4; out_neon = vaddq_s32(input1_neon, input2_neon); vst1q_s32(out, out_neon); out += 4; } } int main() { int count; count = 10000 * 4; int a[count]; int b[count]; int c[count]; clock_t start, finish; double duration; for(int i = 0; i < count; i += 1) { a[i] = rand(); } for(int i = 0; i < count; i += 1) { b[i] = rand(); } start = clock(); for(int i = 0; i < count; i += 1) { add_neon(c, a, b, count); } finish = clock(); duration = (double)(finish - start) / CLOCKS_PER_SEC; printf( "%f seconds for neon\n", duration); return 0; }
输出结果:
0.360000 seconds for neon
可以看出来,使用NEON intrinsics实现,性能有了明显的提升。
父主题: NEON intrinsics编程