内嵌汇编实现示例
C语言实现版本:
void AddFloatC(float* dst, float* src1, float* src2, int count) { for (int i = 0; i < count; i++) { dst[i] = src1[i] + src2[i]; } } int main() { float dst[ARRAY_NUMS] = {0.0}; float src1[ARRAY_NUMS]; float src2[ARRAY_NUMS]; struct timeval start; struct timeval end; double dt; InitArray(dst, src1, src2); gettimeofday(&start,NULL); AddFloatNeonAsm(dst, src1, src2, ARRAY_NUMS); gettimeofday(&end,NULL); dt=(end.tv_sec-start.tv_sec) * 1000+(end.tv_usec-start.tv_usec) / 1000.0; cout<<"Time used of Normal NEON ASM code: "<<dt<<"ms"<<"\tcheck dst[52] value: "<<dst[52]<<endl; return 0; }
内嵌汇编实现版本:
void AddFloatNeonAsm(float* dst, float* src1, float* src2, int count) { __asm__ volatile( "1: \n" "ld1 {v0.4s}, [%[src1]], #16 \n" "ld1 {v1.4s}, [%[src2]], #16 \n" "fadd v0.4s, v0.4s, v1.4s \n" "subs %[count], %[count], #4 \n" "st1 {v0.4s}, [%[dst]], #16 \n" "bgt 1b \n" : [dst] "+r" (dst) : [src1] "r" (src1), [src2] "r" (src2), [count] "r" (count) : "memory", "v0", "v1" ); } int main() { float dst[ARRAY_NUMS] = {0.0}; float src1[ARRAY_NUMS]; float src2[ARRAY_NUMS]; struct timeval start; struct timeval end; double dt; InitArray(dst, src1, src2); gettimeofday(&start,NULL); AddFloatC(dst, src1, src2, ARRAY_NUMS); gettimeofday(&end,NULL); dt=(end.tv_sec-start.tv_sec) * 1000 + (end.tv_usec-start.tv_usec) / 1000.0; cout<<"Time used of Normal C code: "<<dt<<"ms"<<"\t\tcheck dst[52] value: "<<dst[52]<<endl; return 0; }
其中的bgt跳转指令会去读取NZCV系统寄存器中相应的NZCV标记位为看是否满足跳转条件,关键在于subs指令不同于普通的sub指令,在做减法时,会对相应condition flags位进行操作。
“bgt 1b”中的b的含义是:避免bgt把后面的1当成立即数处理,所以在1后加了b,告知编译器前面的1表示label,而不是立即数。
C语言执行时间为3.062ms,而NEON内嵌汇编的版本执行时间为0.331ms。
父主题: NEON汇编编程