Roofline分析采集模式指定为region时,支持对应用中已经插桩的region块进行分别采集,实现function/loop级别的定量数据分析,该能力需要用户手动对待分析的源码进行插桩,并重新编译。
数据是按线程收集,因此需注意以下规则:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 |
#ifdef ROOFLINE_EVENTS #define ROOFLINE_EVENTS_INIT init_perf_roofline_events() #define ROOFLINE_EVENTS_START_REGION(region_label) start_perf_roofline_events(region_label) #define ROOFLINE_EVENTS_STOP_REGION(region_label) stop_perf_roofline_events(region_label) #define ROOFLINE_EVENTS_FINALIZE finalize_perf_roofline_events() #else #define ROOFLINE_EVENTS_INIT #define ROOFLINE_EVENTS_START_REGION(region_label) #define ROOFLINE_EVENTS_STOP_REGION(region_label) #define ROOFLINE_EVENTS_FINALIZE #endif #ifdef __cplusplus extern "C" { #endif // read system counters -> init // should be called in serial code before start_perf_roofline_events extern void init_perf_roofline_events(void) __attribute__((visibility("default"))); // start roofline events for current thread and provided region // should be called in parallel code extern void start_perf_roofline_events(const char* region) __attribute__((visibility("default"))); // stop roofline events for current thread and provided region // should be called in parallel code extern void stop_perf_roofline_events(const char* region) __attribute__((visibility("default"))); // summarize data for all regions // should be called in serial code after stop_perf_roofline_events for all regions/threads extern void finalize_perf_roofline_events(void) __attribute__((visibility("default"))); #ifdef __cplusplus } #endif |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
#include <stdio.h> #include <stdlib.h> #include <math.h> #include <time.h> #include <omp.h> // 使用插桩头文件 // #include "roofline_events.h" #ifdef DOUBLE_TYPE typedef double real_t; #else typedef float real_t; #endif static real_t rand_real() { #ifdef DOUBLE_TYPE const int mod = 1024; const double divider = 16.0; #else const int mod = 256; const float divider = 16.0f; #endif return ((rand() % mod) - mod / 2) / divider; } int main(int argc, char* argv[]) { size_t n = 1024; size_t i, j, k; real_t *A, *B, *B_transposed, *C; double start_time, end_time; // Get the dimension of the matrices from the command line argument if (argc >= 2) { n = atoi(argv[1]); } // 串行代码区initialize插桩事件 // ROOFLINE_EVENTS_INIT; start_time = omp_get_wtime(); // Allocate and initialize matrices A and B A = (real_t*)malloc(n * n * sizeof(real_t)); B = (real_t*)malloc(n * n * sizeof(real_t)); B_transposed = (real_t*)malloc(n * n * sizeof(real_t)); C = (real_t*)malloc(n * n * sizeof(real_t)); for (i = 0; i < n; i++) { for (j = 0; j < n; j++) { A[i * n + j] = rand_real(); B[i * n + j] = rand_real(); B_transposed[j * n + i] = B[i * n + j]; C[i * n + j] = 0.0; } } end_time = omp_get_wtime();; // Print the timings printf("Initialization time: %f seconds\n", end_time - start_time); // Perform matrix multiplication start_time = omp_get_wtime(); #pragma omp parallel { // 并行代码区start插桩事件matrix_multiply_c // ROOFLINE_EVENTS_START_REGION("matrix_multiply_c"); #pragma omp for private(i, j, k) for (i = 0; i < n; i++) { for (j = 0; j < n; j++) { for (k = 0; k < n; k++) { C[i * n + j] += A[i * n + k] * B_transposed[j * n + k]; } } } // 并行代码区stop插桩事件matrix_multiply_c // ROOFLINE_EVENTS_STOP_REGION("matrix_multiply_c"); } end_time = omp_get_wtime(); // Print the timings printf("Calculation time: %f seconds\n", end_time - start_time); // Print the result if n is less than or equal to 16 if (n <= 16) { printf("The product of A and B_transposed is:\n"); for (i = 0; i < n; i++) { for (j = 0; j < n; j++) { printf("%f ", C[i * n + j]); } printf("\n"); } } else { printf("The dimension of the matrices is too large to print.\n"); } // Deallocate matrices free(A); free(B); free(B_transposed); free(C); // 串行代码区finalize插桩事件 // ROOFLINE_EVENTS_FINALIZE; return 0; } |
上述demo已经添加插装代码(注释状态),包含如下5行:
插装代码注释状态(即未进行插装),编译命令如下:
1
|
gcc matrix_multiply.c -o matrix_multiply_c -fopenmp |
插装代码去除注释状态(即已进行插装),需增加插桩说明中的编译选项, 编译命令如下:
1
|
gcc matrix_multiply.c -o matrix_multiply_c -fopenmp -DROOFLINE_EVENTS -I /usr/local/devkit/tuner/include -L/usr/local/devkit/tuner/lib -lrfevents |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
program matrix_multiply ! 使用插桩模块 ! use roofline_events implicit none integer :: n = 1024 real, dimension(:,:), allocatable :: A, B, C, B_transposed integer :: i, j, k integer :: start_time, end_time, clock_rate real :: init_time, calc_time character(len=20) :: arg ! Get the dimension of the matrices from the command line argument if (iargc() .gt. 0) then call getarg(1, arg) read(arg, *) n end if ! 串行代码区initialize插桩事件 ! call init_perf_roofline_events() ! Start timing call system_clock(start_time, clock_rate) ! Allocate and initialize matrices A and B allocate(A(n,n), B(n,n), C(n,n), B_transposed(n,n)) call random_number(A) call random_number(B) ! Transpose matrix B B_transposed = transpose(B) ! End timing call system_clock(end_time) init_time = real(end_time - start_time) / real(clock_rate) print *, 'Initialization time: ', init_time, ' seconds' ! Start timing call system_clock(start_time, clock_rate) ! Perform matrix multiplication C = 0.0 !$OMP PARALLEL PRIVATE(i, j, k) SHARED(A, B_transposed, C, n) ! 并行代码区start插桩事件matrix_multiply_f ! call start_perf_roofline_events("matrix_multiply_f") !$OMP DO do i = 1, n do k = 1, n do j = 1, n C(i, k) = C(i, k) + A(i, j) * B_transposed(k, j) end do end do end do !$OMP END DO ! 并行代码区stop插桩事件matrix_multiply_f ! call stop_perf_roofline_events("matrix_multiply_f") !$OMP END PARALLEL ! End timing call system_clock(end_time) calc_time = real(end_time - start_time) / real(clock_rate) print *, 'Calculation time: ', calc_time, ' seconds' ! Print the result if n is less than or equal to 16 if (n <= 16) then print *, 'The product of A and B_transposed is:' print *, C else print *, 'The dimension of the matrices is too large to print.' end if ! Deallocate matrices deallocate(A, B, C, B_transposed) ! 串行代码区finalize插桩事件 ! call finalize_perf_roofline_events() end program matrix_multiply |
上述demo已添加插装代码(注释状态),包含如下5行:
插装代码注释状态(未进行插装情况下),编译命令如下:
1
|
gfortran matrix_multiply.f90 -o matrix_multiply_f -fopenmp |
插装代码去除注释状态(进行插装情况下),增加插桩说明中的编译选项即可, 编译命令如下:
1
|
gfortran matrix_multiply.f90 -o matrix_multiply_f -fopenmp -I /usr/local/devkit/tuner/include -L/usr/local/devkit/tuner/lib -lrfevents |
该示例中的region名为matrix_multiply_c或matrix_multiply_f,实际插桩过程中可以按需插入多个不同名称的region。