Roofline插桩指导
插桩说明
Roofline分析采集模式指定为region时,支持对应用中已经插桩的region块进行分别采集,实现function/loop级别的定量数据分析,该能力需要用户手动对待分析的源码进行插桩,并重新编译。
region插桩与分析:
- 在源代码中插入Roofline Events API。
- Roofline Events API定义在/usr/bin/devkit/tuner/include/roofline_events.h(.mod) 。
- roofline_events.h用于C/C++程序;roofline_events.mod用于Fortran程序。
- 使用新的编译标志重新编译应用程序:
- C/C++:-DROOFLINE_EVENTS -I /usr/bin/devkit/tuner/include -L/usr/bin/devkit/tuner/lib -lrfevents
- Fortran:-I /usr/bin/devkit/tuner/include -L/usr/bin/devkit/tuner/lib -lrfevents
- 需保证运行时动态库寻址路径包含/usr/bin/devkit/tuner/lib, 比如在LD_LIBRARY_PATH中增加/usr/bin/devkit/tuner/lib路径。
- Roofline分析任务采集模式选择region模式,对插桩后编译生成的应用进行采集。
Roofline Events API介绍
数据是按线程收集,因此需注意以下规则:
- 在串行代码中initialize/finalize(例如主线程)。
- 如果需要分析所有线程数据,start/stop API需要放置在并行代码中。
- 支持多个region,但不支持region之间嵌套,即同一个region的start/stop API需要成对且region之间不交错。
- region名称用于匹配线程之间的region数据。
- 以ROOFLINE_EVENTS开头的接口可以通过ROOFLINE_EVENTS编译选项进行开启和关闭,宏定义能力适用于C/C++。
- 以perf_roofline_events结尾的接口适用于C/C++/Fortran,不支持编译选项开关。
#ifdef ROOFLINE_EVENTS #define ROOFLINE_EVENTS_INIT init_perf_roofline_events() #define ROOFLINE_EVENTS_START_REGION(region_label) start_perf_roofline_events(region_label) #define ROOFLINE_EVENTS_STOP_REGION(region_label) stop_perf_roofline_events(region_label) #define ROOFLINE_EVENTS_FINALIZE finalize_perf_roofline_events() #else #define ROOFLINE_EVENTS_INIT #define ROOFLINE_EVENTS_START_REGION(region_label) #define ROOFLINE_EVENTS_STOP_REGION(region_label) #define ROOFLINE_EVENTS_FINALIZE #endif #ifdef __cplusplus extern "C" { #endif // read system counters -> init // should be called in serial code before start_perf_roofline_events extern void init_perf_roofline_events(void) __attribute__((visibility("default"))); // start roofline events for current thread and provided region // should be called in parallel code extern void start_perf_roofline_events(const char* region) __attribute__((visibility("default"))); // stop roofline events for current thread and provided region // should be called in parallel code extern void stop_perf_roofline_events(const char* region) __attribute__((visibility("default"))); // summarize data for all regions // should be called in serial code after stop_perf_roofline_events for all regions/threads extern void finalize_perf_roofline_events(void) __attribute__((visibility("default"))); #ifdef __cplusplus } #endif
插桩示例
- C源码Demo
#include <stdio.h> #include <stdlib.h> #include <math.h> #include <time.h> #include <omp.h> // 使用插桩头文件 // #include "roofline_events.h" #ifdef DOUBLE_TYPE typedef double real_t; #else typedef float real_t; #endif static real_t rand_real() { #ifdef DOUBLE_TYPE const int mod = 1024; const double divider = 16.0; #else const int mod = 256; const float divider = 16.0f; #endif return ((rand() % mod) - mod / 2) / divider; } int main(int argc, char* argv[]) { size_t n = 1024; size_t i, j, k; real_t *A, *B, *B_transposed, *C; double start_time, end_time; // Get the dimension of the matrices from the command line argument if (argc >= 2) { n = atoi(argv[1]); } // 串行代码区initialize插桩事件 // ROOFLINE_EVENTS_INIT; start_time = omp_get_wtime(); // Allocate and initialize matrices A and B A = (real_t*)malloc(n * n * sizeof(real_t)); B = (real_t*)malloc(n * n * sizeof(real_t)); B_transposed = (real_t*)malloc(n * n * sizeof(real_t)); C = (real_t*)malloc(n * n * sizeof(real_t)); for (i = 0; i < n; i++) { for (j = 0; j < n; j++) { A[i * n + j] = rand_real(); B[i * n + j] = rand_real(); B_transposed[j * n + i] = B[i * n + j]; C[i * n + j] = 0.0; } } end_time = omp_get_wtime();; // Print the timings printf("Initialization time: %f seconds\n", end_time - start_time); // Perform matrix multiplication start_time = omp_get_wtime(); #pragma omp parallel { // 并行代码区start插桩事件matrix_multiply_c // ROOFLINE_EVENTS_START_REGION("matrix_multiply_c"); #pragma omp for private(i, j, k) for (i = 0; i < n; i++) { for (j = 0; j < n; j++) { for (k = 0; k < n; k++) { C[i * n + j] += A[i * n + k] * B_transposed[j * n + k]; } } } // 并行代码区stop插桩事件matrix_multiply_c // ROOFLINE_EVENTS_STOP_REGION("matrix_multiply_c"); } end_time = omp_get_wtime(); // Print the timings printf("Calculation time: %f seconds\n", end_time - start_time); // Print the result if n is less than or equal to 16 if (n <= 16) { printf("The product of A and B_transposed is:\n"); for (i = 0; i < n; i++) { for (j = 0; j < n; j++) { printf("%f ", C[i * n + j]); } printf("\n"); } } else { printf("The dimension of the matrices is too large to print.\n"); } // Deallocate matrices free(A); free(B); free(B_transposed); free(C); // 串行代码区finalize插桩事件 // ROOFLINE_EVENTS_FINALIZE; return 0; }
上述demo已经添加插装代码(注释状态),包含如下5行:
- #include "roofline_events.h"
- ROOFLINE_EVENTS_INIT;
- ROOFLINE_EVENTS_START_REGION("matrix_multiply_c");
- ROOFLINE_EVENTS_STOP_REGION("matrix_multiply_c");
- ROOFLINE_EVENTS_FINALIZE;
插装代码注释状态(即未进行插装),编译命令如下:
gcc matrix_multiply.c -o matrix_multiply_c -fopenmp
插装代码去除注释状态(即已进行插装),需增加插桩说明中的编译选项, 编译命令如下:
gcc matrix_multiply.c -o matrix_multiply_c -fopenmp -DROOFLINE_EVENTS -I /usr/local/devkit/tuner/include -L/usr/local/devkit/tuner/lib -lrfevents
- Fortran插桩Demo。
program matrix_multiply ! 使用插桩模块 ! use roofline_events implicit none integer :: n = 1024 real, dimension(:,:), allocatable :: A, B, C, B_transposed integer :: i, j, k integer :: start_time, end_time, clock_rate real :: init_time, calc_time character(len=20) :: arg ! Get the dimension of the matrices from the command line argument if (iargc() .gt. 0) then call getarg(1, arg) read(arg, *) n end if ! 串行代码区initialize插桩事件 ! call init_perf_roofline_events() ! Start timing call system_clock(start_time, clock_rate) ! Allocate and initialize matrices A and B allocate(A(n,n), B(n,n), C(n,n), B_transposed(n,n)) call random_number(A) call random_number(B) ! Transpose matrix B B_transposed = transpose(B) ! End timing call system_clock(end_time) init_time = real(end_time - start_time) / real(clock_rate) print *, 'Initialization time: ', init_time, ' seconds' ! Start timing call system_clock(start_time, clock_rate) ! Perform matrix multiplication C = 0.0 !$OMP PARALLEL PRIVATE(i, j, k) SHARED(A, B_transposed, C, n) ! 并行代码区start插桩事件matrix_multiply_f ! call start_perf_roofline_events("matrix_multiply_f") !$OMP DO do i = 1, n do k = 1, n do j = 1, n C(i, k) = C(i, k) + A(i, j) * B_transposed(k, j) end do end do end do !$OMP END DO ! 并行代码区stop插桩事件matrix_multiply_f ! call stop_perf_roofline_events("matrix_multiply_f") !$OMP END PARALLEL ! End timing call system_clock(end_time) calc_time = real(end_time - start_time) / real(clock_rate) print *, 'Calculation time: ', calc_time, ' seconds' ! Print the result if n is less than or equal to 16 if (n <= 16) then print *, 'The product of A and B_transposed is:' print *, C else print *, 'The dimension of the matrices is too large to print.' end if ! Deallocate matrices deallocate(A, B, C, B_transposed) ! 串行代码区finalize插桩事件 ! call finalize_perf_roofline_events() end program matrix_multiply
上述demo已添加插装代码(注释状态),包含如下5行:
- use roofline_events
- call init_perf_roofline_events()
- call start_perf_roofline_events("matrix_multiply_f")
- call stop_perf_roofline_events("matrix_multiply_f")
- call finalize_perf_roofline_events()
插装代码注释状态(未进行插装情况下),编译命令如下:
gfortran matrix_multiply.f90 -o matrix_multiply_f -fopenmp
插装代码去除注释状态(进行插装情况下),增加插桩说明中的编译选项即可, 编译命令如下:
gfortran matrix_multiply.f90 -o matrix_multiply_f -fopenmp -I /usr/local/devkit/tuner/include -L/usr/local/devkit/tuner/lib -lrfevents
- Roofline分析任务采集模式选择region。
该示例中的region名为matrix_multiply_c或matrix_multiply_f,实际插桩过程中可以按需插入多个不同名称的region。
图1 Roofline分析任务采集模式选择region
父主题: 常用操作