中文
注册

Roofline插桩指导

插桩说明

Roofline分析采集模式指定为region时,支持对应用中已经插桩的region块进行分别采集,实现function/loop级别的定量数据分析,该能力需要用户手动对待分析的源码进行插桩,并重新编译。

region插桩与分析:
  1. 在源代码中插入Roofline Events API。
    • Roofline Events API定义在/usr/bin/devkit/tuner/include/roofline_events.h(.mod) 。
    • roofline_events.h用于C/C++程序;roofline_events.mod用于Fortran程序。
  2. 使用新的编译标志重新编译应用程序:
    • C/C++:-DROOFLINE_EVENTS -I /usr/bin/devkit/tuner/include -L/usr/bin/devkit/tuner/lib -lrfevents
    • Fortran:-I /usr/bin/devkit/tuner/include -L/usr/bin/devkit/tuner/lib -lrfevents
  3. 需保证运行时动态库寻址路径包含/usr/bin/devkit/tuner/lib, 比如在LD_LIBRARY_PATH中增加/usr/bin/devkit/tuner/lib路径。
  4. Roofline分析任务采集模式选择region模式,对插桩后编译生成的应用进行采集。

Roofline Events API介绍

数据是按线程收集,因此需注意以下规则:

  • 在串行代码中initialize/finalize(例如主线程)。
  • 如果需要分析所有线程数据,start/stop API需要放置在并行代码中。
  • 支持多个region,但不支持region之间嵌套,即同一个region的start/stop API需要成对且region之间不交错。
  • region名称用于匹配线程之间的region数据。
  • 以ROOFLINE_EVENTS开头的接口可以通过ROOFLINE_EVENTS编译选项进行开启和关闭,宏定义能力适用于C/C++。
  • 以perf_roofline_events结尾的接口适用于C/C++/Fortran,不支持编译选项开关。
#ifdef ROOFLINE_EVENTS
#define ROOFLINE_EVENTS_INIT init_perf_roofline_events()
#define ROOFLINE_EVENTS_START_REGION(region_label) start_perf_roofline_events(region_label)
#define ROOFLINE_EVENTS_STOP_REGION(region_label) stop_perf_roofline_events(region_label)
#define ROOFLINE_EVENTS_FINALIZE finalize_perf_roofline_events()
#else
#define ROOFLINE_EVENTS_INIT
#define ROOFLINE_EVENTS_START_REGION(region_label)
#define ROOFLINE_EVENTS_STOP_REGION(region_label)
#define ROOFLINE_EVENTS_FINALIZE
#endif

#ifdef __cplusplus
extern "C" {
#endif
// read system counters -> init
// should be called in serial code before start_perf_roofline_events
extern void init_perf_roofline_events(void) __attribute__((visibility("default")));
// start roofline events for current thread and provided region
// should be called in parallel code
extern void start_perf_roofline_events(const char* region) __attribute__((visibility("default")));
// stop roofline events for current thread and provided region
// should be called in parallel code
extern void stop_perf_roofline_events(const char* region) __attribute__((visibility("default")));
// summarize data for all regions
// should be called in serial code after stop_perf_roofline_events for all regions/threads
extern void finalize_perf_roofline_events(void) __attribute__((visibility("default")));
#ifdef __cplusplus
}
#endif

插桩示例

  • C源码Demo

    文件名为matrix_multiply.c。

    #include <stdio.h>
    #include <stdlib.h>
    #include <math.h>
    #include <time.h>
    #include <omp.h>
    
    // 使用插桩头文件
    // #include "roofline_events.h"
    
    #ifdef DOUBLE_TYPE
    typedef double real_t;
    #else
    typedef float real_t;
    #endif
    
    static real_t rand_real()
    {
    #ifdef DOUBLE_TYPE
        const int mod = 1024;
        const double divider = 16.0;
    #else
        const int mod = 256;
        const float divider = 16.0f;
    #endif
        return ((rand() % mod) - mod / 2) / divider;
    }
    
    int main(int argc, char* argv[]) {
        size_t n = 1024;
        size_t i, j, k;
        real_t *A, *B, *B_transposed, *C;
        double start_time, end_time;
    
        // Get the dimension of the matrices from the command line argument
        if (argc >= 2) {
            n = atoi(argv[1]);
        }
    
        // 串行代码区initialize插桩事件
        // ROOFLINE_EVENTS_INIT;
        start_time = omp_get_wtime();
        // Allocate and initialize matrices A and B
        A = (real_t*)malloc(n * n * sizeof(real_t));
        B = (real_t*)malloc(n * n * sizeof(real_t));
        B_transposed = (real_t*)malloc(n * n * sizeof(real_t));
        C = (real_t*)malloc(n * n * sizeof(real_t));
        for (i = 0; i < n; i++) {
            for (j = 0; j < n; j++) {
                A[i * n + j] = rand_real();
                B[i * n + j] = rand_real();
                B_transposed[j * n + i] = B[i * n + j];
                C[i * n + j] = 0.0;
            }
        }
        end_time = omp_get_wtime();;
        // Print the timings
        printf("Initialization time: %f seconds\n", end_time - start_time);
    
        // Perform matrix multiplication
        start_time = omp_get_wtime();
        #pragma omp parallel
        {
            // 并行代码区start插桩事件matrix_multiply_c
            // ROOFLINE_EVENTS_START_REGION("matrix_multiply_c");
            #pragma omp for private(i, j, k)
            for (i = 0; i < n; i++) {
                for (j = 0; j < n; j++) {
                    for (k = 0; k < n; k++) {
                        C[i * n + j] += A[i * n + k] * B_transposed[j * n + k];
                    }
                }
            }
            // 并行代码区stop插桩事件matrix_multiply_c
            // ROOFLINE_EVENTS_STOP_REGION("matrix_multiply_c");
        }
        end_time = omp_get_wtime();
        // Print the timings
        printf("Calculation time: %f seconds\n", end_time - start_time);
    
        // Print the result if n is less than or equal to 16
        if (n <= 16) {
            printf("The product of A and B_transposed is:\n");
            for (i = 0; i < n; i++) {
                for (j = 0; j < n; j++) {
                    printf("%f ", C[i * n + j]);
                }
                printf("\n");
            }
        } else {
            printf("The dimension of the matrices is too large to print.\n");
        }
    
        // Deallocate matrices
        free(A);
        free(B);
        free(B_transposed);
        free(C);
        // 串行代码区finalize插桩事件
        // ROOFLINE_EVENTS_FINALIZE;
        return 0;
    }

    上述demo已经添加插装代码(注释状态),包含如下5行:

    1. #include "roofline_events.h"
    2. ROOFLINE_EVENTS_INIT;
    3. ROOFLINE_EVENTS_START_REGION("matrix_multiply_c");
    4. ROOFLINE_EVENTS_STOP_REGION("matrix_multiply_c");
    5. ROOFLINE_EVENTS_FINALIZE;

    插装代码注释状态(即未进行插装),编译命令如下:

    gcc matrix_multiply.c -o matrix_multiply_c -fopenmp

    插装代码去除注释状态(即已进行插装),需增加插桩说明中的编译选项, 编译命令如下:

    gcc matrix_multiply.c -o matrix_multiply_c -fopenmp -DROOFLINE_EVENTS -I /usr/local/devkit/tuner/include -L/usr/local/devkit/tuner/lib -lrfevents
  • Fortran插桩Demo。

    文件名为matrix_multiply.f90。

    program matrix_multiply
        ! 使用插桩模块
        ! use roofline_events
        implicit none
        integer :: n = 1024
        real, dimension(:,:), allocatable :: A, B, C, B_transposed
        integer :: i, j, k
        integer :: start_time, end_time, clock_rate
        real :: init_time, calc_time
        character(len=20) :: arg
        ! Get the dimension of the matrices from the command line argument
        if (iargc() .gt. 0) then
            call getarg(1, arg)
            read(arg, *) n
        end if
    
        ! 串行代码区initialize插桩事件
        ! call init_perf_roofline_events()
        ! Start timing
        call system_clock(start_time, clock_rate)
        ! Allocate and initialize matrices A and B
        allocate(A(n,n), B(n,n), C(n,n), B_transposed(n,n))
        call random_number(A)
        call random_number(B)
        ! Transpose matrix B
        B_transposed = transpose(B)
        ! End timing
        call system_clock(end_time)
        init_time = real(end_time - start_time) / real(clock_rate)
        print *, 'Initialization time: ', init_time, ' seconds'
        ! Start timing
        call system_clock(start_time, clock_rate)
        ! Perform matrix multiplication
        C = 0.0
        !$OMP PARALLEL PRIVATE(i, j, k) SHARED(A, B_transposed, C, n)
        ! 并行代码区start插桩事件matrix_multiply_f
        ! call start_perf_roofline_events("matrix_multiply_f")
        !$OMP DO
        do i = 1, n
            do k = 1, n
                do j = 1, n
                    C(i, k) = C(i, k) + A(i, j) * B_transposed(k, j)
                end do
            end do
        end do
        !$OMP END DO
        ! 并行代码区stop插桩事件matrix_multiply_f
        ! call stop_perf_roofline_events("matrix_multiply_f")
        !$OMP END PARALLEL
        ! End timing
        call system_clock(end_time)
        calc_time = real(end_time - start_time) / real(clock_rate)
        print *, 'Calculation time: ', calc_time, ' seconds'
        ! Print the result if n is less than or equal to 16
        if (n <= 16) then
            print *, 'The product of A and B_transposed is:'
            print *, C
        else
            print *, 'The dimension of the matrices is too large to print.'
        end if
        ! Deallocate matrices
        deallocate(A, B, C, B_transposed)
        ! 串行代码区finalize插桩事件
        ! call finalize_perf_roofline_events()
    end program matrix_multiply

    上述demo已添加插装代码(注释状态),包含如下5行:

    1. use roofline_events
    2. call init_perf_roofline_events()
    3. call start_perf_roofline_events("matrix_multiply_f")
    4. call stop_perf_roofline_events("matrix_multiply_f")
    5. call finalize_perf_roofline_events()

    插装代码注释状态(未进行插装情况下),编译命令如下:

    gfortran matrix_multiply.f90 -o matrix_multiply_f -fopenmp

    插装代码去除注释状态(进行插装情况下),增加插桩说明中的编译选项即可, 编译命令如下:

    gfortran matrix_multiply.f90 -o matrix_multiply_f -fopenmp -I /usr/local/devkit/tuner/include -L/usr/local/devkit/tuner/lib -lrfevents
  • Roofline分析任务采集模式选择region。

    该示例中的region名为matrix_multiply_c或matrix_multiply_f,实际插桩过程中可以按需插入多个不同名称的region。

    图1 Roofline分析任务采集模式选择region
搜索结果
找到“0”个结果

当前产品无相关内容

未找到相关内容,请尝试其他搜索词