Roofline插桩指导

插桩说明

Roofline分析采集模式指定为region时，支持对应用中已经插桩的region块进行分别采集，实现function/loop级别的定量数据分析，该能力需要用户手动对待分析的源码进行插桩，并重新编译。

region插桩与分析：

在源代码中插入Roofline Events API。
- Roofline Events API定义在/usr/bin/devkit/tuner/include/roofline_events.h(.mod)。
- roofline_events.h用于C/C++程序；roofline_events.mod用于Fortran程序。
使用新的编译标志重新编译应用程序:
- C/C++：-DROOFLINE_EVENTS -I /usr/bin/devkit/tuner/include -L/usr/bin/devkit/tuner/lib -lrfevents
- Fortran：-I /usr/bin/devkit/tuner/include -L/usr/bin/devkit/tuner/lib -lrfevents
需保证运行时动态库寻址路径包含/usr/bin/devkit/tuner/lib，比如在LD_LIBRARY_PATH中增加/usr/bin/devkit/tuner/lib路径。
Roofline分析任务采集模式选择region模式，对插桩后编译生成的应用进行采集。

Roofline Events API介绍

数据是按线程收集，因此需注意以下规则：

在串行代码中initialize/finalize（例如主线程）。
如果需要分析所有线程数据，start/stop API需要放置在并行代码中。
支持多个region，但不支持region之间嵌套，即同一个region的start/stop API需要成对且region之间不交错。
region名称用于匹配线程之间的region数据。
以ROOFLINE_EVENTS开头的接口可以通过ROOFLINE_EVENTS编译选项进行开启和关闭，宏定义能力适用于C/C++。
以perf_roofline_events结尾的接口适用于C/C++/Fortran，不支持编译选项开关。

      
       
         
         
           #ifdef ROOFLINE_EVENTS
#define ROOFLINE_EVENTS_INIT init_perf_roofline_events()
#define ROOFLINE_EVENTS_START_REGION(region_label) start_perf_roofline_events(region_label)
#define ROOFLINE_EVENTS_STOP_REGION(region_label) stop_perf_roofline_events(region_label)
#define ROOFLINE_EVENTS_FINALIZE finalize_perf_roofline_events()
#else
#define ROOFLINE_EVENTS_INIT
#define ROOFLINE_EVENTS_START_REGION(region_label)
#define ROOFLINE_EVENTS_STOP_REGION(region_label)
#define ROOFLINE_EVENTS_FINALIZE
#endif

#ifdef __cplusplus
extern "C" {
#endif
// read system counters -> init
// should be called in serial code before start_perf_roofline_events
extern void init_perf_roofline_events(void) __attribute__((visibility("default")));
// start roofline events for current thread and provided region
// should be called in parallel code
extern void start_perf_roofline_events(const char* region) __attribute__((visibility("default")));
// stop roofline events for current thread and provided region
// should be called in parallel code
extern void stop_perf_roofline_events(const char* region) __attribute__((visibility("default")));
// summarize data for all regions
// should be called in serial code after stop_perf_roofline_events for all regions/threads
extern void finalize_perf_roofline_events(void) __attribute__((visibility("default")));
#ifdef __cplusplus
}
#endif

          

        

      
     

插桩示例

C源码demo

文件名为matrix_multiply.c。

        
         
           
           
             #include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <omp.h>

// 使用插桩头文件
// #include "roofline_events.h"

#ifdef DOUBLE_TYPE
typedef double real_t;
#else
typedef float real_t;
#endif

static real_t rand_real()
{
#ifdef DOUBLE_TYPE
    const int mod = 1024;
    const double divider = 16.0;
#else
    const int mod = 256;
    const float divider = 16.0f;
#endif
    return ((rand() % mod) - mod / 2) / divider;
}

int main(int argc, char* argv[]) {
    size_t n = 1024;
    size_t i, j, k;
    real_t *A, *B, *B_transposed, *C;
    double start_time, end_time;

    // Get the dimension of the matrices from the command line argument
    if (argc >= 2) {
        n = atoi(argv[1]);
    }

    // 串行代码区initialize插桩事件
    // ROOFLINE_EVENTS_INIT;
    start_time = omp_get_wtime();
    // Allocate and initialize matrices A and B
    A = (real_t*)malloc(n * n * sizeof(real_t));
    B = (real_t*)malloc(n * n * sizeof(real_t));
    B_transposed = (real_t*)malloc(n * n * sizeof(real_t));
    C = (real_t*)malloc(n * n * sizeof(real_t));
    for (i = 0; i < n; i++) {
        for (j = 0; j < n; j++) {
            A[i * n + j] = rand_real();
            B[i * n + j] = rand_real();
            B_transposed[j * n + i] = B[i * n + j];
            C[i * n + j] = 0.0;
        }
    }
    end_time = omp_get_wtime();;
    // Print the timings
    printf("Initialization time: %f seconds\n", end_time - start_time);

    // Perform matrix multiplication
    start_time = omp_get_wtime();
    #pragma omp parallel
    {
        // 并行代码区start插桩事件matrix_multiply_c
        // ROOFLINE_EVENTS_START_REGION("matrix_multiply_c");
        #pragma omp for private(i, j, k)
        for (i = 0; i < n; i++) {
            for (j = 0; j < n; j++) {
                for (k = 0; k < n; k++) {
                    C[i * n + j] += A[i * n + k] * B_transposed[j * n + k];
                }
            }
        }
        // 并行代码区stop插桩事件matrix_multiply_c
        // ROOFLINE_EVENTS_STOP_REGION("matrix_multiply_c");
    }
    end_time = omp_get_wtime();
    // Print the timings
    printf("Calculation time: %f seconds\n", end_time - start_time);

    // Print the result if n is less than or equal to 16
    if (n <= 16) {
        printf("The product of A and B_transposed is:\n");
        for (i = 0; i < n; i++) {
            for (j = 0; j < n; j++) {
                printf("%f ", C[i * n + j]);
            }
            printf("\n");
        }
    } else {
        printf("The dimension of the matrices is too large to print.\n");
    }

    // Deallocate matrices
    free(A);
    free(B);
    free(B_transposed);
    free(C);
    // 串行代码区finalize插桩事件
    // ROOFLINE_EVENTS_FINALIZE;
    return 0;
}

            

          

        
       

上述demo已经添加插装代码（注释状态），包含如下5行：

#include "roofline_events.h"
ROOFLINE_EVENTS_INIT;
ROOFLINE_EVENTS_START_REGION("matrix_multiply_c");
ROOFLINE_EVENTS_STOP_REGION("matrix_multiply_c");
ROOFLINE_EVENTS_FINALIZE;

插装代码注释状态（即未进行插装），编译命令如下：

        
             gcc matrix_multiply.c -o matrix_multiply_c -fopenmp

插装代码去除注释状态（即已进行插装），需增加插桩说明中的编译选项，编译命令如下：

        
             gcc matrix_multiply.c -o matrix_multiply_c -fopenmp -DROOFLINE_EVENTS -I /usr/local/devkit/tuner/include -L/usr/local/devkit/tuner/lib -lrfevents

Fortran插桩demo。

文件名为matrix_multiply.f90。

        
         
           
           
             program matrix_multiply
    ! 使用插桩模块
    ! use roofline_events
    implicit none
    integer :: n = 1024
    real, dimension(:,:), allocatable :: A, B, C, B_transposed
    integer :: i, j, k
    integer :: start_time, end_time, clock_rate
    real :: init_time, calc_time
    character(len=20) :: arg
    ! Get the dimension of the matrices from the command line argument
    if (iargc() .gt. 0) then
        call getarg(1, arg)
        read(arg, *) n
    end if

    ! 串行代码区initialize插桩事件
    ! call init_perf_roofline_events()
    ! Start timing
    call system_clock(start_time, clock_rate)
    ! Allocate and initialize matrices A and B
    allocate(A(n,n), B(n,n), C(n,n), B_transposed(n,n))
    call random_number(A)
    call random_number(B)
    ! Transpose matrix B
    B_transposed = transpose(B)
    ! End timing
    call system_clock(end_time)
    init_time = real(end_time - start_time) / real(clock_rate)
    print *, 'Initialization time: ', init_time, ' seconds'
    ! Start timing
    call system_clock(start_time, clock_rate)
    ! Perform matrix multiplication
    C = 0.0
    !$OMP PARALLEL PRIVATE(i, j, k) SHARED(A, B_transposed, C, n)
    ! 并行代码区start插桩事件matrix_multiply_f
    ! call start_perf_roofline_events("matrix_multiply_f")
    !$OMP DO
    do i = 1, n
        do k = 1, n
            do j = 1, n
                C(i, k) = C(i, k) + A(i, j) * B_transposed(k, j)
            end do
        end do
    end do
    !$OMP END DO
    ! 并行代码区stop插桩事件matrix_multiply_f
    ! call stop_perf_roofline_events("matrix_multiply_f")
    !$OMP END PARALLEL
    ! End timing
    call system_clock(end_time)
    calc_time = real(end_time - start_time) / real(clock_rate)
    print *, 'Calculation time: ', calc_time, ' seconds'
    ! Print the result if n is less than or equal to 16
    if (n <= 16) then
        print *, 'The product of A and B_transposed is:'
        print *, C
    else
        print *, 'The dimension of the matrices is too large to print.'
    end if
    ! Deallocate matrices
    deallocate(A, B, C, B_transposed)
    ! 串行代码区finalize插桩事件
    ! call finalize_perf_roofline_events()
end program matrix_multiply

            

          

        
       

上述demo已添加插装代码（注释状态），包含如下5行：

use roofline_events
call init_perf_roofline_events()
call start_perf_roofline_events("matrix_multiply_f")
call stop_perf_roofline_events("matrix_multiply_f")
call finalize_perf_roofline_events()

插装代码注释状态（未进行插装情况下），编译命令如下：

        
             gfortran matrix_multiply.f90 -o matrix_multiply_f -fopenmp

插装代码去除注释状态（进行插装情况下），增加插桩说明中的编译选项即可，编译命令如下：

        
             gfortran matrix_multiply.f90 -o matrix_multiply_f -fopenmp -I /usr/local/devkit/tuner/include -L/usr/local/devkit/tuner/lib -lrfevents

Roofline分析任务采集模式选择region。
该示例中的region名为matrix_multiply_c或matrix_multiply_f，实际插桩过程中可以按需插入多个不同名称的region。

图1 Roofline分析任务采集模式选择region

父主题： 常用操作