I have the following fragment of code for which I want to count the number of executed instructions.
#include <cstdio>
#include <cuda.h>
__global__ void kernel() {
double value = 10;
// region 1 start
for (int i = 0; i < 100000; i++) {
value *= 1.000001;
}
// region 1 end
// region 2 start
for (int i = 0; i < 100000; i++) {
value *= 1.000001;
}
// region 2 end
printf("value: %lg
", value); // prevent compiler optimizations to remove entire kernel content
}
int main() {
kernel<<<32,32>>>();
cudaDeviceSynchronize();
}
I am doing it with nsight-compute using the following command:
nvcc sample.cu -arch=sm_75 && /opt/cuda/nsight_compute/ncu --metrics smsp__inst_executed ./a.out
As part of output I see the collected metric for the entire kernel:
kernel(), 2021-Jan-24 12:10:11, Context 1, Stream 7
Section: Command line profiler metrics
---------------------------------------------------------------------- --------------- ------------------------------
smsp__inst_executed.avg inst 41871.01
smsp__inst_executed.max inst 209356
smsp__inst_executed.min inst 0
smsp__inst_executed.sum inst 6699361
---------------------------------------------------------------------- --------------- ------------------------------
Is it possible to collect this metric for region 1 and 2 individually as commented in the code instead of the whole kernel?
question from:
https://stackoverflow.com/questions/65869878/is-it-possible-to-collect-performance-metrics-in-cuda-per-region-inside-the-kern 与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…