[go: up one dir, main page]

Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

请问小彭老师,这段GPU代码为什么加速比这么低? #8

Open
balleb6545anickk opened this issue Nov 23, 2023 · 4 comments

Comments

@balleb6545anickk
Copy link
balleb6545anickk commented Nov 23, 2023

测试环境:
笔记本R7-5800H,3060,Win11,MSVC最新版Release模式。
测试结果:
GPU time: 0.0018809
CPU time: 0.0048002
ratio: 2.55208
我用其它的CUDA程序加速比都能达到10倍左右,这个加速比为什么这么慢?
(另外,改成float加速就很快,为什么?如果一定要用double,该怎么改?)

#include <omp.h>
#include <chrono>
#include <iostream>
#include <vector>

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#define TYPE double
#define imgW 2448
#define imgH 2048
#define N imgW *imgH

__global__ void GPU_Cal(TYPE *input, TYPE *output, int width, int height, TYPE *para0, TYPE *para1,
                                      TYPE *para2) {
    int pos = blockIdx.x * blockDim.x + threadIdx.x;
    if (pos >= width * height)
        return;

    TYPE data = input[pos];
    TYPE row = pos / width;
    TYPE col = pos % width;
    TYPE x = (col - para2[0]) * para2[2];
    TYPE y = (row - para2[1]) * para2[3];

    const TYPE a = para0[0] + para0[2] * x + data * (para0[1] + para0[3] * x) + para0[4] * y + data * para0[5] * y;
    const TYPE b = para1[0] + para1[2] * x + data * (para1[1] + para1[3] * x) + para1[4] * y + data * para1[5] * y;

    output[pos] = a / b; 
}

void CPU_Cal(const TYPE *input, TYPE *output, int width, int height, TYPE *para0, TYPE *para1, TYPE *para2) {
#pragma omp parallel for
    for (int row = 0; row < height; ++row) {
        TYPE *_output = output + row * width;
        const TYPE *_input = input + row * width;
        for (int col = 0; col < width; ++col) {
            const TYPE data = *_input;
            const TYPE x = (col - para2[0]) * para2[2];
            const TYPE y = (row - para2[1]) * para2[3];

            const TYPE a =
                para0[0] + para0[2] * x + data * (para0[1] + para0[3] * x) + para0[4] * y + data * para0[5] * y;
            const TYPE b =
                para1[0] + para1[2] * x + data * (para1[1] + para1[3] * x) + para1[4] * y + data * para1[5] * y;

            *_output = a / b; 
            ++_output;
            ++_input;
        }
    }
}

int main() {
    // 准备数据
    std::vector<TYPE> input(N, 2);
    std::vector<TYPE> output(N, 0);
    std::vector<TYPE> para0(30, 1.5);
    std::vector<TYPE> para1(30, 1.5);
    std::vector<TYPE> para3{1246, 1037, 2448, 2048};
    // 随机准备一段数据
    for (int i = 0; i < N; ++i) {
        input[i] = (double)i / N;
        output[i] = (double)i / N + 2;
    }
    for (int i = 0; i < 30; ++i) {
        para0[i] = (double)i / 30;
        para1[i] = (double)i / 30 + 4.0;
    }

    TYPE *d_input;
    TYPE *d_output;
    TYPE *d_para0;
    TYPE *d_para1;
    TYPE *d_para2;
    cudaMalloc((void **)&d_input, N * sizeof(TYPE));
    cudaMalloc((void **)&d_output, N * sizeof(TYPE));
    cudaMalloc((void **)&d_para0, 30 * sizeof(TYPE));
    cudaMalloc((void **)&d_para1, 30 * sizeof(TYPE));
    cudaMalloc((void **)&d_para2, 4 * sizeof(TYPE));
    cudaMemcpy(d_input, input.data(), N * sizeof(TYPE), cudaMemcpyHostToDevice);
    cudaMemcpy(d_output, output.data(), N * sizeof(TYPE), cudaMemcpyHostToDevice);
    cudaMemcpy(d_para0, para0.data(), 30 * sizeof(TYPE), cudaMemcpyHostToDevice);
    cudaMemcpy(d_para1, para1.data(), 30 * sizeof(TYPE), cudaMemcpyHostToDevice);
    cudaMemcpy(d_para2, para3.data(), 4 * sizeof(TYPE), cudaMemcpyHostToDevice);

    // GPU计算时间(取最短时间)
    int thread_num = 256;
    int block_num = (imgW * imgH + thread_num - 1) / thread_num;
    double gpu_time = 10000000;
    cudaDeviceSynchronize();
    for (size_t i = 0; i < 50; i++) {
        auto t0 = std::chrono::steady_clock::now();
        GPU_Cal<<<block_num, thread_num>>>(d_input, d_output, imgW, imgH, d_para0, d_para1, d_para2);
        cudaDeviceSynchronize();
        double time =
            std::chrono::duration_cast<std::chrono::duration<double>>(std::chrono::steady_clock::now() - t0).count();
        gpu_time = std::min(gpu_time, time);
    }
    std::cout << "GPU time: " << gpu_time << std::endl;

    // CPU计算时间(取最短时间)
    TYPE *h_output;
    h_output = (TYPE *)malloc(N * sizeof(TYPE));
    cudaMemcpy(h_output, d_output, N * sizeof(TYPE), cudaMemcpyDeviceToHost);
    double cpu_time = 10000000;
    for (size_t i = 0; i < 50; i++) {
        auto t0 = std::chrono::steady_clock::now();
        CPU_Cal(input.data(), output.data(), imgW, imgH, para0.data(), para1.data(), para3.data());
        double time =
            std::chrono::duration_cast<std::chrono::duration<double>>(std::chrono::steady_clock::now() - t0).count();
        cpu_time = std::min(cpu_time, time);
    }
    std::cout << "CPU time: " << cpu_time << std::endl;
    std::cout << "ratio: " << cpu_time / gpu_time << std::endl;

    // 检测计算结果是否一致
    for (int i = 0; i < N; i++) {
        if (h_output[i] != h_output[i] && output[i] != output[i]) {
            continue;
        }
        if (fabs(h_output[i] - output[i]) > 1e-2) {
            printf("Error! i: %d, cpu: %f, gpu:%f.\n", i, output[i], h_output[i]);
            abort();
        }
    }

    cudaFree(d_input);
    cudaFree(d_output);
    cudaFree(d_para0);
    cudaFree(d_para1);
    cudaFree(d_para2);
    return 0;
}
@balleb6545anickk
Copy link
Author

顺便问一下小彭老师,什么时候把CUDA nsight安排上!

@balleb6545anickk
Copy link
Author

或者先看一下这个简化一点的问题也行:https://stackoverflow.com/questions/77562389/why-is-this-cuda-code-for-summing-arrays-so-slow

@HJzhang-sjtu
Copy link

README里面写了,小彭老师不回答CUDA优化相关的问题。我来回答下吧,你这个开的block数目太多了,总共就那几十个SM,你开了1万多个block,光调度这些block运行在SM上开销就很大了。可以让一个block计算更多的数据,例如每一个block计算256*256个input数据,每一个block内的thread计算256个数据。

@archibate
Copy link
Contributor
archibate commented Dec 11, 2023
  1. GPU(特别是消费级显卡)就是对double支持很差的,正常的。正常图形学应用都是float数据,aipig甚至巴不得用half。(这就是为什么他们搞科学计算的都不爱用GPU集群,因为科研仿真需要double精度)
  2. 你这里的主要瓶颈是这个双精度浮点除法,把 a / b 改成 a + b 后加速比直接从 2.23 提升到 2.99 了。
  3. i / width 这个整数除法的开销也很大,你试图用一维的blockdim和griddim在运行东西,然后用除法和模运算来模拟出row和col,这是不正确的。应该利用blockDim.x和blockDim.y,避免低效的除法。

1和2的问题是可能是你算法需要,改了你的结果就不对了。3这个问题我给你改下。

__global__ void GPU_Cal(TYPE *input, TYPE *output, int width, int height, TYPE *para0, TYPE *para1,
                                      TYPE *para2) {
    for (int row = threadIdx.y + blockIdx.y * blockDim.y; row < height; row += gridDim.y * blockDim.y) {
        for (int col = threadIdx.x + blockIdx.x * blockDim.x; col < width; col += gridDim.x * blockDim.x) {
            int i = row * width + col;
            TYPE data = input[i];
            TYPE x = (row - para2[0]) * para2[2];
            TYPE y = (col - para2[1]) * para2[3];

            const TYPE a = para0[0] + para0[2] * x + data * (para0[1] + para0[3] * x) + para0[4] * y + data * para0[5] * y;
            const TYPE b = para1[0] + para1[2] * x + data * (para1[1] + para1[3] * x) + para1[4] * y + data * para1[5] * y;

            output[i] = a / b;
        }
    }
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

3 participants