어제 했던 소스에서 대충 openmp 적용해봄
2014/01/16 - [Programming/openCL / CUDA] - cuda 1차원 데이터, 2차원 처리 예제
CUDA Runtime API - Host - Extra C++ Options 에서
/openmp 를 추가하면 된다.
[링크 :
https://devtalk.nvidia.com/.../vs2010-cuda4rc2-howto-compile-openmp-host-code...]
물론 #inclued <omp.h>를 추가하지 않으면
컴파일 시에는 문제가 없으나 실행시에 이런 문제가 발생한다.
[링크 :
http://blog.naver.com/changfull7/70110120004]
openmp 적용
cpu Time : 0.048000
gpu Time : 0.001000 |
openmp 미적용
cpu Time : 0.131000
gpu Time : 0.001000 |
#include < stdio.h >
#include < stdlib.h >
#include < time.h >
#include < omp.h >
#include "cuda_runtime.h"
#define BLOCK_WID 128
#define THREAD_WID 32
#define ARRAY_SIZE (THREAD_WID * THREAD_WID * BLOCK_WID * BLOCK_WID)
__global__ void kernel_test(int *a, int *b, int *c)
{
int idx = threadIdx.x +blockIdx.x * blockDim.x + (gridDim.x * blockDim.x) * (blockIdx.y * blockDim.y + threadIdx.y);
c[idx] = a[idx] + b[idx];
}
void main()
{
clock_t start_time, end_time;
int *a, *b, *c, *res;
int *dev_a,*dev_b,*dev_c;
int idx = 0;
dim3 block(BLOCK_WID,BLOCK_WID);
dim3 thread(THREAD_WID,THREAD_WID);
a = (int *)malloc(ARRAY_SIZE * sizeof(int));
b = (int *)malloc(ARRAY_SIZE * sizeof(int));
c = (int *)malloc(ARRAY_SIZE * sizeof(int));
res = (int *)malloc(ARRAY_SIZE * sizeof(int));
// initialize
srand (time(NULL));
#pragma omp parallel for
for(idx = 0;idx < ARRAY_SIZE ; idx++)
{
a[idx] = rand() & 0xFFFF;
b[idx] = rand() & 0xFFFF;
c[idx] = 0;
}
start_time = clock();
#pragma omp parallel for
for(idx = 0;idx < ARRAY_SIZE ; idx++)
{
res[idx] = a[idx] + b[idx];
}
end_time = clock();
printf("cpu Time : %f\n", ((double)(end_time-start_time)) / CLOCKS_PER_SEC);
cudaMalloc(&dev_a, ARRAY_SIZE * sizeof(int));
cudaMalloc(&dev_b, ARRAY_SIZE * sizeof(int));
cudaMalloc(&dev_c, ARRAY_SIZE * sizeof(int));
cudaMemcpy(dev_a, a, ARRAY_SIZE * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, ARRAY_SIZE * sizeof(int), cudaMemcpyHostToDevice);
start_time = clock();
kernel_test<<<block,thread>>>(dev_a,dev_b,dev_c);
end_time = clock();
printf("gpu Time : %f\n", ((double)(end_time-start_time)) / CLOCKS_PER_SEC);
cudaMemcpy(c, dev_c, ARRAY_SIZE * sizeof(int), cudaMemcpyDeviceToHost);
for(idx = 0;idx < ARRAY_SIZE ; idx++)
{
if(res[idx] != c[idx])
{
printf("%5d a:%5d b:%5d c:%5d", idx, a[idx], b[idx], c[idx]);
if(res[idx] != c[idx])
printf(" != ");
else printf(" == ");
printf("r:%5d\n",res[idx]);
break;
}
}
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
free(a);
free(b);
free(c);
free(res);
}