warp 크기는 32 라는데,
원래는 32개씩 묶여서 원하는 수량대로만 돌라고
결론 : warp은 최소단위로 묶이는 쓰레드의 갯수이긴 하지만, 실제로 그 이하로도 묶인다.
그 이하로도 묶어서 사용이 가능한지 테스트를 해보았다.
VectorAdd를 변형해서 int 형으로 계산하고, 변수는 총 64개의 int형 배열로 선언
그리고 커널코드는 단순하게 두개의 배열을 더해서 세번째로 던져주게 해놓았다.
단, 커널은 1 block / 5 thread로 설정했다.
// Device code
__global__ void VecAdd(const int* A, const int* B, int* C, int N)
{
int i = threadIdx.x;
C[i] = A[i] + B[i];
}
VecAdd<<<1,5>>>(d_A, d_B, d_C, N);
|
결과는 아래와 같이 5개만 계산이 되어 나온다.
Vector addition
[000] h_A[0] + h_B[0] = 0, h_C[0]
[001] h_A[1] + h_B[1] = 2, h_C[2]
[002] h_A[2] + h_B[2] = 4, h_C[4]
[003] h_A[3] + h_B[3] = 6, h_C[6]
[004] h_A[4] + h_B[4] = 8, h_C[8]
[005] h_A[5] + h_B[5] = 10, h_C[133]- not ok
[006] h_A[6] + h_B[6] = 12, h_C[134]- not ok
[007] h_A[7] + h_B[7] = 14, h_C[135]- not ok
[008] h_A[8] + h_B[8] = 16, h_C[136]- not ok
[009] h_A[9] + h_B[9] = 18, h_C[137]- not ok
[010] h_A[10] + h_B[10] = 20, h_C[138]- not ok
[011] h_A[11] + h_B[11] = 22, h_C[139]- not ok
[012] h_A[12] + h_B[12] = 24, h_C[140]- not ok
[013] h_A[13] + h_B[13] = 26, h_C[141]- not ok
[014] h_A[14] + h_B[14] = 28, h_C[142]- not ok
[015] h_A[15] + h_B[15] = 30, h_C[143]- not ok
[016] h_A[16] + h_B[16] = 32, h_C[144]- not ok
[017] h_A[17] + h_B[17] = 34, h_C[145]- not ok
[018] h_A[18] + h_B[18] = 36, h_C[146]- not ok
[019] h_A[19] + h_B[19] = 38, h_C[147]- not ok
[020] h_A[20] + h_B[20] = 40, h_C[148]- not ok
[021] h_A[21] + h_B[21] = 42, h_C[149]- not ok
[022] h_A[22] + h_B[22] = 44, h_C[150]- not ok
[023] h_A[23] + h_B[23] = 46, h_C[151]- not ok
[024] h_A[24] + h_B[24] = 48, h_C[152]- not ok
[025] h_A[25] + h_B[25] = 50, h_C[153]- not ok
[026] h_A[26] + h_B[26] = 52, h_C[154]- not ok
[027] h_A[27] + h_B[27] = 54, h_C[155]- not ok
[028] h_A[28] + h_B[28] = 56, h_C[156]- not ok
[029] h_A[29] + h_B[29] = 58, h_C[157]- not ok
[030] h_A[30] + h_B[30] = 60, h_C[158]- not ok
[031] h_A[31] + h_B[31] = 62, h_C[159]- not ok
[032] h_A[32] + h_B[32] = 64, h_C[160]- not ok
[033] h_A[33] + h_B[33] = 66, h_C[161]- not ok
[034] h_A[34] + h_B[34] = 68, h_C[162]- not ok
[035] h_A[35] + h_B[35] = 70, h_C[163]- not ok
[036] h_A[36] + h_B[36] = 72, h_C[164]- not ok
[037] h_A[37] + h_B[37] = 74, h_C[165]- not ok
[038] h_A[38] + h_B[38] = 76, h_C[166]- not ok
[039] h_A[39] + h_B[39] = 78, h_C[167]- not ok
[040] h_A[40] + h_B[40] = 80, h_C[168]- not ok
[041] h_A[41] + h_B[41] = 82, h_C[169]- not ok
[042] h_A[42] + h_B[42] = 84, h_C[170]- not ok
[043] h_A[43] + h_B[43] = 86, h_C[171]- not ok
[044] h_A[44] + h_B[44] = 88, h_C[172]- not ok
[045] h_A[45] + h_B[45] = 90, h_C[173]- not ok
[046] h_A[46] + h_B[46] = 92, h_C[174]- not ok
[047] h_A[47] + h_B[47] = 94, h_C[175]- not ok
[048] h_A[48] + h_B[48] = 96, h_C[176]- not ok
[049] h_A[49] + h_B[49] = 98, h_C[177]- not ok
[050] h_A[50] + h_B[50] = 100, h_C[178]- not ok
[051] h_A[51] + h_B[51] = 102, h_C[179]- not ok
[052] h_A[52] + h_B[52] = 104, h_C[180]- not ok
[053] h_A[53] + h_B[53] = 106, h_C[181]- not ok
[054] h_A[54] + h_B[54] = 108, h_C[182]- not ok
[055] h_A[55] + h_B[55] = 110, h_C[183]- not ok
[056] h_A[56] + h_B[56] = 112, h_C[184]- not ok
[057] h_A[57] + h_B[57] = 114, h_C[185]- not ok
[058] h_A[58] + h_B[58] = 116, h_C[186]- not ok
[059] h_A[59] + h_B[59] = 118, h_C[187]- not ok
[060] h_A[60] + h_B[60] = 120, h_C[188]- not ok
[061] h_A[61] + h_B[61] = 122, h_C[189]- not ok
[062] h_A[62] + h_B[62] = 124, h_C[190]- not ok
[063] h_A[63] + h_B[63] = 126, h_C[191]- not ok
PASSED
|
2개씩 3개 블럭을 사용해도 제대로 나온다.
__global__ void VecAdd(const int* A, const int* B, int* C, int N)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
C[i] = A[i] + B[i];
}
VecAdd<<<3,2>>>(d_A, d_B, d_C, N);
|
Vector addition
[000] h_A[0] + h_B[0] = 0, h_C[0]
[001] h_A[1] + h_B[1] = 2, h_C[2]
[002] h_A[2] + h_B[2] = 4, h_C[4]
[003] h_A[3] + h_B[3] = 6, h_C[6]
[004] h_A[4] + h_B[4] = 8, h_C[8]
[005] h_A[5] + h_B[5] = 10, h_C[10]
[006] h_A[6] + h_B[6] = 12, h_C[134]- not ok
[007] h_A[7] + h_B[7] = 14, h_C[135]- not ok
[008] h_A[8] + h_B[8] = 16, h_C[136]- not ok
[009] h_A[9] + h_B[9] = 18, h_C[137]- not ok
[010] h_A[10] + h_B[10] = 20, h_C[138]- not ok
[011] h_A[11] + h_B[11] = 22, h_C[139]- not ok
[012] h_A[12] + h_B[12] = 24, h_C[140]- not ok
[013] h_A[13] + h_B[13] = 26, h_C[141]- not ok
[014] h_A[14] + h_B[14] = 28, h_C[142]- not ok
[015] h_A[15] + h_B[15] = 30, h_C[143]- not ok
[016] h_A[16] + h_B[16] = 32, h_C[144]- not ok
[017] h_A[17] + h_B[17] = 34, h_C[145]- not ok
[018] h_A[18] + h_B[18] = 36, h_C[146]- not ok
[019] h_A[19] + h_B[19] = 38, h_C[147]- not ok
[020] h_A[20] + h_B[20] = 40, h_C[148]- not ok
[021] h_A[21] + h_B[21] = 42, h_C[149]- not ok
[022] h_A[22] + h_B[22] = 44, h_C[150]- not ok
[023] h_A[23] + h_B[23] = 46, h_C[151]- not ok
[024] h_A[24] + h_B[24] = 48, h_C[152]- not ok
[025] h_A[25] + h_B[25] = 50, h_C[153]- not ok
[026] h_A[26] + h_B[26] = 52, h_C[154]- not ok
[027] h_A[27] + h_B[27] = 54, h_C[155]- not ok
[028] h_A[28] + h_B[28] = 56, h_C[156]- not ok
[029] h_A[29] + h_B[29] = 58, h_C[157]- not ok
[030] h_A[30] + h_B[30] = 60, h_C[158]- not ok
[031] h_A[31] + h_B[31] = 62, h_C[159]- not ok
[032] h_A[32] + h_B[32] = 64, h_C[160]- not ok
[033] h_A[33] + h_B[33] = 66, h_C[161]- not ok
[034] h_A[34] + h_B[34] = 68, h_C[162]- not ok
[035] h_A[35] + h_B[35] = 70, h_C[163]- not ok
[036] h_A[36] + h_B[36] = 72, h_C[164]- not ok
[037] h_A[37] + h_B[37] = 74, h_C[165]- not ok
[038] h_A[38] + h_B[38] = 76, h_C[166]- not ok
[039] h_A[39] + h_B[39] = 78, h_C[167]- not ok
[040] h_A[40] + h_B[40] = 80, h_C[168]- not ok
[041] h_A[41] + h_B[41] = 82, h_C[169]- not ok
[042] h_A[42] + h_B[42] = 84, h_C[170]- not ok
[043] h_A[43] + h_B[43] = 86, h_C[171]- not ok
[044] h_A[44] + h_B[44] = 88, h_C[172]- not ok
[045] h_A[45] + h_B[45] = 90, h_C[173]- not ok
[046] h_A[46] + h_B[46] = 92, h_C[174]- not ok
[047] h_A[47] + h_B[47] = 94, h_C[175]- not ok
[048] h_A[48] + h_B[48] = 96, h_C[176]- not ok
[049] h_A[49] + h_B[49] = 98, h_C[177]- not ok
[050] h_A[50] + h_B[50] = 100, h_C[178]- not ok
[051] h_A[51] + h_B[51] = 102, h_C[179]- not ok
[052] h_A[52] + h_B[52] = 104, h_C[180]- not ok
[053] h_A[53] + h_B[53] = 106, h_C[181]- not ok
[054] h_A[54] + h_B[54] = 108, h_C[182]- not ok
[055] h_A[55] + h_B[55] = 110, h_C[183]- not ok
[056] h_A[56] + h_B[56] = 112, h_C[184]- not ok
[057] h_A[57] + h_B[57] = 114, h_C[185]- not ok
[058] h_A[58] + h_B[58] = 116, h_C[186]- not ok
[059] h_A[59] + h_B[59] = 118, h_C[187]- not ok
[060] h_A[60] + h_B[60] = 120, h_C[188]- not ok
[061] h_A[61] + h_B[61] = 122, h_C[189]- not ok
[062] h_A[62] + h_B[62] = 124, h_C[190]- not ok
[063] h_A[63] + h_B[63] = 126, h_C[191]- not ok
PASSED
|
원래는 32개씩 묶여서 원하는 수량대로만 돌라고
커널 코드에 if(i<N) 식으로 제한이 되어있는줄 알았는데, 없어도 의도한 대로는 돈다.
물론 돌아야 할 데이터의 갯수가 grid.x * gird.y * thread.x * thread.y 의 갯수만큼
떨어지지 않는다면 제한을 두어야겠지만 말이다.
결론 : warp은 최소단위로 묶이는 쓰레드의 갯수이긴 하지만, 실제로 그 이하로도 묶인다.
블럭단위라고 해서 달라지는건 없는듯?
'Programming > openCL & CUDA' 카테고리의 다른 글
CUDA에서 grid 와 thread 의 갯수에 대한 짧은생각 (2) | 2011.01.18 |
---|---|
CUDA 3.1과 3.2의 devicequery 결과 차이점 (0) | 2011.01.18 |
CUDA 메모리별 범위(scope) (0) | 2011.01.17 |
CUDA 내장변수 - built in variable (0) | 2011.01.16 |
CUDA Toolkit 3.2 - SDK와 Toolkit을 둘다 설치하세요! (2) | 2011.01.16 |