// Step 1 Interleaved Addressing with Divergent Branching
__global__ void reduce0(int *g_idata, int *g_odata)
{
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid] = g_idata[i];
__syncthreads();
for (unsigned int s = 1; s < blockDim.x; s *= 2)
{
if ((tid % (2*s)) == 0)
sdata[tid] += sdata[tid + s];
__syncthreads();
}
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
// Step 2 Interleaved Addressing with Bank Collision
// if 문의 % 연산자 제거
__global__ void reduce1(int *g_idata, int *g_odata)
{
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid] = g_idata[i];
__syncthreads();
for (unsigned int s = 1; s < blockDim.x; s *= 2)
{
int index = 2 * s * tid;
if (index < blockDim.x)
sdata[index] += sdata[index + s];
__syncthreads();
}
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
// Step 3 Solve Bank Collision
// Data 전송 Bank를 인접하게 위치하도록 조정
__global__ void reduce3(int *g_idata, int *g_odata)
{
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid] = g_idata[i];
__syncthreads();
for (unsigned int s = blockDim.x/2; s > 0; s >>= 1)
{
if (tid < s)
sdata[tid] += sdata[tid + s];
__syncthreads();
}
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
// Step 4 첫번째 요소 로딩과 동시에 합 계산
// Shared Memory에 최소 값을 초기화 할때 2개의 값을 넣으면서, Kernel 실행 수를 절반으로 줄임
// N개의 Data 생성시 N / 2 개의 Thread 생성
__global__ void reduce4(int *g_idata, int *g_odata)
{
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid] = g_idata[i] + g_idata[i+blockDim.x];
__syncthreads();
for (unsigned int s = blockDim.x/2; s > 0; s >>= 1)
{
if (tid < s)
sdata[tid] += sdata[tid + s];
__syncthreads();
}
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
// Step 5 Last Warp Unrolling
// 마지막 Warp 부분의 rolling 제거
__device__ void warpReduce(volatile int* sdata, int tid)
{
sdata[tid] += sdata[tid + 32];
sdata[tid] += sdata[tid + 16];
sdata[tid] += sdata[tid + 8];
sdata[tid] += sdata[tid + 4];
sdata[tid] += sdata[tid + 2];
sdata[tid] += sdata[tid + 1];
}
__global__ void reduce5(int *g_idata, int *g_odata)
{
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid] = g_idata[i] + g_idata[i+blockDim.x];
__syncthreads();
for (unsigned int s = blockDim.x/2; s > 32; s >>= 1)
{
if (tid < s)
sdata[tid] += sdata[tid + s];
__syncthreads();
}
if (tid < 32) warpReduce(sdata, tid);
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
// Step 6 All Code Unrolling
// 모든 rolling 제거 단 block size는 512 이하로 한다는 전제
template <unsigned int blockSize>
__device__ void warpReduce2(volatile int* sdata, int tid)
{
if (blockSize >= 64) sdata[tid] += sdata[tid + 32];
if (blockSize >= 32) sdata[tid] += sdata[tid + 16];
if (blockSize >= 16) sdata[tid] += sdata[tid + 8];
if (blockSize >= 8) sdata[tid] += sdata[tid + 4];
if (blockSize >= 4) sdata[tid] += sdata[tid + 2];
if (blockSize >= 2) sdata[tid] += sdata[tid + 1];
}
template <unsigned int blockSize>
__global__ void reduce6(int *g_idata, int *g_odata)
{
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid] = g_idata[i] + g_idata[i+blockDim.x];
__syncthreads();
if (blockSize >= 512) {
if (tid < 256) sdata[tid] += sdata[tid + 256];
__syncthreads();
}
if (blockSize >= 256) {
if (tid < 128) sdata[tid] += sdata[tid + 128];
__syncthreads();
}
if (blockSize >= 128) {
if (tid < 64) sdata[tid] += sdata[tid + 64];
__syncthreads();
}
if (tid < 32) warpReduce2<blockSize>(sdata, tid);
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
// Step 7 Algorithm Cascading
// 호출하는 Thread 수를 N / log N 으로 생성하도록 하여 Thread 할당하는 시간을 줄임
template <unsigned int blockSize>
__global__ void reduce7(int *g_idata, int *g_odata, unsigned int n)
{
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid] = 0;
while (i < n)
{
sdata[tid] += g_idata[i] + g_idata[i + blockSize];
i += blockDim.x;
}
__syncthreads();
if (blockSize >= 512) {
if (tid < 256) sdata[tid] += sdata[tid + 256];
__syncthreads();
}
if (blockSize >= 256) {
if (tid < 128) sdata[tid] += sdata[tid + 128];
__syncthreads();
}
if (blockSize >= 128) {
if (tid < 64) sdata[tid] += sdata[tid + 64];
__syncthreads();
}
if (tid < 32) warpReduce2<blockSize>(sdata, tid);
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
__global__ void reduce0(int *g_idata, int *g_odata)
{
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid] = g_idata[i];
__syncthreads();
for (unsigned int s = 1; s < blockDim.x; s *= 2)
{
if ((tid % (2*s)) == 0)
sdata[tid] += sdata[tid + s];
__syncthreads();
}
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
// Step 2 Interleaved Addressing with Bank Collision
// if 문의 % 연산자 제거
__global__ void reduce1(int *g_idata, int *g_odata)
{
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid] = g_idata[i];
__syncthreads();
for (unsigned int s = 1; s < blockDim.x; s *= 2)
{
int index = 2 * s * tid;
if (index < blockDim.x)
sdata[index] += sdata[index + s];
__syncthreads();
}
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
// Step 3 Solve Bank Collision
// Data 전송 Bank를 인접하게 위치하도록 조정
__global__ void reduce3(int *g_idata, int *g_odata)
{
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid] = g_idata[i];
__syncthreads();
for (unsigned int s = blockDim.x/2; s > 0; s >>= 1)
{
if (tid < s)
sdata[tid] += sdata[tid + s];
__syncthreads();
}
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
// Step 4 첫번째 요소 로딩과 동시에 합 계산
// Shared Memory에 최소 값을 초기화 할때 2개의 값을 넣으면서, Kernel 실행 수를 절반으로 줄임
// N개의 Data 생성시 N / 2 개의 Thread 생성
__global__ void reduce4(int *g_idata, int *g_odata)
{
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid] = g_idata[i] + g_idata[i+blockDim.x];
__syncthreads();
for (unsigned int s = blockDim.x/2; s > 0; s >>= 1)
{
if (tid < s)
sdata[tid] += sdata[tid + s];
__syncthreads();
}
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
// Step 5 Last Warp Unrolling
// 마지막 Warp 부분의 rolling 제거
__device__ void warpReduce(volatile int* sdata, int tid)
{
sdata[tid] += sdata[tid + 32];
sdata[tid] += sdata[tid + 16];
sdata[tid] += sdata[tid + 8];
sdata[tid] += sdata[tid + 4];
sdata[tid] += sdata[tid + 2];
sdata[tid] += sdata[tid + 1];
}
__global__ void reduce5(int *g_idata, int *g_odata)
{
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid] = g_idata[i] + g_idata[i+blockDim.x];
__syncthreads();
for (unsigned int s = blockDim.x/2; s > 32; s >>= 1)
{
if (tid < s)
sdata[tid] += sdata[tid + s];
__syncthreads();
}
if (tid < 32) warpReduce(sdata, tid);
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
// Step 6 All Code Unrolling
// 모든 rolling 제거 단 block size는 512 이하로 한다는 전제
template <unsigned int blockSize>
__device__ void warpReduce2(volatile int* sdata, int tid)
{
if (blockSize >= 64) sdata[tid] += sdata[tid + 32];
if (blockSize >= 32) sdata[tid] += sdata[tid + 16];
if (blockSize >= 16) sdata[tid] += sdata[tid + 8];
if (blockSize >= 8) sdata[tid] += sdata[tid + 4];
if (blockSize >= 4) sdata[tid] += sdata[tid + 2];
if (blockSize >= 2) sdata[tid] += sdata[tid + 1];
}
template <unsigned int blockSize>
__global__ void reduce6(int *g_idata, int *g_odata)
{
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid] = g_idata[i] + g_idata[i+blockDim.x];
__syncthreads();
if (blockSize >= 512) {
if (tid < 256) sdata[tid] += sdata[tid + 256];
__syncthreads();
}
if (blockSize >= 256) {
if (tid < 128) sdata[tid] += sdata[tid + 128];
__syncthreads();
}
if (blockSize >= 128) {
if (tid < 64) sdata[tid] += sdata[tid + 64];
__syncthreads();
}
if (tid < 32) warpReduce2<blockSize>(sdata, tid);
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
// Step 7 Algorithm Cascading
// 호출하는 Thread 수를 N / log N 으로 생성하도록 하여 Thread 할당하는 시간을 줄임
template <unsigned int blockSize>
__global__ void reduce7(int *g_idata, int *g_odata, unsigned int n)
{
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid] = 0;
while (i < n)
{
sdata[tid] += g_idata[i] + g_idata[i + blockSize];
i += blockDim.x;
}
__syncthreads();
if (blockSize >= 512) {
if (tid < 256) sdata[tid] += sdata[tid + 256];
__syncthreads();
}
if (blockSize >= 256) {
if (tid < 128) sdata[tid] += sdata[tid + 128];
__syncthreads();
}
if (blockSize >= 128) {
if (tid < 64) sdata[tid] += sdata[tid + 64];
__syncthreads();
}
if (tid < 32) warpReduce2<blockSize>(sdata, tid);
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
댓글 없음:
댓글 쓰기