1. Host 함수에서 Device용 curandState Pointer를 생성하여 할당하기
2. Kernel 함수로 curandState 를 초기화 : 병렬화 할 크기만큼 Seed 생성 (1번만 하면 됨)
3. Kernel 함수로 curandState를 이용하여 random 생성
[-] Collapse
#include <cuda.h>
#include <curand_kernel.h>
#include <stdio.h>
#include <iostream>
using namespace std;
__global__ void setup_kernel(curandState *state, int seed)
{
int id = threadIdx.x + blockIdx.x * blockDim.x;
curand_init(seed,id,0, &state[id]);
}
__global__ void generate_kernel(curandState *state, float* result)
{
int id = threadIdx.x + blockIdx.x * blockDim.x;
result[id] = curand_normal(&state[id]);
}
int main()
{
float* dev_result;
float* host_result;
curandState *devStates;
cudaMalloc((void**)&devStates, 1024 * 1000 * sizeof(curandState));
cudaMalloc((void**)&dev_result, 1024 * 1000 * sizeof(float));
int a = 0;
setup_kernel<<<1024,1000>>>(devStates, a);
while(true)
{
generate_kernel<<<1024,1000>>>(devStates, dev_result);
host_result = new float[1024 * 1000];
cudaMemcpy(host_result, dev_result, 1024 * 1000 * sizeof(float), cudaMemcpyDeviceToHost);
float sum = 0.0f;
float sumSq = 0.0f;
int count = 1024 * 1000;
for (int i = 0; i < count; i++)
{
sum += host_result[i];
sumSq += host_result[i] * host_result[i];
}
float avg = sum / count;
float stDev = ( sumSq - (sum * sum / count) ) / ( count - 1);
cout << "Avg : " << avg << " stDev : " << stDev << " 100번째 숫자 : " << host_result[100] << endl;
getchar();
}
delete [] host_result;
cudaFree(devStates);
cudaFree(dev_result);
return 0;
}
#include <curand_kernel.h>
#include <stdio.h>
#include <iostream>
using namespace std;
__global__ void setup_kernel(curandState *state, int seed)
{
int id = threadIdx.x + blockIdx.x * blockDim.x;
curand_init(seed,id,0, &state[id]);
}
__global__ void generate_kernel(curandState *state, float* result)
{
int id = threadIdx.x + blockIdx.x * blockDim.x;
result[id] = curand_normal(&state[id]);
}
int main()
{
float* dev_result;
float* host_result;
curandState *devStates;
cudaMalloc((void**)&devStates, 1024 * 1000 * sizeof(curandState));
cudaMalloc((void**)&dev_result, 1024 * 1000 * sizeof(float));
int a = 0;
setup_kernel<<<1024,1000>>>(devStates, a);
while(true)
{
generate_kernel<<<1024,1000>>>(devStates, dev_result);
host_result = new float[1024 * 1000];
cudaMemcpy(host_result, dev_result, 1024 * 1000 * sizeof(float), cudaMemcpyDeviceToHost);
float sum = 0.0f;
float sumSq = 0.0f;
int count = 1024 * 1000;
for (int i = 0; i < count; i++)
{
sum += host_result[i];
sumSq += host_result[i] * host_result[i];
}
float avg = sum / count;
float stDev = ( sumSq - (sum * sum / count) ) / ( count - 1);
cout << "Avg : " << avg << " stDev : " << stDev << " 100번째 숫자 : " << host_result[100] << endl;
getchar();
}
delete [] host_result;
cudaFree(devStates);
cudaFree(dev_result);
return 0;
}
- 대표적인 CUDA용 Random 함수 몇가지
curand_uniform(&state) : 0.0f ~ 1.0f 사이의 실수
curand_normal(&state) : 0.0f ~ 1.0f 사이의 정규분포된 실수
curand_log_normal(&state, mean, stddev) : Log-normally 분산 실수
curand_poisson(&state, lambda) : 포아송분포의 unsigned 정수값
댓글 없음:
댓글 쓰기