CUDA 학습을 하기 위하여 nVidia 그래픽 카드 장착후 해당 GPU의 상세 스펙을 알고 싶은데, 사이트에서 찾을 수가 없다.
(내가 영어가 약해서 잘 찾지를 못하는 건가 ??? ㅠㅠ)
간단한 예제 프로그램으로 해당 정보를 알 수가 있다.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include < s t d i o . h >
#include < i o s t r e a m >
using namespace std ;
int main ( )
{
cudaDeviceProp prop ;
int count ;
cudaGetDeviceCount ( & count ) ;
printf ( "cudaGetDeviceCount : %d\n" , count ) ;
for ( int i = 0 ; i < count ; i + + )
{
cudaGetDeviceProperties ( & prop , i ) ;
printf ( "--- General Information for device %d ---\n" , i ) ;
printf ( "Name : %s\n" , prop . name ) ;
printf ( "Compute capability : %d.%d\n" , prop . major , prop . minor ) ;
printf ( "Clock rate: %d\n" , prop . clockRate ) ;
printf ( "Device copy overlap : " ) ;
if ( prop . deviceOverlap ) printf ( "Enabled\n" ) ; else printf ( "Disabled\n" ) ;
printf ( "Kernel execition timeout : " ) ;
if ( prop . kernelExecTimeoutEnabled ) printf ( "Enabled\n" ) ; else printf ( "Disabled\n" ) ;
printf ( "--- Memory Information ---\n" ) ;
cout < < "Global Memory : " < < prop . totalGlobalMem < < endl ;
cout < < "Constant Memory : " < < prop . totalConstMem < < endl ;
cout < < "Max mem pitch : " < < prop . memPitch < < endl ;
cout < < "Texture Alignment : " < < prop . textureAlignment < < endl ;
cout < < "--- MP Information ---" < < endl ;
cout < < "Multiprocessor count : " < < prop . multiProcessorCount < < endl ;
cout < < "Share mem per mp : " < < prop . sharedMemPerBlock < < endl ;
cout < < "Registers per mp : " < < prop . regsPerBlock < < endl ;
cout < < "Threads in warp : " < < prop . warpSize < < endl ;
cout < < "Max Threads per block : " < < prop . maxThreadsPerBlock < < endl ;
cout < < "Max thread dimensions : (" < < prop . maxThreadsDim [ 0 ] < < "," < < prop . maxThreadsDim [ 1 ] < < "," < < prop . maxThreadsDim [ 2 ] < < ")" < < endl ;
cout < < "Max grid dimensions : (" < < prop . maxGridSize [ 0 ] < < "," < < prop . maxGridSize [ 1 ] < < "," < < prop . maxGridSize [ 2 ] < < ")" < < endl ;
}
cudaDeviceProp prop1 ;
int dev ;
cudaGetDevice ( & dev ) ;
cout < < "ID of current CUDA device : " < < dev < < endl ;
memset ( & prop , 0 , sizeof ( cudaDeviceProp ) ) ;
prop . major = 1 ;
prop . minor = 3 ;
cudaChooseDevice ( & dev , & prop ) ;
cout < < "ID of CUDA device closest to revision 1.3 : " < < dev < < endl ;
cudaSetDevice ( dev ) ;
getchar ( ) ;
return 0 ;
}
#include "device_launch_parameters.h"
#include < s t d i o . h >
#include < i o s t r e a m >
using namespace std ;
int main ( )
{
cudaDeviceProp prop ;
int count ;
cudaGetDeviceCount ( & count ) ;
printf ( "cudaGetDeviceCount : %d\n" , count ) ;
for ( int i = 0 ; i < count ; i + + )
{
cudaGetDeviceProperties ( & prop , i ) ;
printf ( "--- General Information for device %d ---\n" , i ) ;
printf ( "Name : %s\n" , prop . name ) ;
printf ( "Compute capability : %d.%d\n" , prop . major , prop . minor ) ;
printf ( "Clock rate: %d\n" , prop . clockRate ) ;
printf ( "Device copy overlap : " ) ;
if ( prop . deviceOverlap ) printf ( "Enabled\n" ) ; else printf ( "Disabled\n" ) ;
printf ( "Kernel execition timeout : " ) ;
if ( prop . kernelExecTimeoutEnabled ) printf ( "Enabled\n" ) ; else printf ( "Disabled\n" ) ;
printf ( "--- Memory Information ---\n" ) ;
cout < < "Global Memory : " < < prop . totalGlobalMem < < endl ;
cout < < "Constant Memory : " < < prop . totalConstMem < < endl ;
cout < < "Max mem pitch : " < < prop . memPitch < < endl ;
cout < < "Texture Alignment : " < < prop . textureAlignment < < endl ;
cout < < "--- MP Information ---" < < endl ;
cout < < "Multiprocessor count : " < < prop . multiProcessorCount < < endl ;
cout < < "Share mem per mp : " < < prop . sharedMemPerBlock < < endl ;
cout < < "Registers per mp : " < < prop . regsPerBlock < < endl ;
cout < < "Threads in warp : " < < prop . warpSize < < endl ;
cout < < "Max Threads per block : " < < prop . maxThreadsPerBlock < < endl ;
cout < < "Max thread dimensions : (" < < prop . maxThreadsDim [ 0 ] < < "," < < prop . maxThreadsDim [ 1 ] < < "," < < prop . maxThreadsDim [ 2 ] < < ")" < < endl ;
cout < < "Max grid dimensions : (" < < prop . maxGridSize [ 0 ] < < "," < < prop . maxGridSize [ 1 ] < < "," < < prop . maxGridSize [ 2 ] < < ")" < < endl ;
}
cudaDeviceProp prop1 ;
int dev ;
cudaGetDevice ( & dev ) ;
cout < < "ID of current CUDA device : " < < dev < < endl ;
memset ( & prop , 0 , sizeof ( cudaDeviceProp ) ) ;
prop . major = 1 ;
prop . minor = 3 ;
cudaChooseDevice ( & dev , & prop ) ;
cout < < "ID of CUDA device closest to revision 1.3 : " < < dev < < endl ;
cudaSetDevice ( dev ) ;
getchar ( ) ;
return 0 ;
}
위의 Code로 cu 파일을 만들어서 실행하면 화면에 CUDA 관련 상세한 Spec이 출력된다.
cudaDeviceProp 의 각 Property의 의미는 아래와 같다.
/**
* CUDA device properties
*/
struct __device_builtin__ cudaDeviceProp
{
char name [ 256 ] ; /**< ASCII string identifying device */
size_t totalGlobalMem ; /**< Global memory available on device in bytes */
size_t sharedMemPerBlock ; /**< Shared memory available per block in bytes */
int regsPerBlock ; /**< 32-bit registers available per block */
int warpSize ; /**< Warp size in threads */
size_t memPitch ; /**< Maximum pitch in bytes allowed by memory copies */
int maxThreadsPerBlock ; /**< Maximum number of threads per block */
int maxThreadsDim [ 3 ] ; /**< Maximum size of each dimension of a block */
int maxGridSize [ 3 ] ; /**< Maximum size of each dimension of a grid */
int clockRate ; /**< Clock frequency in kilohertz */
size_t totalConstMem ; /**< Constant memory available on device in bytes */
int major ; /**< Major compute capability */
int minor ; /**< Minor compute capability */
size_t textureAlignment ; /**< Alignment requirement for textures */
size_t texturePitchAlignment ; /**< Pitch alignment requirement for texture references bound to pitched memory */
int deviceOverlap ;
/**< Device can concurrently copy memory and execute a kernel. Deprecated. Use instead asyncEngineCount. */
int multiProcessorCount ; /**< Number of multiprocessors on device */
int kernelExecTimeoutEnabled ; /**< Specified whether there is a run time limit on kernels */
int integrated ; /**< Device is integrated as opposed to discrete */
int canMapHostMemory ; /**< Device can map host memory with cudaHostAlloc/cudaHostGetDevicePointer */
int computeMode ; /**< Compute mode (See ::cudaComputeMode) */
int maxTexture1D ; /**< Maximum 1D texture size */
int maxTexture1DMipmap ; /**< Maximum 1D mipmapped texture size */
int maxTexture1DLinear ; /**< Maximum size for 1D textures bound to linear memory */
int maxTexture2D [ 2 ] ; /**< Maximum 2D texture dimensions */
int maxTexture2DMipmap [ 2 ] ; /**< Maximum 2D mipmapped texture dimensions */
int maxTexture2DLinear [ 3 ] ; /**< Maximum dimensions (width, height, pitch) for 2D textures bound to pitched memory */
int maxTexture2DGather [ 2 ] ; /**< Maximum 2D texture dimensions if texture gather operations have to be performed */
int maxTexture3D [ 3 ] ; /**< Maximum 3D texture dimensions */
int maxTexture3DAlt [ 3 ] ; /**< Maximum alternate 3D texture dimensions */
int maxTextureCubemap ; /**< Maximum Cubemap texture dimensions */
int maxTexture1DLayered [ 2 ] ; /**< Maximum 1D layered texture dimensions */
int maxTexture2DLayered [ 3 ] ; /**< Maximum 2D layered texture dimensions */
int maxTextureCubemapLayered [ 2 ] ; /**< Maximum Cubemap layered texture dimensions */
int maxSurface1D ; /**< Maximum 1D surface size */
int maxSurface2D [ 2 ] ; /**< Maximum 2D surface dimensions */
int maxSurface3D [ 3 ] ; /**< Maximum 3D surface dimensions */
int maxSurface1DLayered [ 2 ] ; /**< Maximum 1D layered surface dimensions */
int maxSurface2DLayered [ 3 ] ; /**< Maximum 2D layered surface dimensions */
int maxSurfaceCubemap ; /**< Maximum Cubemap surface dimensions */
int maxSurfaceCubemapLayered [ 2 ] ; /**< Maximum Cubemap layered surface dimensions */
size_t surfaceAlignment ; /**< Alignment requirements for surfaces */
int concurrentKernels ; /**< Device can possibly execute multiple kernels concurrently */
int ECCEnabled ; /**< Device has ECC support enabled */
int pciBusID ; /**< PCI bus ID of the device */
int pciDeviceID ; /**< PCI device ID of the device */
int pciDomainID ; /**< PCI domain ID of the device */
int tccDriver ; /**< 1 if device is a Tesla device using TCC driver, 0 otherwise */
int asyncEngineCount ; /**< Number of asynchronous engines */
int unifiedAddressing ; /**< Device shares a unified address space with the host */
int memoryClockRate ; /**< Peak memory clock frequency in kilohertz */
int memoryBusWidth ; /**< Global memory bus width in bits */
int l2CacheSize ; /**< Size of L2 cache in bytes */
int maxThreadsPerMultiProcessor ; /**< Maximum resident threads per multiprocessor */
int streamPrioritiesSupported ; /**< Device supports stream priorities */
} ;
* CUDA device properties
*/
struct __device_builtin__ cudaDeviceProp
{
char name [ 256 ] ; /**< ASCII string identifying device */
size_t totalGlobalMem ; /**< Global memory available on device in bytes */
size_t sharedMemPerBlock ; /**< Shared memory available per block in bytes */
int regsPerBlock ; /**< 32-bit registers available per block */
int warpSize ; /**< Warp size in threads */
size_t memPitch ; /**< Maximum pitch in bytes allowed by memory copies */
int maxThreadsPerBlock ; /**< Maximum number of threads per block */
int maxThreadsDim [ 3 ] ; /**< Maximum size of each dimension of a block */
int maxGridSize [ 3 ] ; /**< Maximum size of each dimension of a grid */
int clockRate ; /**< Clock frequency in kilohertz */
size_t totalConstMem ; /**< Constant memory available on device in bytes */
int major ; /**< Major compute capability */
int minor ; /**< Minor compute capability */
size_t textureAlignment ; /**< Alignment requirement for textures */
size_t texturePitchAlignment ; /**< Pitch alignment requirement for texture references bound to pitched memory */
int deviceOverlap ;
/**< Device can concurrently copy memory and execute a kernel. Deprecated. Use instead asyncEngineCount. */
int multiProcessorCount ; /**< Number of multiprocessors on device */
int kernelExecTimeoutEnabled ; /**< Specified whether there is a run time limit on kernels */
int integrated ; /**< Device is integrated as opposed to discrete */
int canMapHostMemory ; /**< Device can map host memory with cudaHostAlloc/cudaHostGetDevicePointer */
int computeMode ; /**< Compute mode (See ::cudaComputeMode) */
int maxTexture1D ; /**< Maximum 1D texture size */
int maxTexture1DMipmap ; /**< Maximum 1D mipmapped texture size */
int maxTexture1DLinear ; /**< Maximum size for 1D textures bound to linear memory */
int maxTexture2D [ 2 ] ; /**< Maximum 2D texture dimensions */
int maxTexture2DMipmap [ 2 ] ; /**< Maximum 2D mipmapped texture dimensions */
int maxTexture2DLinear [ 3 ] ; /**< Maximum dimensions (width, height, pitch) for 2D textures bound to pitched memory */
int maxTexture2DGather [ 2 ] ; /**< Maximum 2D texture dimensions if texture gather operations have to be performed */
int maxTexture3D [ 3 ] ; /**< Maximum 3D texture dimensions */
int maxTexture3DAlt [ 3 ] ; /**< Maximum alternate 3D texture dimensions */
int maxTextureCubemap ; /**< Maximum Cubemap texture dimensions */
int maxTexture1DLayered [ 2 ] ; /**< Maximum 1D layered texture dimensions */
int maxTexture2DLayered [ 3 ] ; /**< Maximum 2D layered texture dimensions */
int maxTextureCubemapLayered [ 2 ] ; /**< Maximum Cubemap layered texture dimensions */
int maxSurface1D ; /**< Maximum 1D surface size */
int maxSurface2D [ 2 ] ; /**< Maximum 2D surface dimensions */
int maxSurface3D [ 3 ] ; /**< Maximum 3D surface dimensions */
int maxSurface1DLayered [ 2 ] ; /**< Maximum 1D layered surface dimensions */
int maxSurface2DLayered [ 3 ] ; /**< Maximum 2D layered surface dimensions */
int maxSurfaceCubemap ; /**< Maximum Cubemap surface dimensions */
int maxSurfaceCubemapLayered [ 2 ] ; /**< Maximum Cubemap layered surface dimensions */
size_t surfaceAlignment ; /**< Alignment requirements for surfaces */
int concurrentKernels ; /**< Device can possibly execute multiple kernels concurrently */
int ECCEnabled ; /**< Device has ECC support enabled */
int pciBusID ; /**< PCI bus ID of the device */
int pciDeviceID ; /**< PCI device ID of the device */
int pciDomainID ; /**< PCI domain ID of the device */
int tccDriver ; /**< 1 if device is a Tesla device using TCC driver, 0 otherwise */
int asyncEngineCount ; /**< Number of asynchronous engines */
int unifiedAddressing ; /**< Device shares a unified address space with the host */
int memoryClockRate ; /**< Peak memory clock frequency in kilohertz */
int memoryBusWidth ; /**< Global memory bus width in bits */
int l2CacheSize ; /**< Size of L2 cache in bytes */
int maxThreadsPerMultiProcessor ; /**< Maximum resident threads per multiprocessor */
int streamPrioritiesSupported ; /**< Device supports stream priorities */
} ;
위의 방법 말고도 CUDA 설치 파일 중에 deviceQuery.exe 파일을 실행해도 된다. deviceQuery의 원본 소스도 같이 제공이 된다.
필자의 경우 deviceQuery.exe의 위치 : C:\ProgramData\NVIDIA Corporation\CUDA Samples\v5.5\Bin\win64\Release
예제 소스의 위치 : C:\ProgramData\NVIDIA Corporation\CUDA Samples\v5.5\1_Utilities\deviceQuery
GTX 770 기준으로 CUDA 상세 스펙은 아래와 같다.
devicequery Starting...
CUDA Device Query (Runtime API) version (CUDART static linking)
Detected 1 CUDA Capable device(s)
Device 0: "GeForce GTX 770"
CUDA Driver Version / Runtime Version 6.0 / 5.5
CUDA Capability Major/Minor version number: 3.0
Total amount of global memory: 2048 MBytes (2147483648 bytes)
( 8) Multiprocessors, (192) CUDA Cores/MP: 1536 CUDA Cores
GPU Clock rate: 1202 MHz (1.20 GHz)
Memory Clock rate: 3505 Mhz
Memory Bus Width: 256-bit
L2 Cache Size: 524288 bytes
Maximum Texture Dimension Size (x,y,z) 1D=(65536), 2D=(65536, 65536), 3D=(4096, 4096, 4096)
Maximum Layered 1D Texture Size, (num) layers 1D=(16384), 2048 layers
Maximum Layered 2D Texture Size, (num) layers 2D=(16384, 16384), 2048 layers
Total amount of constant memory: 65536 bytes
Total amount of shared memory per block: 49152 bytes
Total number of registers available per block: 65536
Warp size: 32
Maximum number of threads per multiprocessor: 2048
Maximum number of threads per block: 1024
Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
Max dimension size of a grid size (x,y,z): (2147483647, 65535, 65535)
Maximum memory pitch: 2147483647 bytes
Texture alignment: 512 bytes
Concurrent copy and kernel execution: Yes with 1 copy engine(s)
Run time limit on kernels: Yes
Integrated GPU sharing Host Memory: No
Support host page-locked memory mapping: Yes
Alignment requirement for Surfaces: Yes
Device has ECC support: Disabled
CUDA Device Driver Mode (TCC or WDDM): WDDM (Windows Display Driver Model)
Device supports Unified Addressing (UVA): No
Device PCI Bus ID / PCI location ID: 1 / 0
Compute Mode:
< Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >
deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 6.0, CUDA Runtime Version = 5.5, NumDevs = 1, Device0 = GeForce GTX 770
Result = PASS
CUDA Device Query (Runtime API) version (CUDART static linking)
Detected 1 CUDA Capable device(s)
Device 0: "GeForce GTX 770"
CUDA Driver Version / Runtime Version 6.0 / 5.5
CUDA Capability Major/Minor version number: 3.0
Total amount of global memory: 2048 MBytes (2147483648 bytes)
( 8) Multiprocessors, (192) CUDA Cores/MP: 1536 CUDA Cores
GPU Clock rate: 1202 MHz (1.20 GHz)
Memory Clock rate: 3505 Mhz
Memory Bus Width: 256-bit
L2 Cache Size: 524288 bytes
Maximum Texture Dimension Size (x,y,z) 1D=(65536), 2D=(65536, 65536), 3D=(4096, 4096, 4096)
Maximum Layered 1D Texture Size, (num) layers 1D=(16384), 2048 layers
Maximum Layered 2D Texture Size, (num) layers 2D=(16384, 16384), 2048 layers
Total amount of constant memory: 65536 bytes
Total amount of shared memory per block: 49152 bytes
Total number of registers available per block: 65536
Warp size: 32
Maximum number of threads per multiprocessor: 2048
Maximum number of threads per block: 1024
Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
Max dimension size of a grid size (x,y,z): (2147483647, 65535, 65535)
Maximum memory pitch: 2147483647 bytes
Texture alignment: 512 bytes
Concurrent copy and kernel execution: Yes with 1 copy engine(s)
Run time limit on kernels: Yes
Integrated GPU sharing Host Memory: No
Support host page-locked memory mapping: Yes
Alignment requirement for Surfaces: Yes
Device has ECC support: Disabled
CUDA Device Driver Mode (TCC or WDDM): WDDM (Windows Display Driver Model)
Device supports Unified Addressing (UVA): No
Device PCI Bus ID / PCI location ID: 1 / 0
Compute Mode:
< Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >
deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 6.0, CUDA Runtime Version = 5.5, NumDevs = 1, Device0 = GeForce GTX 770
Result = PASS
GT 540M의 Spec.
댓글 없음:
댓글 쓰기