本文整理汇总了C++中cudaMalloc函数的典型用法代码示例。如果您正苦于以下问题:C++ cudaMalloc函数的具体用法?C++ cudaMalloc怎么用?C++ cudaMalloc使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了cudaMalloc函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的C++代码示例。
示例1: cumem
cumem(int newsz){
sz = newsz;
cudaMalloc(&data, sz);
status = inuse;
next = NULL;
};
开发者ID:darlliu,项目名称:Yeast-Cuda-Gillespie-Simulator,代码行数:6,代码来源:cudahelper.hpp
示例2: sscanf_s
ImGpu::ImGpu(const char* filename)
{
FILE *fp = 0;
int t1, t2, t3, t4;
cudaError_t cudaStatus;
sscanf_s(filename, "%dx%dx%dx%d_", &t1, &t2, &t3, &t4);
width = t1;
height = t2;
bpp = t3;
dimension = t4;
void *pxl = 0;
/* Allocate memory for the pixels on the Gpu */
if (8 == bpp)
{
cudaStatus = cudaMalloc((void**)&dev_pxl, width *height *dimension * sizeof(char));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaMemset(dev_pxl, 255, sizeof(char) * width *height *dimension);
pxl = new char[sizeof(char) * width *height *dimension];
}
else if (16 == bpp)
{
cudaStatus = cudaMalloc((void**)&dev_pxl, width *height *dimension * sizeof(unsigned short));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaMemset(dev_pxl, 255, sizeof(unsigned short) * width *height *dimension);
pxl = new unsigned short[sizeof(unsigned short) * width *height *dimension];
}
/*
* Open the file to read the pixels
*/
fopen_s(&fp, filename, "rb"); /* open for reading */
if (0 != fp){
std::fread(pxl, sizeof(unsigned char), width*height*dimension, fp);
fclose(fp); /* close the file */
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_pxl, pxl, width *height *dimension * sizeof(char), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
delete(pxl);
return;
Error:
cudaFree(dev_pxl);
//delete(pxl);
}
开发者ID:mattvend,项目名称:Gpu,代码行数:61,代码来源:ImGpu.cpp
示例3: mexFunction
/* Main */
void mexFunction( int nlhs, mxArray *plhs[],
int nrhs, const mxArray *prhs[]) {
if (nrhs != 7) {
mexErrMsgTxt("sgemm requires 7 input arguments");
} else if (nlhs != 1) {
mexErrMsgTxt("sgemm requires 1 output argument");
}
if ( !mxIsSingle(prhs[4]) ||
!mxIsSingle(prhs[5]) ||
!mxIsSingle(prhs[6])) {
mexErrMsgTxt("Input arrays must be single precision.");
}
int ta = (int) mxGetScalar(prhs[0]);
int tb = (int) mxGetScalar(prhs[1]);
float alpha = (float) mxGetScalar(prhs[2]);
float beta = (float) mxGetScalar(prhs[3]);
float *h_A = (float*) mxGetData(prhs[4]);
float *h_B = (float*) mxGetData(prhs[5]);
float *h_C = (float*) mxGetData(prhs[6]);
int M = mxGetM(prhs[4]); /* gets number of rows of A */
int K = mxGetN(prhs[4]); /* gets number of columns of A */
int L = mxGetM(prhs[5]); /* gets number of rows of B */
int N = mxGetN(prhs[5]); /* gets number of columns of B */
cublasOperation_t transa, transb;
int MM, KK, NN;
if (ta == 0) {
transa = CUBLAS_OP_N;
MM=M;
KK=K;
} else {
transa = CUBLAS_OP_T;
MM=K;
KK=M;
}
if (tb == 0) {
transb = CUBLAS_OP_N;
NN=N;
} else {
transb = CUBLAS_OP_T;
NN=L;
}
/* printf("transa=%c\n",transa);
printf("transb=%c\n",transb);
printf("alpha=%f\n",alpha);
printf("beta=%f\n",beta); */
/* Left hand side matrix set up */
mwSize dims0[2];
dims0[0]=MM;
dims0[1]=NN;
plhs[0] = mxCreateNumericArray(2,dims0,mxSINGLE_CLASS,mxREAL);
float *h_C_out = (float*) mxGetData(plhs[0]);
cublasStatus_t status;
cublasHandle_t handle;
status = cublasCreate(&handle);
if (status != CUBLAS_STATUS_SUCCESS) {
mexErrMsgTxt("!!!! CUBLAS initialization error\n");
}
float* d_A = 0;
float* d_B = 0;
float* d_C = 0;
/* Allocate device memory for the matrices */
if (cudaMalloc((void**)&d_A, M * K * sizeof(d_A[0])) != cudaSuccess) {
mexErrMsgTxt("!!!! device memory allocation error (allocate A)\n");
}
if (cudaMalloc((void**)&d_B, L * N * sizeof(d_B[0])) != cudaSuccess) {
mexErrMsgTxt("!!!! device memory allocation error (allocate B)\n");
}
if (cudaMalloc((void**)&d_C, MM * NN * sizeof(d_C[0])) != cudaSuccess) {
mexErrMsgTxt("!!!! device memory allocation error (allocate C)\n");
}
/* Initialize the device matrices with the host matrices */
status = cublasSetVector(M * K, sizeof(h_A[0]), h_A, 1, d_A, 1);
if (status != CUBLAS_STATUS_SUCCESS) {
mexErrMsgTxt("!!!! device access error (write A)\n");
}
status = cublasSetVector(L * N, sizeof(h_B[0]), h_B, 1, d_B, 1);
if (status != CUBLAS_STATUS_SUCCESS) {
mexErrMsgTxt("!!!! device access error (write B)\n");
}
status = cublasSetVector(MM * NN, sizeof(h_C[0]), h_C, 1, d_C, 1);
if (status != CUBLAS_STATUS_SUCCESS) {
mexErrMsgTxt("!!!! device access error (write C)\n");
}
/* Performs operation using cublas */
//.........这里部分代码省略.........
开发者ID:research2010,项目名称:sparse_linear_model,代码行数:101,代码来源:cudaSample.cpp
示例4: main
int main( int argc, char **argv )
{
uchar *h_Data;
uint *h_HistogramCPU, *h_HistogramGPU;
uchar *d_Data;
uint *d_Histogram;
uint hTimer;
int PassFailFlag = 1;
uint byteCount = 64 * 1048576;
uint uiSizeMult = 1;
cudaDeviceProp deviceProp;
deviceProp.major = 0;
deviceProp.minor = 0;
int dev;
shrQAStart(argc, argv);
// set logfile name and start logs
shrSetLogFileName ("histogram.txt");
//Use command-line specified CUDA device, otherwise use device with highest Gflops/s
if( shrCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
dev = cutilDeviceInit(argc, argv);
if (dev < 0) {
printf("No CUDA Capable Devices found, exiting...\n");
shrQAFinishExit(argc, (const char **)argv, QA_WAIVED);
}
} else {
cudaSetDevice( dev = cutGetMaxGflopsDeviceId() );
cutilSafeCall( cudaChooseDevice(&dev, &deviceProp) );
}
cutilSafeCall( cudaGetDeviceProperties(&deviceProp, dev) );
printf("CUDA device [%s] has %d Multi-Processors, Compute %d.%d\n",
deviceProp.name, deviceProp.multiProcessorCount, deviceProp.major, deviceProp.minor);
int version = deviceProp.major * 0x10 + deviceProp.minor;
if(version < 0x11)
{
printf("There is no device supporting a minimum of CUDA compute capability 1.1 for this SDK sample\n");
cutilDeviceReset();
shrQAFinishExit(argc, (const char **)argv, QA_WAIVED);
}
cutilCheckError(cutCreateTimer(&hTimer));
// Optional Command-line multiplier to increase size of array to histogram
if (shrGetCmdLineArgumentu(argc, (const char**)argv, "sizemult", &uiSizeMult))
{
uiSizeMult = CLAMP(uiSizeMult, 1, 10);
byteCount *= uiSizeMult;
}
shrLog("Initializing data...\n");
shrLog("...allocating CPU memory.\n");
h_Data = (uchar *)malloc(byteCount);
h_HistogramCPU = (uint *)malloc(HISTOGRAM256_BIN_COUNT * sizeof(uint));
h_HistogramGPU = (uint *)malloc(HISTOGRAM256_BIN_COUNT * sizeof(uint));
shrLog("...generating input data\n");
srand(2009);
for(uint i = 0; i < byteCount; i++)
h_Data[i] = rand() % 256;
shrLog("...allocating GPU memory and copying input data\n\n");
cutilSafeCall( cudaMalloc((void **)&d_Data, byteCount ) );
cutilSafeCall( cudaMalloc((void **)&d_Histogram, HISTOGRAM256_BIN_COUNT * sizeof(uint) ) );
cutilSafeCall( cudaMemcpy(d_Data, h_Data, byteCount, cudaMemcpyHostToDevice) );
//-----
// 64 bin histogram
//------
{
shrLog("Starting up 64-bin histogram...\n\n");
initHistogram64();
shrLog("Running 64-bin GPU histogram for %u bytes (%u runs)...\n\n", byteCount, numRuns);
for(int iter = -1; iter < numRuns; iter++){
//iter == -1 -- warmup iteration
if(iter == 0){
cutilSafeCall( cutilDeviceSynchronize() );
cutilCheckError( cutResetTimer(hTimer) );
cutilCheckError( cutStartTimer(hTimer) );
}
histogram64(d_Histogram, d_Data, byteCount);
}
cutilSafeCall( cutilDeviceSynchronize() );
cutilCheckError( cutStopTimer(hTimer));
double dAvgSecs = 1.0e-3 * (double)cutGetTimerValue(hTimer) / (double)numRuns;
shrLog("histogram64() time (average) : %.5f sec, %.4f MB/sec\n\n", dAvgSecs, ((double)byteCount * 1.0e-6) / dAvgSecs);
shrLogEx(LOGBOTH | MASTER, 0, "histogram64, Throughput = %.4f MB/s, Time = %.5f s, Size = %u Bytes, NumDevsUsed = %u, Workgroup = %u\n",
(1.0e-6 * (double)byteCount / dAvgSecs), dAvgSecs, byteCount, 1, HISTOGRAM64_THREADBLOCK_SIZE);
shrLog("\nValidating GPU results...\n");
shrLog(" ...reading back GPU results\n");
cutilSafeCall( cudaMemcpy(h_HistogramGPU, d_Histogram, HISTOGRAM64_BIN_COUNT * sizeof(uint), cudaMemcpyDeviceToHost) );
//.........这里部分代码省略.........
开发者ID:shawndb,项目名称:demoTRISH,代码行数:101,代码来源:main.cpp
示例5: allocate
void allocate() {
this->destroy();
check_error( cudaMalloc((void**)&_dptr, sizeof(value_type)) );
}
开发者ID:MilesCranmer,项目名称:bifrost,代码行数:4,代码来源:value.hpp
示例6: cudaMalloc
void sparse_matrix_t::alloc_device()
{
cudaMalloc((void**)&devJc, (numCols+1) * sizeof(int));
cudaMalloc((void**)&devIr, numNonZeroElems * sizeof(int));
cudaMalloc((void**)&devRVals, numNonZeroElems * sizeof(float));
}
开发者ID:hksonngan,项目名称:Impatient-MRI,代码行数:6,代码来源:utils.cpp
示例7: runTestMax
bool
runTestMax( int argc, char** argv, ReduceType datatype)
{
int size = 1<<24; // number of elements to reduce
int maxThreads = 256; // number of threads per block
int whichKernel = 6;
int maxBlocks = 64;
bool cpuFinalReduction = false;
int cpuFinalThreshold = 1;
cutGetCmdLineArgumenti( argc, (const char**) argv, "n", &size);
cutGetCmdLineArgumenti( argc, (const char**) argv, "threads", &maxThreads);
cutGetCmdLineArgumenti( argc, (const char**) argv, "kernel", &whichKernel);
cutGetCmdLineArgumenti( argc, (const char**) argv, "maxblocks", &maxBlocks);
shrLog("METHOD: MAX\n");
shrLog("%d elements\n", size);
shrLog("%d threads (max)\n", maxThreads);
cpuFinalReduction = (cutCheckCmdLineFlag( argc, (const char**) argv, "cpufinal") == CUTTrue);
cutGetCmdLineArgumenti( argc, (const char**) argv, "cputhresh", &cpuFinalThreshold);
bool runShmoo = (cutCheckCmdLineFlag(argc, (const char**) argv, "shmoo") == CUTTrue);
if (runShmoo)
{
shmoo<T>(1, 33554432, maxThreads, maxBlocks, datatype);
}
else
{
// create random input data on CPU
unsigned int bytes = size * sizeof(T);
T *h_idata = (T *) malloc(bytes);
for(int i=0; i<size; i++)
{
// Keep the numbers small so we don't get truncation error in the sum
if (datatype == REDUCE_INT)
h_idata[i] = (T)(rand() & 0xFF);
else
h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX;
}
int numBlocks = 0;
int numThreads = 0;
getNumBlocksAndThreads(whichKernel, size, maxBlocks, maxThreads, numBlocks, numThreads);
if (numBlocks == 1) cpuFinalThreshold = 1;
// allocate mem for the result on host side
T* h_odata = (T*) malloc(numBlocks*sizeof(T));
shrLog("%d blocks\n\n", numBlocks);
// allocate device memory and data
T* d_idata = NULL;
T* d_odata = NULL;
cutilSafeCallNoSync( cudaMalloc((void**) &d_idata, bytes) );
cutilSafeCallNoSync( cudaMalloc((void**) &d_odata, numBlocks*sizeof(T)) );
// copy data directly to device memory
cutilSafeCallNoSync( cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice) );
cutilSafeCallNoSync( cudaMemcpy(d_odata, h_idata, numBlocks*sizeof(T), cudaMemcpyHostToDevice) );
// warm-up
maxreduce<T>(size, numThreads, numBlocks, whichKernel, d_idata, d_odata);
int testIterations = 100;
unsigned int timer = 0;
cutilCheckError( cutCreateTimer( &timer));
T gpu_result = 0;
gpu_result = benchmarkReduceMax<T>(size, numThreads, numBlocks, maxThreads, maxBlocks,
whichKernel, testIterations, cpuFinalReduction,
cpuFinalThreshold, timer,
h_odata, d_idata, d_odata);
double reduceTime = cutGetAverageTimerValue(timer) * 1e-3;
shrLogEx(LOGBOTH | MASTER, 0, "Reduction, Throughput = %.4f GB/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %d, Workgroup = %u\n",
1.0e-9 * ((double)bytes)/reduceTime, reduceTime, size, 1, numThreads);
// compute reference solution
T cpu_result = maxreduceCPU<T>(h_idata, size);
double threshold = 1e-12;
double diff = 0;
if (datatype == REDUCE_INT)
{
shrLog("\nGPU result = %d\n", gpu_result);
shrLog("CPU result = %d\n\n", cpu_result);
}
else
{
shrLog("\nGPU result = %f\n", gpu_result);
shrLog("CPU result = %f\n\n", cpu_result);
//.........这里部分代码省略.........
开发者ID:szabodabo,项目名称:CUDA-MPI-Reductions,代码行数:101,代码来源:reduction.cpp
示例8: LOG
void iB_FFTShift::FFTShift_2D_Float(int size_X, int size_Y, Sheet* xlSheet, int nLoop)
{
LOG();
INFO("2D FFT Shift Float - CPU " + ITS(size_X) + "x" + ITS(size_Y));
/**********************************************************
* Float Case
**********************************************************/
if (xlSheet)
{
for (int iLoop = 0; iLoop < nLoop; iLoop++)
{
// Headers
xlSheet->writeStr(1, ((iLoop * 4) + 0), "I-CPU");
xlSheet->writeStr(1, ((iLoop * 4) + 1), "O-CPU");
xlSheet->writeStr(1, ((iLoop * 4) + 2), "I-GPU");
xlSheet->writeStr(1, ((iLoop * 4) + 3), "O-GPU");
// Allocation: 2D, Flat, Device
arr_2D_float = MEM_ALLOC_2D_FLOAT(size_X, size_Y);
arr_2D_flat_float = MEM_ALLOC_1D_FLOAT(size_X * size_Y);
int devMem = size_X * size_Y * sizeof(float);
cudaMalloc((void**)(&dev_arr_2D_flat_float), devMem);
// Filling arrays: 2D, Flat
Array::fillArray_2D_float(arr_2D_float, size_X, size_Y, 1);
Array::fillArray_2D_flat_float(arr_2D_flat_float, size_X, size_Y, 1);
// Printing input
ctr = 0;
for (int i = 0; i < size_X; i++)
for (int j = 0; j < size_Y; j++)
xlSheet->writeNum((ctr++) + 2, iLoop * 4, arr_2D_float[i][j]);
// FFT shift operation - CPU
arr_2D_float = FFT::FFT_Shift_2D_float(arr_2D_float, size_X, size_Y);
// Printing CPU output
ctr = 0;
for (int i = 0; i < size_X; i++)
for (int j = 0; j < size_Y; j++)
xlSheet->writeNum((ctr++) + 2, ((iLoop * 4 ) + 1), arr_2D_float[i][j]);
// Printing GPU input
ctr = 0;
for (int i = 0; i < size_X; i++)
for (int j = 0; j < size_Y; j++)
{
xlSheet->writeNum(ctr + 2, ((iLoop * 4 ) + 2), arr_2D_flat_float[ctr]);
ctr++;
}
// Uploading array
cuUtils::upload_2D_float(arr_2D_flat_float, dev_arr_2D_flat_float, size_X, size_Y);
// CUDA Gridding
dim3 cuBlock(512, 512, 1);
dim3 cuGrid(size_X / cuBlock.x, size_Y/ cuBlock.y, 1);
// FFT shift
cuFFTShift_2D( cuBlock, cuGrid, dev_arr_2D_flat_float, dev_arr_2D_flat_float, size_X);
// Downloading array
cuUtils::download_2D_float(arr_2D_flat_float, dev_arr_2D_flat_float, size_X, size_Y);
// Printing output
ctr = 0;
for (int i = 0; i < size_X; i++)
for (int j = 0; j < size_Y; j++)
{
xlSheet->writeNum((ctr) + 2, ((iLoop * 4 ) + 3), arr_2D_flat_float[ctr]);
ctr++;
}
// Dellocating memory
FREE_MEM_2D_FLOAT(arr_2D_float, size_X, size_Y);
}
}
else
{
INFO("No valid xlSheet was created, EXITTING ...");
EXIT(0);
}
}
开发者ID:marwan-abdellah,项目名称:Dummy,代码行数:87,代码来源:iB_FFTShift.cpp
示例9: cudaMallocWrapper
static cudaError_t cudaMallocWrapper(void* ctx, void** devPtr, size_t size, cudaStream_t stream)
{
return cudaMalloc(devPtr, size);
}
开发者ID:ibcn-cloudlet,项目名称:cutorch,代码行数:4,代码来源:THCGeneral.c
示例10: gpujpeg_encoder_create
/** Documented at declaration */
struct gpujpeg_encoder*
gpujpeg_encoder_create(struct gpujpeg_parameters* param, struct gpujpeg_image_parameters* param_image)
{
assert(param_image->comp_count == 1 || param_image->comp_count == 3);
assert(param_image->comp_count <= GPUJPEG_MAX_COMPONENT_COUNT);
assert(param->quality >= 0 && param->quality <= 100);
assert(param->restart_interval >= 0);
assert(param->interleaved == 0 || param->interleaved == 1);
struct gpujpeg_encoder* encoder = (struct gpujpeg_encoder*) malloc(sizeof(struct gpujpeg_encoder));
if ( encoder == NULL )
return NULL;
// Get coder
struct gpujpeg_coder* coder = &encoder->coder;
// Set parameters
memset(encoder, 0, sizeof(struct gpujpeg_encoder));
coder->param_image = *param_image;
coder->param = *param;
int result = 1;
// Create writer
encoder->writer = gpujpeg_writer_create(encoder);
if ( encoder->writer == NULL )
result = 0;
// Initialize coder
if ( gpujpeg_coder_init(coder) != 0 )
result = 0;
// Init preprocessor
if ( gpujpeg_preprocessor_encoder_init(&encoder->coder) != 0 ) {
fprintf(stderr, "Failed to init preprocessor!");
result = 0;
}
// Allocate quantization tables in device memory
for ( int comp_type = 0; comp_type < GPUJPEG_COMPONENT_TYPE_COUNT; comp_type++ ) {
if ( cudaSuccess != cudaMalloc((void**)&encoder->table_quantization[comp_type].d_table, 64 * sizeof(uint16_t)) )
result = 0;
if ( cudaSuccess != cudaMalloc((void**)&encoder->table_quantization[comp_type].d_table_forward, 64 * sizeof(float)) )
result = 0;
}
gpujpeg_cuda_check_error("Encoder table allocation", return NULL);
// Init quantization tables for encoder
for ( int comp_type = 0; comp_type < GPUJPEG_COMPONENT_TYPE_COUNT; comp_type++ ) {
if ( gpujpeg_table_quantization_encoder_init(&encoder->table_quantization[comp_type], (enum gpujpeg_component_type)comp_type, coder->param.quality) != 0 )
result = 0;
}
gpujpeg_cuda_check_error("Quantization init", return NULL);
// Init huffman tables for encoder
for ( int comp_type = 0; comp_type < GPUJPEG_COMPONENT_TYPE_COUNT; comp_type++ ) {
for ( int huff_type = 0; huff_type < GPUJPEG_HUFFMAN_TYPE_COUNT; huff_type++ ) {
if ( gpujpeg_table_huffman_encoder_init(&encoder->table_huffman[comp_type][huff_type], (enum gpujpeg_component_type)comp_type, (enum gpujpeg_huffman_type)huff_type) != 0 )
result = 0;
}
}
gpujpeg_cuda_check_error("Encoder table init", return NULL);
// Init huffman encoder
if ( gpujpeg_huffman_gpu_encoder_init(encoder) != 0 )
result = 0;
if ( result == 0 ) {
gpujpeg_encoder_destroy(encoder);
return NULL;
}
// Timers
GPUJPEG_CUSTOM_TIMER_CREATE(encoder->def);
GPUJPEG_CUSTOM_TIMER_CREATE(encoder->in_gpu);
return encoder;
}
开发者ID:VideoInsight,项目名称:TranscodeModules,代码行数:79,代码来源:gpujpeg_encoder.cpp
示例11: GPU_ENTRY
rk4_mem *SOLVER(rk4, init, TARGET, SIMENGINE_STORAGE, solver_props *props) {
#if defined TARGET_GPU
GPU_ENTRY(init, SIMENGINE_STORAGE);
// Temporary CPU copies of GPU datastructures
rk4_mem tmem;
// GPU datastructures
rk4_mem *dmem;
// Computes GPU kernel geometry
size_t shmem_per_thread, total_shmem = 1<<14;
int warp_size = 1<<5;
uint threads_per_block;
uint num_gpu_threads;
uint num_gpu_blocks;
// shared space for model states and solver overhead
shmem_per_thread = sizeof(CDATAFORMAT) * props->statesize * 6; // 6 = magic for rk4
// shared space for a vector of time
shmem_per_thread += sizeof(CDATAFORMAT);
// shared space for a vector of `running' flags
shmem_per_thread += sizeof(int);
threads_per_block = total_shmem / shmem_per_thread;
threads_per_block = warp_size * (threads_per_block / warp_size);
num_gpu_threads = threads_per_block < props->num_models ? threads_per_block : props->num_models;
num_gpu_blocks = (props->num_models + threads_per_block - 1) / threads_per_block;
props->gpu.blockx = num_gpu_threads;
props->gpu.blocky = 1;
props->gpu.blockz = 1;
props->gpu.gridx = num_gpu_blocks;
props->gpu.gridy = 1;
props->gpu.gridz = 1;
props->gpu.shmem_per_block = shmem_per_thread * num_gpu_threads;
// Allocate GPU space for mem and pointer fields of mem (other than props)
cutilSafeCall(cudaMalloc((void**)&dmem, sizeof(rk4_mem)));
tmem.props = GPU_ENTRY(init_props, SIMENGINE_STORAGE, props);
cutilSafeCall(cudaMalloc((void**)&tmem.k1, props->statesize*props->num_models*sizeof(CDATAFORMAT)));
cutilSafeCall(cudaMalloc((void**)&tmem.k2, props->statesize*props->num_models*sizeof(CDATAFORMAT)));
cutilSafeCall(cudaMalloc((void**)&tmem.k3, props->statesize*props->num_models*sizeof(CDATAFORMAT)));
cutilSafeCall(cudaMalloc((void**)&tmem.k4, props->statesize*props->num_models*sizeof(CDATAFORMAT)));
cutilSafeCall(cudaMalloc((void**)&tmem.temp, props->statesize*props->num_models*sizeof(CDATAFORMAT)));
// Copy mem structure to GPU
cutilSafeCall(cudaMemcpy(dmem, &tmem, sizeof(rk4_mem), cudaMemcpyHostToDevice));
return dmem;
#else // Used for CPU and OPENMP targets
rk4_mem *mem = (rk4_mem*)malloc(sizeof(rk4_mem));
mem->props = props;
mem->k1 = (CDATAFORMAT*)malloc(props->statesize*props->num_models*sizeof(CDATAFORMAT));
mem->k2 = (CDATAFORMAT*)malloc(props->statesize*props->num_models*sizeof(CDATAFORMAT));
mem->k3 = (CDATAFORMAT*)malloc(props->statesize*props->num_models*sizeof(CDATAFORMAT));
mem->k4 = (CDATAFORMAT*)malloc(props->statesize*props->num_models*sizeof(CDATAFORMAT));
mem->temp = (CDATAFORMAT*)malloc(props->statesize*props->num_models*sizeof(CDATAFORMAT));
return mem;
#endif
}
开发者ID:joshuaecook,项目名称:simengine,代码行数:69,代码来源:rk4.c
示例12: CudaThrowsCall
float WFIRFilterCuda::cudaFilter( WLEMData::ScalarT* const output, const WLEMData::ScalarT* const input,
const WLEMData::ScalarT* const previous, size_t channels, size_t samples, const WLEMData::ScalarT* const coeffs,
size_t coeffSize )
{
CuScalarT *dev_in = NULL;
size_t pitchIn;
CuScalarT *dev_prev = NULL;
size_t pitchPrev;
CuScalarT *dev_out = NULL;
size_t pitchOut;
CuScalarT *dev_co = NULL;
try
{
CudaThrowsCall( cudaMallocPitch( ( void** )&dev_in, &pitchIn, samples * sizeof( CuScalarT ), channels ) );
CudaThrowsCall(
cudaMemcpy2D( dev_in, pitchIn, input, samples * sizeof( CuScalarT ), samples * sizeof( CuScalarT ),
channels, cudaMemcpyHostToDevice ) );
CudaThrowsCall( cudaMallocPitch( ( void** )&dev_prev, &pitchPrev, coeffSize * sizeof( CuScalarT ), channels ) );
CudaThrowsCall(
cudaMemcpy2D( dev_prev, pitchPrev, previous, coeffSize * sizeof( CuScalarT ),
coeffSize * sizeof( CuScalarT ), channels, cudaMemcpyHostToDevice ) );
CudaThrowsCall( cudaMallocPitch( ( void** )&dev_out, &pitchOut, samples * sizeof( CuScalarT ), channels ) );
CudaThrowsCall( cudaMalloc( ( void** )&dev_co, coeffSize * sizeof( CuScalarT ) ) );
CudaThrowsCall( cudaMemcpy( dev_co, coeffs, coeffSize * sizeof( CuScalarT ), cudaMemcpyHostToDevice ) );
}
catch( const WException& e )
{
wlog::error( CLASS ) << e.what();
if( dev_in )
{
CudaSafeCall( cudaFree( ( void* )dev_in ) );
}
if( dev_prev )
{
CudaSafeCall( cudaFree( ( void* )dev_prev ) );
}
if( dev_out )
{
CudaSafeCall( cudaFree( ( void* )dev_out ) );
}
if( dev_co )
{
CudaSafeCall( cudaFree( ( void* )dev_co ) );
}
throw WLBadAllocException( "Could not allocate CUDA memory!" );
}
size_t threadsPerBlock = 32;
size_t blocksPerGrid = ( samples + threadsPerBlock - 1 ) / threadsPerBlock;
size_t sharedMem = coeffSize * sizeof( CuScalarT );
cudaEvent_t start, stop;
cudaEventCreate( &start );
cudaEventCreate( &stop );
cudaEventRecord( start, 0 );
cuFirFilter( blocksPerGrid, threadsPerBlock, sharedMem, dev_out, dev_in, dev_prev, channels, samples, dev_co, coeffSize,
pitchOut, pitchIn, pitchPrev );
cudaError_t kernelError = cudaGetLastError();
cudaEventRecord( stop, 0 );
cudaEventSynchronize( stop );
float elapsedTime;
cudaEventElapsedTime( &elapsedTime, start, stop );
cudaEventDestroy( start );
cudaEventDestroy( stop );
try
{
if( kernelError != cudaSuccess )
{
const std::string err( cudaGetErrorString( kernelError ) );
throw WException( "CUDA kernel failed: " + err );
}
CudaThrowsCall(
cudaMemcpy2D( output, samples * sizeof( CuScalarT ), dev_out, pitchOut, samples * sizeof( CuScalarT ),
channels, cudaMemcpyDeviceToHost ) );
}
catch( const WException& e )
{
wlog::error( CLASS ) << e.what();
elapsedTime = -1.0;
}
CudaSafeCall( cudaFree( ( void* )dev_in ) );
CudaSafeCall( cudaFree( ( void* )dev_prev ) );
CudaSafeCall( cudaFree( ( void* )dev_out ) );
CudaSafeCall( cudaFree( ( void* )dev_co ) );
if( elapsedTime > -1.0 )
{
return elapsedTime;
//.........这里部分代码省略.........
开发者ID:labp,项目名称:na-online_ow-toolbox,代码行数:101,代码来源:WFIRFilterCuda.cpp
示例13: checkCudaErrors
BaseData<Dtype>::BaseData(const int length)
{
length_ = length;
checkCudaErrors(cudaHostAlloc(&cpu_data_, sizeof(Dtype)*length_, cudaHostAllocDefault));
checkCudaErrors(cudaMalloc(&gpu_data_, sizeof(Dtype)*length_));
}
开发者ID:Haybla,项目名称:Latte,代码行数:6,代码来源:BaseData.cpp
示例14: ckm
void ckm( struct svm_problem *prob, struct svm_problem *pecm, float *gamma )
{
cublasStatus_t status;
double g_val = *gamma;
long int nfa;
int len_tv;
int ntv;
int i_v;
int i_el;
int i_r, i_c;
int trvei;
double *tv_sq;
double *v_f_g;
float *tr_ar;
float *tva, *vtm, *DP;
float *g_tva = 0, *g_vtm = 0, *g_DotProd = 0;
cudaError_t cudaStat;
cublasHandle_t handle;
status = cublasCreate(&handle);
len_tv = prob-> x[0].dim;
ntv = prob-> l;
nfa = len_tv * ntv;
tva = (float*) malloc ( len_tv * ntv* sizeof(float) );
vtm = (float*) malloc ( len_tv * sizeof(float) );
DP = (float*) malloc ( ntv * sizeof(float) );
tr_ar = (float*) malloc ( len_tv * ntv* sizeof(float) );
tv_sq = (double*) malloc ( ntv * sizeof(double) );
v_f_g = (double*) malloc ( ntv * sizeof(double) );
for ( i_r = 0; i_r < ntv ; i_r++ )
{
for ( i_c = 0; i_c < len_tv; i_c++ )
tva[i_r * len_tv + i_c] = (float)prob-> x[i_r].values[i_c];
}
cudaStat = cudaMalloc((void**)&g_tva, len_tv * ntv * sizeof(float));
if (cudaStat != cudaSuccess) {
free( tva );
free( vtm );
free( DP );
free( v_f_g );
free( tv_sq );
cudaFree( g_tva );
cublasDestroy( handle );
fprintf (stderr, "!!!! Device memory allocation error (A)\n");
getchar();
return;
}
cudaStat = cudaMalloc((void**)&g_vtm, len_tv * sizeof(float));
cudaStat = cudaMalloc((void**)&g_DotProd, ntv * sizeof(float));
for( i_r = 0; i_r < ntv; i_r++ )
for( i_c = 0; i_c < len_tv; i_c++ )
tr_ar[i_c * ntv + i_r] = tva[i_r * len_tv + i_c];
// Copy cpu vector to gpu vector
status = cublasSetVector( len_tv * ntv, sizeof(float), tr_ar, 1, g_tva, 1 );
free( tr_ar );
for( i_v = 0; i_v < ntv; i_v++ )
{
tv_sq[ i_v ] = 0;
for( i_el = 0; i_el < len_tv; i_el++ )
tv_sq[i_v] += pow( tva[i_v*len_tv + i_el], (float)2.0 );
}
for ( trvei = 0; trvei < ntv; trvei++ )
{
status = cublasSetVector( len_tv, sizeof(float), &tva[trvei * len_tv], 1, g_vtm, 1 );
status = cublasSgemv( handle, CUBLAS_OP_N, ntv, len_tv, &alpha, g_tva, ntv , g_vtm, 1, &beta, g_DotProd, 1 );
status = cublasGetVector( ntv, sizeof(float), g_DotProd, 1, DP, 1 );
for ( i_c = 0; i_c < ntv; i_c++ )
v_f_g[i_c] = exp( -g_val * (tv_sq[trvei] + tv_sq[i_c]-((double)2.0)* (double)DP[i_c] ));
//.........这里部分代码省略.........
开发者ID:Kufieta,项目名称:CUDA,代码行数:101,代码来源:kernel_matrix_calculation.c
示例15: runAutoTest
void runAutoTest(int argc, char *argv[])
{
printf("[%s] (automated testing w/ readback)\n", sSDKsample);
int devID = findCudaDevice(argc, (const char **)argv);
// Ensure that SM 2.0 or higher device is available before running
checkDeviceMeetComputeSpec(argc, argv);
loadDefaultImage(argv[0]);
Pixel *d_result;
checkCudaErrors(cudaMalloc((void **)&d_result, imWidth*imHeight*sizeof(Pixel)));
char *ref_file = NULL;
char dump_file[256];
int mode = 0;
mode = getCmdLineArgumentInt(argc, (const char **)argv, "mode");
getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file);
switch (mode)
{
case 0:
g_SobelDisplayMode = SOBELDISPLAY_IMAGE;
sprintf(dump_file, "lena_orig.pgm");
break;
case 1:
g_SobelDisplayMode = SOBELDISPLAY_SOBELTEX;
sprintf(dump_file, "lena_tex.pgm");
break;
case 2:
g_SobelDisplayMode = SOBELDISPLAY_SOBELSHARED;
sprintf(dump_file, "lena_shared.pgm");
break;
default:
printf("Invalid Filter Mode File\n");
exit(EXIT_FAILURE);
break;
}
printf("AutoTest: %s <%s>\n", sSDKsample, filterMode[g_SobelDisplayMode]);
sobelFilter(d_result, imWidth, imHeight, g_SobelDisplayMode, imageScale, blockOp, pointOp);
checkCudaErrors(cudaDeviceSynchronize());
unsigned char *h_result = (unsigned char *)malloc(imWidth*imHeight*sizeof(Pixel));
checkCudaErrors(cudaMemcpy(h_result, d_result, imWidth*imHeight*sizeof(Pixel), cudaMemcpyDeviceToHost));
sdkSavePGM(dump_file, h_result, imWidth, imHeight);
if (!sdkComparePGM(dump_file, sdkFindFilePath(ref_file, argv[0]), MAX_EPSILON_ERROR, 0.15f, false))
{
g_TotalErrors++;
}
checkCudaErrors(cudaFree(d_result));
free(h_result);
if (g_TotalErrors != 0)
{
printf("Test failed!\n");
exit(EXIT_FAILURE);
}
printf("Test passed!\n");
exit(EXIT_SUCCESS);
}
开发者ID:Aahung,项目名称:CudaSample,代码行数:68,代码来源:FunctionPointers.cpp
示例16: main
int main(int argc, char* argv[]) {
int disp_size = 64;
const int bits = 8;
if (argc >= 2) {
disp_size = atoi(argv[1]);
}
// init zed cam
auto cap = new sl::zed::Camera(sl::zed::ZEDResolution_mode::VGA);
sl::zed::ERRCODE err = cap->init(sl::zed::MODE::PERFORMANCE, 0, true);
if (err != sl::zed::ERRCODE::SUCCESS) {
std::cout << sl::zed::errcode2str(err) << std::endl;
exit(EXIT_FAILURE);
}
int width = cap->getImageSize().width;
int height = cap->getImageSize().height;
sgm::StereoSGM ssgm(width, height, disp_size, 8, 16, sgm::EXECUTE_INOUT_CUDA2CUDA);
SGMDemo demo(width, height);
if (demo.init()) {
printf("fail to init SGM Demo\n");
std::exit(EXIT_FAILURE);
}
Renderer renderer(width, height);
uint16_t* d_output_buffer = NULL;
uint8_t* d_input_left = NULL;
uint8_t* d_input_right = NULL;
cudaMalloc((void**)&d_input_left, width * height);
cudaMalloc((void**)&d_input_right, width * height);
const NppiSize roi = { width, height };
cv::Mat h_input_left(height, width, CV_8UC1);
while (!demo.should_close()) {
cap->grab(sl::zed::SENSING_MODE::FULL, false, false);
sl::zed::Mat left_zm = cap->retrieveImage_gpu(sl::zed::SIDE::LEFT);
sl::zed::Mat right_zm = cap->retrieveImage_gpu(sl::zed::SIDE::RIGHT);
nppiRGBToGray_8u_AC4C1R(left_zm.data, width * 4, d_input_left, width, roi);
nppiRGBToGray_8u_AC4C1R(right_zm.data, width * 4, d_input_right, width, roi);
ssgm.execute(d_input_left, d_input_right, (void**)&d_output_buffer);
switch (demo.get_flag()) {
case 0:
cudaMemcpy(h_input_left.data, d_input_left, width * height, cudaMemcpyDeviceToHost);
renderer.render_input((uint8_t*)h_input_left.data);
break;
case 1:
renderer.render_disparity(d_output_buffer, disp_size);
break;
case 2:
renderer.render_disparity_color(d_output_buffer, disp_size);
break;
}
demo.swap_buffer();
}
cudaFree(d_input_left);
cudaFree(d_input_right);
delete cap;
}
开发者ID:PLUSToolkit,项目名称:OvrvisionPro,代码行数:72,代码来源:zed_demo.cpp
示例17: CUDA_SAFE_CALL
void matrix_t::alloc_device()
{
CUDA_SAFE_CALL(
cudaMalloc((void**)&device, elems * sizeof(cufftComplex))
);
}
开发者ID:hksonngan,项目名称:Impatient-MRI,代码行数:6,代码来源:utils.cpp
示例18: prediction
fmat ModelWPAMGPU::ffun(fmat *current)
{
fmat prediction(current->n_rows,current->n_cols);
fmat pNoiseSample = pNoise.sample(current->n_cols);
fmat u = U.sample(current->n_cols);
float* lastState_dev;
float* F_dev;
float* U_dev;
float* pNoise_dev;
int stateDimension = current->n_rows;
int numberOfSamples = current->n_cols;
float* newState_dev;
//allocate memory on gpu
cudaMalloc( &lastState_dev, (size_t) current->n_elem * sizeof(float)) ;
cudaMalloc( &F_dev, (size_t) F.n_elem * sizeof(float)) ;
cudaMalloc( &U_dev, (size_t) u.n_elem * sizeof(float)) ;
cudaMalloc( &pNoise_dev, (size_t) pNoiseSample.n_elem * sizeof(float)) ;
cudaMalloc( &newState_dev, (size_t) prediction.n_elem * sizeof(float)) ;
//Copy particles and weights to the gpu
cudaMemcpy(lastState_dev,current->memptr(),(size_t) current->n_elem * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(F_dev,F.memptr(),(size_t) F.n_elem * sizeof(float), cudaMemcpyHostToDevice);
//cudaMemcpy(U_dev,u.memptr(),(size_t) u.n_elem * sizeof(float), cudaMemcpyHostToDevice);
//cudaMemcpy(pNoise_dev,pNoiseSample.memptr(),(size_t) pNoiseSample.n_elem * sizeof(float), cudaMemcpyHostToDevice);
//pNoise
curandGenerateNormal(gen, pNoise_dev, numberOfSamples, 0.0f, 50.0e-6f);
curandGenerateNormal(gen, pNoise_dev+numberOfSamples, numberOfSamples, 0.0f, 50.0e-6f);
curandGenerateNormal(gen, pNoise_dev+2*numberOfSamples, numberOfSamples, 0.0f, 50.0e-6f);
curandGenerateNormal(gen, pNoise_dev+3*numberOfSamples, numberOfSamples, 0.0f, 10.0e-6f);
curandGenerateNormal(gen, pNoise_dev+4*numberOfSamples, numberOfSamples, 0.0f, 10.0e-6f);
curandGenerateNormal(gen, pNoise_dev+5*numberOfSamples, numberOfSamples, 0.0f, 10.0e-6f);
curandGenerateNormal(gen, pNoise_dev+6*numberOfSamples, numberOfSamples, 0.0f, 100.0e-6f);
curandGenerateNormal(gen, pNoise_dev+7*numberOfSamples, numberOfSamples, 0.0f, 100.0e-6f);
curandGenerateNormal(gen, pNoise_dev+8*numberOfSamples, numberOfSamples, 0.0f, 100.0e-6f);
// U
U.batch.at(0);
for (unsigned int i=0; i< 9 ;++i)
{
curandGenerateNormal(gen, U_dev+ i*numberOfSamples, numberOfSamples, U.batch.at(i)->a, U.batch.at(i)->b);
}
/*curandGenerateNormal(gen, oNoise_dev, numberOfSamples, 0.0f, 50.0e-6f);
curandGenerateNormal(gen, oNoise_dev+numberOfSamples, numberOfSamples, 0.0f, 50.0e-6f);
curandGenerateNormal(gen, oNoise_dev+2*numberOfSamples, numberOfSamples, 0.0f, 50.0e-6f);
curandGenerateNormal(gen, oNoise_dev+3*numberOfSamples, numberOfSamples, 0.0f, 10.0e-6f);
curandGenerateNormal(gen, oNoise_dev+4*numberOfSamples, numberOfSamples, 0.0f, 10.0e-6f);
curandGenerateNormal(gen, oNoise_dev+5*numberOfSamples, numberOfSamples, 0.0f, 10.0e-6f);
curandGenerateNormal(gen, oNoise_dev+6*numberOfSamples, numberOfSamples, 0.0f, 100.0e-6f);
curandGenerateNormal(gen, oNoise_dev+7*numberOfSamples, numberOfSamples, 0.0f, 100.0e-6f);
curandGenerateNormal(gen, oNoise_dev+8*numberOfSamples, numberOfSamples, 0.0f, 100.0e-6f);*/
//prediction = F * current + pNoiseSample + u ;
callFfunKernel(lastState_dev, F_dev, U_dev, pNoise_dev, stateDimension ,numberOfSamples,newState_dev);
//printf("%s\n",cudaGetErrorString(cudaGetLastError()));
//get estimation from gpu
cudaMemcpy(prediction.memptr(),newState_dev,current->n_elem * sizeof(float), cudaMemcpyDeviceToHost);
// clean up the graphics card
cudaFree(lastState_dev);
cudaFree(newState_dev);
cudaFree(F_dev);
cudaFree(U_dev);
cudaFree(pNoise_dev);
return prediction;
}
开发者ID:chingoduc,项目名称:parallel-bayesian-toolbox,代码行数:69,代码来源:model_wpam_gpu.cpp
示例19: shmoo
void shmoo(int minN, int maxN, int maxThreads, int maxBlocks, ReduceType datatype)
{
fprintf(stderr, "Shmoo wasn't implemented in this modified kernel!\n");
exit(1);
// create random input data on CPU
unsigned int bytes = maxN * sizeof(T);
T *h_idata = (T*) malloc(bytes);
for(int i = 0; i < maxN; i++) {
// Keep the numbers small so we don't get truncation error in the sum
if (datatype == REDUCE_INT)
h_idata[i] = (T)(rand() & 0xFF);
else
h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX;
}
int maxNumBlocks = MIN( maxN / maxThreads, MAX_BLOCK_DIM_SIZE);
// allocate mem for the result on host side
T* h_odata = (T*) malloc(maxNumBlocks*sizeof(T));
// allocate device memory and data
T* d_idata = NULL;
T* d_odata = NULL;
cutilSafeCallNoSync( cudaMalloc((void**) &d_idata, bytes) );
cutilSafeCallNoSync( cudaMalloc((void**) &d_odata, maxNumBlocks*sizeof(T)) );
// copy data directly to device memory
cutilSafeCallNoSync( cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice) );
cutilSafeCallNoSync( cudaMemcpy(d_odata, h_idata, maxNumBlocks*sizeof(T), cudaMemcpyHostToDevice) );
// warm-up
f
|
请发表评论