本文整理汇总了C++中cudaFree函数的典型用法代码示例。如果您正苦于以下问题:C++ cudaFree函数的具体用法?C++ cudaFree怎么用?C++ cudaFree使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了cudaFree函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的C++代码示例。
示例1: quantus_cuda_cleanup
void quantus_cuda_cleanup(quantus_comm<T> *comm)
{
cudaFree((T *) comm->matrix);
}
开发者ID:thomasluu,项目名称:quantus,代码行数:4,代码来源:quantus_cuda.cpp
示例2: CUDA_CHECK
GPUParams<Dtype>::~GPUParams() {
#ifndef CPU_ONLY
CUDA_CHECK(cudaFree(data_));
CUDA_CHECK(cudaFree(diff_));
#endif
}
开发者ID:bbshocking,项目名称:caffe,代码行数:6,代码来源:parallel.cpp
示例3: free
static void free(void *data) {
if (data) {
// std::cout << "free " << data << std::endl;
throw_(cudaFree(data));
}
}
开发者ID:asadchev,项目名称:asadchev,代码行数:6,代码来源:allocator.hpp
示例4: cudaFree
TxVectorOptimizationDataCU::~TxVectorOptimizationDataCU() {
if (devicePtr) {
cudaFree(devicePtr);
}
}
开发者ID:NobodyInAmerica,项目名称:libTxHPCG,代码行数:5,代码来源:TxVectorOptimizationDataCU.cpp
示例5: cudaFree
void CloudConstructor::freeGPUPoints() {
cudaFree(d_resultPoints);
d_resultPoints = NULL;
}
开发者ID:damonseeley,项目名称:electroland_repos,代码行数:4,代码来源:CloudConstructor.cpp
示例6: main
int
main()
{
int i;
struct timeval start, stop;
FILE *fd;
char *key;
cudaSetDevice(0);
/* Allocate memory */
if ((key = (char *)malloc(40 * sizeof(char))) == NULL) {
printf("Malloc failed!\n");
exit(EXIT_FAILURE);
}
cudaMallocHost((void **) &batchKeys,
((BATCH_SIZE + 1) * MAX_LEN_ALIGNED) * sizeof(char));
cudaMallocHost((void **) &nKeys, BATCH_SIZE * sizeof(size_t));
cudaMallocHost((void **) &batchIndex, (BATCH_SIZE + 1) * sizeof(int));
cudaMallocHost((void **) &hashedKeys, BATCH_SIZE * sizeof(uint32_t));
cudaMalloc((void **) &d_keys,
((BATCH_SIZE + 1) * MAX_LEN_ALIGNED) * sizeof(char));
cudaMalloc((void **) &d_len, BATCH_SIZE * sizeof(size_t));
cudaMalloc((void **) &d_index, (BATCH_SIZE + 1) * sizeof(int));
cudaMalloc((void **) &d_res, BATCH_SIZE * sizeof(uint32_t));
/* Create 'BATCH_SIZE' number of random keys
* and add them to batch table
*/
batchNo = 0;
batchIndex[0] = 0;
for(i = 0; i < BATCH_SIZE; i++) {
gen_random(key, 30);
add_to_batch(key, 30);
}
/* Start Time (execution + memory) */
#ifdef EXEC_MEM
gettimeofday(&start, NULL);
#endif // EXEC_MEM
/* MemCpy Host -> Device */
cudaMemcpy(d_keys, batchKeys, (batchIndex[BATCH_SIZE-1] +
strlen(&batchKeys[batchIndex[BATCH_SIZE - 1]])) * sizeof(char),
cudaMemcpyHostToDevice);
cudaMemcpy(d_len, nKeys, BATCH_SIZE * sizeof(size_t),
cudaMemcpyHostToDevice);
cudaMemcpy(d_index, batchIndex, BATCH_SIZE * sizeof(int),
cudaMemcpyHostToDevice);
/* Start Time (execution only)*/
#ifndef EXEC_MEM
gettimeofday(&start, NULL);
#endif // EXEC_MEM
/* Call the kernel */
CUDAhash(d_keys, d_index, d_len, d_res);
/* Start Time (execution only)*/
#ifndef EXEC_MEM
cudaDeviceSynchronize();
gettimeofday(&stop, NULL);
#endif // EXEC_MEM
/* MemCpy Device -> Host */
cudaMemcpy(hashedKeys, d_res, BATCH_SIZE * sizeof(uint32_t),
cudaMemcpyDeviceToHost);
/* Start Time (execution + memory) */
#ifdef EXEC_MEM
gettimeofday(&stop, NULL);
#endif // EXEC_MEM
#ifdef DEBUG
for(i = 0; i < BATCH_SIZE; i++) {
printf("%s\n", &batchKeys[batchIndex[i]]);
printf("%u\n", hashedKeys[i]);
}
#endif // DEBUG
/* Print Time */
fd = fopen("log.txt", "a+");
fprintf(fd, "%lu", ((stop.tv_sec * USECS) + stop.tv_usec ) -
((start.tv_sec * USECS) + start.tv_usec));
fprintf(fd, "\t%1.f\n", ((double)BATCH_SIZE /
((double)(((stop.tv_sec * USECS) + stop.tv_usec ) -
((start.tv_sec * USECS) + start.tv_usec)) / 1000000 )) / 1000);
fclose(fd);
#ifdef DEBUG
printf("Time: %lu \n", ((stop.tv_sec * USECS) + stop.tv_usec ) -
((start.tv_sec * USECS) + start.tv_usec));
#endif // DEBUG
/* Free memory */
cudaFree(batchKeys);
cudaFree(nKeys);
//.........这里部分代码省略.........
开发者ID:deyannis,项目名称:HY527,代码行数:101,代码来源:hash.c
示例7: CUDA_SAFE_CALL
void CudaSpace::deallocate( void * const arg_alloc_ptr , const size_t /* arg_alloc_size */ ) const
{
try {
CUDA_SAFE_CALL( cudaFree( arg_alloc_ptr ) );
} catch(...) {}
}
开发者ID:gmackey,项目名称:kokkos,代码行数:6,代码来源:Kokkos_CudaSpace.cpp
示例8: calculateOnGPU
//.........这里部分代码省略.........
//move constants variables to constant cuda memory
setConstants(partSeqSize, partsNumber, overlapLength, seqLibLength,
queryLength, gapOpen, gapExtension, maxScore, partQuerySize,
U2::SmithWatermanAlgorithm::UP, U2::SmithWatermanAlgorithm::LEFT, U2::SmithWatermanAlgorithm::DIAG,
U2::SmithWatermanAlgorithm::STOP);
size_t sh_mem_size = sizeof(ScoreType) * (dimGrid.x + 1) * 3;
u2log.details(QString("SHARED MEM SIZE USED: %1 B").arg(sh_mem_size));
// start main loop
for (int i = 0; i < queryDevider; i++) {
calculateMatrix_wrap( dimBlock.x, dimGrid.x, g_seqLib,
g_queryProfile, g_HdataUp, g_HdataRec, g_HdataMax,
g_FdataUp, g_directionsUp, g_directionsRec,
g_directionsMax, i * partQuerySize, g_directionsMatrix, g_backtraceBegins);
cudaError hasErrors = cudaThreadSynchronize();
if (hasErrors != 0) {
u2log.trace(QString("CUDA ERROR HAPPEN, errorId: ") + QString::number(hasErrors));
}
//revert arrays
g_HdataTmp = g_HdataRec;
g_HdataRec = g_HdataUp;
g_HdataUp = g_HdataTmp;
g_HdataTmp = g_directionsRec;
g_directionsRec = g_directionsUp;
g_directionsUp = g_HdataTmp;
}
//Copy vectors on host and find actual results
cudaMemcpy(tempRow, g_HdataMax, sizeQQ, cudaMemcpyDeviceToHost);
cudaMemcpy(directionRow, g_directionsMax, sizeQQ, cudaMemcpyDeviceToHost);
if(U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) {
cudaMemcpy(globalMatrix, g_directionsMatrix, directionMatrixSize, cudaMemcpyDeviceToHost);
cudaMemcpy(backtraceBegins, g_backtraceBegins, backtraceBeginsSize, cudaMemcpyDeviceToHost);
}
QList<resType> pas;
resType res;
for (int j = 0; j < (sizeRow); j++) {
if (tempRow[j] >= maxScore) {
res.refSubseq.startPos = directionRow[j];
res.refSubseq.length = j - res.refSubseq.startPos + 1 - (j) / (partSeqSize + 1) * overlapLength - (j) / (partSeqSize + 1);
res.score = tempRow[j];
if(U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) {
qint32 pairAlignOffset = 0;
qint32 row = backtraceBegins[2 * j];
qint32 column = backtraceBegins[2 * j + 1];
while(U2::SmithWatermanAlgorithm::STOP != globalMatrix[seqLibLength * row + column]) {
if(U2::SmithWatermanAlgorithm::DIAG == globalMatrix[seqLibLength * row + column]) {
res.pairAlign[pairAlignOffset++] = U2::SmithWatermanAlgorithm::DIAG;
row--;
column--;
} else if(U2::SmithWatermanAlgorithm::LEFT == globalMatrix[seqLibLength * row + column]) {
res.pairAlign[pairAlignOffset++] = U2::SmithWatermanAlgorithm::UP;
column--;
} else if(U2::SmithWatermanAlgorithm::UP == globalMatrix[seqLibLength * row + column]) {
res.pairAlign[pairAlignOffset++] = U2::SmithWatermanAlgorithm::LEFT;
row--;
}
if(0 >= row || 0 >= column) {
break;
}
}
res.patternSubseq.startPos = row;
res.patternSubseq.length = backtraceBegins[2 * j] - row + 1;
}
pas.append(res);
}
}
//deallocation memory
cudaFree(g_seqLib);
cudaFree(g_queryProfile);
cudaFree(g_HdataMax);
cudaFree(g_HdataUp);
cudaFree(g_HdataRec);
cudaFree(g_FdataUp);
cudaFree(g_directionsUp);
cudaFree(g_directionsMax);
cudaFree(g_directionsRec);
if(U2::SmithWatermanSettings::MULTIPLE_ALIGNMENT == resultView) {
cudaFree(g_directionsMatrix);
cudaFree(g_backtraceBegins);
}
delete[] tempRow;
delete[] directionRow;
delete[] zerroArr;
delete[] globalMatrix;
delete[] backtraceBegins;
return pas;
}
开发者ID:ugeneunipro,项目名称:ugene,代码行数:101,代码来源:sw_cuda_cpp.cpp
示例9:
~curandStateManager()
{
//if(_state != NULL) memFree((char*)_state);
if(_state != NULL) CUDA_CHECK(cudaFree(_state));
}
开发者ID:hxiaox,项目名称:arrayfire,代码行数:5,代码来源:random.hpp
示例10: sci_gpuLU
//.........这里部分代码省略.........
default : throw "First option argument must be 0 or 1 or 2.";
}
switch((int)option[1])
{
case 0 : // Don't keep the data input on Device.
{
if(inputType_A == sci_matrix)
{
status = cublasFree(d_A);
if (status != CUBLAS_STATUS_SUCCESS) throw status;
d_A = NULL;
}
break;
}
case 1 : // Keep data of the fisrt argument on Device and return the Device pointer.
{
if(inputType_A == sci_matrix)
{
gpuMat_CUDA* dptr;
gpuMat_CUDA tmp={getCudaContext()->genMatrix<double>(getCudaQueue(),rows_A*cols_A),rows_A,cols_A};
dptr=new gpuMat_CUDA(tmp);
dptr->useCuda = true;
dptr->ptr->set_ptr((double*)d_A);
if(bComplex_A)
dptr->complex=TRUE;
else
dptr->complex=FALSE;
sciErr = createPointer(pvApiCtx,Rhs+posOutput, (void*)dptr);
if(sciErr.iErr) throw sciErr;
LhsVar(posOutput)=Rhs+posOutput;
}
else
throw "The first input argument is already a GPU variable.";
posOutput++;
break;
}
default : throw "Second option argument must be 0 or 1.";
}
// Shutdown
status = cublasShutdown();
if (status != CUBLAS_STATUS_SUCCESS) throw status;
}
#endif
#ifdef WITH_OPENCL
if (!useCuda())
{
throw "not implemented with OpenCL.";
}
#endif
if(Rhs == 1)
{
free(option);
option = NULL;
}
if(posOutput < Lhs+1)
throw "Too many output arguments.";
if(posOutput > Lhs+1)
throw "Too few output arguments.";
PutLhsVar();
return 0;
}
catch(const char* str)
{
Scierror(999,"%s\n",str);
}
catch(SciErr E)
{
printError(&E, 0);
}
#ifdef WITH_CUDA
catch(cudaError_t cudaE)
{
GpuError::treat_error<CUDAmode>((CUDAmode::Status)cudaE);
}
catch(cublasStatus CublasE)
{
GpuError::treat_error<CUDAmode>((CUDAmode::Status)CublasE,1);
}
if (useCuda())
{
if(inputType_A == 1 && d_A != NULL) cudaFree(d_A);
}
#endif
#ifdef WITH_OPENCL
if (!useCuda())
{
Scierror(999,"not implemented with OpenCL.\n");
}
#endif
if(Rhs == 1 && option != NULL) free(option);
return EXIT_FAILURE;
}
开发者ID:dawuweijun,项目名称:scigpgpu,代码行数:101,代码来源:sci_gpuLU.cpp
示例11: main
//.........这里部分代码省略.........
{
fprintf(stderr, "!!!! device access error (write C)\n");
return EXIT_FAILURE;
}
/* Performs operation using plain C code */
simple_sgemm(N, alpha, h_A, h_B, beta, h_C);
h_C_ref = h_C;
/* Performs operation using cublas */
status = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A, N, d_B, N, &beta, d_C, N);
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "!!!! kernel execution error.\n");
return EXIT_FAILURE;
}
/* Allocate host memory for reading back the result from device memory */
h_C = (float *)malloc(n2 * sizeof(h_C[0]));
if (h_C == 0)
{
fprintf(stderr, "!!!! host memory allocation error (C)\n");
return EXIT_FAILURE;
}
/* Read the result back */
status = cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1);
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "!!!! device access error (read C)\n");
return EXIT_FAILURE;
}
/* Check result against reference */
error_norm = 0;
ref_norm = 0;
for (i = 0; i < n2; ++i)
{
diff = h_C_ref[i] - h_C[i];
error_norm += diff * diff;
ref_norm += h_C_ref[i] * h_C_ref[i];
}
error_norm = (float)sqrt((double)error_norm);
ref_norm = (float)sqrt((double)ref_norm);
if (fabs(ref_norm) < 1e-7)
{
fprintf(stderr, "!!!! reference norm is 0\n");
return EXIT_FAILURE;
}
/* Memory clean up */
free(h_A);
free(h_B);
free(h_C);
free(h_C_ref);
if (cudaFree(d_A) != cudaSuccess)
{
fprintf(stderr, "!!!! memory free error (A)\n");
return EXIT_FAILURE;
}
if (cudaFree(d_B) != cudaSuccess)
{
fprintf(stderr, "!!!! memory free error (B)\n");
return EXIT_FAILURE;
}
if (cudaFree(d_C) != cudaSuccess)
{
fprintf(stderr, "!!!! memory free error (C)\n");
return EXIT_FAILURE;
}
/* Shutdown */
status = cublasDestroy(handle);
if (status != CUBLAS_STATUS_SUCCESS)
{
fprintf(stderr, "!!!! shutdown error (A)\n");
return EXIT_FAILURE;
}
if (error_norm / ref_norm < 1e-6f)
{
printf("simpleCUBLAS test passed.\n");
exit(EXIT_SUCCESS);
}
else
{
printf("simpleCUBLAS test failed.\n");
exit(EXIT_FAILURE);
}
}
开发者ID:intersense,项目名称:ox-cuda,代码行数:101,代码来源:simpleCUBLAS.cpp
示例12: main
//.........这里部分代码省略.........
int sem_status = sem_wait(sem1);
if (sem_status == -1)
{
fprintf(stderr, "Cannot wait on semaphore #1 by process %d, errno = %d\n",
pid, errno);
return errno;
}
sem_status = sem_post(sem2);
if (sem_status == -1)
{
fprintf(stderr, "Cannot post on semaphore #2 by process %d, errno = %d\n",
pid, errno);
return errno;
}
}
// At this point two processes are synchronized.
config.step++;
// Reassign porcesses' input data segments to show some
// possible manipulation on shared memory.
// Here we perform cyclic shift of data pointers.
config.idevice++;
config.idevice %= ndevices + 1;
config.inout_cpu = inout + config.idevice * np;
}
// Release device buffers.
if (worker)
{
cuda_status = cudaFree(config.in_dev);
if (cuda_status != cudaSuccess)
{
fprintf(stderr, "Cannot release input buffer by process %d, status = %d\n",
pid, cuda_status);
return cuda_status;
}
cuda_status = cudaFree(config.out_dev);
if (cuda_status != cudaSuccess)
{
fprintf(stderr, "Cannot release output buffer by process %d, status = %d\n",
pid, cuda_status);
return cuda_status;
}
}
else
{
free(config.in_dev);
free(config.out_dev);
}
printf("Device %d deinitialized py process %d\n", config.idevice, pid);
// On master process perform results check:
// compare each GPU result to CPU result.
if (master)
{
float* control = inout + np * ndevices;
for (int idevice = 0; idevice < ndevices; idevice++)
{
// Find the maximum abs difference.
int maxi = 0, maxj = 0;
float maxdiff = fabs(control[0] - (inout + idevice * np)[0]);
开发者ID:7633,项目名称:msu-cuda-course,代码行数:67,代码来源:shmem_mmap_cuda.c
示例13:
OsdCudaTable::~OsdCudaTable() {
if (_devicePtr) cudaFree(_devicePtr);
}
开发者ID:Len3d,项目名称:OpenSubdiv,代码行数:4,代码来源:cudaComputeContext.cpp
示例14: main
int main(int argc, char *argv[])
{
// needed to work correctly with piped benchmarkrunner
setlinebuf(stdout);
setlinebuf(stdin);
int n_indices = 1;
int n_dimensions = 1;
char inBuf[200]; // ridiculously large input buffer.
bool isFirst = true;
do {
// Allocate memory for the arrays
int *h_indices = 0;
double *h_outputGPU = 0;
try
{
h_indices = new int [n_indices * n_dimensions];
h_outputGPU = new double [n_indices * n_dimensions];
}
catch (std::exception e)
{
std::cerr << "Caught exception: " << e.what() << std::endl;
std::cerr << "Unable to allocate CPU memory (try running with fewer vectors/dimensions)" << std::endl;
return -1;
}
int *d_indices;
double *d_output;
try
{
cudaError_t cudaResult;
cudaResult = cudaMalloc((void **)&d_indices, n_dimensions * n_indices * sizeof(int));
if (cudaResult != cudaSuccess)
{
throw std::runtime_error(cudaGetErrorString(cudaResult));
}
}
catch (std::runtime_error e)
{
std::cerr << "Caught exception: " << e.what() << std::endl;
std::cerr << "Unable to allocate GPU memory (try running with fewer vectors/dimensions)" << std::endl;
return -1;
}
// Initialize the indices (done on the host)
for(int i = 0; i < n_indices; i++) {
h_indices[i] = i;
}
// Copy the indices to the device
cudaMemcpy(d_indices, h_indices, n_dimensions * n_indices * sizeof(int), cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
// Execute the QRNG on the device
int n_vec;
sobol_nikola_unsimplified(n_indices, d_indices, n_indices, &d_output, &n_vec);
cudaDeviceSynchronize();
cudaMemcpy(h_outputGPU, d_output, n_indices * n_dimensions * sizeof(double), cudaMemcpyDeviceToHost);
// Cleanup and terminate
delete h_indices;
cudaFree(d_indices);
cudaFree(d_output);
if(!isFirst) {
printf("RESULT ");
for(int i = 0; i < std::min(n_indices,10); i++)
printf("%f ", h_outputGPU[i]);
printf("\n");
}
else {
printf("OK\n");
isFirst = false;
}
delete h_outputGPU;
fgets(inBuf, 200, stdin);
if (sscanf(inBuf, "%u", &n_indices) == 0)
{
// if input is not a number, it has to be "EXIT"
if (strncmp("EXIT",inBuf,4)==0)
{
printf("OK\n");
break;
}
else
{
printf("ERROR. Bad input: %s\n", inBuf);
//.........这里部分代码省略.........
开发者ID:HIPERFIT,项目名称:vectorprogramming,代码行数:101,代码来源:sobol.cpp
示例15: gpuErrchk
PhysicsProcessor::~PhysicsProcessor(void)
{
gpuErrchk(cudaFree(d_V));
}
开发者ID:Aloalo,项目名称:RTRT,代码行数:4,代码来源:PhysicsProcessor.cpp
示例16: exp2
void ControlCubeCache::_reSizeCache()
{
_nLevels = _nextnLevels;
_levelCube = _nextLevelCube;
_offset = _nextOffset;
_nextnLevels = 0;
_nextLevelCube = 0;
_dimCube = exp2(_nLevels - _levelCube) + 2 * CUBE_INC;
_sizeElement = pow(_dimCube, 3);
int dimV = exp2(_nLevels);
_minValue = coordinateToIndex(vmml::vector<3,int>(0,0,0), _levelCube, _nLevels);
_maxValue = coordinateToIndex(vmml::vector<3,int>(dimV-1,dimV-1,dimV-1), _levelCube, _nLevels);
int dc = exp2(_nLevels - _levelCube);
vmml::vector<3,int> mn = _cpuCache->getMinCoord();
vmml::vector<3,int> mx = _cpuCache->getMaxCoord();
_maxC = mx - mn;
if ((mx.x() - mn.x()) % dc != 0)
_maxC[0] += dc;
if ((mx.y() - mn.y()) % dc != 0)
_maxC[1] += dc;
if ((mx.z() - mn.z()) % dc != 0)
_maxC[2] += dc;
if (cudaSuccess != cudaSetDevice(_device))
{
std::cerr<<"Control Cube Cache, error setting device: "<<cudaGetErrorString(cudaGetLastError())<<std::endl;
throw;
}
if (_memory != 0)
if (cudaSuccess != cudaFree((void*)_memory))
{
std::cerr<<"Control Cube Cache, error resizing cache: "<<cudaGetErrorString(cudaGetLastError())<<std::endl;
throw;
}
size_t total = 0;
size_t free = 0;
if (cudaSuccess != cudaMemGetInfo(&free, &total))
{
std::cerr<<"Control Cube Cache, error resizing cache: "<<cudaGetErrorString(cudaGetLastError())<<std::endl;
throw;
}
float memorySize = (0.80f*free); // Get 80% of free memory
_maxNumCubes = memorySize/ (_sizeElement*sizeof(float));
if (_maxNumCubes == 0)
{
std::cerr<<"Control Cube Cache: Memory aviable is not enough "<<memorySize/1024/1024<<" MB"<<std::endl;
throw;
}
if (cudaSuccess != cudaMalloc((void**)&_memory, _maxNumCubes*_sizeElement*sizeof(float)))
{
std::cerr<<"Control Cube Cache, error resizing cache: "<<cudaGetErrorString(cudaGetLastError())<<std::endl;
throw;
}
_freeSlots = _maxNumCubes;
ControlElementCache::_reSizeCache();
}
开发者ID:carlosduelo,项目名称:eqMivtRefactor,代码行数:66,代码来源:controlCubeCache.cpp
示例17: mpla_redistribute_vector_for_dgesv
void mpla_redistribute_vector_for_dgesv(struct mpla_vector* b_redist, struct mpla_vector* b, struct mpla_matrix* A, struct mpla_instance* instance)
{
// attention: this code does no correctness check for the input data
// b_redist->vec_row_count = b->vec_row_count;
//
// // allocating memory for process-wise vector information
// vector->proc_row_count = new int*[instance->proc_rows];
// vector->proc_row_offset = new int*[instance->proc_rows];
// for (int i=0; i<instance->proc_rows; i++)
// {
// b_redist->proc_row_count[i] = new int[instance->proc_cols];
// b_redist->proc_row_offset[i] = new int[instance->proc_cols];
// }
//
// // set sizes of
// for (int i=0; i<instance->proc_rows; i++)
// {
// for (int j=0; j<instance->proc_cols; j++)
// {
// b_redist->proc_row_count[i][j] = A->proc_col_count[i][j];
// b_redist->proc_row_offset[i][j] = A->proc_col_offset[i][j];
// }
// }
//
// // retrieving local data for current process
// b_redist->cur_proc_row_count = A->cur_proc_col_count;
// b_redist->cur_proc_row_offset = A->cur_proc_col_offset;
//
// // allocating temporary vector storage
// cudaMalloc((void*)&(b_redist->data), sizeof(double)*b_redist->cur_proc_row_count);
// WARNING: The following code is not efficient for a strong parallelization !!!!!
// create sub-communicator for each process column
int remain_dims[2];
remain_dims[0]=1;
remain_dims[1]=0;
MPI_Comm column_comm;
MPI_Cart_sub(instance->comm, remain_dims, &column_comm);
int column_rank;
MPI_Comm_rank(column_comm, &column_rank);
// columnwise creation of the full vector
double* full_vector;
int* recvcounts = new int[instance->proc_rows];
int* displs = new int[instance->proc_rows];
for (int i=0; i<instance->proc_rows; i++)
{
recvcounts[i] = b->proc_row_count[i][instance->cur_proc_col];
displs[i] = b->proc_row_offset[i][instance->cur_proc_col];
}
cudaMalloc((void**)&full_vector, sizeof(double)*b->vec_row_count);
cudaThreadSynchronize();
checkCUDAError("cudaMalloc");
MPI_Allgatherv(b->data, b->cur_proc_row_count, MPI_DOUBLE, full_vector, recvcounts, displs, MPI_DOUBLE, column_comm);
// extract column-wise local part of full vector
cudaMemcpy(b_redist->data, &(full_vector[b_redist->cur_proc_row_offset]), sizeof(double)*b_redist->cur_proc_row_count, cudaMemcpyDeviceToDevice);
// memory cleanup
cudaFree(full_vector);
MPI_Comm_free(&column_comm);
}
开发者ID:zaspel,项目名称:MPLA,代码行数:68,代码来源:mpla.cpp
示例18: cudaFree
RealKernel::~RealKernel()
{ delete[] data;
#ifdef GPU_ENABLED
cudaFree(dataGpu);
#endif
}
开发者ID:yalcinozhabes,项目名称:pythonJDFTx,代码行数:6,代码来源:ScalarField.cpp
示例19: run_2D_GLOBAL_MEMORY
void run_2D_GLOBAL_MEMORY()
{
int arrayWidth = 4;
int arrayHeight = 4;
bool SEQ = true;
/* Host allocation */
float* inArr_1_H = (float*) malloc(arrayWidth * arrayHeight * sizeof(float));
float* inArr_2_H = (float*) malloc(arrayWidth * arrayHeight * sizeof(float));
float* outArr_H = (float*) malloc(arrayWidth * arrayHeight * sizeof(float));
/* Fill arrays */
int index = 0;
if (SEQ)
{
int ctr = 0;
for(int j = 0; j < (arrayHeight); j++)
{
for(int i = 0; i < (arrayWidth); i++)
{
index = ((j * arrayWidth) + i);
inArr_1_H[index] = (float) ctr++;
inArr_2_H[index] = (float) ctr++;
outArr_H[index] = (float) 0;
}
}
}
else
{
for(int j = 0; j < (arrayHeight); j++)
{
for(int i = 0; i < (arrayWidth); i++)
{
index = ((j * arrayWidth) + i);
inArr_1_H[index] = (float)rand()/(float)RAND_MAX;
inArr_2_H[index] = (float)rand()/(float)RAND_MAX;
outArr_H[index] = 0;
}
}
}
/* Print host arrays */
printf("inArr_1_H \n");
print_2D_Array(inArr_1_H, arrayWidth, arrayHeight);
printf("inArr_2_H \n");
print_2D_Array(inArr_2_H, arrayWidth, arrayHeight);
/* Device allocation + <__pitch> */
float *inArr_1_D, *inArr_2_D, *outArr_D;
size_t __pitch;
cudaMallocPitch((void**)&inArr_1_D, &__pitch, arrayHeight * sizeof(float), arrayWidth);
cudaMallocPitch((void**)&inArr_2_D, &__pitch, arrayHeight * sizeof(float), arrayWidth);
cudaMallocPitch((void**)&outArr_D, &__pitch, arrayHeight * sizeof(float), arrayWidth);
/* Print __pitch */
printf("__pitch %d \n", (__pitch/sizeof(float)));
/* Uploading data */
cudaMemcpy2D(inArr_1_D, __pitch, inArr_1_H, arrayHeight * sizeof(float), arrayHeight * sizeof(float), arrayWidth, cudaMemcpyHostToDevice);
cudaMemcpy2D(inArr_2_D, __pitch, inArr_2_H, arrayHeight * sizeof(float), arrayHeight * sizeof(float), arrayWidth, cudaMemcpyHostToDevice);
/* Gridding */
dim3 __numBlocks(1,1,1);
dim3 __numThreadsPerBlock(BLOCK_SIZE, BLOCK_SIZE, 1);
__numBlocks.x = ((arrayWidth / BLOCK_SIZE) + (((arrayWidth) % BLOCK_SIZE) == 0 ? 0:1));
__numBlocks.y = ((arrayHeight / BLOCK_SIZE) + (((arrayHeight) % BLOCK_SIZE) == 0 ? 0:1));
/* Kernel invokation */
add_2D_Array(inArr_1_D, inArr_2_D, outArr_D, arrayWidth, arrayHeight, __pitch, __numBlocks, __numThreadsPerBlock);
/* Synchronization */
cudaThreadSynchronize();
/* Download result */
cudaMemcpy2D(outArr_H, arrayHeight * sizeof(float), outArr_D, __pitch, arrayHeight * sizeof(float), arrayWidth, cudaMemcpyDeviceToHost);
/* Free device arrays */
cudaFree(inArr_1_D);
cudaFree(inArr_2_D);
cudaFree(outArr_D);
/* Display results */
printf("outArr \n");
print_2D_Array(outArr_H, arrayWidth, arrayHeight);
}
开发者ID:wow2006,项目名称:cuYURI,代码行数:88,代码来源:globalMem_2D.cpp
示例20: set_size
void gpu_data::
set_size(
size_t new_size
)
{
if (new_size == 0)
{
if (device_in_use)
{
// Wait for any possible CUDA kernels that might be using our memory block to
// complete before we free the memory.
synchronize_stream(0);
device_in_use = false;
}
wait_for_transfer_to_finish();
data_size = 0;
host_current = true;
device_current = true;
device_in_use = false;
data_host.reset();
data_device.reset();
}
else if (new_size != data_size)
{
if (device_in_use)
{
// Wait for any possible CUDA kernels that might be using our memory block to
// complete before we free the memory.
synchronize_stream(0);
device_in_use = false;
}
wait_for_transfer_to_finish();
data_size = new_size;
host_current = true;
device_current = true;
device_in_use = false;
try
{
CHECK_CUDA(cudaGetDevice(&the_device_id));
// free memory blocks before we allocate new ones.
data_host.reset();
data_device.reset();
void* data;
CHECK_CUDA(cudaMallocHost(&data, new_size*sizeof(float)));
// Note that we don't throw exceptions since the free calls are invariably
// called in destructors. They also shouldn't fail anyway unless someone
// is resetting the GPU card in the middle of their program.
data_host.reset((float*)data, [](float* ptr){
auto err = cudaFreeHost(ptr);
if(err!=cudaSuccess)
std::cerr << "cudaFreeHost() failed. Reason: " << cudaGetErrorString(err) << std::endl;
});
CHECK_CUDA(cudaMalloc(&data, new_size*sizeof(float)));
data_device.reset((float*)data, [](float* ptr){
auto err = cudaFree(ptr);
if(err!=cudaSuccess)
std::cerr << "cudaFree() failed. Reason: " << cudaGetErrorString(err) << std::endl;
});
if (!cuda_stream)
{
cudaStream_t cstream;
CHECK_CUDA(cudaStreamCreateWithFlags(&cstream, cudaStreamNonBlocking));
cuda_stream.reset(cstream, [](void* ptr){
auto err = cudaStreamDestroy((cudaStream_t)ptr);
if(err!=cudaSuccess)
std::cerr << "cudaStreamDestroy() failed. Reason: " << cudaGetErrorString(err) << std::endl;
});
}
}
catch(...)
{
set_size(0);
throw;
}
}
}
开发者ID:davisking,项目名称:dlib,代码行数:82,代码来源:gpu_data.cpp
注:本文中的cudaFree函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论