本文整理汇总了C++中cudaStreamSynchronize函数的典型用法代码示例。如果您正苦于以下问题:C++ cudaStreamSynchronize函数的具体用法?C++ cudaStreamSynchronize怎么用?C++ cudaStreamSynchronize使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了cudaStreamSynchronize函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的C++代码示例。
示例1: collect_final_result_dsyrk_syr2k
void collect_final_result_dsyrk_syr2k(int *tasks_rs, int *tasks_rs_size, int switcher, cudaStream_t *stream, double** C_dev, int block_dim, int stream_num, int x,int y, int z, int nrowc, int ncolc, int ldc, double *C, enum CBLAS_UPLO Uplo) {
switcher = 1 - switcher;
int temp = 0;
for (temp = tasks_rs_size[switcher]; temp < tasks_rs_size[1-switcher] ; temp++) {
int prior_task = tasks_rs[temp+stream_num*(1-switcher)];
int i_pre, k_pre;
blasx_get_index(prior_task, 0, x, &i_pre, &k_pre, Uplo, x);
int current_stream = temp;
int nrowc_dev_pre, ncolc_dev_pre;
margin_adjustment(nrowc, ncolc, block_dim, i_pre, k_pre, &nrowc_dev_pre, &ncolc_dev_pre);
int nrow_offset_c_pre = i_pre*block_dim;
int ncol_offset_c_pre = k_pre*block_dim;
double *starting_point_C_pre = &C[nrow_offset_c_pre+ncol_offset_c_pre*ldc];
cublasGetMatrixAsync(nrowc_dev_pre, ncolc_dev_pre, sizeof(double), C_dev[current_stream+(1-switcher)*stream_num], block_dim, starting_point_C_pre, ldc,stream[current_stream]);
cudaStreamSynchronize(stream[current_stream]);
}
for (temp = 0; temp < tasks_rs_size[switcher]; temp++) {
//assume 1-switcher
int prior_task = tasks_rs[temp+stream_num*(switcher)];
int i_pre, k_pre;
blasx_get_index(prior_task, 0, x, &i_pre, &k_pre, Uplo, x);
int current_stream = temp;
int nrowc_dev_pre, ncolc_dev_pre;
margin_adjustment(nrowc, ncolc, block_dim, i_pre, k_pre, &nrowc_dev_pre, &ncolc_dev_pre);
int nrow_offset_c_pre = i_pre*block_dim;
int ncol_offset_c_pre = k_pre*block_dim;
double *starting_point_C_pre = &C[nrow_offset_c_pre+ncol_offset_c_pre*ldc];
cublasGetMatrixAsync(nrowc_dev_pre, ncolc_dev_pre, sizeof(double), C_dev[current_stream+switcher*stream_num], block_dim, starting_point_C_pre, ldc,stream[current_stream]);
cudaStreamSynchronize(stream[current_stream]);
}
}
开发者ID:529038378,项目名称:BLASX,代码行数:31,代码来源:blasx_mem_control.c
示例2: collect_final_result_zgemm
void collect_final_result_zgemm(int *tasks_rs, int *tasks_rs_size, int switcher, cudaStream_t *stream, cuDoubleComplex** C_dev, int block_dim, int stream_num, int x, int y, int z, int nrowc, int ncolc, int ldc, cuDoubleComplex *C) {
switcher = 1 - switcher;
int temp = 0;
for (temp = tasks_rs_size[switcher]; temp < tasks_rs_size[1-switcher] ; temp++) {
int prior_task = tasks_rs[temp+stream_num*(1-switcher)];
int i_pre = prior_task/(y+1);
int k_pre = prior_task%(y+1);
int current_stream = temp;
int nrowc_dev_pre, ncolc_dev_pre;
margin_adjustment(nrowc, ncolc, block_dim, i_pre, k_pre, &nrowc_dev_pre, &ncolc_dev_pre);
int nrow_offset_c_pre = i_pre*block_dim;
int ncol_offset_c_pre = k_pre*block_dim;
cuDoubleComplex *starting_point_C_pre = &C[nrow_offset_c_pre+ncol_offset_c_pre*ldc];
assert( cublasGetMatrixAsync(nrowc_dev_pre, ncolc_dev_pre, sizeof(cuDoubleComplex), C_dev[current_stream+(1-switcher)*stream_num], block_dim, starting_point_C_pre, ldc,stream[current_stream]) == CUBLAS_STATUS_SUCCESS );
assert(cudaStreamSynchronize(stream[current_stream]) == cudaSuccess);
}
for (temp = 0; temp < tasks_rs_size[switcher]; temp++) {
int prior_task = tasks_rs[temp+stream_num*(switcher)];
int i_pre = prior_task/(y+1);
int k_pre = prior_task%(y+1);
int current_stream = temp;
int nrowc_dev_pre, ncolc_dev_pre;
margin_adjustment(nrowc, ncolc, block_dim, i_pre, k_pre, &nrowc_dev_pre, &ncolc_dev_pre);
int nrow_offset_c_pre = i_pre*block_dim;
int ncol_offset_c_pre = k_pre*block_dim;
cuDoubleComplex *starting_point_C_pre = &C[nrow_offset_c_pre+ncol_offset_c_pre*ldc];
assert( cublasGetMatrixAsync(nrowc_dev_pre, ncolc_dev_pre, sizeof(cuDoubleComplex), C_dev[current_stream+switcher*stream_num], block_dim, starting_point_C_pre, ldc,stream[current_stream]) == CUBLAS_STATUS_SUCCESS );
assert(cudaStreamSynchronize(stream[current_stream]) == cudaSuccess);
}
}
开发者ID:529038378,项目名称:BLASX,代码行数:30,代码来源:blasx_mem_control.c
示例3: MyStreamSynchronize
cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id)
{
cudaError_t result = cudaSuccess;
if (abort_flag)
return result;
if (situation >= 0)
{
static std::map<int, tsumarray> tsum;
double a = 0.95, b = 0.05;
if (tsum.find(situation) == tsum.end()) { a = 0.5; b = 0.5; } // faster initial convergence
double tsync = 0.0;
double tsleep = 0.95 * tsum[situation].value[thr_id];
if (cudaStreamQuery(stream) == cudaErrorNotReady)
{
usleep((useconds_t)(1e6*tsleep));
struct timeval tv_start, tv_end;
gettimeofday(&tv_start, NULL);
result = cudaStreamSynchronize(stream);
gettimeofday(&tv_end, NULL);
tsync = 1e-6 * (tv_end.tv_usec-tv_start.tv_usec) + (tv_end.tv_sec-tv_start.tv_sec);
}
if (tsync >= 0) tsum[situation].value[thr_id] = a * tsum[situation].value[thr_id] + b * (tsleep+tsync);
}
else
result = cudaStreamSynchronize(stream);
return result;
}
开发者ID:jcvernaleo,项目名称:ccminer,代码行数:29,代码来源:cuda.cpp
示例4: collect_final_result_dtrsm_mode_1
void collect_final_result_dtrsm_mode_1(int *tasks_rs, int *tasks_rs_size, int switcher, int switcher_rs, cudaStream_t *stream, double** buffer_dev, int block_dim, int stream_num, int x, int y, int z, int nrowb, int ncolb, int ldb, double *B, int* switch_tracker) {
int temp = 0;
for (temp = tasks_rs_size[switcher_rs]; temp < tasks_rs_size[1-switcher_rs] ; temp++) {
// printf("retrieving B[%d, %d] @stream=%d switcher:%d\n", z, tasks_rs[temp+STREAMNUM*(1-switcher_rs)], temp, switcher);
int row = z;
int col = tasks_rs[temp+stream_num*(1-switcher_rs)];
int current_stream = temp;
int nrow_offset = row*block_dim;
int ncol_offset = col*block_dim;
int nrow_dev, ncol_dev;
margin_adjustment(nrowb, ncolb, block_dim, row, col, &nrow_dev, &ncol_dev);
double *starting_point = &B[nrow_offset+ncol_offset*ldb];
cublasGetMatrixAsync(nrow_dev, ncol_dev, sizeof(double), buffer_dev[current_stream+switch_tracker[temp]*stream_num], block_dim, starting_point, ldb, stream[current_stream]);
cudaStreamSynchronize(stream[current_stream]);
}
for (temp = 0; temp < tasks_rs_size[switcher_rs]; temp++) {
//assume 1-switcher
//printf("retrieving B[%d, %d] @stream=%d\n", z, tasks_rs[temp+STREAMNUM*switcher_rs], temp);
int row = z;
int col = tasks_rs[temp+stream_num*switcher_rs];
int current_stream = temp;
int nrow_offset = row*block_dim;
int ncol_offset = col*block_dim;
int nrow_dev, ncol_dev;
margin_adjustment(nrowb, ncolb, block_dim, row, col, &nrow_dev, &ncol_dev);
double *starting_point = &B[nrow_offset+ncol_offset*ldb];
cublasGetMatrixAsync(nrow_dev, ncol_dev, sizeof(double), buffer_dev[current_stream+switch_tracker[temp]*stream_num], block_dim, starting_point, ldb, stream[current_stream]);
cudaStreamSynchronize(stream[current_stream]);
}
}
开发者ID:529038378,项目名称:BLASX,代码行数:30,代码来源:blasx_mem_control.c
示例5: get_gpu
JNIEXPORT jdouble JNICALL Java_org_apache_spark_mllib_classification_LogisticRegressionNative_predictPoint
(JNIEnv *env, jobject obj, jdoubleArray data, jdoubleArray weights, jdouble intercept) {
// the kernel is written to take multiple data sets and produce a set of results, but we're going
// to run it as multiple parallel kernels, each producing a single result instead
double *d_dataBuffer, *d_weightsBuffer, *d_score;
int dataCount, dataLen, whichGPU;
jdouble h_score, *h_dataBuffer, *h_weightsBuffer;
cudaStream_t stream;
// select a GPU for *this* specific dataset
whichGPU = get_gpu();
checkCudaErrors(cudaSetDevice(whichGPU));
checkCudaErrors(cudaStreamCreate(&stream));
// get a pointer to the raw input data, pinning them in memory
dataCount = env->GetArrayLength(data);
dataLen = dataCount*sizeof(double);
assert(dataCount == env->GetArrayLength(weights));
h_dataBuffer = (jdouble*) env->GetPrimitiveArrayCritical(data, 0);
h_weightsBuffer = (jdouble*) env->GetPrimitiveArrayCritical(weights, 0);
// copy input data to the GPU memory
// TODO: It may be better to access host memory directly, skipping the copy. Investigate.
checkCudaErrors(mallocBest((void**)&d_dataBuffer, dataLen));
checkCudaErrors(mallocBest((void**)&d_weightsBuffer, dataLen));
checkCudaErrors(cudaMemcpyAsync(d_dataBuffer, h_dataBuffer, dataLen, cudaMemcpyHostToDevice, stream));
checkCudaErrors(cudaMemcpyAsync(d_weightsBuffer, h_weightsBuffer, dataLen, cudaMemcpyHostToDevice, stream));
// synchronize before unpinning, and also because there is a device-device transfer in predictKernelDevice
checkCudaErrors(cudaStreamSynchronize(stream));
// un-pin the host arrays, as we're done with them
env->ReleasePrimitiveArrayCritical(data, h_dataBuffer, 0);
env->ReleasePrimitiveArrayCritical(weights, h_weightsBuffer, 0);
// allocate storage for the result
checkCudaErrors(mallocBest((void**)&d_score, sizeof(double)));
// run the kernel, to produce a result
predictKernelDevice(d_dataBuffer, d_weightsBuffer, intercept, d_score, 1, dataCount, stream);
checkCudaErrors(cudaStreamSynchronize(stream));
// copy result back to host
checkCudaErrors(cudaMemcpyAsync(&h_score, d_score, sizeof(double), cudaMemcpyDeviceToHost, stream));
checkCudaErrors(cudaStreamSynchronize(stream));
// Free the GPU buffers
checkCudaErrors(freeBest(d_dataBuffer));
checkCudaErrors(freeBest(d_weightsBuffer));
checkCudaErrors(freeBest(d_score));
checkCudaErrors(cudaStreamDestroy(stream));
return h_score;
}
开发者ID:IBMSparkGPU,项目名称:CUDA-MLlib,代码行数:56,代码来源:LogisticRegressionNative.cpp
示例6: dw_common_codelet_update_u11
static inline void dw_common_codelet_update_u11(void *descr[], int s, STARPU_ATTRIBUTE_UNUSED void *_args)
{
float *sub11;
sub11 = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
unsigned long nx = STARPU_MATRIX_GET_NX(descr[0]);
unsigned long ld = STARPU_MATRIX_GET_LD(descr[0]);
unsigned long z;
switch (s)
{
case 0:
for (z = 0; z < nx; z++)
{
float pivot;
pivot = sub11[z+z*ld];
STARPU_ASSERT(pivot != 0.0f);
STARPU_SSCAL(nx - z - 1, (1.0f/pivot), &sub11[z+(z+1)*ld], ld);
STARPU_SGER(nx - z - 1, nx - z - 1, -1.0f,
&sub11[z+(z+1)*ld], ld,
&sub11[(z+1)+z*ld], 1,
&sub11[(z+1) + (z+1)*ld],ld);
}
break;
#ifdef STARPU_USE_CUDA
case 1:
for (z = 0; z < nx; z++)
{
float pivot;
cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
cudaStreamSynchronize(starpu_cuda_get_local_stream());
STARPU_ASSERT(pivot != 0.0f);
cublasSscal(nx - z - 1, 1.0f/pivot, &sub11[z+(z+1)*ld], ld);
cublasSger(nx - z - 1, nx - z - 1, -1.0f,
&sub11[z+(z+1)*ld], ld,
&sub11[(z+1)+z*ld], 1,
&sub11[(z+1) + (z+1)*ld],ld);
}
cudaStreamSynchronize(starpu_cuda_get_local_stream());
break;
#endif
default:
STARPU_ABORT();
break;
}
}
开发者ID:excess-project,项目名称:starpu-ex-1.2.0rc5,代码行数:55,代码来源:dw_factolu_kernels.c
示例7: CAFFE1_CUDA_CHECK
void BasePrefetchingDataLayer<Dtype>::InternalThreadEntry() {
#ifndef CPU_ONLY
cudaStream_t stream;
cudaStream_t stream2;
if (Caffe::mode() == Caffe::GPU) {
CAFFE1_CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
if (untransformed_top_)
CAFFE1_CUDA_CHECK(cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking));
}
#endif
try {
while (!must_stop()) {
Batch<Dtype>* batch = prefetch_free_.pop();
Batch<Dtype>* batch_untransformed = NULL;
if (untransformed_top_)
{
batch_untransformed = prefetch_free_untransformed_.pop();
load_batch_and_untransformed_batch(batch,batch_untransformed);
}
else
load_batch(batch);
#ifndef CPU_ONLY
if (Caffe::mode() == Caffe::GPU) {
batch->data_.data().get()->async_gpu_push(stream);
CAFFE1_CUDA_CHECK(cudaStreamSynchronize(stream));
if (untransformed_top_)
{
batch_untransformed->data_.data().get()->async_gpu_push(stream2);
CAFFE1_CUDA_CHECK(cudaStreamSynchronize(stream2));
}
}
#endif
prefetch_full_.push(batch);
if (untransformed_top_)
prefetch_full_untransformed_.push(batch_untransformed);
}
} catch (boost::thread_interrupted&) {
// Interrupted exception is expected on shutdown
}
#ifndef CPU_ONLY
if (Caffe::mode() == Caffe::GPU) {
CAFFE1_CUDA_CHECK(cudaStreamDestroy(stream));
if (untransformed_top_)
CAFFE1_CUDA_CHECK(cudaStreamDestroy(stream2));
}
#endif
}
开发者ID:beniz,项目名称:caffe,代码行数:49,代码来源:base_data_layer.cpp
示例8: strmm_cuda
static void strmm_cuda(void *descr[], void *args) {
float *a = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
float *b = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
float *c = (float *)STARPU_MATRIX_GET_PTR(descr[2]);
unsigned w = STARPU_MATRIX_GET_NY(descr[0]);
unsigned h = STARPU_MATRIX_GET_NX(descr[1]);
unsigned lda = STARPU_MATRIX_GET_LD(descr[0]);
unsigned ldb = STARPU_MATRIX_GET_LD(descr[1]);
unsigned ldc = STARPU_MATRIX_GET_LD(descr[2]);
struct strmm_arg * arg = (struct strmm_arg *)args;
cublasSideMode_t side = arg->side ? CUBLAS_SIDE_RIGHT : CUBLAS_SIDE_LEFT;
cublasFillMode_t uplo = arg->uplo ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
cublasDiagType_t diag = arg->unit ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT;
cublasOperation_t trans = CUBLAS_OP_T;
const float factor = 1.0f;
cublasSetStream(cublas_handle, starpu_cuda_get_local_stream());
cublasStrmm(cublas_handle, side, uplo, trans, diag, w, h, &factor, a, lda, b, ldb, c, ldc);
cudaStreamSynchronize(starpu_cuda_get_local_stream());
free(arg);
}
开发者ID:hsyl20,项目名称:HaskellPU,代码行数:28,代码来源:FloatMatrixStrmm_c.c
示例9: TransferAllHalos
/**
* @brief This performs the exchanging of all necessary halos between 2 neighboring MPI processes
*
* @param[in] cartComm The carthesian MPI communicator
* @param[in] domSize The 2D size of the local domain
* @param[in] topIndex The 2D index of the calling MPI process in the topology
* @param[in] neighbors The list of ranks which are direct neighbors to the caller
* @param[in] copyStream The stream used to overlap top & bottom halo exchange with side halo copy to host memory
* @param[in, out] devBlocks The 2 device blocks that are updated during the Jacobi run
* @param[in, out] devSideEdges The 2 side edges (parallel to the Y direction) that hold the packed halo values before sending them
* @param[in, out] devHaloLines The 2 halo lines (parallel to the Y direction) that hold the packed halo values after receiving them
* @param[in, out] hostSendLines The 2 host send buffers that are used during the halo exchange by the normal CUDA & MPI version
* @param[in, out] hostRecvLines The 2 host receive buffers that are used during the halo exchange by the normal CUDA & MPI version
* @return The time spent during the MPI transfers
*/
double TransferAllHalos(MPI_Comm cartComm, const int2 * domSize, const int2 * topIndex, const int * neighbors, cudaStream_t copyStream,
real * devBlocks[2], real * devSideEdges[2], real * devHaloLines[2], real * hostSendLines[2], real * hostRecvLines[2])
{
real * devSendLines[2] = {devBlocks[0] + domSize->x + 3, devBlocks[0] + domSize->y * (domSize->x + 2) + 1};
real * devRecvLines[2] = {devBlocks[0] + 1, devBlocks[0] + (domSize->y + 1) * (domSize->x + 2) + 1};
int yNeighbors[2] = {neighbors[DIR_TOP], neighbors[DIR_BOTTOM]};
int xNeighbors[2] = {neighbors[DIR_LEFT], neighbors[DIR_RIGHT]};
int2 order = make_int2(topIndex->x % 2, topIndex->y % 2);
double transferTime;
// Populate the block's side edges
CopyDevSideEdgesFromBlock(devBlocks[0], devSideEdges, domSize, neighbors, copyStream);
// Exchange data with the top and bottom neighbors
transferTime = MPI_Wtime();
ExchangeHalos(cartComm, devSendLines[ order.y ], hostSendLines[0], hostRecvLines[0], devRecvLines[ order.y ], yNeighbors[ order.y ], domSize->x);
ExchangeHalos(cartComm, devSendLines[1 - order.y], hostSendLines[0], hostRecvLines[0], devRecvLines[1 - order.y], yNeighbors[1 - order.y], domSize->x);
SafeCudaCall(cudaStreamSynchronize(copyStream));
// Exchange data with the left and right neighbors
ExchangeHalos(cartComm, devSideEdges[ order.x ], hostSendLines[1], hostRecvLines[1], devHaloLines[ order.x ], xNeighbors[ order.x ], domSize->y);
ExchangeHalos(cartComm, devSideEdges[1 - order.x], hostSendLines[1], hostRecvLines[1], devHaloLines[1 - order.x], xNeighbors[1 - order.x], domSize->y);
transferTime = MPI_Wtime() - transferTime;
// Copy the received halos to the device block
CopyDevHalosToBlock(devBlocks[0], devHaloLines[0], devHaloLines[1], domSize, neighbors);
return transferTime;
}
开发者ID:Haider-BA,项目名称:Matlab2CPP,代码行数:44,代码来源:Host.c
示例10: forward_process_synchronized
void forward_process_synchronized(struct ixmapfwd_thread *thread,
unsigned int port_index, struct ixmap_packet *packet,
unsigned int num_packets, struct ixmap_packet_cuda *result, uint8_t *read_buf)
{
int fd, i;
cudaStreamSynchronize(thread->stream);
for(i = 0; i < num_packets; i++){
if(result[i].outif >= 0){
ixmap_tx_assign(thread->plane, result[i].outif,
thread->buf, &packet[i]);
}else if(result[i].outif == -1){
goto packet_drop;
}else{
goto packet_inject;
}
continue;
packet_inject:
memcpy(read_buf, packet[i].slot_buf, packet[i].slot_size);
fd = thread->tun_plane->ports[port_index].fd;
write(fd, read_buf, packet[i].slot_size);
packet_drop:
ixmap_slot_release(thread->buf, packet[i].slot_index);
}
return;
}
开发者ID:edenden,项目名称:ixmap-cuda,代码行数:28,代码来源:forward.c
示例11: CUDA_CHECK
void BasePrefetchingDataLayer<Dtype>::InternalThreadEntry() {
#ifndef CPU_ONLY
cudaStream_t stream;//创建CUDA stream,非阻塞类型
if (Caffe::mode() == Caffe::GPU) {
CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
}
#endif
try {
while (!must_stop()) { //循环载入批量数据
Batch<Dtype>* batch = prefetch_free_.pop();//拿到一个空闲batch
load_batch(batch);//载入批量数据
#ifndef CPU_ONLY
if (Caffe::mode() == Caffe::GPU) {
batch->data_.data().get()->async_gpu_push(stream);
if (this->output_labels_) {
batch->label_.data().get()->async_gpu_push(stream);
}
CUDA_CHECK(cudaStreamSynchronize(stream));//同步到GPU
}
#endif
prefetch_full_.push(batch);//加入到带负载的Batch队列中
}
} catch (boost::thread_interrupted&) {//捕获异常,退出while循环
// Interrupted exception is expected on shutdown
}
#ifndef CPU_ONLY
if (Caffe::mode() == Caffe::GPU) {
CUDA_CHECK(cudaStreamDestroy(stream));//销毁CUDA stream
}
#endif
}
开发者ID:huanyii,项目名称:caffe_read,代码行数:32,代码来源:base_data_layer.cpp
示例12: LAMA_ASSERT_DEBUG
void CUDABLAS1::scal( IndexType n, const double alpha, double* x_d, const IndexType incx, SyncToken* syncToken )
{
LAMA_CHECK_CUDA_ACCESS
cudaStream_t stream = NULL;
if ( syncToken )
{
CUDAStreamSyncToken* cudaStreamSyncToken = dynamic_cast<CUDAStreamSyncToken*>( syncToken );
LAMA_ASSERT_DEBUG( cudaStreamSyncToken, "no cuda stream sync token provided" )
stream = cudaStreamSyncToken->getCUDAStream();
}
cublasSetKernelStream( stream );
LAMA_CHECK_CUBLAS_ERROR
cublasDscal( n, alpha, x_d, incx );
// No error check here possible as kernel is started asynchronously
if ( !syncToken )
{
cudaStreamSynchronize( 0 );
LAMA_CHECK_CUDA_ERROR
}
开发者ID:fast-project,项目名称:lama,代码行数:25,代码来源:CUDABLAS1.cpp
示例13: advance_generations
/*
* Advance the simulation by <n> generations by mapping the OpenGL pixel buffer
* objects for writing from CUDA, executing the kernel <n> times, and unmapping
* the pixel buffer object.
*/
void advance_generations(unsigned long n)
{
uint8_t* device_bufs[2];
size_t size;
DEBUG2("Mapping CUDA resources and retrieving device buffer pointers\n");
cudaGraphicsMapResources(2, cuda_graphics_resources, (cudaStream_t)0);
cudaGraphicsResourceGetMappedPointer((void**)&device_bufs[0], &size,
cuda_graphics_resources[0]);
cudaGraphicsResourceGetMappedPointer((void**)&device_bufs[1], &size,
cuda_graphics_resources[1]);
check_cuda_error();
while (n--) {
DEBUG2("Launching kernel (grid.width = %u, grid.height = %u)\n",
grid.width, grid.height);
launch_kernel(device_bufs[grid.which_buf], device_bufs[!grid.which_buf],
grid.width, grid.height);
grid.which_buf ^= 1;
}
DEBUG2("Unmapping CUDA resources\n");
cudaGraphicsUnmapResources(2, cuda_graphics_resources, (cudaStream_t)0);
cudaStreamSynchronize(0);
}
开发者ID:anojavan,项目名称:csinparallel,代码行数:37,代码来源:cuda.c
示例14: CHECK
void NCCL<Dtype>::run(int layer) {
CHECK(solver_->param().layer_wise_reduce());
vector<shared_ptr<Blob<Dtype> > >& blobs =
solver_->net()->layers()[layer]->blobs();
#ifdef DEBUG
// Assert blobs are contiguous to reduce in one step (e.g. bias often small)
for (int i = 1; i < blobs.size(); ++i) {
CHECK_EQ(blobs[i - 1]->gpu_diff() + blobs[i - 1]->count(),
blobs[i + 0]->gpu_diff());
}
#endif
if (blobs.size() > 0) {
// Make sure default stream is done computing gradients. Could be
// replaced by cudaEventRecord+cudaStreamWaitEvent to avoid
// blocking the default stream, but it's actually slower.
CUDA_CHECK(cudaStreamSynchronize(cudaStreamDefault));
// Reduce asynchronously
int size = 0;
for (int i = 0; i < blobs.size(); ++i) {
size += blobs[i]->count();
}
if (barrier_) { // NULL in multi process case
barrier_->wait();
}
NCCL_CHECK(ncclAllReduce(blobs[0]->mutable_gpu_diff(),
blobs[0]->mutable_gpu_diff(),
size,
nccl::dataType<Dtype>::type,
ncclSum, comm_, stream_));
caffe_gpu_scal(size, (Dtype) 1.0 / Caffe::solver_count(),
blobs[0]->mutable_gpu_diff(), stream_);
}
}
开发者ID:20337112,项目名称:caffe,代码行数:34,代码来源:parallel.cpp
示例15: while
void KMeansController<Dtype>::PostProcess() {
/** Use the cluster center calculated to get cluster labels**/
int* label_data = this->int_outputs_[0]->mutable_cpu_data();
// first restart the data provider.
this->data_providers_[0]->ForceRestart();
// assign cluster labels
bool assignment_finished = false;
while (!assignment_finished) {
size_t batch_num = 0;
size_t index = this->data_providers_[0]->current_index();
// getting current index must precede GetData()
// otherwise the current index in the data provide would be advanced
this->mats_[0] = this->data_providers_[0]->GetData(batch_num);
if ( this->data_providers_[0]->current_index() == 0) {
assignment_finished = true;
}
//execute only the maximization functions for all samples
this->function_input_vecs_[0][0] = this->mats_[0].get();
this->funcs_[0]->Execute(this->function_input_vecs_[0], this->function_output_vecs_[0], GKMeans::stream(0));
CUDA_CHECK(cudaStreamSynchronize(GKMeans::stream(0)));
// copy out cluster labels
int* out_data = (int*)this->mats_[2]->cpu_data();
for (size_t i = 0; i < batch_num; i++) {
label_data[i + index] = out_data[i];
}
}
// copy out center data
memcpy(this->numeric_outputs_[0]->mutable_cpu_data(), this->mats_[1]->cpu_data(), this->mats_[1]->count() * sizeof(Dtype));
}
开发者ID:CUHK-MMLAB,项目名称:gkmeans,代码行数:35,代码来源:controllers.cpp
示例16: cudaStreamSynchronize
void octree::resetCompact()
{
// reset counts to 1 so next compact proceeds...
cudaStreamSynchronize(execStream->s()); // must make sure any outstanding compact is finished
this->devMemCountsx[0] = 1;
this->devMemCountsx.h2d(1, false, copyStream->s());
}
开发者ID:BerndDoser,项目名称:Bonsai,代码行数:7,代码来源:load_kernels.cpp
示例17: CUDA_CHECK
void P2PSync<Dtype>::on_gradients_ready(Timer* timer, ostringstream* timing) {
#ifndef CPU_ONLY
#ifdef DEBUG
int device;
CUDA_CHECK(cudaGetDevice(&device));
CHECK(device == solver_->param().device_id());
#endif
// Sum children gradients as they appear in the queue
for (int i = 0; i < children_.size(); ++i) {
timer->Start();
P2PSync<Dtype> *child = queue_.pop();
Dtype* src = child->parent_grads_;
Dtype* dst = diff_;
#ifdef DEBUG
bool ok = false;
for (int j = 0; j < children_.size(); ++j) {
if (child == children_[j]) {
ok = true;
}
}
CHECK(ok);
cudaPointerAttributes attributes;
CUDA_CHECK(cudaPointerGetAttributes(&attributes, src));
CHECK(attributes.device == device);
CUDA_CHECK(cudaPointerGetAttributes(&attributes, dst));
CHECK(attributes.device == device);
#endif
caffe_gpu_add(size_, src, dst, dst);
*timing << " add_grad: " << timer->MilliSeconds();
}
// Send gradients to parent
if (parent_) {
timer->Start();
Dtype* src = diff_;
Dtype* dst = parent_grads_;
#ifdef DEBUG
cudaPointerAttributes attributes;
CUDA_CHECK(cudaPointerGetAttributes(&attributes, src));
CHECK(attributes.device == device);
CUDA_CHECK(cudaPointerGetAttributes(&attributes, dst));
CHECK(attributes.device == parent_->solver_->param().device_id());
#endif
CUDA_CHECK(cudaMemcpyAsync(dst, src, size_ * sizeof(Dtype), //
cudaMemcpyDeviceToDevice, cudaStreamDefault));
CUDA_CHECK(cudaStreamSynchronize(cudaStreamDefault));
parent_->queue_.push(this);
*timing << " send_grad: " << timer->MilliSeconds();
} else {
// Loss functions divide gradients by the batch size, so to compensate
// for split batch, the root solver divides by number of solvers.
caffe_gpu_scal(size_, Dtype(1.0 / Caffe::solver_count()), diff_);
}
#endif
}
开发者ID:bbshocking,项目名称:caffe,代码行数:60,代码来源:parallel.cpp
示例18: TEST
TEST(MemcpyAsync, CheckReturnValues) {
/**
* The API documentation states that
* cudaErrorInvalidDevicePointer is a valid return value for
* cudaMemcpyAsync
*
* TODO; This needs a test.
*/
cudaError_t ret;
cudaStream_t stream;
ret = cudaStreamCreate(&stream);
ASSERT_EQ(cudaSuccess, ret);
/**
* Test woefully out of range directions.
*/
int a = 0;
ret = cudaMemcpyAsync(&a, &a, sizeof(a), (cudaMemcpyKind) -1, stream);
EXPECT_EQ(cudaErrorInvalidMemcpyDirection, ret);
ret = cudaMemcpyAsync(NULL, NULL, sizeof(a), (cudaMemcpyKind) -1, stream);
EXPECT_EQ(cudaErrorInvalidMemcpyDirection, ret);
ret = cudaStreamSynchronize(stream);
EXPECT_EQ(cudaSuccess, ret);
ret = cudaStreamDestroy(stream);
EXPECT_EQ(cudaSuccess, ret);
}
开发者ID:ckennelly,项目名称:panoptes,代码行数:30,代码来源:vtest_memcpyasync.cpp
示例19: TEST_P
TEST_P(MemcpyAsync, H2DTransfers) {
const size_t param = GetParam();
const size_t alloc = 1 << param;
cudaError_t ret;
void *d1, *h1;
ret = cudaMalloc(&d1, alloc);
ASSERT_EQ(cudaSuccess, ret);
ret = cudaHostAlloc(&h1, alloc, cudaHostAllocMapped);
ASSERT_EQ(cudaSuccess, ret);
cudaStream_t stream;
ret = cudaStreamCreate(&stream);
ASSERT_EQ(cudaSuccess, ret);
ret = cudaMemcpyAsync(d1, h1, alloc, cudaMemcpyHostToDevice, stream);
ASSERT_EQ(cudaSuccess, ret);
ret = cudaStreamSynchronize(stream);
ASSERT_EQ(cudaSuccess, ret);
ret = cudaFree(d1);
ASSERT_EQ(cudaSuccess, ret);
ret = cudaFreeHost(h1);
ASSERT_EQ(cudaSuccess, ret);
ret = cudaStreamDestroy(stream);
ASSERT_EQ(cudaSuccess, ret);
}
开发者ID:ckennelly,项目名称:panoptes,代码行数:31,代码来源:vtest_memcpyasync.cpp
示例20: CUDA_CHECK
shared_ptr<Mat<Dtype>> DataProviderBase<Dtype>::GetData(size_t &num){
//first join the async task
num = num_future_.get();
//also sync the cuda strea
CUDA_CHECK(cudaStreamSynchronize(data_stream_));
// get the mat with data filled
int ready_idx = data_q_.front();
shared_ptr<Mat<Dtype>> ready_mat = data_slot_vec_[ready_idx];
// send the id of the mat to the end of the queue
data_q_.pop_front(); data_q_.push_back(ready_idx);
// get the mat to be filled (id at the front of the queue)
Mat<Dtype>* working_mat = data_slot_vec_[data_q_.front()].get();
// kickout async task (force async launch)
num_future_ = std::async(std::launch::async, &DataProviderBase::AsyncFunc, this, working_mat);
// increment the current data cursor.
current_index_ += num;
if (current_index_ == round_size_){
current_index_ = 0;
}
return ready_mat;
}
开发者ID:caomw,项目名称:gkmeans,代码行数:28,代码来源:base_data_provider.cpp
注:本文中的cudaStreamSynchronize函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论