C++ cudaStreamSynchronize函数代码示例

OGeek|极客世界-中国程序员成长平台 › 门户 › 编程› C++›C++教程

原作者: [db:作者] 来自: [db:来源] 收藏邀请

本文整理汇总了C++中cudaStreamSynchronize函数的典型用法代码示例。如果您正苦于以下问题：C++ cudaStreamSynchronize函数的具体用法？C++ cudaStreamSynchronize怎么用？C++ cudaStreamSynchronize使用的例子？那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。

在下文中一共展示了cudaStreamSynchronize函数的20个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于我们的系统推荐出更棒的C++代码示例。

示例1: collect_final_result_dsyrk_syr2k

void collect_final_result_dsyrk_syr2k(int *tasks_rs, int *tasks_rs_size, int switcher, cudaStream_t *stream, double** C_dev, int block_dim, int stream_num, int x,int y, int z, int nrowc, int ncolc, int ldc, double *C, enum CBLAS_UPLO Uplo) {
    switcher = 1 - switcher;
    int temp = 0;
    for (temp = tasks_rs_size[switcher]; temp < tasks_rs_size[1-switcher] ; temp++) {
        int prior_task = tasks_rs[temp+stream_num*(1-switcher)];
        int i_pre, k_pre;
        blasx_get_index(prior_task, 0, x, &i_pre, &k_pre, Uplo, x);
        int current_stream = temp;
        int nrowc_dev_pre, ncolc_dev_pre;
        margin_adjustment(nrowc, ncolc, block_dim, i_pre, k_pre, &nrowc_dev_pre, &ncolc_dev_pre);
        int nrow_offset_c_pre = i_pre*block_dim;
        int ncol_offset_c_pre = k_pre*block_dim;
        double *starting_point_C_pre = &C[nrow_offset_c_pre+ncol_offset_c_pre*ldc];
        cublasGetMatrixAsync(nrowc_dev_pre, ncolc_dev_pre, sizeof(double), C_dev[current_stream+(1-switcher)*stream_num], block_dim, starting_point_C_pre, ldc,stream[current_stream]);
        cudaStreamSynchronize(stream[current_stream]);
    }
    for (temp = 0; temp < tasks_rs_size[switcher]; temp++) {
        //assume 1-switcher
        int prior_task = tasks_rs[temp+stream_num*(switcher)];
        int i_pre, k_pre;
        blasx_get_index(prior_task, 0, x, &i_pre, &k_pre, Uplo, x);
        int current_stream = temp;
        int nrowc_dev_pre, ncolc_dev_pre;
        margin_adjustment(nrowc, ncolc, block_dim, i_pre, k_pre, &nrowc_dev_pre, &ncolc_dev_pre);
        int nrow_offset_c_pre = i_pre*block_dim;
        int ncol_offset_c_pre = k_pre*block_dim;
        double *starting_point_C_pre = &C[nrow_offset_c_pre+ncol_offset_c_pre*ldc];
        cublasGetMatrixAsync(nrowc_dev_pre, ncolc_dev_pre, sizeof(double), C_dev[current_stream+switcher*stream_num], block_dim, starting_point_C_pre, ldc,stream[current_stream]);
        cudaStreamSynchronize(stream[current_stream]);
    }
}

开发者ID:529038378，项目名称:BLASX，代码行数:31，代码来源:blasx_mem_control.c

示例2: collect_final_result_zgemm

void collect_final_result_zgemm(int *tasks_rs, int *tasks_rs_size, int switcher, cudaStream_t *stream, cuDoubleComplex** C_dev, int block_dim, int stream_num, int x, int y, int z, int nrowc, int ncolc, int ldc, cuDoubleComplex *C) {
    switcher = 1 - switcher;
    int temp = 0;
    for (temp = tasks_rs_size[switcher]; temp < tasks_rs_size[1-switcher] ; temp++) {
        int prior_task = tasks_rs[temp+stream_num*(1-switcher)];
        int i_pre = prior_task/(y+1);
        int k_pre = prior_task%(y+1);
        int current_stream = temp;
        int nrowc_dev_pre, ncolc_dev_pre;
        margin_adjustment(nrowc, ncolc, block_dim, i_pre, k_pre, &nrowc_dev_pre, &ncolc_dev_pre);
        int nrow_offset_c_pre = i_pre*block_dim;
        int ncol_offset_c_pre = k_pre*block_dim;
        cuDoubleComplex *starting_point_C_pre = &C[nrow_offset_c_pre+ncol_offset_c_pre*ldc];
        assert( cublasGetMatrixAsync(nrowc_dev_pre, ncolc_dev_pre, sizeof(cuDoubleComplex), C_dev[current_stream+(1-switcher)*stream_num], block_dim, starting_point_C_pre, ldc,stream[current_stream]) == CUBLAS_STATUS_SUCCESS );
        assert(cudaStreamSynchronize(stream[current_stream]) == cudaSuccess);
    }
    for (temp = 0; temp < tasks_rs_size[switcher]; temp++) {
        int prior_task = tasks_rs[temp+stream_num*(switcher)];
        int i_pre = prior_task/(y+1);
        int k_pre = prior_task%(y+1);
        int current_stream = temp;
        int nrowc_dev_pre, ncolc_dev_pre;
        margin_adjustment(nrowc, ncolc, block_dim, i_pre, k_pre, &nrowc_dev_pre, &ncolc_dev_pre);
        int nrow_offset_c_pre = i_pre*block_dim;
        int ncol_offset_c_pre = k_pre*block_dim;
        cuDoubleComplex *starting_point_C_pre = &C[nrow_offset_c_pre+ncol_offset_c_pre*ldc];
        assert( cublasGetMatrixAsync(nrowc_dev_pre, ncolc_dev_pre, sizeof(cuDoubleComplex), C_dev[current_stream+switcher*stream_num], block_dim, starting_point_C_pre, ldc,stream[current_stream]) == CUBLAS_STATUS_SUCCESS );
        assert(cudaStreamSynchronize(stream[current_stream]) == cudaSuccess);
    }
}

开发者ID:529038378，项目名称:BLASX，代码行数:30，代码来源:blasx_mem_control.c

示例3: MyStreamSynchronize

cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id)
{
	cudaError_t result = cudaSuccess;
	if (abort_flag)
		return result;
	if (situation >= 0)
	{
		static std::map<int, tsumarray> tsum;

		double a = 0.95, b = 0.05;
		if (tsum.find(situation) == tsum.end()) { a = 0.5; b = 0.5; } // faster initial convergence

		double tsync = 0.0;
		double tsleep = 0.95 * tsum[situation].value[thr_id];
		if (cudaStreamQuery(stream) == cudaErrorNotReady)
		{
			usleep((useconds_t)(1e6*tsleep));
			struct timeval tv_start, tv_end;
			gettimeofday(&tv_start, NULL);
			result = cudaStreamSynchronize(stream);
			gettimeofday(&tv_end, NULL);
			tsync = 1e-6 * (tv_end.tv_usec-tv_start.tv_usec) + (tv_end.tv_sec-tv_start.tv_sec);
		}
		if (tsync >= 0) tsum[situation].value[thr_id] = a * tsum[situation].value[thr_id] + b * (tsleep+tsync);
	}
	else
		result = cudaStreamSynchronize(stream);
	return result;
}

开发者ID:jcvernaleo，项目名称:ccminer，代码行数:29，代码来源:cuda.cpp

示例4: collect_final_result_dtrsm_mode_1

void collect_final_result_dtrsm_mode_1(int *tasks_rs, int *tasks_rs_size, int switcher, int switcher_rs, cudaStream_t *stream, double** buffer_dev, int block_dim, int stream_num, int x, int y, int z, int nrowb, int ncolb, int ldb, double *B, int* switch_tracker) {
    int temp = 0;
    for (temp = tasks_rs_size[switcher_rs]; temp < tasks_rs_size[1-switcher_rs] ; temp++) {
        //            printf("retrieving B[%d, %d] @stream=%d switcher:%d\n", z, tasks_rs[temp+STREAMNUM*(1-switcher_rs)], temp, switcher);
        int row = z;
        int col = tasks_rs[temp+stream_num*(1-switcher_rs)];
        int current_stream = temp;
        int nrow_offset = row*block_dim;
        int ncol_offset = col*block_dim;
        int nrow_dev, ncol_dev;
        margin_adjustment(nrowb, ncolb, block_dim, row, col, &nrow_dev, &ncol_dev);
        double *starting_point = &B[nrow_offset+ncol_offset*ldb];
        cublasGetMatrixAsync(nrow_dev, ncol_dev, sizeof(double), buffer_dev[current_stream+switch_tracker[temp]*stream_num], block_dim, starting_point, ldb, stream[current_stream]);
        cudaStreamSynchronize(stream[current_stream]);
    }
    for (temp = 0; temp < tasks_rs_size[switcher_rs]; temp++) {
        //assume 1-switcher
        //printf("retrieving B[%d, %d] @stream=%d\n", z, tasks_rs[temp+STREAMNUM*switcher_rs], temp);
        int row = z;
        int col = tasks_rs[temp+stream_num*switcher_rs];
        int current_stream = temp;
        int nrow_offset = row*block_dim;
        int ncol_offset = col*block_dim;
        int nrow_dev, ncol_dev;
        margin_adjustment(nrowb, ncolb, block_dim, row, col, &nrow_dev, &ncol_dev);
        double *starting_point = &B[nrow_offset+ncol_offset*ldb];
        cublasGetMatrixAsync(nrow_dev, ncol_dev, sizeof(double), buffer_dev[current_stream+switch_tracker[temp]*stream_num], block_dim, starting_point, ldb, stream[current_stream]);
        cudaStreamSynchronize(stream[current_stream]);
    }
}

开发者ID:529038378，项目名称:BLASX，代码行数:30，代码来源:blasx_mem_control.c

示例5: get_gpu

JNIEXPORT jdouble JNICALL Java_org_apache_spark_mllib_classification_LogisticRegressionNative_predictPoint
    (JNIEnv *env, jobject obj, jdoubleArray data, jdoubleArray weights, jdouble intercept) {

	// the kernel is written to take multiple data sets and produce a set of results, but we're going
	// to run it as multiple parallel kernels, each producing a single result instead
	double *d_dataBuffer, *d_weightsBuffer, *d_score;
	int dataCount, dataLen, whichGPU;
	jdouble h_score, *h_dataBuffer, *h_weightsBuffer;
	cudaStream_t stream;

	// select a GPU for *this* specific dataset
	whichGPU = get_gpu();
	checkCudaErrors(cudaSetDevice(whichGPU));
	checkCudaErrors(cudaStreamCreate(&stream));

	// get a pointer to the raw input data, pinning them in memory
	dataCount = env->GetArrayLength(data);
	dataLen = dataCount*sizeof(double);
	assert(dataCount == env->GetArrayLength(weights));
	h_dataBuffer = (jdouble*) env->GetPrimitiveArrayCritical(data, 0);
	h_weightsBuffer = (jdouble*) env->GetPrimitiveArrayCritical(weights, 0);

	// copy input data to the GPU memory
	// TODO: It may be better to access host memory directly, skipping the copy.  Investigate.
	checkCudaErrors(mallocBest((void**)&d_dataBuffer, dataLen));
	checkCudaErrors(mallocBest((void**)&d_weightsBuffer, dataLen));
	checkCudaErrors(cudaMemcpyAsync(d_dataBuffer, h_dataBuffer, dataLen, cudaMemcpyHostToDevice, stream));
	checkCudaErrors(cudaMemcpyAsync(d_weightsBuffer, h_weightsBuffer, dataLen, cudaMemcpyHostToDevice, stream));
	// synchronize before unpinning, and also because there is a device-device transfer in predictKernelDevice
	checkCudaErrors(cudaStreamSynchronize(stream));
	// un-pin the host arrays, as we're done with them
	env->ReleasePrimitiveArrayCritical(data, h_dataBuffer, 0);
	env->ReleasePrimitiveArrayCritical(weights, h_weightsBuffer, 0);

	// allocate storage for the result
	checkCudaErrors(mallocBest((void**)&d_score, sizeof(double)));

	// run the kernel, to produce a result
	predictKernelDevice(d_dataBuffer, d_weightsBuffer, intercept, d_score, 1, dataCount, stream);

	checkCudaErrors(cudaStreamSynchronize(stream));

	// copy result back to host
	checkCudaErrors(cudaMemcpyAsync(&h_score, d_score, sizeof(double), cudaMemcpyDeviceToHost, stream));

	checkCudaErrors(cudaStreamSynchronize(stream));

	// Free the GPU buffers
	checkCudaErrors(freeBest(d_dataBuffer));
	checkCudaErrors(freeBest(d_weightsBuffer));
	checkCudaErrors(freeBest(d_score));

	checkCudaErrors(cudaStreamDestroy(stream));

	return h_score;
}

开发者ID:IBMSparkGPU，项目名称:CUDA-MLlib，代码行数:56，代码来源:LogisticRegressionNative.cpp

示例6: dw_common_codelet_update_u11

static inline void dw_common_codelet_update_u11(void *descr[], int s, STARPU_ATTRIBUTE_UNUSED void *_args) 
{
	float *sub11;

	sub11 = (float *)STARPU_MATRIX_GET_PTR(descr[0]); 

	unsigned long nx = STARPU_MATRIX_GET_NX(descr[0]);
	unsigned long ld = STARPU_MATRIX_GET_LD(descr[0]);

	unsigned long z;

	switch (s)
	{
		case 0:
			for (z = 0; z < nx; z++)
			{
				float pivot;
				pivot = sub11[z+z*ld];
				STARPU_ASSERT(pivot != 0.0f);
		
				STARPU_SSCAL(nx - z - 1, (1.0f/pivot), &sub11[z+(z+1)*ld], ld);
		
				STARPU_SGER(nx - z - 1, nx - z - 1, -1.0f,
						&sub11[z+(z+1)*ld], ld,
						&sub11[(z+1)+z*ld], 1,
						&sub11[(z+1) + (z+1)*ld],ld);
			}
			break;
#ifdef STARPU_USE_CUDA
		case 1:
			for (z = 0; z < nx; z++)
			{
				float pivot;
				cudaMemcpyAsync(&pivot, &sub11[z+z*ld], sizeof(float), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
				cudaStreamSynchronize(starpu_cuda_get_local_stream());

				STARPU_ASSERT(pivot != 0.0f);
				
				cublasSscal(nx - z - 1, 1.0f/pivot, &sub11[z+(z+1)*ld], ld);
				
				cublasSger(nx - z - 1, nx - z - 1, -1.0f,
								&sub11[z+(z+1)*ld], ld,
								&sub11[(z+1)+z*ld], 1,
								&sub11[(z+1) + (z+1)*ld],ld);
			}

			cudaStreamSynchronize(starpu_cuda_get_local_stream());

			break;
#endif
		default:
			STARPU_ABORT();
			break;
	}
}

开发者ID:excess-project，项目名称:starpu-ex-1.2.0rc5，代码行数:55，代码来源:dw_factolu_kernels.c

示例7: CAFFE1_CUDA_CHECK

void BasePrefetchingDataLayer<Dtype>::InternalThreadEntry() {
#ifndef CPU_ONLY
  cudaStream_t stream;
  cudaStream_t stream2;
  if (Caffe::mode() == Caffe::GPU) {
    CAFFE1_CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
    if (untransformed_top_)
      CAFFE1_CUDA_CHECK(cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking));
  }
#endif

  try {
    while (!must_stop()) {
      Batch<Dtype>* batch = prefetch_free_.pop();
      Batch<Dtype>* batch_untransformed = NULL;
      if (untransformed_top_)
        {
          batch_untransformed = prefetch_free_untransformed_.pop();
          load_batch_and_untransformed_batch(batch,batch_untransformed);
        }
      else
        load_batch(batch);

#ifndef CPU_ONLY
      if (Caffe::mode() == Caffe::GPU) {
        batch->data_.data().get()->async_gpu_push(stream);
        CAFFE1_CUDA_CHECK(cudaStreamSynchronize(stream));
        if (untransformed_top_)
          {
            batch_untransformed->data_.data().get()->async_gpu_push(stream2);
            CAFFE1_CUDA_CHECK(cudaStreamSynchronize(stream2));
          }
      }
#endif
      prefetch_full_.push(batch);
      if (untransformed_top_)
        prefetch_full_untransformed_.push(batch_untransformed);
    }
  } catch (boost::thread_interrupted&) {
    // Interrupted exception is expected on shutdown
  }
#ifndef CPU_ONLY
  if (Caffe::mode() == Caffe::GPU) {
    CAFFE1_CUDA_CHECK(cudaStreamDestroy(stream));
    if (untransformed_top_)
      CAFFE1_CUDA_CHECK(cudaStreamDestroy(stream2));
  }
#endif
}

开发者ID:beniz，项目名称:caffe，代码行数:49，代码来源:base_data_layer.cpp

示例8: strmm_cuda

static void strmm_cuda(void *descr[], void *args) {
  float *a = (float *)STARPU_MATRIX_GET_PTR(descr[0]);
  float *b = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
  float *c = (float *)STARPU_MATRIX_GET_PTR(descr[2]);

  unsigned w = STARPU_MATRIX_GET_NY(descr[0]);
  unsigned h = STARPU_MATRIX_GET_NX(descr[1]);

  unsigned lda = STARPU_MATRIX_GET_LD(descr[0]);
  unsigned ldb = STARPU_MATRIX_GET_LD(descr[1]);
  unsigned ldc = STARPU_MATRIX_GET_LD(descr[2]);
  
  struct strmm_arg * arg = (struct strmm_arg *)args;

  cublasSideMode_t side = arg->side ? CUBLAS_SIDE_RIGHT : CUBLAS_SIDE_LEFT;
  cublasFillMode_t uplo = arg->uplo ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
  cublasDiagType_t diag = arg->unit ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT;
  cublasOperation_t trans = CUBLAS_OP_T;

  const float factor = 1.0f;
  
  cublasSetStream(cublas_handle, starpu_cuda_get_local_stream());

  cublasStrmm(cublas_handle, side, uplo, trans, diag, w, h, &factor, a, lda, b, ldb, c, ldc);
  cudaStreamSynchronize(starpu_cuda_get_local_stream());

  free(arg);
}

开发者ID:hsyl20，项目名称:HaskellPU，代码行数:28，代码来源:FloatMatrixStrmm_c.c

示例9: TransferAllHalos

/**
 * @brief This performs the exchanging of all necessary halos between 2 neighboring MPI processes
 *
 * @param[in]		cartComm	The carthesian MPI communicator
 * @param[in]		domSize		The 2D size of the local domain
 * @param[in]		topIndex	The 2D index of the calling MPI process in the topology
 * @param[in]		neighbors	The list of ranks which are direct neighbors to the caller
 * @param[in]		copyStream	The stream used to overlap top & bottom halo exchange with side halo copy to host memory
 * @param[in, out]	devBlocks	The 2 device blocks that are updated during the Jacobi run
 * @param[in, out]	devSideEdges	The 2 side edges (parallel to the Y direction) that hold the packed halo values before sending them
 * @param[in, out]	devHaloLines	The 2 halo lines (parallel to the Y direction) that hold the packed halo values after receiving them
 * @param[in, out] 	hostSendLines	The 2 host send buffers that are used during the halo exchange by the normal CUDA & MPI version
 * @param[in, out]	hostRecvLines	The 2 host receive buffers that are used during the halo exchange by the normal CUDA & MPI version
 * @return				The time spent during the MPI transfers
 */
double TransferAllHalos(MPI_Comm cartComm, const int2 * domSize, const int2 * topIndex, const int * neighbors, cudaStream_t copyStream,
	real * devBlocks[2], real * devSideEdges[2], real * devHaloLines[2], real * hostSendLines[2], real * hostRecvLines[2])
{
	real * devSendLines[2] = {devBlocks[0] + domSize->x + 3, devBlocks[0] + domSize->y * (domSize->x + 2) + 1};
	real * devRecvLines[2] = {devBlocks[0] + 1, devBlocks[0] + (domSize->y + 1) * (domSize->x + 2) + 1};
	int yNeighbors[2] = {neighbors[DIR_TOP], neighbors[DIR_BOTTOM]};
	int xNeighbors[2] = {neighbors[DIR_LEFT], neighbors[DIR_RIGHT]};
	int2 order = make_int2(topIndex->x % 2, topIndex->y % 2);
	double transferTime;

	// Populate the block's side edges
	CopyDevSideEdgesFromBlock(devBlocks[0], devSideEdges, domSize, neighbors, copyStream);

	// Exchange data with the top and bottom neighbors
	transferTime = MPI_Wtime();
	ExchangeHalos(cartComm, devSendLines[  order.y  ], hostSendLines[0], hostRecvLines[0], devRecvLines[  order.y  ], yNeighbors[  order.y  ], domSize->x);
	ExchangeHalos(cartComm, devSendLines[1 - order.y], hostSendLines[0], hostRecvLines[0], devRecvLines[1 - order.y], yNeighbors[1 - order.y], domSize->x);
	SafeCudaCall(cudaStreamSynchronize(copyStream));
	
	// Exchange data with the left and right neighbors
	ExchangeHalos(cartComm, devSideEdges[  order.x  ], hostSendLines[1], hostRecvLines[1], devHaloLines[  order.x  ], xNeighbors[  order.x  ], domSize->y);
	ExchangeHalos(cartComm, devSideEdges[1 - order.x], hostSendLines[1], hostRecvLines[1], devHaloLines[1 - order.x], xNeighbors[1 - order.x], domSize->y); 
	transferTime = MPI_Wtime() - transferTime;

	// Copy the received halos to the device block
	CopyDevHalosToBlock(devBlocks[0], devHaloLines[0], devHaloLines[1], domSize, neighbors);

	return transferTime;
}

开发者ID:Haider-BA，项目名称:Matlab2CPP，代码行数:44，代码来源:Host.c

示例10: forward_process_synchronized

void forward_process_synchronized(struct ixmapfwd_thread *thread,
	unsigned int port_index, struct ixmap_packet *packet,
	unsigned int num_packets, struct ixmap_packet_cuda *result, uint8_t *read_buf)
{
	int fd, i;

	cudaStreamSynchronize(thread->stream);

	for(i = 0; i < num_packets; i++){
		if(result[i].outif >= 0){
			ixmap_tx_assign(thread->plane, result[i].outif,
				thread->buf, &packet[i]);
		}else if(result[i].outif == -1){
			goto packet_drop;
		}else{
			goto packet_inject;
		}

		continue;
packet_inject:
		memcpy(read_buf, packet[i].slot_buf, packet[i].slot_size);
		fd = thread->tun_plane->ports[port_index].fd;
		write(fd, read_buf, packet[i].slot_size);
packet_drop:
		ixmap_slot_release(thread->buf, packet[i].slot_index);
	}
	return;
}

开发者ID:edenden，项目名称:ixmap-cuda，代码行数:28，代码来源:forward.c

示例11: CUDA_CHECK

void BasePrefetchingDataLayer<Dtype>::InternalThreadEntry() {
#ifndef CPU_ONLY
  cudaStream_t stream;//创建CUDA stream，非阻塞类型
  if (Caffe::mode() == Caffe::GPU) {
    CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
  }
#endif

  try {
    while (!must_stop()) { //循环载入批量数据
      Batch<Dtype>* batch = prefetch_free_.pop();//拿到一个空闲batch
      load_batch(batch);//载入批量数据
#ifndef CPU_ONLY
      if (Caffe::mode() == Caffe::GPU) {
        batch->data_.data().get()->async_gpu_push(stream);
        if (this->output_labels_) {
          batch->label_.data().get()->async_gpu_push(stream);
        }
        CUDA_CHECK(cudaStreamSynchronize(stream));//同步到GPU
      }
#endif
      prefetch_full_.push(batch);//加入到带负载的Batch队列中
    }
  } catch (boost::thread_interrupted&) {//捕获异常，退出while循环
    // Interrupted exception is expected on shutdown
  }
#ifndef CPU_ONLY
  if (Caffe::mode() == Caffe::GPU) {
    CUDA_CHECK(cudaStreamDestroy(stream));//销毁CUDA stream
  }
#endif
}

开发者ID:huanyii，项目名称:caffe_read，代码行数:32，代码来源:base_data_layer.cpp

示例12: LAMA_ASSERT_DEBUG

void CUDABLAS1::scal( IndexType n, const double alpha, double* x_d, const IndexType incx, SyncToken* syncToken )
{
    LAMA_CHECK_CUDA_ACCESS

    cudaStream_t stream = NULL;

    if ( syncToken )
    {
        CUDAStreamSyncToken* cudaStreamSyncToken = dynamic_cast<CUDAStreamSyncToken*>( syncToken );
        LAMA_ASSERT_DEBUG( cudaStreamSyncToken, "no cuda stream sync token provided" )
        stream = cudaStreamSyncToken->getCUDAStream();
    }

    cublasSetKernelStream( stream );
    LAMA_CHECK_CUBLAS_ERROR

    cublasDscal( n, alpha, x_d, incx );

    // No error check here possible as kernel is started asynchronously

    if ( !syncToken )
    {
        cudaStreamSynchronize( 0 );
        LAMA_CHECK_CUDA_ERROR
    }

开发者ID:fast-project，项目名称:lama，代码行数:25，代码来源:CUDABLAS1.cpp

示例13: advance_generations

/* 
 * Advance the simulation by <n> generations by mapping the OpenGL pixel buffer
 * objects for writing from CUDA, executing the kernel <n> times, and unmapping
 * the pixel buffer object.
 */
void advance_generations(unsigned long n)
{
	uint8_t* device_bufs[2];
	size_t size;

	DEBUG2("Mapping CUDA resources and retrieving device buffer pointers\n");
	cudaGraphicsMapResources(2, cuda_graphics_resources, (cudaStream_t)0);

	cudaGraphicsResourceGetMappedPointer((void**)&device_bufs[0], &size, 
								cuda_graphics_resources[0]);

	cudaGraphicsResourceGetMappedPointer((void**)&device_bufs[1], &size, 
								cuda_graphics_resources[1]);

	check_cuda_error();

	while (n--) {

		DEBUG2("Launching kernel (grid.width = %u, grid.height = %u)\n",
				grid.width, grid.height);

		launch_kernel(device_bufs[grid.which_buf], device_bufs[!grid.which_buf], 
									grid.width, grid.height);

		grid.which_buf ^= 1;
	}

	DEBUG2("Unmapping CUDA resources\n");

	cudaGraphicsUnmapResources(2, cuda_graphics_resources, (cudaStream_t)0);
	cudaStreamSynchronize(0);
}

开发者ID:anojavan，项目名称:csinparallel，代码行数:37，代码来源:cuda.c

示例14: CHECK

void NCCL<Dtype>::run(int layer) {
  CHECK(solver_->param().layer_wise_reduce());
  vector<shared_ptr<Blob<Dtype> > >& blobs =
    solver_->net()->layers()[layer]->blobs();
#ifdef DEBUG
  // Assert blobs are contiguous to reduce in one step (e.g. bias often small)
  for (int i = 1; i < blobs.size(); ++i) {
    CHECK_EQ(blobs[i - 1]->gpu_diff() + blobs[i - 1]->count(),
             blobs[i + 0]->gpu_diff());
  }
#endif
  if (blobs.size() > 0) {
    // Make sure default stream is done computing gradients. Could be
    // replaced by cudaEventRecord+cudaStreamWaitEvent to avoid
    // blocking the default stream, but it's actually slower.
    CUDA_CHECK(cudaStreamSynchronize(cudaStreamDefault));

    // Reduce asynchronously
    int size = 0;
    for (int i = 0; i < blobs.size(); ++i) {
      size += blobs[i]->count();
    }
    if (barrier_) {  // NULL in multi process case
      barrier_->wait();
    }
    NCCL_CHECK(ncclAllReduce(blobs[0]->mutable_gpu_diff(),
                             blobs[0]->mutable_gpu_diff(),
                             size,
                             nccl::dataType<Dtype>::type,
                             ncclSum, comm_, stream_));
    caffe_gpu_scal(size, (Dtype) 1.0 / Caffe::solver_count(),
                   blobs[0]->mutable_gpu_diff(), stream_);
  }
}

开发者ID:20337112，项目名称:caffe，代码行数:34，代码来源:parallel.cpp

示例15: while

void KMeansController<Dtype>::PostProcess() {
    /** Use the cluster center calculated to get cluster labels**/

    int* label_data = this->int_outputs_[0]->mutable_cpu_data();

    // first restart the data provider.
    this->data_providers_[0]->ForceRestart();

    // assign cluster labels
    bool assignment_finished = false;
    while (!assignment_finished) {
        size_t batch_num = 0;
        size_t index = this->data_providers_[0]->current_index();

        // getting current index must precede GetData()
        // otherwise the current index in the data provide would be advanced
        this->mats_[0] = this->data_providers_[0]->GetData(batch_num);
        if ( this->data_providers_[0]->current_index()  == 0) {
            assignment_finished = true;
        }
        //execute only the maximization functions for all samples
        this->function_input_vecs_[0][0] = this->mats_[0].get();
        this->funcs_[0]->Execute(this->function_input_vecs_[0], this->function_output_vecs_[0], GKMeans::stream(0));
        CUDA_CHECK(cudaStreamSynchronize(GKMeans::stream(0)));

        // copy out cluster labels
        int* out_data = (int*)this->mats_[2]->cpu_data();
        for (size_t i = 0; i < batch_num; i++) {
            label_data[i + index] = out_data[i];
        }
    }

    // copy out center data
    memcpy(this->numeric_outputs_[0]->mutable_cpu_data(), this->mats_[1]->cpu_data(), this->mats_[1]->count() * sizeof(Dtype));
}

开发者ID:CUHK-MMLAB，项目名称:gkmeans，代码行数:35，代码来源:controllers.cpp

示例16: cudaStreamSynchronize

void octree::resetCompact()
{
  // reset counts to 1 so next compact proceeds...
  cudaStreamSynchronize(execStream->s()); // must make sure any outstanding compact is finished  
  this->devMemCountsx[0] = 1;
  this->devMemCountsx.h2d(1, false, copyStream->s());
}

开发者ID:BerndDoser，项目名称:Bonsai，代码行数:7，代码来源:load_kernels.cpp

示例17: CUDA_CHECK

void P2PSync<Dtype>::on_gradients_ready(Timer* timer, ostringstream* timing) {
#ifndef CPU_ONLY
#ifdef DEBUG
  int device;
  CUDA_CHECK(cudaGetDevice(&device));
  CHECK(device == solver_->param().device_id());
#endif

  // Sum children gradients as they appear in the queue
  for (int i = 0; i < children_.size(); ++i) {
    timer->Start();
    P2PSync<Dtype> *child = queue_.pop();
    Dtype* src = child->parent_grads_;
    Dtype* dst = diff_;

#ifdef DEBUG
    bool ok = false;
    for (int j = 0; j < children_.size(); ++j) {
      if (child == children_[j]) {
        ok = true;
      }
    }
    CHECK(ok);
    cudaPointerAttributes attributes;
    CUDA_CHECK(cudaPointerGetAttributes(&attributes, src));
    CHECK(attributes.device == device);
    CUDA_CHECK(cudaPointerGetAttributes(&attributes, dst));
    CHECK(attributes.device == device);
#endif

    caffe_gpu_add(size_, src, dst, dst);
    *timing << " add_grad: " << timer->MilliSeconds();
  }

  // Send gradients to parent
  if (parent_) {
    timer->Start();
    Dtype* src = diff_;
    Dtype* dst = parent_grads_;

#ifdef DEBUG
    cudaPointerAttributes attributes;
    CUDA_CHECK(cudaPointerGetAttributes(&attributes, src));
    CHECK(attributes.device == device);
    CUDA_CHECK(cudaPointerGetAttributes(&attributes, dst));
    CHECK(attributes.device == parent_->solver_->param().device_id());
#endif

    CUDA_CHECK(cudaMemcpyAsync(dst, src, size_ * sizeof(Dtype),  //
        cudaMemcpyDeviceToDevice, cudaStreamDefault));
    CUDA_CHECK(cudaStreamSynchronize(cudaStreamDefault));
    parent_->queue_.push(this);
    *timing << " send_grad: " << timer->MilliSeconds();
  } else {
    // Loss functions divide gradients by the batch size, so to compensate
    // for split batch, the root solver divides by number of solvers.
    caffe_gpu_scal(size_, Dtype(1.0 / Caffe::solver_count()), diff_);
  }
#endif
}

开发者ID:bbshocking，项目名称:caffe，代码行数:60，代码来源:parallel.cpp

示例18: TEST

TEST(MemcpyAsync, CheckReturnValues) {
    /**
     * The API documentation states that
     * cudaErrorInvalidDevicePointer is a valid return value for
     * cudaMemcpyAsync
     *
     * TODO;  This needs a test.
     */
    cudaError_t ret;
    cudaStream_t stream;

    ret = cudaStreamCreate(&stream);
    ASSERT_EQ(cudaSuccess, ret);

    /**
     * Test woefully out of range directions.
     */
    int a = 0;
    ret = cudaMemcpyAsync(&a,   &a,   sizeof(a), (cudaMemcpyKind) -1, stream);
    EXPECT_EQ(cudaErrorInvalidMemcpyDirection, ret);

    ret = cudaMemcpyAsync(NULL, NULL, sizeof(a), (cudaMemcpyKind) -1, stream);
    EXPECT_EQ(cudaErrorInvalidMemcpyDirection, ret);

    ret = cudaStreamSynchronize(stream);
    EXPECT_EQ(cudaSuccess, ret);

    ret = cudaStreamDestroy(stream);
    EXPECT_EQ(cudaSuccess, ret);
}

开发者ID:ckennelly，项目名称:panoptes，代码行数:30，代码来源:vtest_memcpyasync.cpp

示例19: TEST_P

TEST_P(MemcpyAsync, H2DTransfers) {
    const size_t param = GetParam();
    const size_t alloc = 1 << param;

    cudaError_t ret;
    void *d1, *h1;
    ret = cudaMalloc(&d1, alloc);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaHostAlloc(&h1, alloc, cudaHostAllocMapped);
    ASSERT_EQ(cudaSuccess, ret);

    cudaStream_t stream;
    ret = cudaStreamCreate(&stream);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaMemcpyAsync(d1, h1, alloc, cudaMemcpyHostToDevice, stream);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaStreamSynchronize(stream);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaFree(d1);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaFreeHost(h1);
    ASSERT_EQ(cudaSuccess, ret);

    ret = cudaStreamDestroy(stream);
    ASSERT_EQ(cudaSuccess, ret);
}

开发者ID:ckennelly，项目名称:panoptes，代码行数:31，代码来源:vtest_memcpyasync.cpp

示例20: CUDA_CHECK

  shared_ptr<Mat<Dtype>> DataProviderBase<Dtype>::GetData(size_t &num){

    //first join the async task
    num = num_future_.get();

    //also sync the cuda strea
    CUDA_CHECK(cudaStreamSynchronize(data_stream_));

    // get the mat with data filled
    int ready_idx = data_q_.front();
    shared_ptr<Mat<Dtype>> ready_mat = data_slot_vec_[ready_idx];

    // send the id of the mat to the end of the queue
    data_q_.pop_front(); data_q_.push_back(ready_idx);

    // get the mat to be filled (id at the front of the queue)
    Mat<Dtype>* working_mat = data_slot_vec_[data_q_.front()].get();
    // kickout async task (force async launch)
    num_future_ = std::async(std::launch::async, &DataProviderBase::AsyncFunc, this, working_mat);

    // increment the current data cursor.
    current_index_ += num;
    if (current_index_ == round_size_){
      current_index_ = 0;
    }

    return ready_mat;
  }

开发者ID:caomw，项目名称:gkmeans，代码行数:28，代码来源:base_data_provider.cpp

注：本文中的cudaStreamSynchronize函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。

鲜花

握手

雷人

路过

鸡蛋

该文章已有0人参与评论

请发表评论

全部评论

专题导读

More+

10-27 六六分期app的软件客服如何联系？(六六分期

11-06 可心卡盟:win10系统火狐flash插件崩溃怎么

11-06 亲亲特价:怎么删除回收站图标

11-06 济南大学虚拟社区:鲁大师节能降温的具体办

11-06 xlueops.exe:无线网络安装向导

11-06 女斗合众国:win7系统cf与主机连接不稳定怎

11-06 0xc000022-[cf烟雾头]cf怎么调烟雾头

11-06 qizideyouhuo:应用程序无法正常启动0xc0000

11-06 ipz-185:win7系统vcf文件怎么打开

11-06 傻哥蹦迪:win10系统s4怎么打开usb调试

11-06 八神浩树gtaste:回收站清空了怎么恢复

11-06 妖尾之黑色守护:win10系统电脑没有1440x900

11-06 校园至尊魔王小说:win7系统浏览网页时字体

11-06 女斗合众国:win10系统访问共享文件夹提示请

11-06 tokyo hot n0654:恢复win7系统默认字体一招

11-06 雨酷仙境:设置win7系统转移临时文件夹腾出

11-06 阿穆纳伊之杖:win7系统开始菜单在右边还原

11-06 tunespotting:win10系统火狐flash插件总是

11-06 甘尔葛分析师：计谋网站seo关键词暴涨有什

11-06 蔡贵霖: 计谋网站seo关键词暴涨有什么秘密

11-06 博益网首页:ao3网页版进入不了解决方法

11-06 漏斗子专栏: 网站数据分析小白易懂精华篇

11-06 见证双虹怎么做:win7系统开启telnet命令的

11-06 颾狐蝶蜋:系统资源不足无法完成请求的服务

11-06 国光中学校歌:提交网站到alexa查询详细步骤

11-06 西安有情天:静态网页和动态网页的区别

11-06 红木雅尚斋:外部链接构造对网站的好处

11-06 前官礼遇：防止域名劫持–增强域安全性的10

11-06 密传二转答案: 中文分词算法有哪些

11-06 金泉家园邮编:百度快照劫持的表现及应对方

C++ cudaThreadSynchronize函数代码示例发布时间：2022-05-30

C++ cudaStreamDestroy函数代码示例发布时间：2022-05-30

剪的笔顺,诠释剪的笔画,认识剪的部首

1 六六分期app的软件客服如何联系？(六六分期

六六分期app的软件客服如何联系？不知道吗？加qq群【895510560】即可！标题：六六分期

阅读：18094|2023-10-27

2 可心卡盟:win10系统火狐flash插件崩溃怎么

今天小编告诉大家如何处理win10系统火狐flash插件总是崩溃的问题，可能很多用户都不知

阅读：9618|2022-11-06

3 亲亲特价:怎么删除回收站图标

今天小编告诉大家如何对win10系统删除桌面回收站图标进行设置，可能很多用户都不知道

阅读：8149|2022-11-06

4 济南大学虚拟社区:鲁大师节能降温的具体办

今天小编告诉大家如何对win10系统电脑设置节能降温的设置方法，想必大家都遇到过需要

阅读：8530|2022-11-06

5 xlueops.exe:无线网络安装向导

我们在使用xp系统的过程中,经常需要对xp系统无线网络安装向导设置进行设置，可能很多

阅读：8432|2022-11-06

6 女斗合众国:win7系统cf与主机连接不稳定怎

今天小编告诉大家如何处理win7系统玩cf老是与主机连接不稳定的问题，可能很多用户都不

阅读：9347|2022-11-06

7 0xc000022-[cf烟雾头]cf怎么调烟雾头

电脑对日常生活的重要性小编就不多说了，可是一旦碰到win7系统设置cf烟雾头的问题，很

阅读：8397|2022-11-06

8 qizideyouhuo:应用程序无法正常启动0xc0000

我们在日常使用电脑的时候，有的小伙伴们可能在打开应用的时候会遇见提示应用程序无法

阅读：7833|2022-11-06

9 ipz-185:win7系统vcf文件怎么打开

今天小编告诉大家如何对win7系统打开vcf文件进行设置，可能很多用户都不知道怎么对win

阅读：8387|2022-11-06

10 傻哥蹦迪:win10系统s4怎么打开usb调试

今天小编告诉大家如何对win10系统s4开启USB调试模式进行设置，可能很多用户都不知道怎

阅读：7380|2022-11-06

客服电话

电子邮件

C++ cudaStreamSynchronize函数代码示例

示例1: collect_final_result_dsyrk_syr2k

示例2: collect_final_result_zgemm

示例3: MyStreamSynchronize

示例4: collect_final_result_dtrsm_mode_1

示例5: get_gpu

示例6: dw_common_codelet_update_u11

示例7: CAFFE1_CUDA_CHECK

示例8: strmm_cuda

示例9: TransferAllHalos

示例10: forward_process_synchronized

示例11: CUDA_CHECK

示例12: LAMA_ASSERT_DEBUG

示例13: advance_generations

示例14: CHECK

示例15: while

示例16: cudaStreamSynchronize

示例17: CUDA_CHECK

示例18: TEST

示例19: TEST_P

示例20: CUDA_CHECK

请发表评论

全部评论

上一篇：

下一篇：

PacktPublishing/Python-Machine-Learning-

BentoBoxWorld/AOneBlock: A OneBlock Mine

鲁东大学一米网:Win7系统USB驱动器RAM的操

manur/MATLAB-git: A thin MATLAB wrapper

Matlab中点乘和乘的区别

剪的笔顺,诠释剪的笔画,认识剪的部首

六六分期app的软件客服如何联系？(六六分期

florent37/ViewAnimator: A fluent Android

florent37/Shrine-MaterialDesign2: implem

CVE-2020-36276

SimpleSoftwareIO/simple-sms: Send and re

关于我们

产品与服务

解决方案

139-2527-9053