本文整理汇总了C++中cudaThreadSynchronize函数的典型用法代码示例。如果您正苦于以下问题:C++ cudaThreadSynchronize函数的具体用法?C++ cudaThreadSynchronize怎么用?C++ cudaThreadSynchronize使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了cudaThreadSynchronize函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的C++代码示例。
示例1: runCalcU
Fflcun2::ParticlesPosition Fflcun2::run(int nSteps) {
for(int j = 0; j < nSteps; j++) {
runCalcU(blocks, threads, devMatrixes);
cudaThreadSynchronize();
if (cPar->ft == ConstParams::ROTATING) {
chPar->hExtX = cPar->hExtXInit * sin(2 * M_PI * cPar->rff * chPar->time);
chPar->hExtY = cPar->hExtYInit * cos(2 * M_PI * cPar->rff * chPar->time);
}
fillGloabalChangable(chPar);
cudaThreadSynchronize();
runOneStep(blocks, threads, devMatrixes);
cudaThreadSynchronize();
runApplyDeltas(blocks, threads, devMatrixes);
cudaThreadSynchronize();
chPar->time += chPar->dTimeCurrent;
}
/* cudaMemcpy(partPos.x, devMatrixes.x, sizeof(float) * cPar->nPart, cudaMemcpyDeviceToHost);
cudaMemcpy(partPos.y, devMatrixes.y, sizeof(float) * cPar->nPart, cudaMemcpyDeviceToHost);
cudaMemcpy(partPos.z, devMatrixes.z, sizeof(float) * cPar->nPart, cudaMemcpyDeviceToHost);
cudaMemcpy(partPos.theta, devMatrixes.theta, sizeof(float) * cPar->nPart, cudaMemcpyDeviceToHost);
cudaMemcpy(partPos.phy, devMatrixes.phy, sizeof(float) * cPar->nPart, cudaMemcpyDeviceToHost);
*/
cudaMemcpy(partPos.x, devMatrixes.x, sizeof(float) * cPar->nPart, cudaMemcpyDeviceToHost);
cudaMemcpy(partPos.y, devMatrixes.y, sizeof(float) * cPar->nPart, cudaMemcpyDeviceToHost);
cudaMemcpy(partPos.z, devMatrixes.z, sizeof(float) * cPar->nPart, cudaMemcpyDeviceToHost);
cudaMemcpy(partPos.theta, devMatrixes.theta, sizeof(float) * cPar->nPart, cudaMemcpyDeviceToHost);
cudaMemcpy(partPos.phy, devMatrixes.phy, sizeof(float) * cPar->nPart, cudaMemcpyDeviceToHost);
return partPos;
}
开发者ID:psci2195,项目名称:fflcu,代码行数:34,代码来源:fflcun2.cpp
示例2: benchmark
void
benchmark(int iterations)
{
// allocate memory for result
unsigned int *d_result;
unsigned int size = width * height * sizeof(unsigned int);
cutilSafeCall( cudaMalloc( (void**) &d_result, size));
// warm-up
gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads);
cutilSafeCall( cudaThreadSynchronize() );
cutilCheckError( cutStartTimer( timer));
// execute the kernel
for(int i=0; i<iterations; i++) {
gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads);
}
cutilSafeCall( cudaThreadSynchronize() );
cutilCheckError( cutStopTimer( timer));
// check if kernel execution generated an error
cutilCheckMsg("Kernel execution failed");
printf("Processing time: %f (ms)\n", cutGetTimerValue( timer));
printf("%.2f Mpixels/sec\n", (width*height*iterations / (cutGetTimerValue( timer) / 1000.0f)) / 1e6);
cutilSafeCall(cudaFree(d_result));
}
开发者ID:AnkurAnandapu,项目名称:ocelot-fork,代码行数:30,代码来源:recursiveGaussian-host.cpp
示例3: device_X
void RadialBasisFunction::Train(HostMatrix<float> &Input, HostMatrix<float> &Target){
//std::cout << "Training" << std::endl;
// c_width = (float*) malloc(sizeof(float)*network_size);
// memset(c_width,0,sizeof(float)*network_size);
DeviceMatrix<float> device_X(Input);
//std::cout << "KMeans" << std::endl;
clock_t initialTime = clock();
KMeans KM;
KM.SetSeed(seed);
dCenters = KM.Execute(device_X,network_size);
cudaThreadSynchronize();
times[0] = (clock() - initialTime);
//std::cout << "Adjust Widths" << std::endl;
/*Adjust width using mean of distance to neighbours*/
initialTime = clock();
AdjustWidths(number_neighbours);
cudaThreadSynchronize();
times[1] = (clock() - initialTime);
/*Training weights and scaling factor*/
HostMatrix<float> TargetArr(Target.Rows(),NumClasses);
memset(TargetArr.Pointer(),0,sizeof(float)*TargetArr.Elements());
for(int i = 0; i < Target.Rows(); i++){
TargetArr(i,((int)Target(i,0)-1)) = 1;
}
DeviceMatrix<float> d_Target(TargetArr);
//std::cout << "Calculating Weights" << std::endl;
initialTime = clock();
DeviceMatrix<float> device_activ_matrix(device_X.Rows(),dCenters.Rows(),ColumnMajor);
KernelActivationMatrix(device_activ_matrix.Pointer(),device_X.Pointer(),dCenters.Pointer(),device_X.Columns(),dCenters.Columns(),device_activ_matrix.Columns(),device_activ_matrix.Rows(),scaling_factor,device_c_width.Pointer());
DeviceMatrix<float> d_Aplus = UTILS::pseudoinverse(device_activ_matrix);
dWeights = DeviceMatrix<float>(d_Aplus.Rows(),d_Target.Columns());
d_Aplus.Multiply(d_Aplus,d_Target,dWeights);
/*Return Weights and Centers*/
cudaThreadSynchronize();
times[2] = (clock() - initialTime);
// cudaMemcpy(c_width,device_c_width.Pointer(),sizeof(float)*device_c_width.Length(),cudaMemcpyDeviceToHost);
// this->Weights = HostMatrix<float>(dWeights);
// this->Centers = HostMatrix<float>(dCenters);
}
开发者ID:kallaballa,项目名称:Neurocid,代码行数:60,代码来源:RadialBasisFunction.cpp
示例4: cudaGraphicsMapResources
//-----------------------------------------------------------------------------
void QGLImageGpuWidget::fillPbo(iu::ImageGpu_8u_C4* output)
{
// map GL <-> CUDA resource
uchar4 *d_dst = NULL;
size_t start;
cudaGraphicsMapResources(1, &cuda_pbo_resource_, 0);
cudaGraphicsResourceGetMappedPointer((void**)&d_dst, &start, cuda_pbo_resource_);
// get image data
iuprivate::cuCopyImageToPbo(image_, num_channels_, bit_depth_, d_dst, min_, max_);
cudaThreadSynchronize();
// get overlays
iuprivate::OverlayList::iterator it;
for ( it=overlay_list_.begin() ; it != overlay_list_.end(); it++ )
if ((*it)->isActive())
cuCopyOverlayToPbo((*it), d_dst, image_->size());
cudaThreadSynchronize();
if (output != NULL)
{
// copy final pbo to output
iu::ImageGpu_8u_C4 temp(d_dst, image_->width(), image_->height(),
image_->width()*sizeof(uchar4), true);
iu::copy(&temp, output);
}
// unmap GL <-> CUDA resource
cudaGraphicsUnmapResources(1, &cuda_pbo_resource_, 0);
}
开发者ID:ankurhanda,项目名称:imageutilities,代码行数:31,代码来源:qglimagegpuwidget.cpp
示例5: cudaThreadSynchronize
OsdCudaGLVertexBuffer::~OsdCudaGLVertexBuffer() {
cudaThreadSynchronize();
unmap();
cudaGraphicsUnregisterResource(_cudaResource);
cudaThreadSynchronize();
glDeleteBuffers(1, &_vbo);
}
开发者ID:chrislowe5,项目名称:OpenSubdiv-dev,代码行数:8,代码来源:cudaGLVertexBuffer.cpp
示例6: main
int main(int argc, char** argv)
{
float fTotalTime = 0;
// int TARGET_WIDTH=atoi(argv[2]);
// int TARGET_HEIGHT=atoi(argv[3]);
// bool visualize_results=atoi(argv[4]);
// unsigned int kernel_size=atoi(argv[2]);
int gpuNr=atoi(argv[2]);
checkCudaErrors(cudaSetDevice(gpuNr));
IplImage* gray_image = cvLoadImage(argv[1],CV_LOAD_IMAGE_GRAYSCALE);
unsigned char * d_input_image;
unsigned char * d_output_image;
int widthImage=gray_image->width;
int heightImage=gray_image->height;
IplImage *output_image = cvCreateImage(cvSize(widthImage,heightImage), IPL_DEPTH_8U, 1);
for( int i=0;i<heightImage;i++)
for( int j=0;j<widthImage;j++)
output_image->imageData[i*widthImage+j]=255;
unsigned int * d_histogram;
int total_threads=256;
cudaMalloc(&d_histogram,sizeof(unsigned int)*256*total_threads);
checkCudaErrors(cudaMalloc(&d_input_image,widthImage*heightImage*sizeof(unsigned char)));
checkCudaErrors(cudaMalloc(&d_output_image,widthImage*heightImage*sizeof(unsigned char)));
checkCudaErrors(cudaMemcpy(d_input_image,gray_image->imageData,widthImage*heightImage*sizeof(unsigned char),cudaMemcpyHostToDevice));
unsigned int windows_array[4]={15,17,25,31};
int total_implementations=4;
double elapsed_time;
for (int i=1;i<=total_implementations;i++)
{
for( int j=0;j<4;j++)
{
timer my_timer;
MedianFilterUcharCUDA(d_input_image,d_output_image,d_histogram,widthImage,heightImage,windows_array[j],16,16,i);
cudaThreadSynchronize();
elapsed_time=my_timer.elapsed();
printf("elapsed_time for implementation %d for window size %d was %f \n",i,windows_array[j],elapsed_time);
}
}
timer array_timer;
arrayFireRows(d_input_image,d_output_image,widthImage,heightImage,3,16,16);
cudaThreadSynchronize();
elapsed_time=array_timer.elapsed();
printf("elapsed_time for array fire was %f \n",elapsed_time);
checkCudaErrors(cudaMemcpy(output_image->imageData,d_output_image,widthImage*heightImage*sizeof(unsigned char),cudaMemcpyDeviceToHost));
// _medianfilter((unsigned char *)gray_image->imageData, (unsigned char *)output_image->imageData, widthImage, heightImage);
cvSaveImage("output.jpg",output_image);
}
开发者ID:aglenis,项目名称:gpu_medfilter,代码行数:58,代码来源:main_large_windows.cpp
示例7: checkCudaErrors
void
CudaInterface::fillParamMem(ParamMem_t& pmem, int byteVal) {
checkCudaErrors(cudaSetDevice(mDevID));
checkCudaErrors(cudaGetDevice(&mDevID));
std::cout << " setting " << pmem.totalSize * sizeof(float) << " bytes to " << pmem.base << "\n";
if (pmem.device) {
checkCudaErrors(cudaThreadSynchronize());
checkCudaErrors(cudaMemset(pmem.base, byteVal, pmem.totalSize * sizeof(float)));
checkCudaErrors(cudaThreadSynchronize());
} else
memset(pmem.base, byteVal, pmem.totalSize * sizeof(float));
}
开发者ID:ethancaballero,项目名称:Multiplicative-Recursive-Neural-Net,代码行数:12,代码来源:host-device_interface.cpp
示例8: time_ongpu
void time_ongpu(int TA, int TB, int m, int k, int n)
{
int iter = 10;
float *a = random_matrix(m,k);
float *b = random_matrix(k,n);
int lda = (!TA)?k:m;
int ldb = (!TB)?n:k;
float *c = random_matrix(m,n);
float *a_cl = cuda_make_array(a, m*k);
float *b_cl = cuda_make_array(b, k*n);
float *c_cl = cuda_make_array(c, m*n);
int i;
clock_t start = clock(), end;
for(i = 0; i<iter; ++i){
gemm_ongpu(TA,TB,m,n,k,1,a_cl,lda,b_cl,ldb,1,c_cl,n);
cudaThreadSynchronize();
}
double flop = ((double)m)*n*(2.*k + 2.)*iter;
double gflop = flop/pow(10., 9);
end = clock();
double seconds = sec(end-start);
printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %lf s, %lf GFLOPS\n",m,k,k,n, TA, TB, seconds, gflop/seconds);
cuda_free(a_cl);
cuda_free(b_cl);
cuda_free(c_cl);
free(a);
free(b);
free(c);
}
开发者ID:hyperchris,项目名称:Yolo,代码行数:33,代码来源:gemm.c
示例9: cudaGetDeviceProperties
void Fflcun2::setConfig(std::string fname) { //WARNING - test it
this->freeMemory();
cPar = new ConstParams;
chPar = new ChangableParams;
configFileName = fname;
this->setSettings();
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, 0);
//TODO - throw exception if no device found
blocks = ceil((float)cPar->nPart / SHARED_ARRAY);
blocks += blocks % deviceProp.multiProcessorCount;
threads = cPar->nPart / blocks;
cPar->nPart = threads * blocks;
std::cout << "After correction number of particles = " << cPar->nPart << std::endl;
srand(time(NULL));
this->allocMemory();
this->initDevMatrixes();
fillGloabalChangable(chPar);
fillGloabalConstant(cPar);
runSetupKernel(blocks, threads, devMatrixes);
cudaThreadSynchronize();
}
开发者ID:psci2195,项目名称:fflcu,代码行数:28,代码来源:fflcun2.cpp
示例10: cufftDestroy
void gpuNUFFT::GpuNUFFTOperator::freeDeviceMemory(int n_coils)
{
if (!gpuMemAllocated)
return;
cufftDestroy(fft_plan);
// Destroy the cuFFT plan.
if (DEBUG && (cudaThreadSynchronize() != cudaSuccess))
printf("error at thread synchronization 9: %s\n",cudaGetErrorString(cudaGetLastError()));
freeLookupTable();
freeTotalDeviceMemory(data_indices_d,data_sorted_d,crds_d,gdata_d,sectors_d,sector_centers_d,NULL);//NULL as stop
if (n_coils > 1 && deapo_d != NULL)
cudaFree(deapo_d);
if (this->applySensData())
cudaFree(sens_d);
if (this->applyDensComp())
cudaFree(density_comp_d);
showMemoryInfo();
gpuMemAllocated = false;
}
开发者ID:davidssmith,项目名称:TRON,代码行数:25,代码来源:gpuNUFFT_operator.cpp
示例11: mpla_generic_dgemv
void mpla_generic_dgemv(struct mpla_vector* b, struct mpla_generic_matrix* A, struct mpla_vector* x, void (*mpla_dgemv_core)(struct mpla_vector*, struct mpla_generic_matrix*, struct mpla_vector*, struct mpla_instance*), struct mpla_instance* instance)
{
// allocate redistributed vector
struct mpla_vector x_redist;
mpla_init_vector_for_block_rows(&x_redist, instance, x->vec_row_count);
// redistribute input vector with row-block parallel distribution to column-block parallel distribution
mpla_redistribute_vector_for_generic_dgesv(&x_redist, x, A, instance);
// generic computation core: matrix-vector product
mpla_dgemv_core(b, A, &x_redist, instance);
// create sub-communicator for each process row
int remain_dims[2];
remain_dims[0]=0;
remain_dims[1]=1;
MPI_Comm row_comm;
MPI_Cart_sub(instance->comm, remain_dims, &row_comm);
// summation of block row results
double* sum;
cudaMalloc((void**)&sum, sizeof(double)*b->cur_proc_row_count);
cudaThreadSynchronize();
checkCUDAError("cudaMalloc");
MPI_Allreduce(b->data, sum, b->cur_proc_row_count, MPI_DOUBLE, MPI_SUM, row_comm);
cudaMemcpy(b->data, sum, sizeof(double)*b->cur_proc_row_count, cudaMemcpyDeviceToDevice);
// cleanup
cudaFree(sum);
mpla_free_vector(&x_redist, instance);
MPI_Comm_free(&row_comm);
}
开发者ID:zaspel,项目名称:MPLA,代码行数:33,代码来源:mpla.cpp
示例12: cudaLaunch
cudaError_t cudaLaunch(const void *entry)
{
static cudaError_t (*nv_cudaLaunch)(const char *) = NULL;
cudaError_t ret;
struct timeval t;
if(!nv_cudaLaunch) {
nv_cudaLaunch = dlsym(RTLD_NEXT, "cudaLaunch");
if(!nv_cudaLaunch) {
fprintf(stderr, "failed to find symbol cudaLaunch: %s\n", dlerror());
return cudaErrorSharedObjectSymbolNotFound;
}
}
gettimeofday(&t, NULL);
printf("[gvm] %lf intercepting cudaLaunch\n", t.tv_sec + t.tv_usec / 1000000.0);
ret = nv_cudaLaunch(entry);
cudaThreadSynchronize();
gettimeofday(&t, NULL);
printf("[gvm] %lf intercepted cudaLaunch\n", t.tv_sec + t.tv_usec / 1000000.0);
return ret;
}
开发者ID:arnaudperin,项目名称:gpudb,代码行数:27,代码来源:intercept.c
示例13: synchronize
/**
* Synchronize with device.
*
* @param sync True to synchronize, false if not.
*/
inline void synchronize(const bool sync = true) {
#ifdef ENABLE_CUDA
if (sync) {
CUDA_CHECKED_CALL(cudaThreadSynchronize());
}
#endif
}
开发者ID:JohannesBuchner,项目名称:LibBi,代码行数:12,代码来源:cuda.hpp
示例14: mvReductArraysToHost
void
mvReductArraysToHost ( int reduct_bytes )
{
cutilSafeCall ( cudaMemcpy ( OP_reduct_h, OP_reduct_d, reduct_bytes,
cudaMemcpyDeviceToHost ) );
cutilSafeCall ( cudaThreadSynchronize ( ) );
}
开发者ID:doru1004,项目名称:OP2-Common,代码行数:7,代码来源:op_cuda_rt_support.c
示例15: mvConstArraysToDevice
void
mvConstArraysToDevice ( int consts_bytes )
{
cutilSafeCall ( cudaMemcpy ( OP_consts_d, OP_consts_h, consts_bytes,
cudaMemcpyHostToDevice ) );
cutilSafeCall ( cudaThreadSynchronize ( ) );
}
开发者ID:doru1004,项目名称:OP2-Common,代码行数:7,代码来源:op_cuda_rt_support.c
示例16: cudaMemcpy
cudaError_t cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind)
{
static cudaError_t (*nv_cudaMemcpy)(void *, const void *, size_t, enum cudaMemcpyKind) = NULL;
cudaError_t ret;
struct timeval t;
if(!nv_cudaMemcpy) {
nv_cudaMemcpy = dlsym(RTLD_NEXT, "cudaMemcpy");
if(!nv_cudaMemcpy) {
fprintf(stderr, "failed to find symbol cudaMemcpy: %s\n", dlerror());
return cudaErrorSharedObjectSymbolNotFound;
}
}
gettimeofday(&t, NULL);
printf("[gvm] %lf intercepting cudaMemcpy\n", t.tv_sec + t.tv_usec / 1000000.0);
ret = nv_cudaMemcpy(dst, src, count, kind);
cudaThreadSynchronize();
gettimeofday(&t, NULL);
printf("[gvm] %lf intercepted cudaMemcpy( %lx %lx %ld %d ) = %d\n", t.tv_sec + t.tv_usec / 1000000.0, (unsigned long)dst, (unsigned long)src, count, kind, (int)ret);
return ret;
}
开发者ID:arnaudperin,项目名称:gpudb,代码行数:26,代码来源:intercept.c
示例17: time_invocation_cuda
double time_invocation_cuda(std::size_t num_trials, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3)
{
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
for(std::size_t i = 0;
i < num_trials;
++i)
{
f(arg1,arg2,arg3);
}
cudaEventRecord(stop);
cudaThreadSynchronize();
float msecs = 0;
cudaEventElapsedTime(&msecs, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
// return mean msecs
return msecs / num_trials;
}
开发者ID:egaburov,项目名称:bulk,代码行数:25,代码来源:time_invocation_cuda.hpp
示例18: savegpuann
void savegpuann(const gpuann& nn, fann *ann, unsigned int instanceIndex)
{
unsigned int neuronCount = ann->total_neurons;
unsigned int weightsCount = ((ann->last_layer - 1)->last_neuron - 1)->last_con;
chekedcudaMemcpyAsync(nn.h_tmp_sumArray, nn.d_sumArray + neuronCount * instanceIndex, neuronCount * sizeof(fann_type), cudaMemcpyDeviceToHost);
chekedcudaMemcpyAsync(nn.h_tmp_valuesArray, nn.d_valuesArray + neuronCount * instanceIndex, neuronCount * sizeof(fann_type), cudaMemcpyDeviceToHost);
chekedcudaMemcpyAsync(ann->weights, nn.d_weightsArray + weightsCount * instanceIndex, weightsCount * sizeof(fann_type), cudaMemcpyDeviceToHost);
cudaThreadSynchronize();
struct fann_neuron *neuronsArray = ann->first_layer->first_neuron;
struct fann_layer *last_layer = ann->last_layer;
struct fann_layer *layer_it = ann->first_layer;
for(; layer_it != last_layer; layer_it++)
{
struct fann_neuron * last_neuron = layer_it->last_neuron;
struct fann_neuron * neuron_it = layer_it->first_neuron;
for(; neuron_it != last_neuron; neuron_it++)
{
unsigned int currentNeuronShift = neuron_it - neuronsArray;
neuron_it->value = nn.h_tmp_valuesArray[currentNeuronShift];
neuron_it->sum = nn.h_tmp_sumArray[currentNeuronShift];
}
}
}
开发者ID:verybigbadboy,项目名称:gpuann,代码行数:26,代码来源:gpuannDataCreator.cpp
示例19: keplereq_wrapper_c
// keplereq_wrapper_C:
// C wrapper function to solve's Kepler's equation num times.
// inputs:
// ph_ma: pointer to beginning element of array of doubles containing mean anomaly in radians
// ph_ecc: pointer to beginning element of array of doubles containing eccentricity
// num: integer size of input arrays
// ph_eccanom: pointer to beginning element of array of doubles eccentric anomaly in radians
// outputs:
// ph_eccanom: values overwritten with eccentric anomaly
// assumptions:
// input mean anomalies between 0 and 2pi
// input eccentricities between 0 and 1
// all three arrays have at least num elements
//
void keplereq_wrapper_c(double *ph_ma, double *ph_ecc, int num, double *ph_eccanom)
{
int gpuid = init_cuda();
// put vectors in thrust format from raw points
thrust::host_vector<double> h_ecc(ph_ecc,ph_ecc+num);
thrust::host_vector<double> h_ma(ph_ma,ph_ma+num);
cutCreateTimer(&memoryTime); cutCreateTimer(&kernelTime);
cutResetTimer(memoryTime); cutResetTimer(kernelTime);
if(gpuid>=0)
{
cutStartTimer(memoryTime);
// transfer input params to GPU
thrust::device_vector<double> d_ecc = h_ecc;
thrust::device_vector<double> d_ma = h_ma;
// allocate mem on GPU
thrust::device_vector<double> d_eccanom(num);
cudaThreadSynchronize();
cutStopTimer(memoryTime);
// distribute the computation to the GPU
cutStartTimer(kernelTime);
thrust::for_each(
thrust::make_zip_iterator(thrust::make_tuple(d_ma.begin(),d_ecc.begin(),d_eccanom.begin())),
thrust::make_zip_iterator(thrust::make_tuple(d_ma.end(), d_ecc.end(), d_eccanom.end())),
keplereq_functor() );
cudaThreadSynchronize();
cutStopTimer(kernelTime);
// transfer results back to host
cutStartTimer(memoryTime);
thrust::copy(d_eccanom.begin(),d_eccanom.end(),ph_eccanom);
cudaThreadSynchronize();
cutStopTimer(memoryTime);
}
else
{
// distribute the computation to the CPU
cutStartTimer(kernelTime);
thrust::for_each(
thrust::make_zip_iterator(thrust::make_tuple(h_ma.begin(),h_ecc.begin(),ph_eccanom)),
thrust::make_zip_iterator(thrust::make_tuple(h_ma.end(), h_ecc.end(), ph_eccanom+num)),
keplereq_functor() );
cutStopTimer(kernelTime);
}
}
开发者ID:jkrick,项目名称:idlbin,代码行数:61,代码来源:keplereq.c
示例20: op_fetch_data
void
op_fetch_data ( op_dat dat )
{
cutilSafeCall ( cudaMemcpy ( dat->data, dat->data_d,
dat->size * dat->set->size,
cudaMemcpyDeviceToHost ) );
cutilSafeCall ( cudaThreadSynchronize ( ) );
}
开发者ID:doru1004,项目名称:OP2-Common,代码行数:8,代码来源:op_cuda_rt_support.c
注:本文中的cudaThreadSynchronize函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论