本文整理汇总了C++中cudaMemcpy函数 的典型用法代码示例。如果您正苦于以下问题:C++ cudaMemcpy函数的具体用法?C++ cudaMemcpy怎么用?C++ cudaMemcpy使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了cudaMemcpy函数 的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的C++代码示例。
示例1: step3_gpu
void step3_gpu(int *n) {
int nprocs, procid;
MPI_Comm_rank(MPI_COMM_WORLD, &procid);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
/* Create Cartesian Communicator */
int c_dims[2]={0};
MPI_Comm c_comm;
accfft_create_comm(MPI_COMM_WORLD,c_dims,&c_comm);
Complexf *data, *data_cpu;
Complexf *data_hat;
double f_time=0*MPI_Wtime(),i_time=0, setup_time=0;
int alloc_max=0;
int isize[3],osize[3],istart[3],ostart[3];
/* Get the local pencil size and the allocation size */
alloc_max=accfft_local_size_dft_c2c_gpuf(n,isize,istart,osize,ostart,c_comm);
#ifdef INPLACE
data_cpu=(Complexf*)malloc(alloc_max);
cudaMalloc((void**) &data, alloc_max);
#else
data_cpu=(Complexf*)malloc(isize[0]*isize[1]*isize[2]*2*sizeof(float));
cudaMalloc((void**) &data,isize[0]*isize[1]*isize[2]*2*sizeof(float));
cudaMalloc((void**) &data_hat, alloc_max);
#endif
//accfft_init(nthreads);
setup_time=-MPI_Wtime();
/* Create FFT plan */
#ifdef INPLACE
accfft_plan_gpuf * plan=accfft_plan_dft_3d_c2c_gpuf(n,data,data,c_comm,ACCFFT_MEASURE);
#else
accfft_plan_gpuf * plan=accfft_plan_dft_3d_c2c_gpuf(n,data,data_hat,c_comm,ACCFFT_MEASURE);
#endif
setup_time+=MPI_Wtime();
/* Warmup Runs */
#ifdef INPLACE
accfft_execute_c2c_gpuf(plan,ACCFFT_FORWARD,data,data);
accfft_execute_c2c_gpuf(plan,ACCFFT_FORWARD,data,data);
#else
accfft_execute_c2c_gpuf(plan,ACCFFT_FORWARD,data,data_hat);
accfft_execute_c2c_gpuf(plan,ACCFFT_FORWARD,data,data_hat);
#endif
/* Initialize data */
initialize(data_cpu,n,c_comm);
#ifdef INPLACE
cudaMemcpy(data, data_cpu,alloc_max, cudaMemcpyHostToDevice);
#else
cudaMemcpy(data, data_cpu,isize[0]*isize[1]*isize[2]*2*sizeof(float), cudaMemcpyHostToDevice);
#endif
MPI_Barrier(c_comm);
/* Perform forward FFT */
f_time-=MPI_Wtime();
#ifdef INPLACE
accfft_execute_c2c_gpuf(plan,ACCFFT_FORWARD,data,data);
#else
accfft_execute_c2c_gpuf(plan,ACCFFT_FORWARD,data,data_hat);
#endif
f_time+=MPI_Wtime();
MPI_Barrier(c_comm);
#ifndef INPLACE
Complexf *data2_cpu, *data2;
cudaMalloc((void**) &data2, isize[0]*isize[1]*isize[2]*2*sizeof(float));
data2_cpu=(Complexf*) malloc(isize[0]*isize[1]*isize[2]*2*sizeof(float));
#endif
/* Perform backward FFT */
i_time-=MPI_Wtime();
#ifdef INPLACE
accfft_execute_c2c_gpuf(plan,ACCFFT_BACKWARD,data,data);
#else
accfft_execute_c2c_gpuf(plan,ACCFFT_BACKWARD,data_hat,data2);
#endif
i_time+=MPI_Wtime();
/* copy back results on CPU and check error*/
#ifdef INPLACE
cudaMemcpy(data_cpu, data, alloc_max, cudaMemcpyDeviceToHost);
check_err(data_cpu,n,c_comm);
#else
cudaMemcpy(data2_cpu, data2, isize[0]*isize[1]*isize[2]*2*sizeof(float), cudaMemcpyDeviceToHost);
check_err(data2_cpu,n,c_comm);
#endif
/* Compute some timings statistics */
double g_f_time, g_i_time, g_setup_time;
MPI_Reduce(&f_time,&g_f_time,1, MPI_DOUBLE, MPI_MAX,0, MPI_COMM_WORLD);
MPI_Reduce(&i_time,&g_i_time,1, MPI_DOUBLE, MPI_MAX,0, MPI_COMM_WORLD);
//.........这里部分代码省略.........
开发者ID:jeffhammond, 项目名称:accfft, 代码行数:101, 代码来源:step3_gpuf.cpp
示例2: copy_device_to_host
void copy_device_to_host(const size_t size, double *h_input,double *h_output,double *d_input,double *d_output){
CHECK_CUDA(cudaMemcpy(h_output, d_output, size, cudaMemcpyDeviceToHost));
CHECK_CUDA(cudaMemcpy(h_input, d_input, size, cudaMemcpyDeviceToHost));
}
开发者ID:bearrito, 项目名称:learncudathehardway, 代码行数:5, 代码来源:reduction_helpers.c
示例3: wine_cudaMemcpy
cudaError_t WINAPI wine_cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind) {
WINE_TRACE("\n");
return cudaMemcpy(dst, src, count, kind);
}
开发者ID:Shelnutt2, 项目名称:cuda-wine-wrapper, 代码行数:4, 代码来源:cudart.c
示例4: init_arrays
void init_arrays(Arrays *arr, FLOAT_TYPE** cu_F,
Command_line_opts *opts, Detector_settings *sett)
{
// Allocates and initializes to zero the data, detector ephemeris
// and the F-statistic arrays
// arr->xDat = (double *) calloc (sett->N, sizeof (double));
CudaSafeCall( cudaMallocHost((void**)&arr->xDat, sizeof(double)*sett->N));
CudaSafeCall ( cudaMalloc((void**)&arr->cu_xDat, sizeof(double)*sett->N));
// arr->DetSSB = (double *) calloc (3*sett->N, sizeof (double));
CudaSafeCall( cudaMallocHost((void**)&arr->DetSSB, sizeof(double)*3*sett->N) );
CudaSafeCall ( cudaMalloc((void**)&arr->cu_DetSSB, sizeof(double)*3*sett->N));
CudaSafeCall ( cudaMalloc((void**)cu_F, sizeof(FLOAT_TYPE)*sett->fftpad*sett->nfft));
CudaSafeCall ( cudaMemset(*cu_F, 0, sizeof(FLOAT_TYPE)*sett->fftpad*sett->nfft));
char filename[CHAR_BUFFER_SIZE];
FILE *data;
// Input time-domain data handling
sprintf (filename, "%s/%03d/xdatc_%03d_%03d%s.bin", opts->dtaprefix, opts->ident, \
opts->ident, opts->band, opts->label);
if ((data = fopen (filename, "r")) != NULL) {
fread ((void *)(arr->xDat), sizeof (double), sett->N, data); // !!! wczytanie danych
fclose (data);
} else {
perror (filename);
printf("Problem with %s... Exiting...\n", filename);
exit(1);
}
//copy to device
CudaSafeCall ( cudaMemcpy(arr->cu_xDat, arr->xDat, sizeof(double)*sett->N, cudaMemcpyHostToDevice));
int Nzeros=0;
int i;
// Checking for null values in the data
for(i=0; i < sett->N; i++)
if(!arr->xDat[i]) Nzeros++;
// factor N/(N - Nzeros) to account for null values in the data
sett->crf0 = (double)sett->N/(sett->N-Nzeros);
//if white noise...
if (opts->white_flag)
sett->sig2 = sett->N*var (arr->xDat, sett->N);
else
sett->sig2 = -1.;
double epsm, phir;
/*
############ Efemerydy ################
*/
// Ephemeris file handling
sprintf (filename, "%s/%03d/DetSSB.bin", opts->dtaprefix, opts->ident);
if ((data = fopen (filename, "r")) != NULL) {
// Detector position w.r.t solar system baricenter
// for every datapoint
fread ((void *)(arr->DetSSB), sizeof (double), 3*sett->N, data);
// Deterministic phase defining the position of the Earth
// in its diurnal motion at t=0
fread ((void *)(&phir), sizeof (double), 1, data);
// Earth's axis inclination to the ecliptic at t=0
fread ((void *)(&epsm), sizeof (double), 1, data);
fclose (data);
} else {
perror (filename);
printf("Problem with %s... Exiting...\n", filename);
exit(1);
}
//copy DetSSB to device
CudaSafeCall ( cudaMemcpy(arr->cu_DetSSB, arr->DetSSB, sizeof(double)*sett->N*3, cudaMemcpyHostToDevice));
/*
############ Sincos ################
*/
sett->sphir = sin (phir);
sett->cphir = cos (phir);
sett->sepsm = sin (epsm);
sett->cepsm = cos (epsm);
//misc. arrays
//arr->aa = (double*) malloc(sizeof(double)*sett->N);
//arr->bb = (double*) malloc(sizeof(double)*sett->N);
CudaSafeCall( cudaMallocHost((void**)&arr->aa, sizeof(double)*sett->N) );
CudaSafeCall( cudaMallocHost((void**)&arr->bb, sizeof(double)*sett->N) );
CudaSafeCall ( cudaMalloc((void**)&arr->cu_aa, sizeof(double)*sett->nfft));
CudaSafeCall ( cudaMalloc((void**)&arr->cu_bb, sizeof(double)*sett->nfft));
CudaSafeCall ( cudaMalloc((void**)&arr->cu_shft, sizeof(double)*sett->N));
CudaSafeCall ( cudaMalloc((void**)&arr->cu_shftf, sizeof(double)*sett->N));
CudaSafeCall ( cudaMalloc((void**)&arr->cu_tshift, sizeof(double)*sett->N));
//.........这里部分代码省略.........
开发者ID:mbejger, 项目名称:polgraw-allsky, 代码行数:101, 代码来源:init.c
示例5: main
// Host code
int main(int argc, char** argv)
{
ParseArguments(argc, argv);
float s_SobelMatrix[25];
s_SobelMatrix[0] = 1;
s_SobelMatrix[1] = 2;
s_SobelMatrix[2]= 0;
s_SobelMatrix[3] = -2;
s_SobelMatrix[4] = -1;
s_SobelMatrix[5] = 4;
s_SobelMatrix[6] = 8;
s_SobelMatrix[7] = 0;
s_SobelMatrix[8] = -8;
s_SobelMatrix[9] = -4;
s_SobelMatrix[10] = 6;
s_SobelMatrix[11] = 12;
s_SobelMatrix[12] = 0;
s_SobelMatrix[13] = -12;
s_SobelMatrix[14] = -6;
s_SobelMatrix[15] = 4;
s_SobelMatrix[16] = 8;
s_SobelMatrix[17] = 0;
s_SobelMatrix[18] = -8;
s_SobelMatrix[19] =-4;
s_SobelMatrix[20] =1;
s_SobelMatrix[21] =2;
s_SobelMatrix[22] =0;
s_SobelMatrix[23] =-2;
s_SobelMatrix[24] =-1;
unsigned char *palete = NULL;
unsigned char *data = NULL, *out = NULL;
PPMImage *input_image=NULL, *output_image=NULL;
output_image = (PPMImage *)malloc(sizeof(PPMImage));
input_image = readPPM(PPMInFileL);
printf("Running %s filter\n", Filter);
out = (unsigned char *)malloc();
printf("Computing the CPU output\n");
printf("Image details: %d by %d = %d , imagesize = %d\n", input_image->x, input_image->y, input_image->x * input_image->y, input_image->x * input_image->y);
cutilCheckError(cutStartTimer(time_CPU));
if(FilterMode == SOBEL_FILTER){
printf("Running Sobel\n");
CPU_Sobel(intput_image->data, output_image, input_image->x, input_image->y);
}
else if(FilterMode == HIGH_BOOST_FILTER){
printf("Running boost\n");
CPU_Boost(data, out, dib.width, dib.height);
}
cutilCheckError(cutStopTimer(time_CPU));
if(FilterMode == SOBEL_FILTER || FilterMode == SOBEL_FILTER5)
BitMapWrite("CPU_sobel.bmp", &bmp, &dib, out, palete);
else if(FilterMode == AVERAGE_FILTER)
BitMapWrite("CPU_average.bmp", &bmp, &dib, out, palete);
else if(FilterMode == HIGH_BOOST_FILTER)
BitMapWrite("CPU_boost.bmp", &bmp, &dib, out, palete);
printf("Done with CPU output\n");
printf("CPU execution time %f \n", cutGetTimerValue(time_CPU));
printf("Allocating %d bytes for image \n", dib.image_size);
cutilSafeCall( cudaMalloc( (void **)&d_In, dib.image_size*sizeof(unsigned char)) );
cutilSafeCall( cudaMalloc( (void **)&d_Out, dib.image_size*sizeof(unsigned char)) );
// creating space for filter matrix
cutilSafeCall( cudaMalloc( (void **)&sobel_matrix, 25*sizeof(float)) );
cutilCheckError(cutStartTimer(time_mem));
cudaMemcpy(d_In, data, dib.image_size*sizeof(unsigned char), cudaMemcpyHostToDevice);
cudaMemcpy(sobel_matrix, s_SobelMatrix, 25*sizeof(float), cudaMemcpyHostToDevice);
cutilCheckError(cutStopTimer(time_mem));
FilterWrapper(data, dib.width, dib.height);
// Copy image back to host
cutilCheckError(cutStartTimer(time_mem));
cudaMemcpy(out, d_Out, dib.image_size*sizeof(unsigned char), cudaMemcpyDeviceToHost);
cutilCheckError(cutStopTimer(time_mem));
printf("GPU execution time %f Memtime %f \n", cutGetTimerValue(time_GPU), cutGetTimerValue(time_mem));
printf("Total GPU = %f \n", (cutGetTimerValue(time_GPU) + cutGetTimerValue(time_mem)));
// Write output image
BitMapWrite(BMPOutFile, &bmp, &dib, out, palete);
Cleanup();
}
开发者ID:rtvj, 项目名称:Misc, 代码行数:97, 代码来源:filter_cpu.c
示例6: toHost
void toHost(T* base) const {
cudaCheck(cudaMemcpy(base, vals_, n_ * sizeof(T), cudaMemcpyDeviceToHost));
}
开发者ID:2php, 项目名称:fbcunn, 代码行数:3, 代码来源:HalfPrecTest.cpp
示例7: preProcess
//return types are void since any internal error will be handled by quitting
//no point in returning error codes...
//returns a pointer to an RGBA version of the input image
//and a pointer to the single channel grey-scale output
//on both the host and device
void preProcess(uchar4 **h_inputImageRGBA, uchar4 **h_outputImageRGBA,
uchar4 **d_inputImageRGBA, uchar4 **d_outputImageRGBA,
unsigned char **d_redBlurred,
unsigned char **d_greenBlurred,
unsigned char **d_blueBlurred,
float **h_filter, int *filterWidth,
const std::string &filename) {
//make sure the context initializes ok
checkCudaErrors(cudaFree(0));
cv::Mat image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR);
if (image.empty()) {
std::cerr << "Couldn't open file: " << filename << std::endl;
exit(1);
}
cv::cvtColor(image, imageInputRGBA, CV_BGR2RGBA);
//allocate memory for the output
imageOutputRGBA.create(image.rows, image.cols, CV_8UC4);
//This shouldn't ever happen given the way the images are created
//at least based upon my limited understanding of OpenCV, but better to check
if (!imageInputRGBA.isContinuous() || !imageOutputRGBA.isContinuous()) {
std::cerr << "Images aren't continuous!! Exiting." << std::endl;
exit(1);
}
*h_inputImageRGBA = (uchar4 *)imageInputRGBA.ptr<unsigned char>(0);
*h_outputImageRGBA = (uchar4 *)imageOutputRGBA.ptr<unsigned char>(0);
const size_t numPixels = numRows() * numCols();
//allocate memory on the device for both input and output
checkCudaErrors(cudaMalloc(d_inputImageRGBA, sizeof(uchar4) * numPixels));
checkCudaErrors(cudaMalloc(d_outputImageRGBA, sizeof(uchar4) * numPixels));
checkCudaErrors(cudaMemset(*d_outputImageRGBA, 0, numPixels * sizeof(uchar4))); //make sure no memory is left laying around
//copy input array to the GPU
checkCudaErrors(cudaMemcpy(*d_inputImageRGBA, *h_inputImageRGBA, sizeof(uchar4) * numPixels, cudaMemcpyHostToDevice));
d_inputImageRGBA__ = *d_inputImageRGBA;
d_outputImageRGBA__ = *d_outputImageRGBA;
//now create the filter that they will use
const int blurKernelWidth = 9;
const float blurKernelSigma = 2.;
*filterWidth = blurKernelWidth;
//create and fill the filter we will convolve with
*h_filter = new float[blurKernelWidth * blurKernelWidth];
h_filter__ = *h_filter;
float filterSum = 0.f; //for normalization
for (int r = -blurKernelWidth/2; r <= blurKernelWidth/2; ++r) {
for (int c = -blurKernelWidth/2; c <= blurKernelWidth/2; ++c) {
float filterValue = expf( -(float)(c * c + r * r) / (2.f * blurKernelSigma * blurKernelSigma));
(*h_filter)[(r + blurKernelWidth/2) * blurKernelWidth + c + blurKernelWidth/2] = filterValue;
filterSum += filterValue;
}
}
float normalizationFactor = 1.f / filterSum;
for (int r = -blurKernelWidth/2; r <= blurKernelWidth/2; ++r) {
for (int c = -blurKernelWidth/2; c <= blurKernelWidth/2; ++c) {
(*h_filter)[(r + blurKernelWidth/2) * blurKernelWidth + c + blurKernelWidth/2] *= normalizationFactor;
}
}
//blurred
checkCudaErrors(cudaMalloc(d_redBlurred, sizeof(unsigned char) * numPixels));
checkCudaErrors(cudaMalloc(d_greenBlurred, sizeof(unsigned char) * numPixels));
checkCudaErrors(cudaMalloc(d_blueBlurred, sizeof(unsigned char) * numPixels));
checkCudaErrors(cudaMemset(*d_redBlurred, 0, sizeof(unsigned char) * numPixels));
checkCudaErrors(cudaMemset(*d_greenBlurred, 0, sizeof(unsigned char) * numPixels));
checkCudaErrors(cudaMemset(*d_blueBlurred, 0, sizeof(unsigned char) * numPixels));
}
开发者ID:berant89, 项目名称:CUDA-Projects, 代码行数:85, 代码来源:HW2.cpp
示例8: main
int main(int argc, char **argv)
{
// Start logs
printf("%s Starting...\n\n", argv[0]);
unsigned int tableCPU[QRNG_DIMENSIONS][QRNG_RESOLUTION];
float *h_OutputGPU, *d_Output;
int dim, pos;
double delta, ref, sumDelta, sumRef, L1norm, gpuTime;
StopWatchInterface *hTimer = NULL;
if (sizeof(INT64) != 8)
{
printf("sizeof(INT64) != 8\n");
return 0;
}
cudaDeviceProp deviceProp;
int dev = findCudaDevice(argc, (const char **)argv);
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev));
if (((deviceProp.major << 4) + deviceProp.minor) < 0x20)
{
fprintf(stderr, "quasirandomGenerator requires Compute Capability of SM 2.0 or higher to run.\n");
cudaDeviceReset();
exit(EXIT_WAIVED);
}
sdkCreateTimer(&hTimer);
printf("Allocating GPU memory...\n");
checkCudaErrors(cudaMalloc((void **)&d_Output, QRNG_DIMENSIONS * N * sizeof(float)));
printf("Allocating CPU memory...\n");
h_OutputGPU = (float *)malloc(QRNG_DIMENSIONS * N * sizeof(float));
printf("Initializing QRNG tables...\n\n");
initQuasirandomGenerator(tableCPU);
initTableGPU(tableCPU);
printf("Testing QRNG...\n\n");
checkCudaErrors(cudaMemset(d_Output, 0, QRNG_DIMENSIONS * N * sizeof(float)));
int numIterations = 20;
for (int i = -1; i < numIterations; i++)
{
if (i == 0)
{
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
}
quasirandomGeneratorGPU(d_Output, 0, N);
}
checkCudaErrors(cudaDeviceSynchronize());
sdkStopTimer(&hTimer);
gpuTime = sdkGetTimerValue(&hTimer)/(double)numIterations*1e-3;
printf("quasirandomGenerator, Throughput = %.4f GNumbers/s, Time = %.5f s, Size = %u Numbers, NumDevsUsed = %u, Workgroup = %u\n",
(double)QRNG_DIMENSIONS * (double)N * 1.0E-9 / gpuTime, gpuTime, QRNG_DIMENSIONS*N, 1, 128*QRNG_DIMENSIONS);
printf("\nReading GPU results...\n");
checkCudaErrors(cudaMemcpy(h_OutputGPU, d_Output, QRNG_DIMENSIONS * N * sizeof(float), cudaMemcpyDeviceToHost));
printf("Comparing to the CPU results...\n\n");
sumDelta = 0;
sumRef = 0;
for (dim = 0; dim < QRNG_DIMENSIONS; dim++)
for (pos = 0; pos < N; pos++)
{
ref = getQuasirandomValue63(pos, dim);
delta = (double)h_OutputGPU[dim * N + pos] - ref;
sumDelta += fabs(delta);
sumRef += fabs(ref);
}
printf("L1 norm: %E\n", sumDelta / sumRef);
printf("\nTesting inverseCNDgpu()...\n\n");
checkCudaErrors(cudaMemset(d_Output, 0, QRNG_DIMENSIONS * N * sizeof(float)));
for (int i = -1; i < numIterations; i++)
{
if (i == 0)
{
checkCudaErrors(cudaDeviceSynchronize());
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
}
inverseCNDgpu(d_Output, NULL, QRNG_DIMENSIONS * N);
}
checkCudaErrors(cudaDeviceSynchronize());
//.........这里部分代码省略.........
开发者ID:ziyuhe, 项目名称:cuda_project, 代码行数:101, 代码来源:quasirandomGenerator.cpp
示例9: main
int main(int argc, char **argv) {
uchar4 *h_inputImageRGBA, *d_inputImageRGBA;
uchar4 *h_outputImageRGBA, *d_outputImageRGBA;
unsigned char *d_redBlurred, *d_greenBlurred, *d_blueBlurred;
float *h_filter;
int filterWidth;
std::string input_file;
std::string output_file;
std::string reference_file;
double perPixelError = 0.0;
double globalError = 0.0;
bool useEpsCheck = false;
std::string blur_impl = "hw";
switch (argc) {
case 2:
input_file = std::string(argv[1]);
output_file = "HW2_output.png";
reference_file = "HW2_reference.png";
break;
case 3:
input_file = std::string(argv[1]);
output_file = std::string(argv[2]);
reference_file = "HW2_reference.png";
break;
case 4:
input_file = std::string(argv[1]);
output_file = std::string(argv[2]);
reference_file = std::string(argv[3]);
break;
case 5:
input_file = std::string(argv[1]);
output_file = std::string(argv[2]);
reference_file = std::string(argv[3]);
blur_impl = std::string(argv[4]);
break;
default:
std::cerr << "Usage: ./HW2 input_file [output_filename] "
"[reference_filename] [blur_impl]]"
<< std::endl;
exit(1);
}
// load the image and give us our input and output pointers
preProcess(&h_inputImageRGBA, &h_outputImageRGBA, &d_inputImageRGBA,
&d_outputImageRGBA, &d_redBlurred, &d_greenBlurred, &d_blueBlurred,
&h_filter, &filterWidth, input_file);
allocateMemoryAndCopyToGPU(numRows(), numCols(), h_filter, filterWidth);
GpuTimer timer;
timer.Start();
// call the students' code
if (blur_impl == "hw") {
your_gaussian_blur(h_inputImageRGBA, d_inputImageRGBA, d_outputImageRGBA,
numRows(), numCols(), d_redBlurred, d_greenBlurred,
d_blueBlurred, filterWidth);
} else if (blur_impl == "shared") {
gaussian_blur_shared(h_inputImageRGBA, d_inputImageRGBA, d_outputImageRGBA,
numRows(), numCols(), d_redBlurred, d_greenBlurred,
d_blueBlurred, filterWidth);
}
timer.Stop();
cudaDeviceSynchronize();
checkCudaErrors(cudaGetLastError());
int err = printf("Your code ran in: %f msecs.\n", timer.Elapsed());
if (err < 0) {
// Couldn't print! Probably the student closed stdout - bad news
std::cerr << "Couldn't print timing information! STDOUT Closed!"
<< std::endl;
exit(1);
}
// check results and output the blurred image
size_t numPixels = numRows() * numCols();
// copy the output back to the host
checkCudaErrors(cudaMemcpy(h_outputImageRGBA, d_outputImageRGBA__,
sizeof(uchar4) * numPixels,
cudaMemcpyDeviceToHost));
std::cerr << "postProcess output...\n";
postProcess(output_file, h_outputImageRGBA);
timer.Start();
referenceCalculation(h_inputImageRGBA, h_outputImageRGBA, numRows(),
numCols(), h_filter, filterWidth);
timer.Stop();
std::cerr << "referenceCalculation elapsed: " << timer.Elapsed() << " ms\n";
std::cerr << "postProcess reference...\n";
postProcess(reference_file, h_outputImageRGBA);
// Cheater easy way with OpenCV
// generateReferenceImage(input_file, reference_file, filterWidth);
compareImages(reference_file, output_file, useEpsCheck, perPixelError,
globalError);
//.........这里部分代码省略.........
开发者ID:WangHanbin, 项目名称:cs344, 代码行数:101, 代码来源:main.cpp
示例10: create
void pcl::gpu::DeviceMemory::upload(const void *host_ptr_arg, size_t sizeBytes_arg)
{
create(sizeBytes_arg);
cudaSafeCall( cudaMemcpy(data_, host_ptr_arg, sizeBytes_, cudaMemcpyHostToDevice) );
cudaSafeCall( cudaDeviceSynchronize() );
}
开发者ID:VictorLamoine, 项目名称:pcl, 代码行数:6, 代码来源:device_memory.cpp
示例11: cudaSafeCall
void pcl::gpu::DeviceMemory::download(void *host_ptr_arg) const
{
cudaSafeCall( cudaMemcpy(host_ptr_arg, data_, sizeBytes_, cudaMemcpyDeviceToHost) );
cudaSafeCall( cudaDeviceSynchronize() );
}
开发者ID:VictorLamoine, 项目名称:pcl, 代码行数:5, 代码来源:device_memory.cpp
示例12: CUDA_SAFE_CALL
DeepCopy<CudaSpace,HostSpace>::DeepCopy( void * dst , const void * src , size_t n )
{ CUDA_SAFE_CALL( cudaMemcpy( dst , src , n , cudaMemcpyDefault ) ); }
开发者ID:gurkih, 项目名称:lammps, 代码行数:2, 代码来源:Kokkos_CudaSpace.cpp
示例13: main2
int main2(int sockfd)
{
cufftHandle plan;
cufftComplex *devPtr;
cufftReal indata[NX*BATCH];
cufftComplex data[NX*BATCH];
int i,timer,j,k;
char fname[15];
FILE *f;
#define BUFSIZE (21*4096*sizeof(int))
int buffer[BUFSIZE];
int p,nread;
f = fopen("21-4096","rb");
nread=fread(buffer,BUFSIZE,1,f);
printf("nread=%i\n",nread);
fclose(f);
i=0;
for (j=0;j<BATCH;j++) {
for (k=0;k<NX;k++) {
data[j*NX+k].x = buffer[j*NX+k];
data[j*NX+k].y = 0;
}
}
//f=fopen("y.txt","r");
/* source data creation */
//int sockfd = myconnect();
//printf("connected\n");
/* WORKING!!!!!!!!
i=0;
for (j=0;j<BATCH;j++) {
sprintf(fname,"%i.txt",j);
printf("%s\n",fname);
f = fopen(fname,"r");
for (k=0;k<NX;k++) {
fscanf(f,"%i\n",&p);
data[j*NX+k].x = p;
data[j*NX+k].y = 0;
}
fclose(f);
*/
/*
for(i= 0 ; i < NX*BATCH ; i++){
//fscanf(f,"%i\n",&p);
//data[i].x= p;
data[i].x= 1.0f;
//printf("%f\n",data[i].x);
data[i].y = 0.0f;
}
//fclose(f)
*/
//}
/* creates 1D FFT plan */
cufftPlan1d(&plan, NX, CUFFT_C2C, BATCH);
/*
cutCreateTimer(&timer);
cutResetTimer(timer);
cutStartTimer(timer);
*/
/* GPU memory allocation */
cudaMalloc((void**)&devPtr, sizeof(cufftComplex)*NX*BATCH);
/* transfer to GPU memory */
cudaMemcpy(devPtr, data, sizeof(cufftComplex)*NX*BATCH, cudaMemcpyHostToDevice);
/* executes FFT processes */
cufftExecC2C(plan, devPtr, devPtr, CUFFT_FORWARD);
/* executes FFT processes (inverse transformation) */
//cufftExecC2C(plan, devPtr, devPtr, CUFFT_INVERSE);
/* transfer results from GPU memory */
cudaMemcpy(data, devPtr, sizeof(cufftComplex)*NX*BATCH, cudaMemcpyDeviceToHost);
/* deletes CUFFT plan */
cufftDestroy(plan);
/* frees GPU memory */
cudaFree(devPtr);
/*
cudaThreadSynchronize();
cutStopTimer(timer);
printf("%f\n",cutGetTimerValue(timer)/(float)1000);
cutDeleteTimer(timer);
//.........这里部分代码省略.........
开发者ID:chatbot, 项目名称:mybci, 代码行数:101, 代码来源:source2socket.c
示例14: main
//.........这里部分代码省略.........
manageCudaError();
r_lists = (results_list *) malloc(MAX_BUS_GPU * sizeof(results_list));
for (int i=0; i<MAX_BUS_GPU; i++) {
new_results_list(&r_lists[i], RESULTS);
}
k = (uint32_t*)malloc(RESULTS * sizeof(uint32_t));
l = (uint32_t*)malloc(RESULTS * sizeof(uint32_t));
toc();
int TAM_BUS_GPU=0, NUM_BLOQUES_GPU=0;
NUM_REP = atoi(argv[5]);
tic("Leer de disco");
while(nextFASTAToken(queries_file, h_Worig + TAM_BUS_GPU * MAXLINE, h_We + TAM_BUS_GPU * MAXLINE, h_nWe + TAM_BUS_GPU)) {
TAM_BUS_GPU++;
if (TAM_BUS_GPU == MAX_BUS_GPU) break;
}
toc();
NUM_BLOQUES_GPU = (TAM_BUS_GPU / TAM_BLOQUE_GPU);
cudaThreadSynchronize();
tic("CPU -> GPU");
cudaMemcpy(d_We, h_We, TAM_BUS_GPU * MAXLINE * sizeof(uint8_t), cudaMemcpyHostToDevice);
manageCudaError();
cudaMemcpy(d_nWe, h_nWe, TAM_BUS_GPU * sizeof(uint64_t), cudaMemcpyHostToDevice);
manageCudaError();
cudaThreadSynchronize();
toc();
cudaThreadSynchronize();
tic("GPU Kernel");
BWExactSearchBackwardVectorGPUWrapper(NUM_BLOQUES_GPU, TAM_BLOQUE_GPU, d_We, d_nWe, MAXLINE, d_k, d_l, 0, d_O.siz-2, &d_C, &d_C1, &d_O);
BWExactSearchForwardVectorGPUWrapper(NUM_BLOQUES_GPU, TAM_BLOQUE_GPU, d_We, d_nWe, MAXLINE, d_ki, d_li, 0, d_Oi.siz-2, &d_C, &d_C1, &d_Oi);
cudaThreadSynchronize();
toc();
cudaThreadSynchronize();
tic("GPU -> CPU");
cudaMemcpy(h_k, d_k, sizeof(uint32_t) * TAM_BUS_GPU * MAXLINE, cudaMemcpyDeviceToHost);
manageCudaError();
cudaMemcpy(h_l, d_l, sizeof(uint32_t) * TAM_BUS_GPU * MAXLINE, cudaMemcpyDeviceToHost);
manageCudaError();
cudaMemcpy(h_ki, d_ki, sizeof(uint32_t) * TAM_BUS_GPU * MAXLINE, cudaMemcpyDeviceToHost);
manageCudaError();
cudaMemcpy(h_li, d_li, sizeof(uint32_t) * TAM_BUS_GPU * MAXLINE, cudaMemcpyDeviceToHost);
manageCudaError();
cudaThreadSynchronize();
toc();
tic("CPU Vector");
for (int i=0; i<TAM_BUS_GPU; i++) {
BWExactSearchVectorBackward(h_We + MAXLINE*i, 0, h_nWe[i]-1, 0, d_O.siz-2, h_k2 + MAXLINE*i, h_l2 + MAXLINE*i, &backward);
BWExactSearchVectorForward(h_We + MAXLINE*i, 0, h_nWe[i]-1, 0, d_Oi.siz-2, h_ki2 + MAXLINE*i, h_li2 + MAXLINE*i, &forward);
}
开发者ID:josator, 项目名称:gnu-bwt-aligner, 代码行数:67, 代码来源:optimize_speedup_vector.c
示例15: D_MEMCPY_D2H
//-------------------------------------------------------
//copy a buffer from device memory to host memory
//
//param : des
//param : src
//param : size
//-------------------------------------------------------
void D_MEMCPY_D2H(void *des, void *src, size_t size)
{
CUDA_SAFE_CALL(cudaMemcpy(des, src, size, cudaMemcpyDeviceToHost));
}
开发者ID:xiaobaidemu, 项目名称:GPU_Apriori, 代码行数:11, 代码来源:BenCUDAMem.cpp
示例16: main
int main(int argc, char **argv)
{
int OPT_N = 4000000;
int OPT_SZ = OPT_N * sizeof(float);
printf("Initializing data...\n");
float *callResult, *putResult, *stockPrice, *optionStrike, *optionYears;
float *d_callResult, *d_putResult;
float *d_stockPrice, *d_optionStrike, *d_optionYears;
#ifdef HEMI_CUDA_COMPILER
checkCuda( cudaMallocHost((void**)&callResult, OPT_SZ) );
checkCuda( cudaMallocHost((void**)&putResult, OPT_SZ) );
checkCuda( cudaMallocHost((void**)&stockPrice, OPT_SZ) );
checkCuda( cudaMallocHost((void**)&optionStrike, OPT_SZ) );
checkCuda( cudaMallocHost((void**)&optionYears, OPT_SZ) );
checkCuda( cudaMalloc ((void**)&d_callResult, OPT_SZ) );
checkCuda( cudaMalloc ((void**)&d_putResult, OPT_SZ) );
checkCuda( cudaMalloc ((void**)&d_stockPrice, OPT_SZ) );
checkCuda( cudaMalloc ((void**)&d_optionStrike, OPT_SZ) );
checkCuda( cudaMalloc ((void**)&d_optionYears, OPT_SZ) );
#else
callResult = (float*)malloc(OPT_SZ);
putResult = (float*)malloc(OPT_SZ);
stockPrice = (float*)malloc(OPT_SZ);
optionStrike = (float*)malloc(OPT_SZ);
optionYears = (float*)malloc(OPT_SZ);
#endif
initOptions(OPT_N, stockPrice, optionStrike, optionYears);
int blockDim = 128; // blockDim, gridDim ignored by host code
int gridDim = std::min<int>(1024, (OPT_N + blockDim - 1) / blockDim);
printf("Running %s Version...\n", HEMI_LOC_STRING);
StartTimer();
#ifdef HEMI_CUDA_COMPILER
checkCuda( cudaMemcpy(d_stockPrice, stockPrice, OPT_SZ, cudaMemcpyHostToDevice) );
checkCuda( cudaMemcpy(d_optionStrike, optionStrike, OPT_SZ, cudaMemcpyHostToDevice) );
checkCuda( cudaMemcpy(d_optionYears, optionYears, OPT_SZ, cudaMemcpyHostToDevice) );
#else
d_callResult = callResult;
d_putResult = putResult;
d_stockPrice = stockPrice;
d_optionStrike = optionStrike;
d_optionYears = optionYears;
#endif
HEMI_KERNEL_LAUNCH(BlackScholes, gridDim, blockDim, 0, 0,
d_callResult, d_putResult, d_stockPrice, d_optionStrike,
d_optionYears, RISKFREE, VOLATILITY, OPT_N);
#ifdef HEMI_CUDA_COMPILER
checkCuda( cudaMemcpy(callResult, d_callResult, OPT_SZ, cudaMemcpyDeviceToHost) );
checkCuda( cudaMemcpy(putResult, d_putResult, OPT_SZ, cudaMemcpyDeviceToHost) );
#endif
printf("Option 0 call: %f\n", callResult[0]);
printf("Option 0 put: %f\n", putResult[0]);
double ms = GetTimer();
//Both call and put is calculated
printf("Options count : %i \n", 2 * OPT_N);
printf("\tBlackScholes() time : %f msec\n", ms);
printf("\t%f GB/s, %f GOptions/s\n",
((double)(5 * OPT_N * sizeof(float)) * 1E-9) / (ms * 1E-3),
((double)(2 * OPT_N) * 1E-9) / (ms * 1E-3));
#ifdef HEMI_CUDA_COMPILER
checkCuda( cudaFree(d_stockPrice) );
checkCuda( cudaFree(d_optionStrike) );
checkCuda( cudaFree(d_optionYears) );
checkCuda( cudaFreeHost(callResult) );
checkCuda( cudaFreeHost(putResult) );
checkCuda( cudaFreeHost(stockPrice) );
checkCuda( cudaFreeHost(optionStrike) );
checkCuda( cudaFreeHost(optionYears) );
#else
free(callResult);
free(putResult);
free(stockPrice);
free(optionStrike);
free(optionYears);
#endif // HEMI_CUDA_COMPILER
}
开发者ID:Oblynx, 项目名称:hemi, 代码行数:89, 代码来源:blackscholes.cpp
示例17: CUDA
CUDA(const T* base, size_t n) :
n_(n) {
cudaCheck(cudaMalloc(&vals_, n_ * sizeof(T)));
cudaCheck(cudaMemcpy(vals_, base, n_ * sizeof(T), cudaMemcpyHostToDevice));
}
开发者ID:2php, 项目名称:fbcunn, 代码行数:5, 代码来源:HalfPrecTest.cpp
示例18: CHECK_GT
void MultiStageMeanfieldLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
init_cpu = false;
init_gpu = false;
const caffe::MultiStageMeanfieldParameter meanfield_param = this->layer_param_.multi_stage_meanfield_param();
num_iterations_ = meanfield_param.num_iterations();
CHECK_GT(num_iterations_, 1) << "Number of iterations must be greater than 1.";
theta_alpha_ = meanfield_param.theta_alpha();
theta_beta_ = meanfield_param.theta_beta();
theta_gamma_ = meanfield_param.theta_gamma();
count_ = bottom[0]->count();
num_ = bottom[0]->num();
channels_ = bottom[0]->channels();
height_ = bottom[0]->height();
width_ = bottom[0]->width();
num_pixels_ = height_ * width_;
LOG(INFO) << "This implementation has not been tested batch size > 1.";
top[0]->Reshape(num_, channels_, height_, width_);
// Initialize the parameters that will updated by backpropagation.
if (this->blobs_.size() > 0) {
LOG(INFO) << "Multimeanfield layer skipping parameter initialization.";
} else {
this->blobs_.resize(3);// blobs_[0] - spatial kernel weights, blobs_[1] - bilateral kernel weights, blobs_[2] - compatability matrix
// Allocate space for kernel weights.
this->blobs_[0].reset(new Blob<Dtype>(1, 1, channels_, channels_));
this->blobs_[1].reset(new Blob<Dtype>(1, 1, channels_, channels_));
caffe_set(channels_ * channels_, Dtype(0.), this->blobs_[0]->mutable_cpu_data());
caffe_set(channels_ * channels_, Dtype(0.), this->blobs_[1]->mutable_cpu_data());
// Initialize the kernels weights. The two files spatial.par and bilateral.par should be available.
FILE * pFile;
pFile = fopen("spatial.par", "r");
CHECK(pFile) << "The file 'spatial.par' is not found. Please create it with initial spatial kernel weights.";
for (int i = 0; i < channels_; i++) {
fscanf(pFile, "%lf", &this->blobs_[0]->mutable_cpu_data()[i * channels_ + i]);
}
fclose(pFile);
pFile = fopen("bilateral.par", "r");
CHECK(pFile) << "The file 'bilateral.par' is not found. Please create it with initial bilateral kernel weights.";
for (int i = 0; i < channels_; i++) {
fscanf(pFile, "%lf", &this->blobs_[1]->mutable_cpu_data()[i * channels_ + i]);
}
fclose(pFile);
// Initialize the compatibility matrix.
this->blobs_[2].reset(new Blob<Dtype>(1, 1, channels_, channels_));
caffe_set(channels_ * channels_, Dtype(0.), this->blobs_[2]->mutable_cpu_data());
// Initialize it to have the Potts model.
for (int c = 0; c < channels_; ++c) {
(this->blobs_[2]->mutable_cpu_data())[c * channels_ + c] = Dtype(-1.);
}
}
float spatial_kernel[2 * num_pixels_];
float *spatial_kernel_gpu_;
compute_spatial_kernel(spatial_kernel);
spatial_lattice_.reset(new ModifiedPermutohedral());
spatial_norm_.Reshape(1, 1, height_, width_);
Dtype* norm_data_gpu ;
Dtype* norm_data;
// Initialize the spatial lattice. This does not need to be computed for every image because we use a fixed size.
switch (Caffe::mode()) {
case Caffe::CPU:
norm_data = spatial_norm_.mutable_cpu_data();
spatial_lattice_->init(spatial_kernel, 2, width_, height_);
// Calculate spatial filter normalization factors.
norm_feed_= new Dtype[num_pixels_];
caffe_set(num_pixels_, Dtype(1.0), norm_feed_);
// pass norm_feed and norm_data to gpu
spatial_lattice_->compute(norm_data, norm_feed_, 1);
bilateral_kernel_buffer_ = new float[5 * num_pixels_];
init_cpu = true;
break;
#ifndef CPU_ONLY
case Caffe::GPU:
CUDA_CHECK(cudaMalloc((void**)&spatial_kernel_gpu_, 2*num_pixels_ * sizeof(float))) ;
CUDA_CHECK(cudaMemcpy(spatial_kernel_gpu_, spatial_kernel, 2*num_pixels_ * sizeof(float), cudaMemcpyHostToDevice)) ;
spatial_lattice_->init(spatial_kernel_gpu_, 2, width_, height_);
CUDA_CHECK(cudaMalloc((void**)&norm_feed_, num_pixels_ * sizeof(Dtype))) ;
caffe_gpu_set(num_pixels_, Dtype(1.0), norm_feed_);
norm_data_gpu = spatial_norm_.mutable_gpu_data();
spatial_lattice_->compute(norm_data_gpu, norm_feed_, 1);
norm_data = spatial_norm_.mutable_cpu_data();
CUDA_CHECK(cudaMalloc((void**)&bilateral_kernel_buffer_, 5 * num_pixels_ * sizeof(float))) ;
CUDA_CHECK(cudaFree(spatial_kernel_gpu_));
init_gpu = true;
break;
//.........这里部分代码省略.........
开发者ID:AmirooR, 项目名称:caffe_video_segmentation, 代码行数:101, 代码来源:multi_stage_meanfield.cpp
示例19: AllocateResources
void CUDARunner::FindBestConfiguration()
{
unsigned long lowb=16;
unsigned long highb=128;
unsigned long lowt=16;
unsigned long hight=256;
unsigned long bestb=16;
unsigned long bestt=16;
int64 besttime=std::numeric_limits<int64>::max();
if(m_requestedgrid>0 && m_requestedgrid<=65536)
{
lowb=m_requestedgrid;
highb=m_requestedgrid;
}
if(m_requestedthreads>0 && m_requestedthreads<=65536)
{
lowt=m_requestedthreads;
hight=m_requestedthreads;
}
for(int numb=lowb; numb<=highb; numb*=2)
{
for(int numt=lowt; numt<=hight; numt*=2)
{
AllocateResources(numb,numt);
// clear out any existing error
cudaError_t err=cudaGetLastError();
err=cudaSuccess;
int64 st=GetTimeMillis();
for(int it=0; it<128*256*2 && err==0; it+=(numb*numt))
{
cutilSafeCall(cudaMemcpy(m_devin,m_in,sizeof(cuda_in),cudaMemcpyHostToDevice));
cuda_process_helper(m_devin,m_devout,64,6,numb,numt);
cutilSafeCall(cudaMemcpy(m_out,m_devout,numb*numt*sizeof(cuda_out),cudaMemcpyDeviceToHost));
err=cudaGetLastError();
if(err!=cudaSuccess)
{
printf("CUDA error %d\n",err);
}
}
int64 et=GetTimeMillis();
printf("Finding best configuration step end (%d,%d) %"PRI64d"ms prev best=%"PRI64d"ms\n",numb,numt,et-st,besttime);
if((et-st)<besttime && err==cudaSuccess)
{
bestb=numb;
bestt=numt;
besttime=et-st;
}
}
}
m_numb=bestb;
m_numt=bestt;
AllocateResources(m_numb,m_numt);
}
开发者ID:chancn, 项目名称:bitcoin-pool, 代码行数:67, 代码来源:bitcoinminercuda.cpp
六六分期app的软件客服如何联系?不知道吗?加qq群【895510560】即可!标题:六六分期
阅读:18094| 2023-10-27
今天小编告诉大家如何处理win10系统火狐flash插件总是崩溃的问题,可能很多用户都不知
阅读:9618| 2022-11-06
今天小编告诉大家如何对win10系统删除桌面回收站图标进行设置,可能很多用户都不知道
阅读:8149| 2022-11-06
今天小编告诉大家如何对win10系统电脑设置节能降温的设置方法,想必大家都遇到过需要
阅读:8530| 2022-11-06
我们在使用xp系统的过程中,经常需要对xp系统无线网络安装向导设置进行设置,可能很多
阅读:8432| 2022-11-06
今天小编告诉大家如何处理win7系统玩cf老是与主机连接不稳定的问题,可能很多用户都不
阅读:9347| 2022-11-06
电脑对日常生活的重要性小编就不多说了,可是一旦碰到win7系统设置cf烟雾头的问题,很
阅读:8397| 2022-11-06
我们在日常使用电脑的时候,有的小伙伴们可能在打开应用的时候会遇见提示应用程序无法
阅读:7833| 2022-11-06
今天小编告诉大家如何对win7系统打开vcf文件进行设置,可能很多用户都不知道怎么对win
阅读:8387| 2022-11-06
今天小编告诉大家如何对win10系统s4开启USB调试模式进行设置,可能很多用户都不知道怎
阅读:7380| 2022-11-06
请发表评论