本文整理汇总了C++中clEnqueueNDRangeKernel函数的典型用法代码示例。如果您正苦于以下问题:C++ clEnqueueNDRangeKernel函数的具体用法?C++ clEnqueueNDRangeKernel怎么用?C++ clEnqueueNDRangeKernel使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了clEnqueueNDRangeKernel函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的C++代码示例。
示例1: clSetKernelArg
void AdvancedMaxPoolingLayer::BackPropagate() {
#ifdef BUILD_OPENCL_MAX
input_->delta.MoveToGPU(true);
output_->delta.MoveToGPU();
maximum_mask_.MoveToGPU();
cl_uint error = 0;
error |= clSetKernelArg (CLHelper::k_amaximumBackward, 0, sizeof (cl_mem), &input_->delta.cl_data_ptr_);
error |= clSetKernelArg (CLHelper::k_amaximumBackward, 1, sizeof (cl_mem), &maximum_mask_.cl_data_ptr_);
error |= clSetKernelArg (CLHelper::k_amaximumBackward, 2, sizeof (cl_mem), &output_->delta.cl_data_ptr_);
error |= clSetKernelArg (CLHelper::k_amaximumBackward, 3, sizeof (unsigned int), &input_width_);
error |= clSetKernelArg (CLHelper::k_amaximumBackward, 4, sizeof (unsigned int), &input_height_);
error |= clSetKernelArg (CLHelper::k_amaximumBackward, 5, sizeof (unsigned int), &maps_);
error |= clSetKernelArg (CLHelper::k_amaximumBackward, 6, sizeof (unsigned int), &output_width_);
error |= clSetKernelArg (CLHelper::k_amaximumBackward, 7, sizeof (unsigned int), &output_height_);
error |= clSetKernelArg (CLHelper::k_amaximumBackward, 8, sizeof (unsigned int), ®ion_width_);
error |= clSetKernelArg (CLHelper::k_amaximumBackward, 9, sizeof (unsigned int), ®ion_height_);
error |= clSetKernelArg (CLHelper::k_amaximumBackward, 10, sizeof (unsigned int), &stride_width_);
error |= clSetKernelArg (CLHelper::k_amaximumBackward, 11, sizeof (unsigned int), &stride_height_);
if (error != CL_SUCCESS) {
FATAL ("Error setting kernel args: " << (signed int) error);
}
size_t global_work_size[] = { input_width_, input_height_, maps_* input_->data.samples() };
error = clEnqueueNDRangeKernel (CLHelper::queue, CLHelper::k_amaximumBackward, 3, NULL,
global_work_size, NULL, 0, NULL, NULL);
if (error != CL_SUCCESS) {
FATAL ("Error enqueueing kernel: " << (signed int) error);
}
#ifdef BRUTAL_FINISH
error = clFinish (CLHelper::queue);
if (error != CL_SUCCESS) {
FATAL ("Error finishing command queue: " << (signed int) error);
}
#endif
#else
#define MP_HELPER_MIN(X, Y) (((X) < (Y)) ? (X) : (Y))
#pragma omp parallel for default(shared)
for(std::size_t sample = 0; sample < input_->data.samples(); sample++) {
for (unsigned int map = 0; map < maps_; map++) {
for (unsigned int ix = 0; ix < input_width_; ix++) {
for(unsigned int iy = 0; iy < input_width_; iy++) {
const unsigned int mask_index = ix + input_width_ * iy;
const unsigned int oxstart = (ix < region_width_) ?
0 : (ix - region_width_) / stride_width_+ 1;
const unsigned int oxend = MP_HELPER_MIN(ix / stride_width_ + 1, output_width_);
const unsigned int oystart = (iy < region_height_) ?
0 : (iy - region_height_) / stride_height_ + 1;
const unsigned int oyend = MP_HELPER_MIN(iy / stride_height_ + 1, output_height_);
datum sum = 0.0;
for (unsigned int oy = oystart; oy < oyend; oy++) {
for (unsigned int ox = oxstart; ox < oxend; ox++) {
if(*maximum_mask_.data_ptr_const(ox, oy, map, sample) == mask_index)
sum += *output_->delta.data_ptr_const(ox, oy, map, sample);
}
}
*(input_->delta.data_ptr(ix, iy, map, sample)) = sum;
}
}
}
}
#endif
}
开发者ID:cnndabbler,项目名称:cn24,代码行数:70,代码来源:AdvancedMaxPoolingLayer.cpp
示例2: task
int task(cl_context context, cl_device_id device, cl_command_queue queue, void* data_)
{
const TaskData* data = (const TaskData*) data_;
cl_int err;
if (data->points % data->points_per_work_item)
check_error(CLQMC_INVALID_VALUE, "points must be a multiple of points_per_work_item");
if (data->replications % data->replications_per_work_item)
check_error(CLQMC_INVALID_VALUE, "replications must be a multiple of replications_per_work_item");
// Lattice buffer
size_t pointset_size;
// gen_vec is given in common.c
clqmcLatticeRule* pointset = clqmcLatticeRuleCreate(data->points, DIMENSION, gen_vec, &pointset_size, &err);
check_error(err, NULL);
cl_mem pointset_buf = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
pointset_size, pointset, &err);
check_error(err, "cannot create point set buffer");
// Shifts buffer
clqmc_fptype* shifts = (clqmc_fptype*) malloc(data->replications * DIMENSION * sizeof(clqmc_fptype));
// populate random shifts using a random stream
clrngMrg31k3pStream* stream = clrngMrg31k3pCreateStreams(NULL, 1, NULL, &err);
check_error(err, NULL);
for (cl_uint i = 0; i < data->replications; i++)
for (cl_uint j = 0; j < DIMENSION; j++)
shifts[i * DIMENSION + j] = clrngMrg31k3pRandomU01(stream);
err = clrngMrg31k3pDestroyStreams(stream);
check_error(err, NULL);
cl_mem shifts_buf = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
data->replications * DIMENSION * sizeof(clqmc_fptype), shifts, &err);
check_error(err, "cannot create shifts buffer");
// Output buffer
size_t points_block_count = data->points / data->points_per_work_item;
cl_mem output_buf = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY,
data->replications * points_block_count * sizeof(clqmc_fptype), NULL, &err);
check_error(err, "cannot create output buffer");
// OpenCL kernel
cl_program program = build_program_from_file(context, device,
"client/DocsTutorial/example4_kernel.cl",
NULL);
check_error(err, NULL);
cl_kernel kernel = clCreateKernel(program, "simulateWithRQMC", &err);
check_error(err, "cannot create kernel");
int iarg = 0;
err = clSetKernelArg(kernel, iarg++, sizeof(pointset_buf), &pointset_buf);
err |= clSetKernelArg(kernel, iarg++, sizeof(shifts_buf), &shifts_buf);
err |= clSetKernelArg(kernel, iarg++, sizeof(data->points_per_work_item), &data->points_per_work_item);
err |= clSetKernelArg(kernel, iarg++, sizeof(data->replications), &data->replications);
err |= clSetKernelArg(kernel, iarg++, sizeof(output_buf), &output_buf);
check_error(err, "cannot set kernel arguments");
// Execution
cl_event ev;
size_t global_size = (data->replications / data->replications_per_work_item) * points_block_count;
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_size, NULL, 0, NULL, &ev);
check_error(err, "cannot enqueue kernel");
err = clWaitForEvents(1, &ev);
check_error(err, "error waiting for events");
clqmc_fptype* output = (clqmc_fptype*) malloc(data->replications * points_block_count * sizeof(clqmc_fptype));
err = clEnqueueReadBuffer(queue, output_buf, CL_TRUE, 0,
data->replications * points_block_count * sizeof(clqmc_fptype), output, 0, NULL, NULL);
check_error(err, "cannot read output buffer");
printf("\nAdvanced randomized quasi-Monte Carlo integration:\n\n");
err = clqmcLatticeRuleWriteInfo(pointset, stdout);
check_error(err, NULL);
printf("\n");
rqmcReport(data->replications, data->points, points_block_count, output);
// Clean up
clReleaseEvent(ev);
clReleaseMemObject(output_buf);
clReleaseMemObject(pointset_buf);
clReleaseKernel(kernel);
clReleaseProgram(program);
//.........这里部分代码省略.........
开发者ID:umontreal-simul,项目名称:clQMC,代码行数:101,代码来源:example4.c
示例3: main
//.........这里部分代码省略.........
{
printf("error: call to 'clEnqueueWriteBuffer' failed\n");
exit(1);
}
/* Create host dst buffer */
cl_int16 *dst_host_buffer;
dst_host_buffer = malloc(num_elem * sizeof(cl_int16));
memset((void *)dst_host_buffer, 1, num_elem * sizeof(cl_int16));
/* Create device dst buffer */
cl_mem dst_device_buffer;
dst_device_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, num_elem *sizeof(cl_int16), NULL, &ret);
if (ret != CL_SUCCESS)
{
printf("error: could not create dst buffer\n");
exit(1);
}
/* Set kernel arguments */
ret = CL_SUCCESS;
ret |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &src_0_device_buffer);
ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &src_1_device_buffer);
ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &dst_device_buffer);
if (ret != CL_SUCCESS)
{
printf("error: call to 'clSetKernelArg' failed\n");
exit(1);
}
/* Launch the kernel */
size_t global_work_size = num_elem;
size_t local_work_size = num_elem;
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL);
if (ret != CL_SUCCESS)
{
printf("error: call to 'clEnqueueNDRangeKernel' failed\n");
exit(1);
}
/* Wait for it to finish */
clFinish(command_queue);
/* Read results from GPU */
ret = clEnqueueReadBuffer(command_queue, dst_device_buffer, CL_TRUE,0, num_elem * sizeof(cl_int16), dst_host_buffer, 0, NULL, NULL);
if (ret != CL_SUCCESS)
{
printf("error: call to 'clEnqueueReadBuffer' failed\n");
exit(1);
}
/* Dump dst buffer to file */
char dump_file[100];
sprintf((char *)&dump_file, "%s.result", argv[0]);
write_buffer(dump_file, (const char *)dst_host_buffer, num_elem * sizeof(cl_int16));
printf("Result dumped to %s\n", dump_file);
/* Free host dst buffer */
free(dst_host_buffer);
/* Free device dst buffer */
ret = clReleaseMemObject(dst_device_buffer);
if (ret != CL_SUCCESS)
{
printf("error: call to 'clReleaseMemObject' failed\n");
exit(1);
}
开发者ID:xianggong,项目名称:m2c-llvm-devtools-host,代码行数:67,代码来源:relational_greater_than_or_equal_to_ulong16ulong16_src.c
示例4: opencl_scanhash
static int64_t opencl_scanhash(struct thr_info *thr, struct work *work,
int64_t __maybe_unused max_nonce)
{
const int thr_id = thr->id;
struct opencl_thread_data *thrdata = thr->cgpu_data;
struct cgpu_info *gpu = thr->cgpu;
_clState *clState = clStates[thr_id];
const cl_kernel *kernel = &clState->kernel;
const int dynamic_us = opt_dynamic_interval * 1000;
cl_int status;
size_t globalThreads[1];
size_t localThreads[1] = { clState->wsize };
int64_t hashes;
/* Windows' timer resolution is only 15ms so oversample 5x */
if (gpu->dynamic && (++gpu->intervals * dynamic_us) > 70000) {
struct timeval tv_gpuend;
double gpu_us;
gettimeofday(&tv_gpuend, NULL);
gpu_us = us_tdiff(&tv_gpuend, &gpu->tv_gpustart) / gpu->intervals;
if (gpu_us > dynamic_us) {
if (gpu->intensity > MIN_INTENSITY)
--gpu->intensity;
} else if (gpu_us < dynamic_us / 2) {
if (gpu->intensity < MAX_INTENSITY)
++gpu->intensity;
}
memcpy(&(gpu->tv_gpustart), &tv_gpuend, sizeof(struct timeval));
gpu->intervals = 0;
}
set_threads_hashes(clState->vwidth, &hashes, globalThreads, localThreads[0], &gpu->intensity);
if (hashes > gpu->max_hashes)
gpu->max_hashes = hashes;
status = thrdata->queue_kernel_parameters(clState, &work->blk, globalThreads[0]);
if (unlikely(status != CL_SUCCESS)) {
applog(LOG_ERR, "Error: clSetKernelArg of all params failed.");
return -1;
}
if (clState->goffset) {
size_t global_work_offset[1];
global_work_offset[0] = work->blk.nonce;
status = clEnqueueNDRangeKernel(clState->commandQueue, *kernel, 1, global_work_offset,
globalThreads, localThreads, 0, NULL, NULL);
} else
status = clEnqueueNDRangeKernel(clState->commandQueue, *kernel, 1, NULL,
globalThreads, localThreads, 0, NULL, NULL);
if (unlikely(status != CL_SUCCESS)) {
applog(LOG_ERR, "Error %d: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel)", status);
return -1;
}
status = clEnqueueReadBuffer(clState->commandQueue, clState->outputBuffer, CL_FALSE, 0,
BUFFERSIZE, thrdata->res, 0, NULL, NULL);
if (unlikely(status != CL_SUCCESS)) {
applog(LOG_ERR, "Error: clEnqueueReadBuffer failed error %d. (clEnqueueReadBuffer)", status);
return -1;
}
/* The amount of work scanned can fluctuate when intensity changes
* and since we do this one cycle behind, we increment the work more
* than enough to prevent repeating work */
work->blk.nonce += gpu->max_hashes;
/* This finish flushes the readbuffer set with CL_FALSE in clEnqueueReadBuffer */
clFinish(clState->commandQueue);
/* FOUND entry is used as a counter to say how many nonces exist */
if (thrdata->res[FOUND]) {
/* Clear the buffer again */
status = clEnqueueWriteBuffer(clState->commandQueue, clState->outputBuffer, CL_FALSE, 0,
BUFFERSIZE, blank_res, 0, NULL, NULL);
if (unlikely(status != CL_SUCCESS)) {
applog(LOG_ERR, "Error: clEnqueueWriteBuffer failed.");
return -1;
}
applog(LOG_DEBUG, "GPU %d found something?", gpu->device_id);
postcalc_hash_async(thr, work, thrdata->res);
memset(thrdata->res, 0, BUFFERSIZE);
/* This finish flushes the writebuffer set with CL_FALSE in clEnqueueWriteBuffer */
clFinish(clState->commandQueue);
}
return hashes;
}
开发者ID:poorleno,项目名称:cgminer,代码行数:90,代码来源:driver-opencl.c
示例5: kernel_gpu_opencl_wrapper_2
//.........这里部分代码省略.........
clSetKernelArg( kernel,
4,
sizeof(cl_mem),
(void *) &offsetD);
clSetKernelArg( kernel,
5,
sizeof(cl_mem),
(void *) &lastKnodeD);
clSetKernelArg( kernel,
6,
sizeof(cl_mem),
(void *) &offset_2D);
clSetKernelArg( kernel,
7,
sizeof(cl_mem),
(void *) &startD);
clSetKernelArg( kernel,
8,
sizeof(cl_mem),
(void *) &endD);
clSetKernelArg( kernel,
9,
sizeof(cl_mem),
(void *) &ansDStart);
clSetKernelArg( kernel,
10,
sizeof(cl_mem),
(void *) &ansDLength);
//====================================================================================================100
// Kernel
//====================================================================================================100
error = clEnqueueNDRangeKernel( command_queue,
kernel,
1,
NULL,
global_work_size,
local_work_size,
0,
NULL,
NULL);
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
// Wait for all operations to finish NOT SURE WHERE THIS SHOULD GO
error = clFinish(command_queue);
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
time4 = get_time();
//====================================================================================================100
// END
//====================================================================================================100
//======================================================================================================================================================150
// GPU MEMORY COPY (CONTD.)
//======================================================================================================================================================150
//====================================================================================================100
// DEVICE IN/OUT
//====================================================================================================100
//==================================================50
// ansDStart
开发者ID:briansp2020,项目名称:HIP-Examples,代码行数:67,代码来源:kernel_gpu_opencl_wrapper_2.c
示例6: clCreateFromGLTexture3D
void OpenCLExecuter::ocl_filter_shared(void)
{
cl_int err; // debugging variables
size_t szParmDataBytes; // Byte size of context information
cl_mem src_buffer; // OpenCL device source buffer
cl_mem dst_buffer; // OpenCL device source buffer
cl_sampler sampler; // OpenCL sampler
cl_kernel ckKernel; // OpenCL kernel
int iNumElements = volobj->texwidth*volobj->texheight*volobj->texdepth; // Length of float arrays to process
// set Local work size dimensions
// size_t local_threads[3] ={256,256,64};
// set Global work size dimensions
// size_t global_threads[3] ={roundup((int) volobj->texwidth/local_threads[0], 0)*local_threads[0], roundup((int) volobj->texheight/local_threads[1], 0)*local_threads[1], roundup((int) volobj->texdepth/local_threads[2], 0)*local_threads[2]};
// set Global work size dimensions
size_t global_threads[3] ={volobj->texwidth, volobj->texheight, volobj->texdepth};
// allocate the source buffer memory object
src_buffer = clCreateFromGLTexture3D (ocl_wrapper->context, CL_MEM_READ_WRITE, GL_TEXTURE_3D, 0, volobj->TEXTURE3D_RED, &err);
printf("OPENCL: clCreateFromGLTexture3D: %s\n", ocl_wrapper->get_error(err));
// allocate the destination buffer memory object
dst_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_WRITE, sizeof(unsigned char) * iNumElements, NULL, &err);
printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err));
// create a sampler object
sampler = clCreateSampler(ocl_wrapper->context, CL_FALSE, CL_ADDRESS_CLAMP, CL_FILTER_NEAREST, &err);
printf("OPENCL: clCreateSampler: %s\n", ocl_wrapper->get_error(err));
// Create the kernel
ckKernel = clCreateKernel (cpProgram, "myFunc", &err);
printf("OPENCL: clCreateKernel: %s\n", ocl_wrapper->get_error(err));
// Set the Argument values
err = clSetKernelArg (ckKernel, 0, sizeof(cl_mem), (void*)&src_buffer);
printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
err = clSetKernelArg (ckKernel, 1, sizeof(cl_mem), (void*)&dst_buffer);
printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
err = clSetKernelArg (ckKernel, 2, sizeof(sampler), (void*)&sampler);
printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
size_t local;
err = clGetKernelWorkGroupInfo(ckKernel, ocl_wrapper->devices[ocl_wrapper->deviceUsed], CL_KERNEL_LOCAL_MEM_SIZE , sizeof(local), &local, NULL);
printf("OPENCL: clGetKernelWorkGroupInfo (kernel memory): %s\n", ocl_wrapper->get_error(err));
printf("OPENCL: Kernel local memory use: %d Bytes\n", (int)local);
// grab input data from OpenGL, compute, copy the results back to OpenGL
// Runs asynchronous to host, up until blocking clFinish at the end
glFinish();
glFlush();
// grab the OpenGL texture object for read/writing from OpenCL
err = clEnqueueAcquireGLObjects(ocl_wrapper->commandQue, 1, &src_buffer, 0,NULL,NULL);
printf("OPENCL: clEnqueueAcquireGLObjects: %s\n", ocl_wrapper->get_error(err));
// Execute a kernel
err = clEnqueueNDRangeKernel (ocl_wrapper->commandQue, ckKernel, 3, NULL, global_threads, NULL, 0, NULL, NULL);
printf("OPENCL: clEnqueueNDRangeKernel: %s\n", ocl_wrapper->get_error(err));
/*
// Blocking read of results from GPU to Host
int size = volobj->texwidth*volobj->texheight*volobj->texdepth;
unsigned char* result = new unsigned char[size];
err = clEnqueueReadBuffer (ocl_wrapper->commandQue, dst_buffer, CL_TRUE, 0, sizeof(unsigned char) * iNumElements, result, 0, NULL, NULL);
printf("OPENCL: clEnqueueReadBuffer: %s\n", ocl_wrapper->get_error(err));
for(int i=0; i<size; i++) volobj->texture3d[3*i+0] = result[i];
delete[] result;
*/
// copy OpenCL buffer to OpenGl texture
size_t corigin[3] = {0,0,0};
size_t cdimensions[3] = {(unsigned int)volobj->texwidth, (unsigned int)volobj->texheight, (unsigned int)volobj->texdepth};
err = clEnqueueCopyBufferToImage(ocl_wrapper->commandQue , dst_buffer, src_buffer, 0, corigin, cdimensions, 0, NULL, NULL);
printf("OPENCL: clEnqueueCopyBufferToImage: %s\n", ocl_wrapper->get_error(err));
//make sure we block until we are done.
//err = clFinish(ocl_wrapper->commandQue);
//printf("OPENCL: clFinish: %s\n", ocl_wrapper->get_error(err));
//release opengl objects now
err = clEnqueueReleaseGLObjects(ocl_wrapper->commandQue, 1, &src_buffer, 0,0,0);
printf("OPENCL: clEnqueueAcquireGLObjects: %s\n", ocl_wrapper->get_error(err));
// Cleanup allocated objects
printf("OPENCL: Releasing kernel memory\n");
if(ckKernel)clReleaseKernel(ckKernel);
//need to release any other OpenCL memory objects here
if(src_buffer)clReleaseMemObject(src_buffer);
if(dst_buffer)clReleaseMemObject(dst_buffer);
}
开发者ID:ut666,项目名称:VolViewer,代码行数:94,代码来源:OpenCLExecuter.cpp
示例7: roundup
void OpenCLExecuter::ocl_parrallelReduction(void)
{
cl_int err; // debugging variables
size_t szParmDataBytes; // Byte size of context information
cl_mem src_buffer; // OpenCL device source buffer
cl_mem tmp_buffer; // OpenCL device source buffer
cl_mem dst_buffer; // OpenCL device source buffer
size_t szGlobalWorkSize; // 1D var for Total # of work items
size_t szLocalWorkSize; // 1D var for # of work items in the work group
size_t numWorkGroups;
cl_kernel ckKernel; // OpenCL kernel
int iNumElements = 65536; //65536 // Length of float arrays to process
// set Local work size dimensions
szLocalWorkSize = 512;
// set Global work size dimensions
szGlobalWorkSize = roundup((int) iNumElements/szLocalWorkSize, 0)*szLocalWorkSize;
//szGlobalWorkSize = iNumElements;
numWorkGroups = (float)szGlobalWorkSize/(float)szLocalWorkSize;
printf("OPENCL: number of elements: %d\n", (int)iNumElements);
printf("OPENCL: local worksize: %d\n", (int)szLocalWorkSize);
printf("OPENCL: global worksize: %d\n", (int)szGlobalWorkSize);
printf("OPENCL: work groups: %d\n", (int)(numWorkGroups));
//temp array
int* data = new int[iNumElements];
for(int i=0; i<iNumElements; i++)
data[i] = randomFloat(1.0, (float)iNumElements);
data[iNumElements/2] = -100.0;
//for(int i=0; i<iNumElements; i++)
// printf("data: %d\n", data[i]);
size_t global_threads[1] ={iNumElements};
// allocate the source buffer memory object
src_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_ONLY, sizeof(int) * iNumElements, NULL, &err);
printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err));
// allocate the temp buffer memory object
tmp_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_WRITE, sizeof(int) * iNumElements, NULL, &err);
printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err));
// allocate the destination buffer memory object
dst_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_WRITE_ONLY, sizeof(int) * iNumElements, NULL, &err);
printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err));
// Create the kernel
ckKernel = clCreateKernel (cpProgram, "min_reduce", &err);
printf("OPENCL: clCreateKernel: %s\n", ocl_wrapper->get_error(err));
// Set the Argument values
err = clSetKernelArg (ckKernel, 0, sizeof(cl_mem), (void*)&src_buffer);
printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
err = clSetKernelArg (ckKernel, 1, sizeof(int)*szLocalWorkSize, NULL);
printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
err = clSetKernelArg (ckKernel, 2, sizeof(int), (void*)&iNumElements);
printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
err = clSetKernelArg (ckKernel, 3, sizeof(cl_mem), (void*)&dst_buffer);
printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
// Copy input data to GPU, compute, copy results back
// Runs asynchronous to host, up until blocking read at end
int numb_iterations = sqrt((float)numWorkGroups);
numb_iterations=0;
bool cont = true;
Timer timer;
timer.startTimer();
//for(int i=0; i<numb_iterations; i++)
while(cont)
{
// Write data from host to GPU
err = clEnqueueWriteBuffer (ocl_wrapper->commandQue, src_buffer, CL_FALSE, 0, sizeof(int) * iNumElements, data, 0, NULL, NULL);
printf("OPENCL: clEnqueueWriteBuffer: %s\n", ocl_wrapper->get_error(err));
// Launch kernel
err = clEnqueueNDRangeKernel (ocl_wrapper->commandQue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
printf("OPENCL: clEnqueueNDRangeKernel: %s\n", ocl_wrapper->get_error(err));
// Blocking read of results from GPU to Host
err = clEnqueueReadBuffer (ocl_wrapper->commandQue, dst_buffer, CL_TRUE, 0, sizeof(int) * iNumElements, data, 0, NULL, NULL);
printf("OPENCL: clEnqueueReadBuffer: %s\n", ocl_wrapper->get_error(err));
numb_iterations++;
if(data[1]==0) cont = false;
//printf("min: %d\n", data[0]);
for(int i=0; i<numWorkGroups; i++)
printf("min: %d\n", data[i]);
}
timer.endTimer("GPU find min");
timer.startTimer();
int min=iNumElements;
for(int i=0; i<iNumElements; i++)
//.........这里部分代码省略.........
开发者ID:ut666,项目名称:VolViewer,代码行数:101,代码来源:OpenCLExecuter.cpp
示例8: main
int main()
{
// Initiating opencl
cl_device_id device_id;
cl_int err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, 1, &device_id, NULL);
if (err != CL_SUCCESS)
{
std::cout<<"Error in device."<<std::endl;
return EXIT_FAILURE;
}
cl_context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
if (!context)
{
std::cout<<"Error in context."<<std::endl;
return EXIT_FAILURE;
}
cl_command_queue commands = clCreateCommandQueue(context, device_id, 0, &err);
if (!commands)
{
std::cout<<"Error in command queue."<<std::endl;
return EXIT_FAILURE;
}
std::ifstream in("transpMatrix.cl");
std::string contents((std::istreambuf_iterator<char>(in)), std::istreambuf_iterator<char>());
const char* kernelSource = contents.c_str();
cl_program program = clCreateProgramWithSource(context, 1, &kernelSource, NULL, &err);
if (!program)
{
std::cout<<"Error in program."<<std::endl;
return EXIT_FAILURE;
}
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (err != CL_SUCCESS)
{
size_t len;
char buffer[2048];
std::cout<<"Error in compiling the opencl program."<<std::endl;
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
std::cout<<buffer<<std::endl;
return EXIT_FAILURE;
}
cl_kernel kernel = clCreateKernel(program, "simplecl", &err);
if (!kernel || err != CL_SUCCESS)
{
std::cout<<"Error in kernel "<<err<<std::endl;
return EXIT_FAILURE;
}
// Data to compute
float* data = new float[count*count];
for(int i = 0; i < count; ++i)
{
for(int j = 0; j < count; ++j)
{
data[i*count+j] = rand()%10;
std::cout<<data[i*count+j]<<" ";
}
std::cout<<std::endl;
}
std::cout<<std::endl;
// Creating communication buffers
cl_mem input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * count*count, NULL, NULL);
cl_mem output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count*count, NULL, NULL);
if (!input || !output)
{
std::cout<<"Error in allocation."<<std::endl;
return EXIT_FAILURE;
}
// Copy data to input buffer
err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count*count, data, 0, NULL, NULL);
if (err != CL_SUCCESS)
{
std::cout<<"Error in copy."<<std::endl;
return EXIT_FAILURE;
}
err = 0;
err = clSetKernelArg(kernel, 0, sizeof(int), &count);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &input);
err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &output);
if (err != CL_SUCCESS)
{
std::cout<<"Error in argument."<<std::endl;
return EXIT_FAILURE;
}
size_t local[] = {1,1};
size_t global[] = {10,10};
// err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
// if (err != CL_SUCCESS)
// {
// std::cout<<"Error in getting loal."<<std::endl;
// return EXIT_FAILURE;
// }
err = clEnqueueNDRangeKernel(commands, kernel, 2, NULL, global, local, 0, NULL, NULL);
if (err != CL_SUCCESS)
{
std::cout<<"Error in pushing to queue "<<err<<std::endl;
//.........这里部分代码省略.........
开发者ID:AnisB,项目名称:Misc,代码行数:101,代码来源:transpMatrix.cpp
示例9: main
int main(void) {
// se crea los 2 vectores de entrada
int i;
const int LIST_SIZE = 1024;
int *A = (int*)malloc(sizeof(int)*LIST_SIZE);
int *B = (int*)malloc(sizeof(int)*LIST_SIZE);
for(i = 0; i < LIST_SIZE; i++) {
A[i] = i;
B[i] = LIST_SIZE - i;
}
// cargamos el kernel en source_str
FILE *fp;
char *source_str;
size_t source_size;
fp = fopen("vector_add_kernel.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
fclose( fp );
// obtenemos las plataformas y informacion de los devices
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_DEFAULT, 1,
&device_id, &ret_num_devices);
// creamos un contexto OpenCL
cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
// creamos la cola de comandos
cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
// creamos el buffer de memoria en el device para cada vector
cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
LIST_SIZE * sizeof(int), NULL, &ret);
cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
LIST_SIZE * sizeof(int), NULL, &ret);
cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
LIST_SIZE * sizeof(int), NULL, &ret);
// copiamos los vectores A y B a sus respectivas memorias buffer
ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(int), A, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(int), B, 0, NULL, NULL);
// creamos un programa para el kernel
cl_program program = clCreateProgramWithSource(context, 1,
(const char **)&source_str, (const size_t *)&source_size, &ret);
// generamos el programa
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
// creamos el kernel
cl_kernel kernel = clCreateKernel(program, "vector_add", &ret);
// establecemos los argumentos del kernel
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj);
// ejecutamos el kernel de la lista
size_t global_item_size = LIST_SIZE;
size_t local_item_size = 64; // dividimos los work items en grupos de 64
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&global_item_size, &local_item_size, 0, NULL, NULL);
// copiamos la memoria buffer C del device hacia la variable local C
int *C = (int*)malloc(sizeof(int)*LIST_SIZE);
ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(int), C, 0, NULL, NULL);
// muestra el resultado
for(i = 0; i < LIST_SIZE; i++)
printf("%d + %d = %d\n", A[i], B[i], C[i]);
free(A);
free(B);
free(C);
return 0;
}
开发者ID:pioh123,项目名称:parallel,代码行数:90,代码来源:vector.c
示例10: main
//.........这里部分代码省略.........
chk(status, "clCreatebuffer");
// perform computing on GPU
// copy data from host to device
status = clEnqueueWriteBuffer(cmdQueue, u_d, CL_FALSE, 0, size, u_h, 0, NULL, NULL);
chk(status,"ClEnqueueWriteBuffer");
status = clEnqueueWriteBuffer(cmdQueue, f_d, CL_FALSE, 0, size1, f_h, 0, NULL, NULL);
chk(status, "clEnqueueWriteBuffer");
// create program with source code
cl_program program = clCreateProgramWithSource(context,1,(const char**)&programSource, NULL, &status);
chk(status, "clCreateProgramWithSource");
// Compile program for the device
status = clBuildProgram(program, numDevices, devices, NULL, NULL,NULL);
// chk(status, "ClBuildProgram");
if(status != CL_SUCCESS){
printf("clBuildProgram failed (%d) \n", status);
size_t log_size;
clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
char *log = (char *) malloc(log_size);
clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
printf("%s\n", log);
exit(-1);
}
printf("successfully built program \n");
// Create lattice-boltzman kernel
cl_kernel kernel, kernel1;
kernel = clCreateKernel(program, "lbiteration", &status);
kernel1 = clCreateKernel(program, "Denrho", &status);
chk(status, "clCreateKernel");
printf("successfully create kernel \n");
// Associate the input and output buffers with the kernel
status = clSetKernelArg(kernel,0, sizeof(cl_mem), &f_d);
status |= clSetKernelArg(kernel1,0, sizeof(cl_mem), &u_d);
status |= clSetKernelArg(kernel1,1, sizeof(cl_mem), &f_d);
status |= clSetKernelArg(kernel, 1, sizeof(int), &ArraySizeX);
status |= clSetKernelArg(kernel1,2, sizeof(int), &ArraySizeX);
status |= clSetKernelArg(kernel, 2, sizeof(int), &ArraySizeY);
status |= clSetKernelArg(kernel1,3, sizeof(int),&ArraySizeY);
chk(status, "clSerKernelArg");
// set the work dimensions
size_t localworksize[2] = {BLOCK_SIZE_X,BLOCK_SIZE_Y};
int nBLOCKSX = (ArraySizeX-2)/(BLOCK_SIZE_X -2);
int nBLOCKSY = (ArraySizeY-2)/(BLOCK_SIZE_Y -2);
size_t globalworksize[2] = {nBLOCKSX*BLOCK_SIZE_X,nBLOCKSY*BLOCK_SIZE_Y};
// loop the kernel
for( nsteps = 0; nsteps < 100; nsteps++){
status = clEnqueueNDRangeKernel(cmdQueue, kernel, 2, NULL, globalworksize,localworksize,0,NULL,&event);
clWaitForEvents(1 , &event);
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START,
sizeof(time_start), &time_start, NULL);
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END,
sizeof(time_end), &time_end, NULL);
total_time += time_end - time_start;
}
printf("Good so far \n");
status = clEnqueueNDRangeKernel(cmdQueue, kernel1, 2, NULL, globalworksize,localworksize,0,NULL,&event);
chk(status, "clEnqueueNDR");
clWaitForEvents(1 , &event);
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START,
sizeof(time_start), &time_start, NULL);
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END,
sizeof(time_end), &time_end, NULL);
total_time += time_end - time_start;
printf("running time is %0.3f \n",(total_time/1000000000.0));
// retrieve data from device
status = clEnqueueReadBuffer(cmdQueue, u_d, CL_TRUE, 0, size, u_h, 0, NULL, NULL);
chk(status, "clEnqueueReadBuffer");
// Output results
fp = fopen("SolutionCL.txt", "wt");
for(i= 0;i<ArraySizeX;i++){
for(j=0;j<ArraySizeY;j++)
fprintf(fp, " %f", u_h[i*ArraySizeY+j]);
fprintf(fp, "\n");
}
fclose(fp);
//cleanup
clReleaseKernel(kernel);
clReleaseKernel(kernel1);
clReleaseProgram(program);
clReleaseCommandQueue(cmdQueue);
clReleaseMemObject(u_d);
clReleaseMemObject(f_d);
clReleaseContext(context);
free(u_h);
free(f_h);
free(platforms);
free(devices);
return 0;
}
开发者ID:hietwll,项目名称:parallel-lattice-Boltzmann,代码行数:101,代码来源:Lattice_BoltzmannOpenCL.c
示例11: main
//.........这里部分代码省略.........
d_C = clCreateBuffer(clGPUContext,
CL_MEM_READ_WRITE,
mem_size_A, NULL, &errcode);
d_A = clCreateBuffer(clGPUContext,
CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
mem_size_A, h_A, &errcode);
d_B = clCreateBuffer(clGPUContext,
CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
mem_size_B, h_B, &errcode);
FILE* fp = fopen("hw2.cl", "r");
fseek (fp , 0 , SEEK_END);
const size_t lSize = ftell(fp);
rewind(fp);
unsigned char* buffer;
buffer = (unsigned char*) malloc (lSize);
fread(buffer, 1, lSize, fp);
fclose(fp);
cl_int status;
clProgram = clCreateProgramWithBinary(clGPUContext,
1, (const cl_device_id *)clDevices,
&lSize, (const unsigned char**)&buffer,
&status, &errcode);
errcode = clBuildProgram(clProgram, 0, NULL, NULL,
NULL, NULL);
errcode = clBuildProgram(clProgram, 0,
NULL, NULL, NULL, NULL);
clKernel = clCreateKernel(clProgram,
"MM", &errcode);
size_t globalWorkSize[2];
int wA = WA;
int wC = WC;
errcode = clSetKernelArg(clKernel, 0,
sizeof(cl_mem), (void *)&d_C);
errcode |= clSetKernelArg(clKernel, 1,
sizeof(cl_mem), (void *)&d_A);
errcode |= clSetKernelArg(clKernel, 2,
sizeof(cl_mem), (void *)&d_B);
errcode |= clSetKernelArg(clKernel, 3,
sizeof(int), (void *)&wA);
errcode |= clSetKernelArg(clKernel, 4,
sizeof(int), (void *)&wC);
globalWorkSize[0] = 16;
globalWorkSize[1] = 16;
cl_ulong time_start, time_end, total_time = 0;
errcode = clEnqueueNDRangeKernel(clCommandQue,
clKernel, 2, NULL, globalWorkSize,
NULL, 0, NULL, &mm);
printf("Average time = %lu\n");
clFinish(clCommandQue);
clGetEventProfilingInfo(mm, CL_PROFILING_COMMAND_START,
sizeof(time_start), &time_start, NULL);
clGetEventProfilingInfo(mm, CL_PROFILING_COMMAND_END,
sizeof(time_end), &time_end, NULL);
total_time += time_end - time_start;
printf("Average time = %lu\n", total_time);
errcode = clEnqueueReadBuffer(clCommandQue,
d_C, CL_TRUE, 0, mem_size_C,
h_C, 0, NULL, NULL);
free(h_A);
free(h_B);
free(h_C);
clReleaseMemObject(d_A);
clReleaseMemObject(d_C);
clReleaseMemObject(d_B);
free(clDevices);
clReleaseContext(clGPUContext);
clReleaseKernel(clKernel);
clReleaseProgram(clProgram);
clReleaseCommandQueue(clCommandQue);
}
开发者ID:hemantjp,项目名称:HW2,代码行数:101,代码来源:hw2.c
示例12: main
int main(int argc, char *argv[])
{
double Mops, t1, t2;
double tsx, tsy, tm, an, tt, gc;
double sx_verify_value, sy_verify_value, sx_err, sy_err;
int i, nit;
int k_offset, j;
logical verified;
char size[16];
FILE *fp;
if (argc == 1) {
fprintf(stderr, "Usage: %s <kernel directory>\n", argv[0]);
exit(-1);
}
if ((fp = fopen("timer.flag", "r")) == NULL) {
timers_enabled = false;
} else {
timers_enabled = true;
fclose(fp);
}
//--------------------------------------------------------------------
// Because the size of the problem is too large to store in a 32-bit
// integer for some classes, we put it into a string (for printing).
// Have to strip off the decimal point put in there by the floating
// point print statement (internal file)
//--------------------------------------------------------------------
sprintf(size, "%15.0lf", pow(2.0, M+1));
j = 14;
if (size[j] == '.') j--;
size[j+1] = '\0';
printf("\n\n NAS Parallel Benchmarks (NPB3.3-OCL) - EP Benchmark\n");
printf("\n Number of random numbers generated: %15s\n", size);
verified = false;
//--------------------------------------------------------------------
// Compute the number of "batches" of random number pairs generated
// per processor. Adjust if the number of processors does not evenly
// divide the total number
//--------------------------------------------------------------------
np = NN;
setup_opencl(argc, argv);
timer_clear(0);
timer_start(0);
//--------------------------------------------------------------------
// Compute AN = A ^ (2 * NK) (mod 2^46).
//--------------------------------------------------------------------
t1 = A;
for (i = 0; i < MK + 1; i++) {
t2 = randlc(&t1, t1);
}
an = t1;
tt = S;
//--------------------------------------------------------------------
// Each instance of this loop may be performed independently. We compute
// the k offsets separately to take into account the fact that some nodes
// have more numbers to generate than others
//--------------------------------------------------------------------
k_offset = -1;
DTIMER_START(T_KERNEL_EMBAR);
// Launch the kernel
int q_size = GROUP_SIZE * NQ * sizeof(cl_double);
int sx_size = GROUP_SIZE * sizeof(cl_double);
int sy_size = GROUP_SIZE * sizeof(cl_double);
err_code = clSetKernelArg(kernel, 0, q_size, NULL);
err_code |= clSetKernelArg(kernel, 1, sx_size, NULL);
err_code |= clSetKernelArg(kernel, 2, sy_size, NULL);
err_code |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&pgq);
err_code |= clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&pgsx);
err_code |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&pgsy);
err_code |= clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&k_offset);
err_code |= clSetKernelArg(kernel, 7, sizeof(cl_double), (void*)&an);
clu_CheckError(err_code, "clSetKernelArg()");
size_t localWorkSize[] = { GROUP_SIZE };
size_t globalWorkSize[] = { np };
err_code = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, NULL,
globalWorkSize,
localWorkSize,
0, NULL, NULL);
clu_CheckError(err_code, "clEnqueueNDRangeKernel()");
CHECK_FINISH();
DTIMER_STOP(T_KERNEL_EMBAR);
//.........这里部分代码省略.........
开发者ID:NatTuck,项目名称:cakemark,代码行数:101,代码来源:ep.c
示例13: clCreateBuffer
void OpenCLExecuter::ocl_filterBoundingBox(int channel, int window_size)
{
cl_int err; // debugging variables
size_t szParmDataBytes; // Byte size of context information
cl_mem src_buffer; // OpenCL device source buffer
cl_mem bbmin_buffer; // OpenCL device source buffer
cl_mem bbmax_buffer; // OpenCL device source buffer
size_t szGlobalWorkSize; // 1D var for Total # of work items
size_t szLocalWorkSize; // 1D var for # of work items in the work group
cl_kernel ckKernel; // OpenCL kernel
cl_int4 minbb;
cl_int4 maxbb;
minbb.s[0] = minbb.s[1] = minbb.s[2] = 8192;
maxbb.s[0] = maxbb.s[1] = maxbb.s[2] = -8192;
int iNumElements = 3*volobj->texwidth*volobj->texheight*volobj->texdepth; // Length of float arrays to process
size_t global_threads[3] ={volobj->texwidth, volobj->texheight, volobj->texdepth};
// allocate the source buffer memory object
src_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_ONLY, sizeof(unsigned char) * iNumElements, NULL, &err);
printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err));
// allocate the destination buffer memory object
bbmin_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_WRITE, sizeof(cl_int4), NULL, &err);
printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err));
bbmax_buffer = clCreateBuffer (ocl_wrapper->context, CL_MEM_READ_WRITE, sizeof(cl_int4), NULL, &err);
printf("OPENCL: clCreateBuffer: %s\n", ocl_wrapper->get_error(err));
// Create the kernel
ckKernel = clCreateKernel (cpProgram, "myFunc", &err);
printf("OPENCL: clCreateKernel: %s\n", ocl_wrapper->get_error(err));
// Set the Argument values
err = clSetKernelArg (ckKernel, 0, sizeof(cl_mem), (void*)&src_buffer);
printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
err = clSetKernelArg (ckKernel, 1, sizeof(cl_mem), (void*)&bbmin_buffer);
printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
err = clSetKernelArg (ckKernel, 2, sizeof(cl_mem), (void*)&bbmax_buffer);
printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
err = clSetKernelArg (ckKernel, 2, sizeof(int), (void*)&volobj->texwidth);
printf("OPENCL: clSetKernelArg: %s\n", ocl_wrapper->get_error(err));
err = clSetKernelArg (ckKernel, 3, sizeof(
|
请发表评论