本文整理汇总了C++中clEnqueueWriteBuffer函数的典型用法代码示例。如果您正苦于以下问题:C++ clEnqueueWriteBuffer函数的具体用法?C++ clEnqueueWriteBuffer怎么用?C++ clEnqueueWriteBuffer使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了clEnqueueWriteBuffer函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的C++代码示例。
示例1: main
int main(int argc, char **argv) {
if(argc < 2) {
usage();
return -1;
}
//init the filter array
float filter[49] =
{-1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, 49, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1};
//operate the params of cmd
const char* inputFileName;
const char* outputFileName;
inputFileName = (argv[1]);
outputFileName = (argv[2]);
//the image height and width
int imageHeight, imageWidth;
int filterWidth = 7;
//read the bmp image to the memory
float* inputImage = readBmpImage(inputFileName, &imageWidth, &imageHeight);
//to check the read is succ
printf("the width of the image is %d, the height of the image is %d\n", imageWidth, imageHeight);
//calculate the datasize
int dataSize = imageHeight * imageWidth * sizeof(float);
int filterSize = sizeof(float) * filterWidth * filterWidth;
//output image
float *outputImage = NULL;
outputImage = (float*)malloc(dataSize);
//set up the OpenCL environment
cl_int status;
//Discovery platform
cl_platform_id platforms[2];
cl_platform_id platform;
status = clGetPlatformIDs(2, platforms, NULL);
check(status, "clGetPlatformIDs");
platform = platforms[PLATFORM_TO_USE];
//Discovery device
cl_device_id device;
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
check(status, "clGetDeviceIDs");
//create context
cl_context_properties props[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)(platform), 0};
cl_context context;
context = clCreateContext(props, 1, &device, NULL, NULL, &status);
check(status, "clCreateContext");
//create command queue
cl_command_queue queue;
queue = clCreateCommandQueue(context, device, 0, &status);
check(status, "clCreateCommandQueue");
//create the input and output buffers
cl_mem d_input, d_output, d_filter;
d_input = clCreateBuffer(context, CL_MEM_READ_ONLY, dataSize, NULL,
&status);
check(status, "clCreateBuffer");
d_filter = clCreateBuffer(context, CL_MEM_READ_ONLY, filterSize, NULL,
&status);
check(status, "clCreateBuffer");
// Copy the input image to the device
d_output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, dataSize, NULL,
&status);
check(status, "clCreateBuffer");
status = clEnqueueWriteBuffer(queue, d_input, CL_TRUE, 0, dataSize,
inputImage, 0, NULL, NULL);
check(status, "clEnqueueWriteBuffer");
status = clEnqueueWriteBuffer(queue, d_filter, CL_TRUE, 0, filterSize,
filter, 0, NULL, NULL);
check(status, "clEnqueueWriteBuffer");
const char* source = readSource(kernelPath);
//create a program object with source and build it
cl_program program;
program = clCreateProgramWithSource(context, 1, &source, NULL, NULL);
check(status, "clCreateProgramWithSource");
status = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
size_t log_size;
char *program_log;
if(status < 0) {
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
program_log = (char*)malloc(log_size + 1);
program_log[log_size] = '\0';
//.........这里部分代码省略.........
开发者ID:BryantChang,项目名称:HSA-Bench,代码行数:101,代码来源:sharpen.c
示例2: main
//.........这里部分代码省略.........
err = DIVIDEND_CL_WRAP(clBuildProgram)(prog, 0, NULL, clOptions, NULL, NULL);
/*{ // show warnings/errors
static char log[65536]; memset(log, 0, sizeof(log));
cl_device_id device_id = 0;
err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(device_id), &device_id, NULL);
clGetProgramBuildInfo(prog, device_id, CL_PROGRAM_BUILD_LOG, sizeof(log)-1, log, NULL);
if(err || strstr(log,"warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log);
}*/
if(err != CL_SUCCESS) { printf("ERROR: clBuildProgram() => %d\n", err); return -1; }
cl_kernel kernel1;
cl_kernel kernel2;
kernel1 = clCreateKernel(prog, kernel_nw1, &err);
kernel2 = clCreateKernel(prog, kernel_nw2, &err);
if(err != CL_SUCCESS) { printf("ERROR: clCreateKernel() 0 => %d\n", err); return -1; }
clReleaseProgram(prog);
// creat buffers
cl_mem input_itemsets_d;
cl_mem output_itemsets_d;
cl_mem reference_d;
input_itemsets_d = clCreateBuffer(context, CL_MEM_READ_WRITE, max_cols * max_rows * sizeof(int), NULL, &err );
if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_item_set (size:%d) => %d\n", max_cols * max_rows, err); return -1;}
reference_d = clCreateBuffer(context, CL_MEM_READ_WRITE, max_cols * max_rows * sizeof(int), NULL, &err );
if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer reference (size:%d) => %d\n", max_cols * max_rows, err); return -1;}
output_itemsets_d = clCreateBuffer(context, CL_MEM_READ_WRITE, max_cols * max_rows * sizeof(int), NULL, &err );
if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer output_item_set (size:%d) => %d\n", max_cols * max_rows, err); return -1;}
//write buffers
err = clEnqueueWriteBuffer(cmd_queue, input_itemsets_d, 1, 0, max_cols * max_rows * sizeof(int), input_itemsets, 0, 0, 0);
if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer bufIn1 (size:%d) => %d\n", max_cols * max_rows, err); return -1; }
err = clEnqueueWriteBuffer(cmd_queue, reference_d, 1, 0, max_cols * max_rows * sizeof(int), reference, 0, 0, 0);
if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer bufIn2 (size:%d) => %d\n", max_cols * max_rows, err); return -1; }
int worksize = max_cols - 1;
printf("worksize = %d\n", worksize);
//these two parameters are for extension use, don't worry about it.
int offset_r = 0, offset_c = 0;
int block_width = worksize/BLOCK_SIZE ;
clSetKernelArg(kernel1, 0, sizeof(void *), (void*) &reference_d);
clSetKernelArg(kernel1, 1, sizeof(void *), (void*) &input_itemsets_d);
clSetKernelArg(kernel1, 2, sizeof(void *), (void*) &output_itemsets_d);
clSetKernelArg(kernel1, 3, sizeof(cl_int) * (BLOCK_SIZE + 1) *(BLOCK_SIZE+1), (void*)NULL );
clSetKernelArg(kernel1, 4, sizeof(cl_int) * BLOCK_SIZE * BLOCK_SIZE, (void*)NULL );
clSetKernelArg(kernel1, 5, sizeof(cl_int), (void*) &max_cols);
clSetKernelArg(kernel1, 6, sizeof(cl_int), (void*) &penalty);
clSetKernelArg(kernel1, 8, sizeof(cl_int), (void*) &block_width);
clSetKernelArg(kernel1, 9, sizeof(cl_int), (void*) &worksize);
clSetKernelArg(kernel1, 10, sizeof(cl_int), (void*) &offset_r);
clSetKernelArg(kernel1, 11, sizeof(cl_int), (void*) &offset_c);
clSetKernelArg(kernel2, 0, sizeof(void *), (void*) &reference_d);
clSetKernelArg(kernel2, 1, sizeof(void *), (void*) &input_itemsets_d);
clSetKernelArg(kernel2, 2, sizeof(void *), (void*) &output_itemsets_d);
clSetKernelArg(kernel2, 3, sizeof(cl_int) * (BLOCK_SIZE + 1) *(BLOCK_SIZE+1), (void*)NULL );
clSetKernelArg(kernel2, 4, sizeof(cl_int) * BLOCK_SIZE *BLOCK_SIZE, (void*)NULL );
clSetKernelArg(kernel2, 5, sizeof(cl_int), (void*) &max_cols);
clSetKernelArg(kernel2, 6, sizeof(cl_int), (void*) &penalty);
clSetKernelArg(kernel2, 8, sizeof(cl_int), (void*) &block_width);
clSetKernelArg(kernel2, 9, sizeof(cl_int), (void*) &worksize);
clSetKernelArg(kernel2, 10, sizeof(cl_int), (void*) &offset_r);
开发者ID:zwang4,项目名称:dividend,代码行数:67,代码来源:nw.c
示例3: main
//.........这里部分代码省略.........
free(log);
exit(1);
}
printf("program built\n");
printf("\n");
/* Create a Kernel Object */
cl_kernel kernel;
kernel = clCreateKernel(program, "abs_ulong4", &ret);
if (ret != CL_SUCCESS)
{
printf("error: call to 'clCreateKernel' failed\n");
exit(1);
}
/* Create and allocate host buffers */
size_t num_elem = 10;
/* Create and init host side src buffer 0 */
cl_ulong4 *src_0_host_buffer;
src_0_host_buffer = malloc(num_elem * sizeof(cl_ulong4));
for (int i = 0; i < num_elem; i++)
src_0_host_buffer[i] = (cl_ulong4){{2, 2, 2, 2}};
/* Create and init device side src buffer 0 */
cl_mem src_0_device_buffer;
src_0_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_ulong4), NULL, &ret);
if (ret != CL_SUCCESS)
{
printf("error: could not create source buffer\n");
exit(1);
}
ret = clEnqueueWriteBuffer(command_queue, src_0_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_ulong4), src_0_host_buffer, 0, NULL, NULL);
if (ret != CL_SUCCESS)
{
printf("error: call to 'clEnqueueWriteBuffer' failed\n");
exit(1);
}
/* Create host dst buffer */
cl_ulong4 *dst_host_buffer;
dst_host_buffer = malloc(num_elem * sizeof(cl_ulong4));
memset((void *)dst_host_buffer, 1, num_elem * sizeof(cl_ulong4));
/* Create device dst buffer */
cl_mem dst_device_buffer;
dst_device_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, num_elem *sizeof(cl_ulong4), NULL, &ret);
if (ret != CL_SUCCESS)
{
printf("error: could not create dst buffer\n");
exit(1);
}
/* Set kernel arguments */
ret = CL_SUCCESS;
ret |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &src_0_device_buffer);
ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &dst_device_buffer);
if (ret != CL_SUCCESS)
{
printf("error: call to 'clSetKernelArg' failed\n");
exit(1);
}
/* Launch the kernel */
size_t global_work_size = num_elem;
开发者ID:xianggong,项目名称:m2c-llvm-devtools-host,代码行数:67,代码来源:abs_ulong4_src.c
示例4: opencl_thread_init
static bool opencl_thread_init(struct thr_info *thr)
{
const int thr_id = thr->id;
struct cgpu_info *gpu = thr->cgpu;
struct opencl_thread_data *thrdata;
_clState *clState = clStates[thr_id];
cl_int status = 0;
thrdata = calloc(1, sizeof(*thrdata));
thr->cgpu_data = thrdata;
int buffersize = opt_scrypt ? SCRYPT_BUFFERSIZE : BUFFERSIZE;
if (opt_neoscrypt) {
buffersize = opt_neoscrypt ? SCRYPT_BUFFERSIZE : BUFFERSIZE;
}
if (!thrdata) {
applog(LOG_ERR, "Failed to calloc in opencl_thread_init");
return false;
}
switch (clState->chosen_kernel) {
case KL_POCLBM:
thrdata->queue_kernel_parameters = &queue_poclbm_kernel;
break;
case KL_PHATK:
thrdata->queue_kernel_parameters = &queue_phatk_kernel;
break;
case KL_DIAKGCN:
thrdata->queue_kernel_parameters = &queue_diakgcn_kernel;
break;
#ifdef USE_SCRYPT
case KL_SCRYPT:
thrdata->queue_kernel_parameters = &queue_scrypt_kernel;
break;
#endif
#ifdef USE_NEOSCRYPT
case KL_NEOSCRYPT:
thrdata->queue_kernel_parameters = &queue_neoscrypt_kernel;
break;
#endif
#ifdef USE_KECCAK
case KL_KECCAK:
thrdata->queue_kernel_parameters = &queue_keccak_kernel;
break;
#endif
default:
case KL_DIABLO:
thrdata->queue_kernel_parameters = &queue_diablo_kernel;
break;
}
thrdata->res = calloc(buffersize, 1);
if (!thrdata->res) {
free(thrdata);
applog(LOG_ERR, "Failed to calloc in opencl_thread_init");
return false;
}
status |= clEnqueueWriteBuffer(clState->commandQueue, clState->outputBuffer, CL_TRUE, 0,
buffersize, blank_res, 0, NULL, NULL);
if (unlikely(status != CL_SUCCESS)) {
applog(LOG_ERR, "Error: clEnqueueWriteBuffer failed.");
return false;
}
gpu->status = LIFE_WELL;
gpu->device_last_well = time(NULL);
return true;
}
开发者ID:cqtenq,项目名称:cgminer,代码行数:71,代码来源:driver-opencl.c
示例5: main
int main(int argc, char *argv[])
{
std::string vvadd_kernel_str;
/* Provide names of the OpenCL kernels
* and cl file that they're kept in */
std::string vvadd_name_str =
std::string("vvadd");
std::string vvadd_kernel_file =
std::string("vvadd.cl");
cl_vars_t cv;
cl_kernel vvadd;
/* Read OpenCL file into STL string */
readFile(vvadd_kernel_file,
vvadd_kernel_str);
/* Initialize the OpenCL runtime
* Source in clhelp.cpp */
initialize_ocl(cv);
/* Compile all OpenCL kernels */
compile_ocl_program(vvadd, cv, vvadd_kernel_str.c_str(),
vvadd_name_str.c_str());
/* Arrays on the host (CPU) */
float *h_A, *h_B, *h_Y;
/* Arrays on the device (GPU) */
cl_mem g_A, g_B, g_Y;
/* Allocate arrays on the host
* and fill with random data */
int n = (1<<20);
h_A = new float[n];
h_B = new float[n];
h_Y = new float[n];
bzero(h_Y, sizeof(float)*n);
for(int i = 0; i < n; i++)
{
h_A[i] = (float)drand48();
h_B[i] = (float)drand48();
}
/* CS194: Allocate memory for arrays on
* the GPU */
cl_int err = CL_SUCCESS;
/* CS194: Here's something to get you started */
// creates memory on the device to hold the A and B source arrays, plus the results array Y.
g_Y = clCreateBuffer(cv.context,CL_MEM_READ_WRITE,sizeof(float)*n,NULL,&err);
CHK_ERR(err);
g_A = clCreateBuffer(cv.context,CL_MEM_READ_WRITE,sizeof(float)*n,NULL,&err);
CHK_ERR(err);
g_B = clCreateBuffer(cv.context,CL_MEM_READ_WRITE,sizeof(float)*n,NULL,&err);
CHK_ERR(err);
/* CS194: Copy data from host CPU to GPU */
// copies the host array A and B to the device.
err = clEnqueueWriteBuffer(cv.commands, g_A, true, 0, sizeof(float)*n,
h_A, 0, NULL, NULL);
CHK_ERR(err);
err = clEnqueueWriteBuffer(cv.commands, g_B, true, 0, sizeof(float)*n,
h_B, 0, NULL, NULL);
CHK_ERR(err);
/* CS194: Define the global and local workgroup sizes */
size_t global_work_size[1] = {n};
size_t local_work_size[1] = {128};
/* CS194: Set Kernel Arguments */
err = clSetKernelArg(vvadd, 0, sizeof(cl_mem), &g_Y);
CHK_ERR(err);
err = clSetKernelArg(vvadd, 1, sizeof(cl_mem), &g_A);
CHK_ERR(err);
err = clSetKernelArg(vvadd, 2, sizeof(cl_mem), &g_B);
CHK_ERR(err);
err = clSetKernelArg(vvadd, 3, sizeof(int), &n);
CHK_ERR(err);
/* CS194: Call kernel on the GPU */
err = clEnqueueNDRangeKernel(cv.commands,
vvadd,
1,//work_dim,
NULL, //global_work_offset
global_work_size, //global_work_size
local_work_size, //local_work_size
0, //num_events_in_wait_list
NULL, //event_wait_list
NULL //
);
CHK_ERR(err);
/* Read result of GPU on host CPU */
// copies the result array Y from the device back to the host Y.
err = clEnqueueReadBuffer(cv.commands, g_Y, true, 0, sizeof(float)*n,
h_Y, 0, NULL, NULL);
CHK_ERR(err);
//.........这里部分代码省略.........
开发者ID:kishkaru,项目名称:OpenCL-projects,代码行数:101,代码来源:vvadd.cpp
示例6: main
int main(int argc, char* argv[])
{
int ciErrNum = 0;
printf("press a key to start\n");
getchar();
const char* vendorSDK = btOpenCLUtils::getSdkVendorName();
printf("This program was compiled using the %s OpenCL SDK\n",vendorSDK);
cl_device_type deviceType = CL_DEVICE_TYPE_GPU;//CL_DEVICE_TYPE_ALL
void* glCtx=0;
void* glDC = 0;
printf("Initialize OpenCL using btOpenCLUtils::createContextFromType for CL_DEVICE_TYPE_GPU\n");
g_cxMainContext = btOpenCLUtils::createContextFromType(deviceType, &ciErrNum, glCtx, glDC);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
int numDev = btOpenCLUtils::getNumDevices(g_cxMainContext);
if (numDev>0)
{
int deviceIndex=0;
cl_device_id device;
device = btOpenCLUtils::getDevice(g_cxMainContext,deviceIndex);
btOpenCLDeviceInfo clInfo;
btOpenCLUtils::getDeviceInfo(device,clInfo);
btOpenCLUtils::printDeviceInfo(device);
const char* globalAtomicsKernelStringPatched = globalAtomicsKernelString;
if (!strstr(clInfo.m_deviceExtensions,"cl_ext_atomic_counters_32"))
{
globalAtomicsKernelStringPatched = findAndReplace(globalAtomicsKernelString,"counter32_t", "volatile __global int*");
}
// create a command-queue
g_cqCommandQue = clCreateCommandQueue(g_cxMainContext, device, 0, &ciErrNum);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
cl_mem counterBuffer = clCreateBuffer(g_cxMainContext, CL_MEM_READ_WRITE, sizeof(int), NULL, &ciErrNum);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
char* kernelMethods[] =
{
"globalAtomicKernelOpenCL1_1",
"counterAtomicKernelExt",
"globalAtomicKernelExt",
"globalAtomicKernelCounters32Broken"
};
int numKernelMethods = sizeof(kernelMethods)/sizeof(char*);
for (int i=0;i<numKernelMethods;i++)
{
int myCounter = 0;
//write to counterBuffer
int deviceOffset=0;
int hostOffset=0;
ciErrNum = clEnqueueWriteBuffer(g_cqCommandQue, counterBuffer,CL_FALSE, deviceOffset, sizeof(int), &myCounter, 0, NULL, NULL);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
g_atomicsKernel = btOpenCLUtils::compileCLKernelFromString(g_cxMainContext,device,globalAtomicsKernelStringPatched,kernelMethods[i], &ciErrNum);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
ciErrNum = clSetKernelArg(g_atomicsKernel, 0, sizeof(cl_mem),(void*)&counterBuffer);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
size_t numWorkItems = workGroupSize*((NUM_OBJECTS + (workGroupSize-1)) / workGroupSize);
ciErrNum = clEnqueueNDRangeKernel(g_cqCommandQue, g_atomicsKernel, 1, NULL, &numWorkItems, &workGroupSize,0 ,0 ,0);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
clFinish(g_cqCommandQue);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
//read from counterBuffer
ciErrNum = clEnqueueReadBuffer(g_cqCommandQue, counterBuffer, CL_TRUE, deviceOffset, sizeof(int), &myCounter, 0, NULL, NULL);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
if (myCounter != NUM_OBJECTS)
{
printf("%s is broken, expected %d got %d\n",kernelMethods[i],NUM_OBJECTS,myCounter);
} else
{
printf("%s success, got %d\n",kernelMethods[i],myCounter);
}
}
clReleaseCommandQueue(g_cqCommandQue);
oclCHECKERROR(ciErrNum, CL_SUCCESS);
}
clReleaseContext(g_cxMainContext);
//.........这里部分代码省略.........
开发者ID:Lunavast,项目名称:bullet-physics,代码行数:101,代码来源:main.cpp
示例7: cl_launch_kernel
//.........这里部分代码省略.........
// t_start = rtclock();
// Set the arguments of the kernel
err_code = clSetKernelArg(clKernel_mean, 0, sizeof(cl_mem), (void *)&mean_mem_obj);
err_code |= clSetKernelArg(clKernel_mean, 1, sizeof(cl_mem), (void *)&data_mem_obj);
err_code |= clSetKernelArg(clKernel_mean, 2, sizeof(DATA_TYPE), (void *)&float_n);
err_code |= clSetKernelArg(clKernel_mean, 3, sizeof(int), (void *)&m);
err_code |= clSetKernelArg(clKernel_mean, 4, sizeof(int), (void *)&n);
if(err_code != CL_SUCCESS)
{
printf("Error in seting arguments1\n");
exit(1);
}
// Execute the OpenCL kernel
err_code = clEnqueueNDRangeKernel(clCommandQue, clKernel_mean, 1, NULL, globalWorkSize_Kernel1, localWorkSize_Kernel1, 0, NULL, NULL);
if(err_code != CL_SUCCESS)
{
printf("Error in launching kernel1\n");
exit(1);
}
clEnqueueBarrier(clCommandQue);
// Set the arguments of the kernel
err_code = clSetKernelArg(clKernel_std, 0, sizeof(cl_mem), (void *)&mean_mem_obj);
err_code = clSetKernelArg(clKernel_std, 1, sizeof(cl_mem), (void *)&stddev_mem_obj);
err_code |= clSetKernelArg(clKernel_std, 2, sizeof(cl_mem), (void *)&data_mem_obj);
err_code |= clSetKernelArg(clKernel_std, 3, sizeof(DATA_TYPE), (void *)&float_n);
err_code |= clSetKernelArg(clKernel_std, 4, sizeof(DATA_TYPE), (void *)&eps);
err_code |= clSetKernelArg(clKernel_std, 5, sizeof(int), (void *)&m);
err_code |= clSetKernelArg(clKernel_std, 6, sizeof(int), (void *)&n);
if(err_code != CL_SUCCESS)
{
printf("Error in seting arguments2\n");
exit(1);
}
// Execute the OpenCL kernel
err_code = clEnqueueNDRangeKernel(clCommandQue, clKernel_std, 1, NULL, globalWorkSize_Kernel2, localWorkSize_Kernel2, 0, NULL, NULL);
if(err_code != CL_SUCCESS)
{
printf("Error in launching kernel2\n");
exit(1);
}
clEnqueueBarrier(clCommandQue);
// Set the arguments of the kernel
err_code = clSetKernelArg(clKernel_reduce, 0, sizeof(cl_mem), (void *)&mean_mem_obj);
err_code = clSetKernelArg(clKernel_reduce, 1, sizeof(cl_mem), (void *)&stddev_mem_obj);
err_code |= clSetKernelArg(clKernel_reduce, 2, sizeof(cl_mem), (void *)&data_mem_obj);
err_code |= clSetKernelArg(clKernel_reduce, 3, sizeof(DATA_TYPE), (void *)&float_n);
err_code |= clSetKernelArg(clKernel_reduce, 4, sizeof(int), (void *)&m);
err_code |= clSetKernelArg(clKernel_reduce, 5, sizeof(int), (void *)&n);
if(err_code != CL_SUCCESS)
{
printf("Error in seting arguments3\n");
exit(1);
}
// Execute the OpenCL kernel
err_code = clEnqueueNDRangeKernel(clCommandQue, clKernel_reduce, 2, NULL, globalWorkSize_Kernel3, localWorkSize_Kernel3, 0, NULL, NULL);
if(err_code != CL_SUCCESS)
{
printf("Error in launching kernel3\n");
exit(1);
}
clEnqueueBarrier(clCommandQue);
// Set the arguments of the kernel
err_code = clSetKernelArg(clKernel_corr, 0, sizeof(cl_mem), (void *)&symmat_mem_obj);
err_code |= clSetKernelArg(clKernel_corr, 1, sizeof(cl_mem), (void *)&data_mem_obj);
err_code |= clSetKernelArg(clKernel_corr, 2, sizeof(int), (void *)&m);
err_code |= clSetKernelArg(clKernel_corr, 3, sizeof(int), (void *)&n);
if(err_code != CL_SUCCESS)
{
printf("Error in seting arguments4\n");
exit(1);
}
// Execute the OpenCL kernel
err_code = clEnqueueNDRangeKernel(clCommandQue, clKernel_corr, 1, NULL, globalWorkSize_Kernel4, localWorkSize_Kernel4, 0, NULL, NULL);
if(err_code != CL_SUCCESS)
{
printf("Error in launching kernel4\n");
exit(1);
}
clEnqueueBarrier(clCommandQue);
clEnqueueWriteBuffer(clCommandQue, symmat_mem_obj, CL_TRUE, ((M)*(M+1) + (M))*sizeof(DATA_TYPE), sizeof(DATA_TYPE), &val, 0, NULL, NULL);
clFinish(clCommandQue);
// t_end = rtclock();
// fprintf(stdout, "GPU Runtime: %0.6lfs\n", t_end - t_start);
}
开发者ID:gfursin,项目名称:cm-ctuning-code-source,代码行数:101,代码来源:correlation.c
示例8: perpendicular
compute::buffer cape::fighter_to_fixed_vec(vec3f p1, vec3f p2, vec3f p3, vec3f rot)
{
vec3f rotation = rot;
vec3f diff = p3 - p1;
float shrink = 0.12f;
diff = diff * shrink;
p3 = p3 - diff;
p1 = p1 + diff;
vec3f lpos = p1;
vec3f rpos = p3;
///approximation
///could also use body scaling
float ldepth = (p3 - p1).length() / 3.f;
float rdepth = ldepth;
///we should move perpendicularly away, not zdistance away
vec2f ldir = {p3.v[0], p3.v[2]};
ldir = ldir - (vec2f){p1.v[0], p1.v[2]};
vec2f perp = perpendicular(ldir.norm());
vec3f perp3 = {perp.v[0], 0.f, perp.v[1]};
lpos = lpos + perp3 * ldepth;
rpos = rpos + perp3 * ldepth;
lpos.v[1] += bodypart::scale / 4;
rpos.v[1] += bodypart::scale / 4;
///dir could also just be (p3 - p1).rot ???
vec3f dir = rpos - lpos;
int len = width;
vec3f step = dir / (float)(len - 1);
vec3f current = lpos;
compute::buffer buf = compute::buffer(cl::context, sizeof(float)*width*3, CL_MEM_READ_WRITE, nullptr);
if(!cape_init)
{
gpu_cape.resize(width * 3);
cape_init = true;
}
//cl_float* mem_map = (cl_float*) clEnqueueMapBuffer(cl::cqueue.get(), buf.get(), CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 0, sizeof(cl_float)*width*3, 0, NULL, NULL, NULL);
float sag = bodypart::scale/32.f;
//sag = 0;
for(int i=0; i<len; i++)
{
float xf = (float)i / len;
float yval = 4 * xf * (xf - 1) * sag + sin(xf * 30);
/*mem_map[i*3 + 0] = current.v[0];
mem_map[i*3 + 1] = current.v[1] + yval;
mem_map[i*3 + 2] = current.v[2];*/
gpu_cape[i*3 + 0] = current.v[0];
gpu_cape[i*3 + 1] = current.v[1] + yval;
gpu_cape[i*3 + 2] = current.v[2];
current = current + step;
}
clEnqueueWriteBuffer(cl::cqueue.get(), buf.get(), CL_FALSE, 0, sizeof(cl_float) * width * 3, gpu_cape.data(), 0, NULL, NULL);
//clEnqueueUnmapMemObject(cl::cqueue.get(), buf.get(), mem_map, 0, NULL, NULL);
return buf;
}
开发者ID:20k,项目名称:SwordFight,代码行数:82,代码来源:cape.cpp
示例9: init_cl_radix_sort
void init_cl_radix_sort(
int nkeys){
cl_int err;
cl_int status;
/**/
nkeys_rounded=nkeys;
// check some conditions
assert(_TOTALBITS % _BITS == 0);
assert(nkeys % (_GROUPS * _ITEMS) == 0);
assert( (_GROUPS * _ITEMS * _RADIX) % _HISTOSPLIT == 0);
assert(pow(2,(int) log2(_GROUPS)) == _GROUPS);
assert(pow(2,(int) log2(_ITEMS)) == _ITEMS);
// init the timers
histo_time=0;
scan_time=0;
reorder_time=0;
transpose_time=0;
//printf("Construct the random list\n");
// construction of a random list
uint maxint=_MAXINT;
assert(_MAXINT != 0);
h_checkKeys = (uint*)malloc(sizeof(uint)*nkeys);
h_Permut = (uint*)malloc(sizeof(uint)*nkeys);
// construction of the initial permutation
for(uint i = 0; i < nkeys; i++){
//printf("%d, ",i);
h_Permut[i] = i;
h_checkKeys[i]=h_keys[i];
}
printf("Send to the GPU\n");
// copy on the GPU
d_inKeys = clCreateBuffer(context,
CL_MEM_READ_WRITE,
sizeof(uint)* nkeys ,
NULL,
&err);
assert(err == CL_SUCCESS);
d_outKeys = clCreateBuffer(context,
CL_MEM_READ_WRITE,
sizeof(uint)* nkeys ,
NULL,
&err);
assert(err == CL_SUCCESS);
d_inPermut = clCreateBuffer(context,
CL_MEM_READ_WRITE,
sizeof(uint)* nkeys ,
NULL,
&err);
assert(err == CL_SUCCESS);
d_outPermut = clCreateBuffer(context,
CL_MEM_READ_WRITE,
sizeof(uint)* nkeys ,
NULL,
&err);
assert(err == CL_SUCCESS);
////////////////////////////////////////////////////////////////////////////////
//copy the two previous vectors to the device
//cl_radix_host2gpu();
////////////////////////////////////////////////////////////////////////////////
status = clEnqueueWriteBuffer( command_que,
d_inKeys,
CL_TRUE, 0,
sizeof(uint) * nkeys,
h_keys,
0, NULL, NULL );
//.........这里部分代码省略.........
开发者ID:Quadra-H,项目名称:QH-GPU,代码行数:101,代码来源:cl_radix_sort.c
示例10: materializeCol
void * materializeCol(struct materializeNode * mn, struct clContext * context, struct statistic * pp){
struct timespec start,end;
clock_gettime(CLOCK_REALTIME,&start);
cl_event ndrEvt;
cl_ulong startTime, endTime;
struct tableNode *tn = mn->table;
char * res;
cl_mem gpuResult;
cl_mem gpuAttrSize;
long totalSize = tn->tupleNum * tn->tupleSize;
cl_int error = 0;
cl_mem gpuContent = clCreateBuffer(context->context, CL_MEM_READ_ONLY, totalSize, NULL, &error);
gpuResult = clCreateBuffer(context->context, CL_MEM_READ_WRITE, totalSize, NULL, &error);
gpuAttrSize = clCreateBuffer(context->context, CL_MEM_READ_ONLY, sizeof(int)*tn->totalAttr,NULL,&error);
clEnqueueWriteBuffer(context->queue,gpuAttrSize,CL_TRUE,0,sizeof(int)*tn->totalAttr,tn->attrSize,0,0,&ndrEvt);
clWaitForEvents(1, &ndrEvt);
clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
pp->pcie += 1e-6 * (endTime - startTime);
res = (char *) malloc(totalSize);
long offset = 0;
long *colOffset = (long*)malloc(sizeof(long)*tn->totalAttr);
for(int i=0;i<tn->totalAttr;i++){
colOffset[i] = offset;
int size = tn->tupleNum * tn->attrSize[i];
if(tn->dataPos[i] == MEM){
clEnqueueWriteBuffer(context->queue,gpuContent,CL_TRUE,offset,size,tn->content[i],0,0,&ndrEvt);
clWaitForEvents(1, &ndrEvt);
clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
pp->pcie += 1e-6 * (endTime - startTime);
}else
clEnqueueCopyBuffer(context->queue,(cl_mem)tn->content[i],gpuContent,0,offset,size,0,0,0);
offset += size;
}
cl_mem gpuColOffset = clCreateBuffer(context->context, CL_MEM_READ_ONLY, sizeof(long)*tn->totalAttr,NULL,&error);
clEnqueueWriteBuffer(context->queue,gpuColOffset,CL_TRUE,0,sizeof(long)*tn->totalAttr,colOffset,0,0,&ndrEvt);
clWaitForEvents(1, &ndrEvt);
clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
pp->pcie += 1e-6 * (endTime - startTime);
size_t globalSize = 512;
size_t localSize = 128;
context->kernel = clCreateKernel(context->program,"materialize",0);
clSetKernelArg(context->kernel,0,sizeof(cl_mem), (void*)&gpuContent);
clSetKernelArg(context->kernel,1,sizeof(cl_mem), (void*)&gpuColOffset);
clSetKernelArg(context->kernel,2,sizeof(int), (void*)&tn->totalAttr);
clSetKernelArg(context->kernel,3,sizeof(cl_mem), (void*)&gpuAttrSize);
clSetKernelArg(context->kernel,4,sizeof(long), (void*)&tn->tupleNum);
clSetKernelArg(context->kernel,5,sizeof(int), (void*)&tn->tupleSize);
clSetKernelArg(context->kernel,6,sizeof(cl_mem), (void*)&gpuResult);
clEnqueueNDRangeKernel(context->queue, context->kernel, 1, 0, &globalSize,&localSize,0,0,0);
clEnqueueReadBuffer(context->queue,gpuResult,CL_TRUE,0,totalSize,res,0,0,&ndrEvt);
clWaitForEvents(1, &ndrEvt);
clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0);
clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0);
pp->pcie += 1e-6 * (endTime - startTime);
free(colOffset);
clFinish(context->queue);
clReleaseMemObject(gpuColOffset);
clReleaseMemObject(gpuContent);
clReleaseMemObject(gpuAttrSize);
clReleaseMemObject(gpuResult);
clock_gettime(CLOCK_REALTIME,&end);
double timeE = (end.tv_sec - start.tv_sec)* BILLION + end.tv_nsec - start.tv_nsec;
printf("Materialization Time: %lf\n", timeE/(1000*1000));
return res;
}
开发者ID:CodingCat,项目名称:gpudb,代码行数:92,代码来源:materialize.cpp
示例11: main
int main(int argc, char **argv)
{
cl_int err = 0;
cl_context context = 0;
cl_device_id * devices = NULL;
cl_command_queue queue = 0;
cl_program program = 0;
cl_mem cl_a = 0, cl_b = 0, cl_res = 0;
cl_kernel adder = 0;
cl_event event;
// The iteration variable
int i;
// Define our data set
cl_float a[DATA_SIZE], b[DATA_SIZE], res[DATA_SIZE];
// Initialize array
srand(time(0));
for (i = 0; i < DATA_SIZE; i++) {
a[i] = (rand() % 100) / 100.0;
b[i] = (rand() % 100) / 100.0;
res[i] = 0;
}
check_release(get_cl_context(&context, &devices, 0) == false,
"Fail to create context");
// Specify the queue to be profile-able
queue = clCreateCommandQueue(context, devices[0], CL_QUEUE_PROFILING_ENABLE, 0);
check_release(queue == NULL, "Can't create command queue");
program = load_program(context, devices[0], "shader.cl");
check_release(program == NULL, "Fail to build program");
cl_a =
clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(cl_float) * DATA_SIZE, NULL, NULL);
cl_b =
clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(cl_float) * DATA_SIZE, NULL, NULL);
cl_res = clCreateBuffer(
context, CL_MEM_WRITE_ONLY, sizeof(cl_float) * DATA_SIZE, NULL, NULL);
if (cl_a == 0 || cl_b == 0 || cl_res == 0) {
printf("Can't create OpenCL buffer\n");
goto release;
}
check_release(clEnqueueWriteBuffer(
queue, cl_a, CL_TRUE, 0, sizeof(cl_float) * DATA_SIZE, a, 0, 0, 0),
"Write Buffer 1");
check_release(clEnqueueWriteBuffer(
queue, cl_b, CL_TRUE, 0, sizeof(cl_float) * DATA_SIZE, b, 0, 0, 0),
"Write Buffer 2");
adder = clCreateKernel(program, "adder", &err);
if (err == CL_INVALID_KERNEL_NAME) printf("CL_INVALID_KERNEL_NAME\n");
check_release(adder == NULL, "Can't load kernel");
clSetKernelArg(adder, 0, sizeof(cl_mem), &cl_a);
clSetKernelArg(adder, 1, sizeof(cl_mem), &cl_b);
clSetKernelArg(adder, 2, sizeof(cl_mem), &cl_res);
size_t work_size = DATA_SIZE;
check_release(clEnqueueNDRangeKernel(queue, adder, 1, 0, &work_size, 0, 0, 0, &event),
"Can't enqueue kernel");
check_release(
clEnqueueReadBuffer(
queue, cl_res, CL_TRUE, 0, sizeof(cl_float) * DATA_SIZE, res, 0, 0, 0),
"Can't enqueue read buffer");
clWaitForEvents(1, &event);
printf("Execution Time: %.04lf ms\n\n", get_event_exec_time(event));
// Make sure everything is done before we do anything
clFinish(queue);
err = 0;
for (i = 0; i < DATA_SIZE; i++) {
if (res[i] != a[i] + b[i]) {
printf("%f + %f = %f(answer %f)\n", a[i], b[i], res[i], a[i] + b[i]);
err++;
}
}
if (err == 0)
printf("Validation passed\n");
else
printf("Validation failed\n");
printf("------\n");
//--------------------------------
// Second test
for (i = 0; i < DATA_SIZE; i++) {
a[i] = i;
b[i] = i;
res[i] = 0;
}
check_err(clEnqueueWriteBuffer(
queue, cl_a, CL_TRUE, 0, sizeof(cl_float) * DATA_SIZE, a, 0, 0, 0),
"Write Buffer 1");
check_err(clEnqueueWriteBuffer(
queue, cl_b, CL_TRUE, 0, sizeof(cl_float) * DATA_SIZE, b, 0, 0, 0),
"Write Buffer 2");
//.........这里部分代码省略.........
开发者ID:MedicineYeh,项目名称:vector_add_sample,代码行数:101,代码来源:main.c
示例12: main
//.........这里部分代码省略.........
(const char **) &source, &program_length, &ciErrNum);
shrCheckError(ciErrNum, CL_SUCCESS);
// build the program
ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-mad-enable", NULL, NULL);
if (ciErrNum != CL_SUCCESS)
{
// write out standard error, Build Log and PTX, then cleanup and exit
shrLog(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclDXTCompression.ptx");
shrCheckError(ciErrNum, CL_SUCCESS);
}
// create the kernel
ckKernel = clCreateKernel(cpProgram, "compress", &ciErrNum);
shrCheckError(ciErrNum, CL_SUCCESS);
// set the args values
ciErrNum = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void *) &cmMemObjs[0]);
ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void *) &cmMemObjs[1]);
ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void *) &cmMemObjs[2]);
ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(float) * 4 * 16, NULL);
ciErrNum |= clSetKernelArg(ckKernel, 4, sizeof(float) * 4 * 16, NULL);
ciErrNum |= clSetKernelArg(ckKernel, 5, sizeof(int) * 64, NULL);
ciErrNum |= clSetKernelArg(ckKernel, 6, sizeof(float) * 16 * 6, NULL);
ciErrNum |= clSetKernelArg(ckKernel, 7, sizeof(unsigned int) * 160, NULL);
ciErrNum |= clSetKernelArg(ckKernel, 8, sizeof(int) * 16, NULL);
shrCheckError(ciErrNum, CL_SUCCESS);
shrLog(LOGBOTH, 0, "Running DXT Compression on %u x %u image...\n\n", width, height);
// Upload the image
clEnqueueWriteBuffer(cqCommandQueue, cmMemObjs[1], CL_FALSE, 0, sizeof(cl_uint) * width * height, block_image, 0,0,0);
// set work-item dimensions
szGlobalWorkSize[0] = width * height * (NUM_THREADS/16);
szLocalWorkSize[0]= NUM_THREADS;
#ifdef GPU_PROFILING
int numIterations = 100;
for (int i = -1; i < numIterations; ++i) {
if (i == 0) { // start timing only after the first warmup iteration
clFinish(cqCommandQueue); // flush command queue
shrDeltaT(0); // start timer
}
#endif
// execute kernel
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL,
szGlobalWorkSize, szLocalWorkSize,
0, NULL, NULL);
shrCheckError(ciErrNum, CL_SUCCESS);
#ifdef GPU_PROFILING
}
clFinish(cqCommandQueue);
double dAvgTime = shrDeltaT(0) / (double)numIterations;
shrLog(LOGBOTH | MASTER, 0, "oclDXTCompression, Throughput = %.4f, Time = %.5f, Size = %u, NumDevsUsed = %i\n",
(1.0e-6 * (double)(width * height)/ dAvgTime), dAvgTime, (width * height), 1);
#endif
// blocking read output
ciErrNum = clEnqueueReadBuffer(cqCommandQueue, cmMemObjs[2], CL_TRUE, 0,
compressedSize, h_result, 0, NULL, NULL);
shrCheckError(ciErrNum, CL_SUCCESS);
开发者ID:ajaykumarkannan,项目名称:simulation-opencl,代码行数:66,代码来源:oclDXTCompression.cpp
示例13: main
//.........这里部分代码省略.........
return 0;
}
}
context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
if (err != CL_SUCCESS) {
printf("Unable to create context. Error: %d\n", err);
return 0;
}
command_queue = clCreateCommandQueue(context, device_id, 0, &err);
if (err != CL_SUCCESS) {
printf("Unable to create command queue. Error: %d\n", err);
return 0;
}
program = clCreateProgramWithSource(context, 1, (const char **)&KernelSource, NULL, &err);
if (err != CL_SUCCESS) {
printf("Unable to create program. Error: %d\n", err);
return 0;
}
if (clBuildProgram(program, 0, NULL, NULL, NULL, NULL) != CL_SUCCESS) {
char *log;
size_t size;
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &size); // 1. Länge des Logbuches?
log = (char *)malloc(size + 1);
if (log) {
clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, size, log, NULL); // 2. Hole das Logbuch ab
log[size] = '\0';
printf("%s", log);
free(log);
}
return 1;
}
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (err != CL_SUCCESS) {
printf("Error building program. Error: %d\n", err);
return 0;
}
kernel = clCreateKernel(program, "matmult_ocl", &err);
if (err != CL_SUCCESS) {
printf("Error setting kernel. Error: %d\n", err);
return 0;
}
input1 = clCreateBuffer(context, CL_MEM_READ_ONLY, d1*d2*sizeof(float), NULL, &err);
input2 = clCreateBuffer(context, CL_MEM_READ_ONLY, d2*d3*sizeof(float), NULL, &err);
input3 = clCreateBuffer(context, CL_MEM_READ_ONLY, 4 * sizeof(int), NULL, &err);
output = clCreateBuffer(context, CL_MEM_READ_WRITE, d1*d3*sizeof(float), NULL, &err);
start_time = omp_get_wtime();
clEnqueueWriteBuffer(command_queue, input1, CL_TRUE, 0, d1*d2*sizeof(float), *A, 0, NULL, NULL);
clEnqueueWriteBuffer(command_queue, input2, CL_TRUE, 0, d2*d3*sizeof(float), *B, 0, NULL, NULL);
clEnqueueWriteBuffer(command_queue, input3, CL_TRUE, 0, 4 * sizeof(int), d, 0, NULL, NULL);
clSetKernelArg(kernel, 0, sizeof(cl_mem), &input1);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &input2);
clSetKernelArg(kernel, 2, sizeof(cl_mem), &input3);
clSetKernelArg(kernel, 3, sizeof(cl_mem), &output);
clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, global, NULL, 0, NULL, NULL);
clFinish(command_queue);
clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, d1*d3*sizeof(float), *C, 0, NULL, NULL);
// for (unsigned int i = 0; i < (unsigned int) d1*d3; i++)
// printf("%f\n", C[0][i]);
openCL_time = omp_get_wtime() - start_time;
clReleaseMemObject(input1);
clReleaseMemObject(input2);
clReleaseMemObject(input3);
clReleaseMemObject(output);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(command_queue);
clReleaseContext(context);
printf("Running serial algorithm...\n");
start_time = omp_get_wtime();
serialC = mult_mat(A, B, d1, d2, d3);
serial_time = omp_get_wtime() - start_time;
printf("Checking results... ");
is_correct(C, serialC, d1, d3);
printf("Showing stats...\n");
printf(" serial runtime = %f\n", serial_time);
printf(" OpenCL runtime = %f\n", openCL_time);
printf(" Speedup = %f\n", serial_time / openCL_time);
return 0;
}
开发者ID:yorrickslr,项目名称:pvs6,代码行数:101,代码来源:matmult_var4.cpp
示例14: main
//.........这里部分代码省略.........
checkError(err, "creating context");
queue = clCreateCommandQueue(context, device, 0, &err);
checkError(err, "creating command queue");
program = clCreateProgramWithSource(context, 1, &KERNEL_SOURCE, NULL, &err);
checkError(err, "creating program");
err = clBuildProgram(program, 1, &device, "", NULL, NULL);
if (err == CL_BUILD_PROGRAM_FAILURE)
{
size_t sz;
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
sizeof(size_t), NULL, &sz);
char *buildLog = malloc(++sz);
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
sz, buildLog, NULL);
fprintf(stderr, "%s\n", buildLog);
}
checkError(err, "building program");
kernel = clCreateKernel(program, "vecadd", &err);
checkError(err, "creating kernel");
size_t dataSize = N*sizeof(cl_float);
// Initialise host data
srand(0);
h_a = malloc(dataSize);
h_b = malloc(dataSize);
h_c = malloc(dataSize);
for (int i = 0; i < N; i++)
{
h_a[i] = rand()/(float)RAND_MAX;
h_b[i] = rand()/(float)RAND_MAX;
h_c[i] = 0;
}
d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, dataSize, NULL, &err);
checkError(err, "creating d_a buffer");
d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, dataSize, NULL, &err);
checkError(err, "creating d_b buffer");
d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, dataSize, NULL, &err);
checkError(err, "creating d_c buffer");
err = clEnqueueWriteBuffer(queue, d_a, CL_FALSE,
0, dataSize, h_a, 0, NULL, NULL);
checkError(err, "writing d_a data");
err = clEnqueueWriteBuffer(queue, d_b, CL_FALSE,
0, dataSize, h_b, 0, NULL, NULL);
checkError(err, "writing d_b data");
err = clEnqueueWriteBuffer(queue, d_c, CL_FALSE,
0, dataSize, h_c, 0, NULL, NULL);
checkError(err, "writing d_c data");
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b);
err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c);
checkError(err, "setting kernel args");
err = clEnqueueNDRangeKernel(queue, kernel,
1, NULL, &global, NULL, 0, NULL, NULL);
checkError(err, "enqueuing kernel");
err = clFinish(queue);
checkError(err, "running kernel");
err = clEnqueueReadBuffer(queue, d_c, CL_TRUE,
0, dataSize, h_c, 0, NULL, NULL);
checkError(err, "reading d_c data");
// Check results
int errors = 0;
for (int i = 0; i < N; i++)
{
float ref = h_a[i] + h_b[i];
if (fabs(ref - h_c[i]) > TOL)
{
if (errors < MAX_ERRORS)
{
fprintf(stderr, "%4d: %.4f != %.4f\n", i, h_c[i], ref);
}
errors++;
}
}
printf("%d errors detected\n", errors);
free(h_a);
free(h_b);
free(h_c);
clReleaseMemObject(d_a);
clReleaseMemObject(d_b);
clReleaseMemObject(d_c);
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(queue);
clReleaseContext(context);
return (errors != 0);
}
开发者ID:lalanne,项目名 |
请发表评论