本文整理汇总了C++中clCreateBuffer函数的典型用法代码示例。如果您正苦于以下问题:C++ clCreateBuffer函数的具体用法?C++ clCreateBuffer怎么用?C++ clCreateBuffer使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了clCreateBuffer函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的C++代码示例。
示例1: help
//.........这里部分代码省略.........
std::cerr << "Retrieving OpenCL platforms" << std::endl;
error = clGetPlatformIDs(4, platforms, &platformsNumber);
if(error != CL_SUCCESS) {
throw OpenclError(error, "Unable to retrieve the OpenCL platforms");
}
if(platformId >= platformsNumber) {
throw std::runtime_error("No platform found with the provided id");
}
std::cerr << "Retrieving OpenCL GPU devices" << std::endl;
error = clGetDeviceIDs(platforms[platformId], CL_DEVICE_TYPE_CPU | CL_DEVICE_TYPE_GPU, 32, devices, &devicesNumber);
if(error != CL_SUCCESS) {
throw OpenclError(error, "Unable to retrieve the OpenCL devices");
}
if(deviceId >= devicesNumber) {
throw std::runtime_error("No device found with the provided id");
}
std::cerr << "Creating OpenCL context" << std::endl;
context = clCreateContext(0, 1, &devices[deviceId], NULL, NULL, &error);
if(error != CL_SUCCESS) {
throw OpenclError(error, "Unable to create the OpenCL context");
}
std::cerr << "Creating OpenCL command queue" << std::endl;
commandQueue = clCreateCommandQueue(context, devices[deviceId], 0, &error);
if(error != CL_SUCCESS) {
throw OpenclError(error, "Unable to create the OpenCL command queue");
}
std::cerr << "Creating OpenCL GPU generation buffer" << std::endl;
bufferGpuGen = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_uchar) * GEN_SIZE * staggerSize, 0, &error);
if(error != CL_SUCCESS) {
throw OpenclError(error, "Unable to create the OpenCL GPU generation buffer");
}
std::cerr << "Creating OpenCL GPU scoops buffer" << std::endl;
bufferGpuScoops = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_uchar) * nonceSize, 0, &error);
if(error != CL_SUCCESS) {
throw OpenclError(error, "Unable to create the OpenCL GPU scoops buffer");
}
std::cerr << "Creating OpenCL program" << std::endl;
std::string source = loadSource("kernel/nonce.cl");
const char* sources[] = {source.c_str()};
size_t sourcesLength[] = {source.length()};
program = clCreateProgramWithSource(context, 1, sources, sourcesLength, &error);
if(error != CL_SUCCESS) {
throw OpenclError(error, "Unable to create the OpenCL program");
}
std::cerr << "Building OpenCL program" << std::endl;
error = clBuildProgram(program, 1, &devices[deviceId], "-I kernel", 0, 0);
if(error != CL_SUCCESS) {
size_t logSize;
clGetProgramBuildInfo(program, devices[deviceId], CL_PROGRAM_BUILD_LOG, 0, 0, &logSize);
char* log = new char[logSize];
clGetProgramBuildInfo(program, devices[deviceId], CL_PROGRAM_BUILD_LOG, logSize, (void*)log, 0);
std::cerr << log << std::endl;
delete[] log;
throw OpenclError(error, "Unable to build the OpenCL program");
}
开发者ID:coventry,项目名称:BurstGPUPlotter,代码行数:67,代码来源:CommandGenerate.cpp
示例2: bpnn_train_kernel
//.........这里部分代码省略.........
float *input_weights_one_dim;
float *input_weights_prev_one_dim;
float * partial_sum;
float sum;
float num_blocks = in / BLOCK_SIZE;
input_weights_one_dim = (float *) malloc((in + 1)* (hid + 1) * sizeof(float));
input_weights_prev_one_dim = (float *) malloc((in + 1)* (hid + 1) * sizeof(float));
partial_sum = (float *) malloc(num_blocks * WIDTH * sizeof(float));
// set global and local workitems
size_t global_work[3] = { BLOCK_SIZE, BLOCK_SIZE * num_blocks, 1 };
size_t local_work[3] = { BLOCK_SIZE, BLOCK_SIZE, 1 };
// this preprocessing stage is temporarily added to correct the bug of wrong memcopy using two-dimensional net->inputweights
// todo: fix mem allocation
int m = 0;
for (int k = 0; k <= in; k++) {
for (int j = 0; j <= hid; j++) {
input_weights_one_dim[m] = net->input_weights[k][j];
input_weights_prev_one_dim[m] = net-> input_prev_weights[k][j];
m++;
}
}
cl_mem input_hidden_ocl;
cl_mem input_ocl;
cl_mem output_hidden_ocl;
cl_mem hidden_partial_sum;
cl_mem hidden_delta_ocl;
cl_mem input_prev_weights_ocl;
input_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * sizeof(float), NULL, &err );
if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_ocl\n"); return -1;}
input_hidden_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * (hid + 1) * sizeof(float), NULL, &err );
if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_hidden_ocl\n"); return -1;}
output_hidden_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (hid + 1) * sizeof(float), NULL, &err );
if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer output_hidden_ocl\n"); return -1;}
hidden_partial_sum = clCreateBuffer(context, CL_MEM_READ_WRITE, num_blocks * WIDTH * sizeof(float), NULL, &err );
if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer hidden_partial_sum\n"); return -1;}
hidden_delta_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (hid + 1) * sizeof(float), NULL, &err );
if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer hidden_delta_ocl\n"); return -1;}
input_prev_weights_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * (hid + 1) * sizeof(float), NULL, &err );
if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_prev_weights_ocl\n"); return -1;}
printf("Performing GPU computation\n");
//write buffers
err = clEnqueueWriteBuffer(cmd_queue, input_ocl, 1, 0, (in + 1) * sizeof(float), net->input_units, 0, 0, 0);
if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_ocl\n"); return -1; }
err = clEnqueueWriteBuffer(cmd_queue, input_hidden_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, 0);
if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_hidden_ocl\n"); return -1; }
clSetKernelArg(kernel1, 0, sizeof(void *), (void*) &input_ocl);
clSetKernelArg(kernel1, 1, sizeof(void *), (void*) &output_hidden_ocl);
clSetKernelArg(kernel1, 2, sizeof(void *), (void*) &input_hidden_ocl);
clSetKernelArg(kernel1, 3, sizeof(void *), (void*) &hidden_partial_sum );
clSetKernelArg(kernel1, 4, sizeof(float) * HEIGHT, (void*)NULL );
clSetKernelArg(kernel1, 5, sizeof(float ) * HEIGHT * WIDTH, (void*)NULL );
clSetKernelArg(kernel1, 6, sizeof(cl_int), (void*) &in);
clSetKernelArg(kernel1, 7, sizeof(cl_int), (void*) &hid);
err = clEnqueueNDRangeKernel(cmd_queue, kernel1, 3, NULL, global_work, local_work, 0, NULL, 0);
if(err == CL_INVALID_KERNEL) {printf("Error is invalid kernel\n");}
if(err != CL_SUCCESS) { printf("ERROR: 1 kernel1 clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }
开发者ID:shvo,项目名称:Rodinia-FPGA,代码行数:67,代码来源:backprop_ocl_fpga.cpp
示例3: execute
void execute(float *grid, size_t gridSize, unsigned int width, unsigned int workGroupSize, unsigned int iterations, bool printResult) {
cl_context context;
cl_command_queue commandQueue;
cl_program program;
cl_kernel kernel;
size_t dataBytes, kernelLength;
cl_int errorCode;
cl_mem gridBuffer;
cl_device_id* devices;
cl_device_id gpu;
cl_uint numPlatforms;
errorCode = clGetPlatformIDs(0, NULL, &numPlatforms);
cl_platform_id platforms[numPlatforms];
errorCode = clGetPlatformIDs(numPlatforms, platforms, NULL);
checkError(errorCode);
cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (int) platforms[0], 0};
context = clCreateContextFromType(properties, CL_DEVICE_TYPE_ALL, 0, NULL, &errorCode);
checkError(errorCode);
errorCode = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &dataBytes);
devices = malloc(dataBytes);
errorCode |= clGetContextInfo(context, CL_CONTEXT_DEVICES, dataBytes, devices, NULL);
gpu = devices[0];
commandQueue = clCreateCommandQueue(context, gpu, 0, &errorCode);
checkError(errorCode);
gridBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, gridSize, grid, &errorCode);
checkError(errorCode);
const char* programBuffer = readFile("kernel.cl");
kernelLength = strlen(programBuffer);
program = clCreateProgramWithSource(context, 1, (const char **)&programBuffer, &kernelLength, &errorCode);
checkError(errorCode);
errorCode = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (errorCode == CL_BUILD_PROGRAM_FAILURE) {
// Determine the size of the log
size_t log_size;
clGetProgramBuildInfo(program, gpu, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
// Allocate memory for the log
char *log = (char *) malloc(log_size);
// Get the log
clGetProgramBuildInfo(program, gpu, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
// Print the log
free(log);
printf("%s\n", log);
}
checkError(errorCode);
kernel = clCreateKernel(program, "diffuse", &errorCode);
checkError(errorCode);
size_t localWorkSize[2] = {workGroupSize, workGroupSize}, globalWorkSize[2] = {width, width};
errorCode |= clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&gridBuffer);
errorCode |= clSetKernelArg(kernel, 1, sizeof(float) * workGroupSize * workGroupSize, NULL);
errorCode |= clSetKernelArg(kernel, 2, sizeof(int), (void *)&width);
errorCode |= clSetKernelArg(kernel, 3, sizeof(int), (void *)&workGroupSize);
errorCode |= clSetKernelArg(kernel, 4, sizeof(int), (void *)&iterations);
checkError(errorCode);
errorCode = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL);
checkError(errorCode);
errorCode = clEnqueueReadBuffer(commandQueue, gridBuffer, CL_TRUE, 0, gridSize, grid, 0, NULL, NULL);
checkError(errorCode);
free(devices);
free((void *)programBuffer);
clReleaseContext(context);
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(commandQueue);
}
开发者ID:scrblnrd3,项目名称:GPGPU-Diffusion,代码行数:91,代码来源:main.c
示例4: initGPU
int initGPU(int n)
{
#pragma mark Device Information
// Find the CPU CL device, as a fallback
err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_CPU, 1, &cpu, NULL);
assert(err == CL_SUCCESS);
// Find the GPU CL device, this is what we really want
// If there is no GPU device is CL capable, fall back to CPU
err |= clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
if (err != CL_SUCCESS) device = cpu;
assert(device);
// Get some information about the returned device
cl_char vendor_name[1024] = {0};
cl_char device_name[1024] = {0};
err |= clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(vendor_name), vendor_name, &returned_size);
err |= clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), device_name, &returned_size);
assert(err == CL_SUCCESS);
printf("Connecting to %s %s...", vendor_name, device_name);
#pragma mark Context and Command Queue
// Now create a context to perform our calculation with the
// specified device
context = clCreateContext(0, 1, &device, NULL, NULL, &err);
assert(err == CL_SUCCESS);
// And also a command queue for the context
cmd_queue = clCreateCommandQueue(context, device, 0, NULL);
#pragma mark Program and Kernel Creation
// Load the program source from disk
// The kernel/program is the project directory and in Xcode the executable
// is set to launch from that directory hence we use a relative path
const char * filename = "kernel.cl";
char *program_source = load_program_source(filename);
program[0] = clCreateProgramWithSource(context, 1, (const char**)&program_source, NULL, &err);
assert(err == CL_SUCCESS);
err |= clBuildProgram(program[0], 0, NULL, NULL, NULL, NULL);
assert(err == CL_SUCCESS);
// Now create the kernel "objects" that we want to use in the example file
kernel[0] = clCreateKernel(program[0], "add", &err);
assert(err == CL_SUCCESS);
#pragma mark Memory Allocation
// Allocate memory on the device to hold our data and store the results into
buffer_size = sizeof(int) * n;
mem_c_position = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, &err);
mem_c_velocity = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, &err);
mem_p_angle = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, &err);
mem_p_velocity = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, &err);
assert(err == CL_SUCCESS);
mem_fitness = clCreateBuffer(context, CL_MEM_WRITE_ONLY, buffer_size, NULL, &err);
assert(err == CL_SUCCESS);
// Get all of the stuff written and allocated
clFinish(cmd_queue);
printf(" done\n");
return err; // CL_SUCCESS
}
开发者ID:Mikulas,项目名称:PoleBalanceGPU,代码行数:66,代码来源:main.c
示例5: magma_cgeqrf2_2q_gpu
//.........这里部分代码省略.........
} else if (n < 0) {
*info = -2;
} else if (ldda < max(1,m)) {
*info = -4;
}
if (*info != 0) {
magma_xerbla( __func__, -(*info) );
return *info;
}
k = min(m,n);
if (k == 0)
return MAGMA_SUCCESS;
nb = magma_get_cgeqrf_nb(m);
lwork = (m+n) * nb;
lhwork = lwork - (m)*nb;
if ( MAGMA_SUCCESS != magma_cmalloc( &dwork, n*nb )) {
*info = MAGMA_ERR_DEVICE_ALLOC;
return *info;
}
/*
if ( MAGMA_SUCCESS != magma_cmalloc_cpu( &work, lwork ) ) {
*info = MAGMA_ERR_HOST_ALLOC;
magma_free( dwork );
return *info;
}
*/
cl_mem buffer = clCreateBuffer(gContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(magmaFloatComplex)*lwork, NULL, NULL);
work = (magmaFloatComplex*)clEnqueueMapBuffer(queues[0], buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, lwork*sizeof(magmaFloatComplex), 0, NULL, NULL, NULL);
nbmin = 2;
nx = 2*nb;
ldwork = m;
lddwork= n;
if (nb >= nbmin && nb < k && nx < k) {
/* Use blocked code initially */
old_i = 0; old_ib = nb;
for (i = 0; i < k-nx; i += nb) {
ib = min(k-i, nb);
rows = m -i;
magma_cgetmatrix_async(rows, ib, dA(i, i), ldda, work_ref(i), 0, ldwork, queues[0], NULL);
clFlush(queues[0]);
if (i>0){
/* Apply H' to A(i:m,i+2*ib:n) from the left */
magma_clarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
m-old_i, n-old_i-2*old_ib, old_ib,
dA(old_i, old_i ), ldda, dwork,0, lddwork,
dA(old_i, old_i+2*old_ib), ldda, dwork,old_ib, lddwork, queues[1]);
magma_csetmatrix_async( old_ib, old_ib, work_ref(old_i), 0, ldwork,
dA(old_i, old_i), ldda, queues[1], NULL);
clFlush(queues[1]);
}
magma_queue_sync(queues[0]);
lapackf77_cgeqrf(&rows, &ib, work_ref(i), &ldwork, tau+i, hwork, &lhwork, info);
/* Form the triangular factor of the block reflector
开发者ID:EmergentOrder,项目名称:clmagma,代码行数:67,代码来源:cgeqrf2_2q_gpu.cpp
示例6: clGetPlatformIDs
//.........这里部分代码省略.........
maxWorkItemSizes = (size_t*)malloc(maxDimensions*sizeof(size_t));
status = clGetDeviceInfo(devices[deviceId],
CL_DEVICE_MAX_WORK_ITEM_SIZES,
sizeof(size_t)*maxDimensions,
(void *)maxWorkItemSizes,
NULL);
if(!sampleCommon->checkVal(status,
CL_SUCCESS,
"clGetDeviceInfo CL_DEVICE_MAX_WORK_ITEM_SIZES failed."))
return SDK_FAILURE;
{
/* The block is to move the declaration of prop closer to its use */
cl_command_queue_properties prop = 0;
prop |= CL_QUEUE_PROFILING_ENABLE;
commandQueue = clCreateCommandQueue(context,
devices[deviceId],
prop,
&status);
if(!sampleCommon->checkVal(status,
0,
"clCreateCommandQueue failed."))
return SDK_FAILURE;
}
/* Input buffer */
inputBuffer = clCreateBuffer(context,
CL_MEM_READ_ONLY,
sizeof(cl_float4) * length,
0,
&status);
if(!sampleCommon->checkVal(status,
CL_SUCCESS,
"clCreateBuffer failed. (inputBuffer)"))
return SDK_FAILURE;
/* Write data to buffer */
status = clEnqueueWriteBuffer(commandQueue,
inputBuffer,
1,
0,
sizeof(cl_float4) * length,
input,
0,
0,
0);
if(!sampleCommon->checkVal(status,
CL_SUCCESS,
"clEnqueueWriteBuffer failed. (inputBuffer)"))
return SDK_FAILURE;
outputBuffer = clCreateBuffer(context,
CL_MEM_WRITE_ONLY,
sizeof(cl_float4) * length,
0,
&status);
if(!sampleCommon->checkVal(status,
CL_SUCCESS,
开发者ID:pbains,项目名称:m2s-bench-amdapp-2.5,代码行数:67,代码来源:MemoryOptimizations.cpp
示例7: main
//.........这里部分代码省略.........
return 2;
}
INTG iTypicalWorkgroupNo = TheGPAK->TheMaxWorkGroupSizes[0];
INTG iExpOutputSize = ioutsize(iGlobalSize, iTypicalWorkgroupNo);
FLPT * fExpDotProdResult = (FLPT *) malloc(iExpOutputSize * sizeof(FLPT));
FLPT * fExpReduceResult = (FLPT *) malloc(iExpOutputSize * sizeof(FLPT));
fdotprodexpresult(iGlobalSize, iTypicalWorkgroupNo, fExpDotProdResult);
freduceexpresult(iGlobalSize, iTypicalWorkgroupNo, fExpReduceResult);
// printvector("dot prod", iExpOutputSize, fExpDotProdResult);
// printvector("reduce", iExpOutputSize, fExpReduceResult);
FLPT* inputDataF = (FLPT *) malloc(iGlobalSize * sizeof(FLPT));
SetFIncrease(iGlobalSize, inputDataF);
// For the dot product.
FLPT* outputDataD = (FLPT *) malloc(iGlobalSize * sizeof(FLPT));
SetFNull(iGlobalSize, outputDataD);
// For the reduction.
FLPT* outputDataR = (FLPT *) malloc(iGlobalSize * sizeof(FLPT));
SetFNull(iGlobalSize, outputDataR);
struct timespec start[iNoKernels];
struct timespec end[iNoKernels];
// create buffers for the input and ouput
int err;
cl_mem inputF, outputF, outputAll;
inputF = clCreateBuffer(TheGCAQ->TheContext, CL_MEM_READ_ONLY, iGlobalSize * sizeof(FLPT), NULL, &err);
if (err != CL_SUCCESS)
{
printf("Error allocating for F");
return 3;
}
outputF = clCreateBuffer(TheGCAQ->TheContext, CL_MEM_WRITE_ONLY, iGlobalSize * sizeof(float), NULL, &err);
if (err != CL_SUCCESS)
{
printf("Error allocating for output 7");
return 9;
}
outputAll = clCreateBuffer(TheGCAQ->TheContext, CL_MEM_WRITE_ONLY, iGlobalSize * sizeof(float), NULL, &err);
if (err != CL_SUCCESS)
{
printf("Error allocating for output 8");
return 9;
}
clEnqueueWriteBuffer(TheGCAQ->TheQueue, inputF, CL_TRUE, 0, iGlobalSize * sizeof(FLPT), inputDataF, 0, NULL, NULL);
int iRep;
int iKernel;
int i;
int iLengthTotal = iGlobalSize;
size_t iGlobalWorkThing = iGlobalSize;
int iSomething = 1;
for (iKernel = 0; iKernel < iNoKernels; iKernel++)
{
for (i = 0; i < iLengthTotal; i++)
{
开发者ID:peterkmurphy,项目名称:thesis-control,代码行数:67,代码来源:dotproducttest.c
示例8: clCreateBuffer
// Create the data array in device memory for our calculation
//
cl_mem device_$arg_ref = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof($arg_ref[0]) * grid_size, NULL, NULL);
if (!device_$arg_ref)
{
printf("Error: Failed to allocate device memory!\n");
return err;
}
// Write our data set into the data array in device memory
//
err = clEnqueueWriteBuffer(commands, device_$arg_ref, CL_TRUE, 0, sizeof($arg_ref[0]) * grid_size, $arg_ref, 0, NULL, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to write to source array!\n");
return err;
}
// Set the arguments to our compute kernel
//
err = clSetKernelArg(kernel, $arg_index, sizeof(cl_mem), &device_$arg_ref);
if (err != CL_SUCCESS)
{
printf("Error: Failed to set kernel arguments! %d\n", err);
return err;
}
开发者ID:lowks,项目名称:stencil_code,代码行数:26,代码来源:OclLoadGrid.tmpl.c
示例9: test_csrmv
//.........这里部分代码省略.........
}
if (typeid(T) == typeid(cl_double) )
{
status = clsparseDcsrmv(&gAlpha, &CSRE::csrDMatrix, &gX,
&gBeta, &gY, CLSE::control);
ASSERT_EQ(clsparseSuccess, status);
double* vals = (double*)&CSRE::ublasDCsr.value_data()[0];
int* rows = &CSRE::ublasDCsr.index1_data()[0];
int* cols = &CSRE::ublasDCsr.index2_data()[0];
for (int row = 0; row < CSRE::n_rows; row++)
{
// Summation done using a compensated summation to decrease
// summation errors from rounding. This allows us to get
// smaller errors without requiring quad precision support.
// This method is like performing summation at quad precision and
// casting down to double in the end.
hY[row] *= hBeta;
int row_end = rows[row+1];
double temp_sum;
temp_sum = hY[row];
T sumk_err = 0.;
for (int i = rows[row]; i < rows[row+1]; i++)
{
// Perform: hY[row] += hAlpha * vals[i] * hX[cols[i]];
temp_sum = two_sum(temp_sum, hAlpha*vals[i]*hX[cols[i]], &sumk_err);
}
hY[row] = temp_sum + sumk_err;
}
T* host_result = (T*) ::clEnqueueMapBuffer(CLSE::queue, gY.values,
CL_TRUE, CL_MAP_READ,
0, gY.num_values * sizeof(T),
0, nullptr, nullptr, &cl_status);
ASSERT_EQ(CL_SUCCESS, cl_status);
uint64_t max_ulps = 0;
uint64_t min_ulps = ULLONG_MAX;
uint64_t total_ulps = 0;
for (int i = 0; i < hY.size(); i++)
{
long long int intDiff = (long long int)boost::math::float_distance(hY[i], host_result[i]);
intDiff = llabs(intDiff);
total_ulps += intDiff;
if (max_ulps < intDiff)
max_ulps = intDiff;
if (min_ulps > intDiff)
min_ulps = intDiff;
// Debug printouts.
//std::cout << "Row " << i << " Double Ulps: " << intDiff << std::endl;
//std::cout.precision(17);
//std::cout << "\tDouble hY[" << i << "] = " << std::scientific << hY[i] << " (0x" << std::hex << *(uint64_t *)&hY[i] << "), " << std::dec;
//std::cout << "host_result[" << i << "] = " << std::scientific << host_result[i] << " (0x" << std::hex << *(uint64_t *)&host_result[i] << ")" << std::dec << std::endl;
}
if (extended_precision)
{
#ifndef NDEBUG
std::cout << "Double Min ulps: " << min_ulps << std::endl;
std::cout << "Double Max ulps: " << max_ulps << std::endl;
std::cout << "Double Total ulps: " << total_ulps << std::endl;
std::cout << "Double Average ulps: " << (double)total_ulps/(double)hY.size() << " (Size: " << hY.size() << ")" << std::endl;
#endif
for (int i = 0; i < hY.size(); i++)
{
double compare_val = fabs(hY[i]*1e-14);
if (compare_val < 10*DBL_EPSILON)
compare_val = 10*DBL_EPSILON;
ASSERT_NEAR(hY[i], host_result[i], compare_val);
}
}
else
{
for (int i = 0; i < hY.size(); i++)
{
double compare_val = 0.;
if (boost::math::isnormal(hY[i]))
compare_val = fabs(hY[i]*0.1);
if (compare_val < 10*DBL_EPSILON)
compare_val = 10*DBL_EPSILON;
ASSERT_NEAR(hY[i], host_result[i], compare_val);
}
}
cl_status = ::clEnqueueUnmapMemObject(CLSE::queue, gY.values,
host_result, 0, nullptr, nullptr);
ASSERT_EQ(CL_SUCCESS, cl_status);
}
// Reset output buffer for next test.
::clReleaseMemObject(gY.values);
clsparseInitVector(&gY);
gY.values = clCreateBuffer(CLSE::context,
CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
hY.size() * sizeof(T), hY.data().begin(),
&cl_status);
gY.num_values = hY.size();
ASSERT_EQ(CL_SUCCESS, cl_status);
}
开发者ID:kvaragan,项目名称:clSPARSE,代码行数:101,代码来源:test-blas2.cpp
示例10: vector_sum
inline void vector_sum(const int arraySize,
const double* inputA,
const double* inputB,
double* output)
{
/* Allocate memory buffers */
/*
* Ask the OpenCL implementation to allocate buffers for the data.
* We ask the OpenCL implemenation to allocate memory rather than
* allocating it on the CPU to avoid having to copy the data later.
* The read/write flags relate to accesses to the memory from within
* the kernel.
*/
bool createMemoryObjectSuccess = true;
int numberOfMemoryObjects = 3;
cl_mem memoryObjects[3] = {0, 0, 0};
int errorNumber = 0;
int bufferSize = arraySize*sizeof(double);
memoryObjects[0] = clCreateBuffer(context,
CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
bufferSize, (void*)inputA, &errorNumber);
checkErr(errorNumber, "Failed to create buffer, 1.");
memoryObjects[1] = clCreateBuffer(context,
CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
bufferSize, (void*)inputB, &errorNumber);
checkErr(errorNumber, "Failed to create buffer, 2.");
memoryObjects[2] = clCreateBuffer(context,
CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
bufferSize, output, &errorNumber);
checkErr(errorNumber, "Failed to create buffer, 3.");
/* Enqueue commands and kernels */
/* Enqueue to the command queues the commands that control the sequence
* and synchronization of kernel execution, reading and writing of data,
* and manipulation of memory objects
*/
/* Execute a kernel function */
/* Call clSetKernelArg() for each parameter in the kernel */
bool setKernelArgumentsSuccess = true;
setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, 0,
sizeof(cl_mem), &memoryObjects[0]));
setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, 1,
sizeof(cl_mem), &memoryObjects[1]));
setKernelArgumentsSuccess &= checkSuccess(clSetKernelArg(kernel, 2,
sizeof(cl_mem), &memoryObjects[2]));
if (not setKernelArgumentsSuccess) {
cleanUpOpenCL();
std::cerr << "Failed setting OpenCL kernel arguments. " << __FILE__
<< ":"<< __LINE__ << std::endl;
exit(1);
}
/* Determine the work-group size and index space for the kernel */
const size_t globalWorkSize[1] = {arraySize};
const size_t localWorkSize[1] = { 1 };
/* Enqueue the kernel for execution in the command queue */
//for (int j = 0; j < ITER; j++) {
if (not checkSuccess(clEnqueueNDRangeKernel(commandQueue, kernel, 1,
NULL, globalWorkSize, localWorkSize, 0, NULL, NULL))) {
cleanUpOpenCL();
std::cerr << "Failed enqueuing the kernel. " << __FILE__ << ":"
<< __LINE__ <<std::endl;
exit(1);
}
//}
/* Get a pointer to the output data */
output = (double*)clEnqueueMapBuffer(commandQueue,
memoryObjects[2], CL_TRUE, CL_MAP_READ, 0,
arraySize, 0, NULL, NULL, &errorNumber);
if (not checkSuccess(errorNumber)) {
cleanUpOpenCL();
std::cerr << "Failed to map buffer " << __FILE__ << ":"
<< __LINE__ << std::endl;
exit(1);
}
/* Wait for kernel execution */
if (not checkSuccess(clFinish(commandQueue))) {
cleanUpOpenCL();
std::cerr << "Failed waiting for kernel execution to finish. "
<< __FILE__ << ":"<< __LINE__ << std::endl;
exit(1);
}
/* Unmap the memory objects as we finished using them in the CPU */
if (not checkSuccess(clReleaseMemObject(memoryObjects[0]))) {
//.........这里部分代码省略.........
开发者ID:bagustris,项目名称:LowPowerSupercomputer,代码行数:101,代码来源:es2.cpp
示例11: main
int main() {
// START:context
cl_platform_id platform;
clGetPlatformIDs(1, &platform, NULL);
cl_device_id device;
clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
// END:context
// START:queue
cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL);
// END:queue
// START:kernel
char* source = read_source("multiply_arrays.cl");
cl_program program = clCreateProgramWithSource(context, 1,
(const char**)&source, NULL, NULL);
free(source);
clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
cl_kernel kernel = clCreateKernel(program, "multiply_arrays", NULL);
// END:kernel
// START:buffers
cl_float a[NUM_ELEMENTS], b[NUM_ELEMENTS];
random_fill(a, NUM_ELEMENTS);
random_fill(b, NUM_ELEMENTS);
cl_mem inputA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(cl_float) * NUM_ELEMENTS, a, NULL);
cl_mem inputB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
sizeof(cl_float) * NUM_ELEMENTS, b, NULL);
cl_mem output = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
sizeof(cl_float) * NUM_ELEMENTS, NULL, NULL);
// END:buffers
// START:execute
clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputA);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &inputB);
clSetKernelArg(kernel, 2, sizeof(cl_mem), &output);
size_t work_units = NUM_ELEMENTS;
clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &work_units, NULL, 0, NULL, NULL);
// END:execute
// START:results
cl_float results[NUM_ELEMENTS];
clEnqueueReadBuffer(queue, output, CL_TRUE, 0, sizeof(cl_float) * NUM_ELEMENTS,
results, 0, NULL, NULL);
// END:results
// START:cleanup
clReleaseMemObject(inputA);
clReleaseMemObject(inputB);
clReleaseMemObject(output);
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(queue);
clReleaseContext(context);
// END:cleanup
for (int i = 0; i < NUM_ELEMENTS; ++i) {
printf("%f * %f = %f\n", a[i], b[i], results[i]);
}
return 0;
}
开发者ID:kaizenoh,项目名称:book-source,代码行数:67,代码来源:multiply_arrays.c
示例12: init_kernel_platform
int32_t init_kernel_platform() {
cl_uint plat_num;
cl_platform_id plat_id = NULL;
cl_uint dev_num = 0;
cl_device_id *devices;
ret = clGetPlatformIDs(0, NULL, &plat_num);
if (ret < 0) {
LOGD("MU1 Error: Getting plat_ids!\n");
return -1;
}
if(plat_num > 0)
{
cl_platform_id* plat_ids = (cl_platform_id* )malloc(plat_num* sizeof(cl_platform_id));
ret = clGetPlatformIDs(plat_num, plat_ids, NULL);
plat_id = plat_ids[0];
free(plat_ids);
}
ret = clGetDeviceIDs(plat_id, CL_DEVICE_TYPE_GPU, 0, NULL, &dev_num);
if (dev_num == 0) {
LOGD("MU1: No GPU device available.\n");
LOGD("MU1: Choose CPU as default device.\n");
ret = clGetDeviceIDs(plat_id, CL_DEVICE_TYPE_CPU, 0, NULL, &dev_num);
devices = (cl_device_id*)malloc(dev_num * sizeof(cl_device_id));
ret = clGetDeviceIDs(plat_id, CL_DEVICE_TYPE_CPU, dev_num, devices, NULL);
} else {
LOGD("MU1: Choose GPU as default device. dev_num %d\n", dev_num);
devices = (cl_device_id*)malloc(dev_num * sizeof(cl_device_id));
ret = clGetDeviceIDs(plat_id, CL_DEVICE_TYPE_GPU, dev_num, devices, NULL);
}
context = clCreateContext(NULL,1, devices,NULL,NULL,NULL);
commandQueue = clCreateCommandQueue(context, devices[0], 0, NULL);
char filename[] = "/data/mu1_kernel.cl";
char file_context[10*1024]={0};
const char *source = &file_context[0];
ret = read_cl(filename, &file_context[0]);
size_t sourceSize[10] = {strlen(source)};
cl_program program = clCreateProgramWithSource(context, 1, &source, &sourceSize[0], NULL);
ret = clBuildProgram(program, 1, devices, NULL, NULL, NULL);
if(ret < 0) {
LOGD("MU1 Error: clBuildProgram error\n");
return 0;
}
kernel = clCreateKernel(program, "process_iq", NULL);
inputBuffer_i = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
512*1024*4, (void *)(&table_i[0][0]), NULL);
inputBuffer_q = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
512*1024*4, (void *)(&table_q[0][0]), NULL);
inputBuffer_o = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
512*1024*4, (void *)(&table_o[0][0]), NULL);
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputBuffer_i);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&inputBuffer_q);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&inputBuffer_o);
if(devices != NULL) { free(devices);}
LOGD("MU1: init cl plat success");
return 0;
}
开发者ID:chengyake,项目名称:karch,代码行数:74,代码来源:opencl_mu1.c
示例13: main
////////////////////////////////////////////////////////////////////////////////
// Main program
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
cl_platform_id cpPlatform; //OpenCL platform
cl_device_id cdDevice; //OpenCL device
cl_context cxGPUContext; //OpenCL context
cl_command_queue cqCommandQueue; //OpenCL command que
cl_mem d_Input, d_Output; //OpenCL memory buffer objects
cl_int ciErrNum;
float *h_Input, *h_OutputCPU, *h_OutputGPU;
const uint
imageW = 2048,
imageH = 2048,
stride = 2048;
const int dir = DCT_FORWARD;
shrQAStart(argc, argv);
// set logfile name and start logs
shrSetLogFileName ("oclDCT8x8.txt");
shrLog("%s Starting...\n\n", argv[0]);
shrLog("Allocating and initializing host memory...\n");
h_Input = (float *)malloc(imageH * stride * sizeof(float));
h_OutputCPU = (float *)malloc(imageH * stride * sizeof(float));
h_OutputGPU = (float *)malloc(imageH * stride * sizeof(float));
srand(2009);
for(uint i = 0; i < imageH; i++)
for(uint j = 0; j < imageW; j++)
h_Input[i * stride + j] = (float)rand() / (float)RAND_MAX;
shrLog("Initializing OpenCL...\n");
//Get the NVIDIA platform
ciErrNum = oclGetPlatformID(&cpPlatform);
oclCheckError(ciErrNum, CL_SUCCESS);
//Get a GPU device
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL);
oclCheckError(ciErrNum, CL_SUCCESS);
//Create the context
cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErrNum);
oclCheckError(ciErrNum, CL_SUCCESS);
//Create a command-queue
cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
oclCheckError(ciErrNum, CL_SUCCESS);
shrLog("Initializing OpenCL DCT 8x8...\n");
initDCT8x8(cxGPUContext, cqCommandQueue, (const char **)argv);
shrLog("Creating OpenCL memory objects...\n");
d_Input = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, imageH * stride * sizeof(cl_float), h_Input, &ciErrNum);
oclCheckError(ciErrNum, CL_SUCCESS);
d_Output = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, imageH * stride * sizeof(cl_float), NULL, &ciErrNum);
oclCheckError(ciErrNum, CL_SUCCESS);
shrLog("Performing DCT8x8 of %u x %u image...\n\n", imageH, imageW);
//Just a single iteration or a warmup iteration
DCT8x8(
cqCommandQueue,
d_Output,
d_Input,
stride,
imageH,
imageW,
dir
);
#define GPU_PROFILING 1
#ifdef GPU_PROFILING
const int numIterations = 16;
cl_event startMark, endMark;
ciErrNum = clEnqueueMarker(cqCommandQueue, &startMark);
ciErrNum |= clFinish(cqCommandQueue);
shrCheckError(ciErrNum, CL_SUCCESS);
shrDeltaT(0);
for(int iter = 0; iter < numIterations; iter++)
DCT8x8(
NULL,
d_Output,
d_Input,
stride,
imageH,
imageW,
dir
);
ciErrNum = clEnqueueMarker(cqCommandQueue, &endMark);
ciErrNum |= clFinish(cqCommandQueue);
shrCheckError(ciErrNum, CL_SUCCESS);
//Calculate performance metrics by wallclock time
//.........这里部分代码省略.........
开发者ID:NatTuck,项目名称:cakemark,代码行数:101,代码来源:main.cpp
示例14: main
//.........这里部分代码省略.........
context = CL_CHECK_ERR(clCreateContext(NULL, 1, devices+1, &pfn_notify, NULL, &_err));
cl_command_queue queue;
queue = CL_CHECK_ERR(clCreateCommandQueue(context, devices[1], CL_QUEUE_PROFILING_ENABLE, &_err));
cl_kernel kernel = 0;
cl_mem memObjects[2] = {0,0};
// Create OpenCL program - first attempt to load cached binary.
// If that is not available, then create the program from source
// and store the binary for future use.
std::cout << "Attempting to create program from binary..." << std::endl;
cl_program program = CreateProgramFromBinary(context, devices[1], "kernel.cl.bin");
if (program == NULL)
{
std::cout << "Binary not loaded, create from source..." << std::endl;
program = CreateProgram(context, devices[1], "kernel.cl");
if (program == NULL)
{
Cleanup(context, queue, program, kernel, memObjects);
return 1;
}
std::cout << "Save program binary for future run..." << std::endl;
if (SaveProgramBinary(program, devices[1], "kernel.cl.bin") == false)
{
std::cerr << "Failed to write program binary" << std::endl;
Cleanup(context, queue, program, kernel, memObjects);
return 1;
}
}
else
{
std::cout << "Read program from binary." << std::endl;
}
printf("attempting to create input buffer\n");
fflush(stdout);
cl_mem input_buffer;
input_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(double)*NUM_DATA, NULL, &_err));
printf("attempting to create output buffer\n");
fflush(stdout);
cl_mem output_buffer;
output_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(double)*NUM_DATA, NULL, &_err));
memObjects[0] = input_buffer;
memObjects[1] = output_buffer;
double factor = ((double)rand()/(double)(RAND_MAX)) * 100.0;;
printf("attempting to create kernel\n");
fflush(stdout);
kernel = CL_CHECK_ERR(clCreateKernel(program, "daxpy", &_err));
printf("setting up kernel args cl_mem:%lx \n",input_buffer);
fflush(stdout);
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(input_buffer), &input_buffer));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(output_buffer), &output_buffer));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(factor), &factor));
printf("attempting to enqueue write buffer\n");
fflush(stdout);
for (int i=0; i<NUM_DATA; i++) {
double in = ((double)rand()/(double)(RAND_MAX)) * 100.0;;
CL_CHECK(clEnqueueWriteBuffer(queue, input_buffer, CL_TRUE, i*sizeof(double), 8, &in, 0, NULL, NULL));
}
cl_event kernel_completion;
size_t global_work_size[1] = { NUM_DATA };
printf("attempting to enqueue kernel\n");
fflush(stdout);
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, &kernel_completion));
printf("Enqueue'd kerenel\n");
fflush(stdout);
cl_ulong time_start, time_end;
CL_CHECK(clWaitForEvents(1, &kernel_completion));
CL_CHECK(clGetEventProfilingInfo(kernel_completion, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL));
CL_CHECK(clGetEventProfilingInfo(kernel_completion, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL));
double elapsed = time_end - time_start;
printf("time(ns):%lg\n",elapsed);
CL_CHECK(clReleaseEvent(kernel_completion));
printf("Result:");
for (int i=0; i<NUM_DATA; i++) {
double data;
CL_CHECK(clEnqueueReadBuffer(queue, output_buffer, CL_TRUE, i*sizeof(double), 8, &data, 0, NULL, NULL));
//printf(" %lg", data);
}
printf("\n");
CL_CHECK(clReleaseMemObject(memObjects[0]));
CL_CHECK(clReleaseMemObject(memObjects[1]));
CL_CHECK(clReleaseKernel(kernel));
CL_CHECK(clReleaseProgram(program));
CL_CHECK(clReleaseContext(context));
return 0;
}
开发者ID:JamesLinus,项目名称:opencl-kernels,代码行数:101,代码来源:daxpy.c
示例15: setup_buffer
//.........这里部分代码省略.........
}
else if (lda < K)
{
std::cerr << "lda:wrong size\n";
exit(1);
}
else
{
buffer_.lda_ = lda;
}
}
else
{
buffer_.a_num_vectors_ = K;
if (transA_option == 1)
{
buffer_.trans_a_ = clblasTrans;
}
else if (transA_option == 2)
{
buffer_.trans_a_ = clblasConjTrans;
|
请发表评论