本文整理汇总了C++中clFinish函数的典型用法代码示例。如果您正苦于以下问题:C++ clFinish函数的具体用法?C++ clFinish怎么用?C++ clFinish使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了clFinish函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的C++代码示例。
示例1: main
//.........这里部分代码省略.........
break;
dev -= devs[platform];
}
workSize[i] = (i < sizeMod) ? sizePerGPU+1 : sizePerGPU;
check2(d_A[i] = clCreateBuffer(ctx[platform], CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, workSize[i] * WA * sizeof(TYPE), &A_data[workOffset[i] * WA], &err));
check2(d_C[i] = clCreateBuffer(ctx[platform], CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, workSize[i] * WC * sizeof(TYPE), &C_data[workOffset[i] * WC], &err));
check(clSetKernelArg(multiplicationKernel[platform], 0, sizeof(cl_int), &workSize[i]));
check(clSetKernelArg(multiplicationKernel[platform], 1, sizeof(cl_int), &workSize[i]));
check(clSetKernelArg(multiplicationKernel[platform], 2, sizeof(cl_int), &workSize[i]));
check(clSetKernelArg(multiplicationKernel[platform], 3, sizeof(cl_mem), (void *) &d_A[i]));
check(clSetKernelArg(multiplicationKernel[platform], 4, sizeof(cl_mem), (void *) &d_B[d]));
check(clSetKernelArg(multiplicationKernel[platform], 5, sizeof(cl_mem), (void *) &d_C[i]));
size_t globalWorkSize[] = {roundUp(BLOCK_SIZE,WC), roundUp(BLOCK_SIZE,workSize[i])};
check(clEnqueueNDRangeKernel(commandQueue[platform][dev], multiplicationKernel[platform], 2, NULL, globalWorkSize, localWorkSize, 0, NULL, &GPUExecution[i]));
// Non-blocking copy of result from device to host
cqs[i] = commandQueue[platform][dev];
check2(ptrs[i] = clEnqueueMapBuffer(cqs[i], d_C[i], CL_FALSE, CL_MAP_READ, 0, WC * sizeof(TYPE) * workSize[i], 1, &GPUExecution[i], &GPUDone[i], &err));
if(i+1 < BLOCKS)
workOffset[i + 1] = workOffset[i] + workSize[i];
}
// CPU sync with GPU
for (p=0; p<platform_count;p++) {
cl_uint dev;
for (dev=0; dev<devs[p]; dev++) {
clFinish(commandQueue[p][dev]);
}
}
gettimeofday(&end, NULL);
double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
double dSeconds = timing/1000/1000;
double dNumOps = 2.0 * (double)WA * (double)HA * (double)WB;
double gflops = 1.0e-9 * dNumOps/dSeconds;
printf("Throughput = %.4f GFlops/s, Time = %.5f s, Size = %.0f, NumDevsUsed = %d, Blocks = %ld, Workgroup = %zu\n",
gflops, dSeconds, dNumOps, device_count, BLOCKS, localWorkSize[0] * localWorkSize[1]);
// compute reference solution
if (check) {
printf("Comparing results with CPU computation... ");
TYPE* reference = (TYPE*)malloc(C_mem_size);
computeReference(reference, A_data, B_data, HA, WA, WB);
// check result
int res = shrCompareL2fe(reference, C_data, C_size, 1.0e-6f);
if (res == 0) {
printf("\n\n");
printDiff(reference, C_data, WC, HC, 100, 1.0e-5f);
}
else printf("PASSED\n\n");
free(reference);
}
for(i = 0; i < BLOCKS; i++)
{
clEnqueueUnmapMemObject(cqs[i], d_C[i], ptrs[i], 0, NULL, NULL);
开发者ID:joao-lima,项目名称:starpu-1.2.0rc2,代码行数:67,代码来源:matmul.c
示例2: runProgram
//.........这里部分代码省略.........
kernel[0] = clCreateKernel(program, "kernel_a", &err);
OCL_CHECK(err);
// memory on device
cl_mem A_d = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*N*N, NULL, NULL);
cl_mem Aout_d = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*N*N, NULL, NULL);
// copy data to device
err = clEnqueueWriteBuffer(queue, A_d, CL_TRUE, 0, sizeof(float)*N*N, A, 0, NULL , &event[0]);
OCL_CHECK(err);
size_t localsize[2];
size_t globalsize[2];
localsize[0] = 16;
localsize[1] = 16;
globalsize[0] = N;
globalsize[1] = N;
err = clSetKernelArg(kernel[0], 0, sizeof(cl_mem), &A_d);
if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}
err = clSetKernelArg(kernel[0], 1, sizeof(cl_mem), &Aout_d);
if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);}
err = clEnqueueNDRangeKernel(queue, kernel[0], 2, NULL, globalsize, localsize, 0, NULL, NULL);
OCL_CHECK(err);
clFinish(queue);
// read device data back to host
clEnqueueReadBuffer(queue, Aout_d, CL_TRUE, 0, sizeof(float)*N*N, Aout, 0, NULL , &event[1]);
err = clWaitForEvents(1,&event[1]);
OCL_CHECK(err);
err = clGetEventProfilingInfo (event[0], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &gstart, NULL);
OCL_CHECK(err);
err = clGetEventProfilingInfo (event[1], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &gend, NULL);
OCL_CHECK(err);
gpuTime = (double)(gend -gstart)/1000000000.0;
//check_1d_f(sum, blks+1);
#ifdef DEBUG
puts("Output");
check_2d_f(Aout,N,N);
#endif
printf("oclTime = %lf (s)\n", gpuTime );
// free
clReleaseMemObject(A_d);
clReleaseMemObject(Aout_d);
// // check
开发者ID:Anmol-007,项目名称:oclKernels,代码行数:67,代码来源:template.c
示例3: runSummarization
static cl_int runSummarization(CLInfo* ci,
SeparationCLMem* cm,
const IntegralArea* ia,
cl_uint which,
Kahan* resultOut)
{
cl_int err = CL_SUCCESS;
cl_mem buf;
cl_uint offset;
size_t global[1];
size_t local[1];
real result[2] = { -1.0, -1.0 };
cl_uint nElements = ia->r_steps * ia->mu_steps;
cl_mem sumBufs[2] = { cm->summarizationBufs[0], cm->summarizationBufs[1] };
if (which == 0)
{
buf = cm->outBg;
offset = 0;
}
else
{
buf = cm->outStreams;
offset = (which - 1) * nElements;
}
/* First call reads from an offset into one of the output buffers */
err |= clSetKernelArg(_summarizationKernel, 0, sizeof(cl_mem), &sumBufs[0]);
err |= clSetKernelArg(_summarizationKernel, 1, sizeof(cl_mem), &buf);
err |= clSetKernelArg(_summarizationKernel, 2, sizeof(cl_uint), &nElements);
err |= clSetKernelArg(_summarizationKernel, 3, sizeof(cl_uint), &offset);
if (err != CL_SUCCESS)
{
mwPerrorCL(err, "Error setting summarization kernel arguments");
return err;
}
local[0] = _summarizationWorkgroupSize;
global[0] = mwNextMultiple(local[0], nElements);
err = clEnqueueNDRangeKernel(ci->queue, _summarizationKernel, 1,
NULL, global, local,
0, NULL, NULL);
if (err != CL_SUCCESS)
{
mwPerrorCL(err, "Error enqueuing summarization kernel");
return err;
}
/* Why is this necessary? It seems to frequently break on the 7970 and nowhere else without it */
err = clFinish(ci->queue);
//err = clFlush(ci->queue);
if (err != CL_SUCCESS)
{
mwPerrorCL(err, "Error finishing summarization kernel");
return err;
}
/* Later calls swap between summarization buffers without an offset */
nElements = (cl_uint) mwDivRoundup(global[0], local[0]);
offset = 0;
err |= clSetKernelArg(_summarizationKernel, 3, sizeof(cl_uint), &offset);
if (err != CL_SUCCESS)
{
mwPerrorCL(err, "Error setting summarization kernel offset argument");
return err;
}
while (nElements > 1)
{
/* Swap old summarization buffer to the input and shrink the range */
swapBuffers(sumBufs);
global[0] = mwNextMultiple(local[0], nElements);
err |= clSetKernelArg(_summarizationKernel, 0, sizeof(cl_mem), &sumBufs[0]);
err |= clSetKernelArg(_summarizationKernel, 1, sizeof(cl_mem), &sumBufs[1]);
err |= clSetKernelArg(_summarizationKernel, 2, sizeof(cl_uint), &nElements);
if (err != CL_SUCCESS)
{
mwPerrorCL(err, "Error setting summarization kernel arguments");
return err;
}
/*
err = clEnqueueBarrier(ci->queue);
if (err != CL_SUCCESS)
{
mwPerrorCL(err, "Error enqueuing summarization barrier");
return err;
}
*/
err = clEnqueueNDRangeKernel(ci->queue, _summarizationKernel, 1,
NULL, global, local,
0, NULL, NULL);
if (err != CL_SUCCESS)
{
mwPerrorCL(err, "Error enqueuing summarization kernel");
//.........这里部分代码省略.........
开发者ID:LocutusOfBorg,项目名称:milkywayathome_client,代码行数:101,代码来源:run_cl.c
示例4: main
//.........这里部分代码省略.........
delete[] kernelSource;
// Create the compute kernel in the program we wish to run
kernel_compute_flux = clCreateKernel(program, "compute_flux", &err);
CHKERR(err, "Failed to create a compute kernel!");
// Create the reduce kernel in the program we wish to run
kernel_compute_flux_contributions = clCreateKernel(program, "compute_flux_contributions", &err);
CHKERR(err, "Failed to create a compute_flux_contributions kernel!");
// Create the reduce kernel in the program we wish to run
kernel_compute_step_factor = clCreateKernel(program, "compute_step_factor", &err);
CHKERR(err, "Failed to create a compute_step_factor kernel!");
// Create the reduce kernel in the program we wish to run
kernel_time_step = clCreateKernel(program, "time_step", &err);
CHKERR(err, "Failed to create a time_step kernel!");
// Create the reduce kernel in the program we wish to run
kernel_initialize_variables = clCreateKernel(program, "initialize_variables", &err);
CHKERR(err, "Failed to create a initialize_variables kernel!");
// Create arrays and set initial conditions
cl_mem variables = alloc<cl_float>(context, nelr*NVAR);
err = 0;
err = clSetKernelArg(kernel_initialize_variables, 0, sizeof(int), &nelr);
err |= clSetKernelArg(kernel_initialize_variables, 1, sizeof(cl_mem),&variables);
err |= clSetKernelArg(kernel_initialize_variables, 2, sizeof(cl_mem),&ff_variable);
CHKERR(err, "Failed to set kernel arguments!");
// Get the maximum work group size for executing the kernel on the device
//err = clGetKernelWorkGroupInfo(kernel_initialize_variables, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL);
CHKERR(err, "Failed to retrieve kernel_initialize_variables work group info!");
local_size = 1;//std::min(local_size, (size_t)nelr);
global_size = nelr;
err = clEnqueueNDRangeKernel(commands, kernel_initialize_variables, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent);
err = clFinish(commands);
START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Init Kernels", ocdTempTimer)
END_TIMER(ocdTempTimer)
CHKERR(err, "Failed to execute kernel [kernel_initialize_variables]! 0");
cl_mem old_variables = alloc<float>(context, nelr*NVAR);
cl_mem fluxes = alloc<float>(context, nelr*NVAR);
cl_mem step_factors = alloc<float>(context, nelr);
clFinish(commands);
cl_mem fc_momentum_x = alloc<float>(context, nelr*NDIM);
cl_mem fc_momentum_y = alloc<float>(context, nelr*NDIM);
cl_mem fc_momentum_z = alloc<float>(context, nelr*NDIM);
cl_mem fc_density_energy = alloc<float>(context, nelr*NDIM);
clFinish(commands);
// make sure all memory is floatly allocated before we start timing
err = 0;
err = clSetKernelArg(kernel_initialize_variables, 0, sizeof(int), &nelr);
err |= clSetKernelArg(kernel_initialize_variables, 1, sizeof(cl_mem),&old_variables);
err |= clSetKernelArg(kernel_initialize_variables, 2, sizeof(cl_mem),&ff_variable);
CHKERR(err, "Failed to set kernel arguments!");
// Get the maximum work group size for executing the kernel on the device
err = clGetKernelWorkGroupInfo(kernel_initialize_variables, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL);
CHKERR(err, "Failed to retrieve kernel_initialize_variables work group info!");
err = clEnqueueNDRangeKernel(commands, kernel_initialize_variables, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent);
clFinish(commands);
START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Init Kernels", ocdTempTimer)
END_TIMER(ocdTempTimer)
CHKERR(err, "Failed to execute kernel [kernel_initialize_variables]! 1");
err = 0;
err = clSetKernelArg(kernel_initialize_variables, 0, sizeof(int), &nelr);
err |= clSetKernelArg(kernel_initialize_variables, 1, sizeof(cl_mem),&fluxes);
开发者ID:CharudattaSChitale,项目名称:OpenDwarfs,代码行数:67,代码来源:cfd.cpp
示例5: finish
int finish() { return clFinish(commands); }
开发者ID:gitter-badger,项目名称:OpenPV,代码行数:1,代码来源:CLKernel.hpp
示例6: ops_par_loop_advec_mom_kernel_mass_flux_z
//.........这里部分代码省略.........
buildOpenCLKernels_advec_mom_kernel_mass_flux_z(xdim0, ydim0, xdim1, ydim1);
// set up OpenCL thread blocks
size_t globalWorkSize[3] = {
((x_size - 1) / OPS_block_size_x + 1) * OPS_block_size_x,
((y_size - 1) / OPS_block_size_y + 1) * OPS_block_size_y,
((z_size - 1) / OPS_block_size_z + 1) * OPS_block_size_z};
size_t localWorkSize[3] = {OPS_block_size_x, OPS_block_size_y,
OPS_block_size_z};
// set up initial pointers
int d_m[OPS_MAX_DIM];
#ifdef OPS_MPI
for (int d = 0; d < dim; d++)
d_m[d] =
args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d];
#else
for (int d = 0; d < dim; d++)
d_m[d] = args[0].dat->d_m[d];
#endif
int base0 = 1 * 1 * (start[0] * args[0].stencil->stride[0] -
args[0].dat->base[0] - d_m[0]);
base0 = base0 +
args[0].dat->size[0] * 1 * (start[1] * args[0].stencil->stride[1] -
args[0].dat->base[1] - d_m[1]);
base0 = base0 +
args[0].dat->size[0] * 1 * args[0].dat->size[1] * 1 *
(start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] -
d_m[2]);
#ifdef OPS_MPI
for (int d = 0; d < dim; d++)
d_m[d] =
args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d];
#else
for (int d = 0; d < dim; d++)
d_m[d] = args[1].dat->d_m[d];
#endif
int base1 = 1 * 1 * (start[0] * args[1].stencil->stride[0] -
args[1].dat->base[0] - d_m[0]);
base1 = base1 +
args[1].dat->size[0] * 1 * (start[1] * args[1].stencil->stride[1] -
args[1].dat->base[1] - d_m[1]);
base1 = base1 +
args[1].dat->size[0] * 1 * args[1].dat->size[1] * 1 *
(start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] -
d_m[2]);
ops_H_D_exchanges_device(args, 2);
ops_halo_exchanges(args, 2, range);
ops_H_D_exchanges_device(args, 2);
if (OPS_diags > 1) {
ops_timers_core(&c2, &t2);
OPS_kernels[134].mpi_time += t2 - t1;
}
if (globalWorkSize[0] > 0 && globalWorkSize[1] > 0 && globalWorkSize[2] > 0) {
clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[134], 0, sizeof(cl_mem),
(void *)&arg0.data_d));
clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[134], 1, sizeof(cl_mem),
(void *)&arg1.data_d));
clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[134], 2, sizeof(cl_int),
(void *)&base0));
clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[134], 3, sizeof(cl_int),
(void *)&base1));
clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[134], 4, sizeof(cl_int),
(void *)&x_size));
clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[134], 5, sizeof(cl_int),
(void *)&y_size));
clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[134], 6, sizeof(cl_int),
(void *)&z_size));
// call/enque opencl kernel wrapper function
clSafeCall(clEnqueueNDRangeKernel(
OPS_opencl_core.command_queue, OPS_opencl_core.kernel[134], 3, NULL,
globalWorkSize, localWorkSize, 0, NULL, NULL));
}
if (OPS_diags > 1) {
clSafeCall(clFinish(OPS_opencl_core.command_queue));
}
if (OPS_diags > 1) {
ops_timers_core(&c1, &t1);
OPS_kernels[134].time += t1 - t2;
}
ops_set_dirtybit_device(args, 2);
ops_set_halo_dirtybit3(&args[0], range);
if (OPS_diags > 1) {
// Update kernel record
ops_timers_core(&c2, &t2);
OPS_kernels[134].mpi_time += t2 - t1;
OPS_kernels[134].transfer += ops_compute_transfer(dim, start, end, &arg0);
OPS_kernels[134].transfer += ops_compute_transfer(dim, start, end, &arg1);
}
}
开发者ID:gihanmudalige,项目名称:OPS,代码行数:101,代码来源:advec_mom_kernel_mass_flux_z_opencl_kernel.cpp
示例7: ocl_call
//.........这里部分代码省略.........
case INTSXP:
ptr = INTEGER(arg);
al = sizeof(int);
break;
case LGLSXP:
ptr = LOGICAL(arg);
al = sizeof(int);
break;
case RAWSXP:
if (inherits(arg, "clFloat")) {
ptr = RAW(arg);
ndiv = al = sizeof(float);
break;
}
default:
Rf_error("only numeric or logical kernel arguments are supported");
/* no-ops but needed to make the compiler happy */
ptr = 0;
al = 0;
}
n = LENGTH(arg);
if (ndiv != 1) n /= ndiv;
if (n == 1) {/* scalar */
if ((last_ocl_error = clSetKernelArg(kernel, an++, al, ptr)) != CL_SUCCESS)
Rf_error("Failed to set scalar kernel argument %d (size=%d, error code %d)", an, al, last_ocl_error);
} else {
cl_mem input = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, al * n, ptr, &last_ocl_error);
if (!input)
Rf_error("Unable to create buffer (%d elements, %d bytes each) for vector argument %d (oclError %d)", n, al, an, last_ocl_error);
if (!occ->mem_objects)
occ->mem_objects = arg_alloc(0, 32);
arg_add(occ->mem_objects, input);
#if 0 /* we used this before CL_MEM_USE_HOST_PTR */
if ((last_ocl_error = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, al * n, ptr, 0, NULL, NULL)) != CL_SUCCESS)
Rf_error("Failed to transfer data (%d elements) for vector argument %d (oclError %d)", n, an, last_ocl_error);
#endif
if ((last_ocl_error = clSetKernelArg(kernel, an++, sizeof(cl_mem), &input)) != CL_SUCCESS)
Rf_error("Failed to set vector kernel argument %d (size=%d, length=%d, error %d)", an, al, n, last_ocl_error);
/* clReleaseMemObject(input); */
}
args = CDR(args);
}
if ((last_ocl_error = clEnqueueNDRangeKernel(commands, kernel, wdim, NULL, wdims, NULL, 0, NULL, async ? &occ->event : NULL)) != CL_SUCCESS)
ocl_err("Kernel execution");
if (async) { /* asynchronous call -> get out and return the context */
#if USE_OCL_COMPLETE_CALLBACK
last_ocl_error = clSetEventCallback(occ->event, CL_COMPLETE, ocl_complete_callback, occ);
#endif
clFlush(commands); /* the specs don't guarantee execution unless clFlush is called */
occ->ftres = ftres;
occ->ftype = ftype;
occ->on = on;
Rf_setAttrib(octx, R_ClassSymbol, mkString("clCallContext"));
UNPROTECT(1);
return octx;
}
clFinish(commands);
occ->finished = 1;
/* we can release input memory objects now */
if (occ->mem_objects) {
arg_free(occ->mem_objects, (afin_t) clReleaseMemObject);
occ->mem_objects = 0;
}
if (float_args) {
arg_free(float_args, 0);
float_args = occ->float_args = 0;
}
res = ftres ? Rf_allocVector(RAWSXP, on * sizeof(float)) : Rf_allocVector(REALSXP, on);
if (ftype == FT_SINGLE) {
if (ftres) {
if ((last_ocl_error = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * on, RAW(res), 0, NULL, NULL )) != CL_SUCCESS)
Rf_error("Unable to transfer result vector (%d float elements, oclError %d)", on, last_ocl_error);
PROTECT(res);
Rf_setAttrib(res, R_ClassSymbol, mkString("clFloat"));
UNPROTECT(1);
} else {
/* float - need a temporary buffer */
float *fr = (float*) malloc(sizeof(float) * on);
double *r = REAL(res);
int i;
if (!fr)
Rf_error("unable to allocate memory for temporary single-precision output buffer");
occ->float_out = fr;
if ((last_ocl_error = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * on, fr, 0, NULL, NULL )) != CL_SUCCESS)
Rf_error("Unable to transfer result vector (%d float elements, oclError %d)", on, last_ocl_error);
for (i = 0; i < on; i++)
r[i] = fr[i];
}
} else if ((last_ocl_error = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(double) * on, REAL(res), 0, NULL, NULL )) != CL_SUCCESS)
Rf_error("Unable to transfer result vector (%d double elements, oclError %d)", on, last_ocl_error);
ocl_call_context_fin(octx);
UNPROTECT(1);
return res;
}
开发者ID:mprymek,项目名称:OpenCL,代码行数:101,代码来源:ocl.c
示例8: gws_test
//Do the proper test using different sizes.
static cl_ulong gws_test(size_t num, struct fmt_main * self) {
cl_event myEvent;
cl_int ret_code;
cl_uint *tmpbuffer;
cl_ulong startTime, endTime, runtime;
int i, loops;
//Prepare buffers.
create_clobj(num, self);
tmpbuffer = mem_alloc(sizeof(sha512_hash) * num);
if (tmpbuffer == NULL) {
fprintf(stderr, "Malloc failure in find_best_gws\n");
exit(EXIT_FAILURE);
}
queue_prof = clCreateCommandQueue(context[ocl_gpu_id], devices[ocl_gpu_id],
CL_QUEUE_PROFILING_ENABLE, &ret_code);
HANDLE_CLERROR(ret_code, "Failed in clCreateCommandQueue");
// Set salt.
set_salt(get_salt("$6$saltstring$"));
salt->initial = salt->rounds - get_multiple(salt->rounds, HASH_LOOPS);
// Set keys
for (i = 0; i < num; i++) {
set_key("aaabaabaaa", i);
}
//** Get execution time **//
HANDLE_CLERROR(clEnqueueWriteBuffer(queue_prof, salt_buffer, CL_FALSE, 0,
sizeof(sha512_salt), salt, 0, NULL, &myEvent),
"Failed in clEnqueueWriteBuffer");
HANDLE_CLERROR(clFinish(queue_prof), "Failed in clFinish");
HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT,
sizeof(cl_ulong), &startTime, NULL),
"Failed in clGetEventProfilingInfo I");
HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END,
sizeof(cl_ulong), &endTime, NULL),
"Failed in clGetEventProfilingInfo II");
HANDLE_CLERROR(clReleaseEvent(myEvent), "Failed in clReleaseEvent");
runtime = endTime - startTime;
//** Get execution time **//
HANDLE_CLERROR(clEnqueueWriteBuffer(queue_prof, pass_buffer, CL_FALSE, 0,
sizeof(sha512_password) * num, plaintext, 0, NULL, &myEvent),
"Failed in clEnqueueWriteBuffer");
HANDLE_CLERROR(clFinish(queue_prof), "Failed in clFinish");
HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT,
sizeof(cl_ulong), &startTime, NULL),
"Failed in clGetEventProfilingInfo I");
HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END,
sizeof(cl_ulong), &endTime, NULL),
"Failed in clGetEventProfilingInfo II");
HANDLE_CLERROR(clReleaseEvent(myEvent), "Failed in clReleaseEvent");
runtime += endTime - startTime;
//** Get execution time **//
if (gpu(source_in_use) || use_local(source_in_use)) {
ret_code = clEnqueueNDRangeKernel(queue_prof, prepare_kernel,
1, NULL, &num, &local_work_size, 0, NULL, &myEvent);
HANDLE_CLERROR(clFinish(queue_prof), "Failed in clFinish");
HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT,
sizeof(cl_ulong), &startTime, NULL),
"Failed in clGetEventProfilingInfo I");
HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END,
sizeof(cl_ulong), &endTime, NULL),
"Failed in clGetEventProfilingInfo II");
HANDLE_CLERROR(clReleaseEvent(myEvent), "Failed in clReleaseEvent");
runtime += endTime - startTime;
}
loops = gpu(source_in_use) || use_local(source_in_use) ? (salt->rounds / HASH_LOOPS) : 1;
//** Get execution time **//
for (i = 0; i < loops; i++)
{
ret_code = clEnqueueNDRangeKernel(queue_prof, crypt_kernel,
1, NULL, &num, &local_work_size, 0, NULL, &myEvent);
HANDLE_CLERROR(clFinish(queue_prof), "Failed in clFinish");
HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT,
sizeof(cl_ulong), &startTime, NULL),
"Failed in clGetEventProfilingInfo I");
HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END,
sizeof(cl_ulong), &endTime, NULL),
"Failed in clGetEventProfilingInfo II");
HANDLE_CLERROR(clReleaseEvent(myEvent), "Failed in clReleaseEvent");
runtime += endTime - startTime;
}
//** Get execution time **//
HANDLE_CLERROR(clEnqueueReadBuffer(queue_prof, hash_buffer, CL_FALSE, 0,
sizeof(sha512_hash) * num, tmpbuffer, 0, NULL, &myEvent),
"Failed in clEnqueueReadBuffer");
//.........这里部分代码省略.........
开发者ID:bhargavz,项目名称:pac4mac,代码行数:101,代码来源:opencl_cryptsha512_fmt.c
示例9: opencl_scanhash
static int64_t opencl_scanhash(struct thr_info *thr, struct work *work,
int64_t __maybe_unused max_nonce)
{
const int thr_id = thr->id;
struct opencl_thread_data *thrdata = thr->cgpu_data;
struct cgpu_info *gpu = thr->cgpu;
_clState *clState = clStates[thr_id];
const cl_kernel *kernel = &clState->kernel;
const int dynamic_us = opt_dynamic_interval * 1000;
cl_bool blocking;
cl_int status;
size_t globalThreads[1];
size_t localThreads[1] = { clState->wsize };
unsigned int threads;
int64_t hashes;
if (gpu->dynamic)
blocking = CL_TRUE;
else
blocking = CL_FALSE;
/* This finish flushes the readbuffer set with CL_FALSE later */
if (!blocking)
clFinish(clState->commandQueue);
if (gpu->dynamic) {
struct timeval diff;
suseconds_t gpu_us;
gettimeofday(&gpu->tv_gpuend, NULL);
timersub(&gpu->tv_gpuend, &gpu->tv_gpustart, &diff);
gpu_us = diff.tv_sec * 1000000 + diff.tv_usec;
if (likely(gpu_us >= 0)) {
gpu->gpu_us_average = (gpu->gpu_us_average + gpu_us * 0.63) / 1.63;
/* Try to not let the GPU be out for longer than
* opt_dynamic_interval in ms, but increase
* intensity when the system is idle in dynamic mode */
if (gpu->gpu_us_average > dynamic_us) {
if (gpu->intensity > MIN_INTENSITY)
--gpu->intensity;
} else if (gpu->gpu_us_average < dynamic_us / 2) {
if (gpu->intensity < MAX_INTENSITY)
++gpu->intensity;
}
}
}
set_threads_hashes(clState->vwidth, &threads, &hashes, globalThreads,
localThreads[0], gpu->intensity);
if (hashes > gpu->max_hashes)
gpu->max_hashes = hashes;
status = thrdata->queue_kernel_parameters(clState, &work->blk, globalThreads[0]);
if (unlikely(status != CL_SUCCESS)) {
applog(LOG_ERR, "Error: clSetKernelArg of all params failed.");
return -1;
}
/* MAXBUFFERS entry is used as a flag to say nonces exist */
if (thrdata->res[FOUND]) {
/* Clear the buffer again */
status = clEnqueueWriteBuffer(clState->commandQueue, clState->outputBuffer, blocking, 0,
BUFFERSIZE, blank_res, 0, NULL, NULL);
if (unlikely(status != CL_SUCCESS)) {
applog(LOG_ERR, "Error: clEnqueueWriteBuffer failed.");
return -1;
}
if (unlikely(thrdata->last_work)) {
applog(LOG_DEBUG, "GPU %d found something in last work?", gpu->device_id);
postcalc_hash_async(thr, thrdata->last_work, thrdata->res);
thrdata->last_work = NULL;
} else {
applog(LOG_DEBUG, "GPU %d found something?", gpu->device_id);
postcalc_hash_async(thr, work, thrdata->res);
}
memset(thrdata->res, 0, BUFFERSIZE);
if (!blocking)
clFinish(clState->commandQueue);
}
gettimeofday(&gpu->tv_gpustart, NULL);
if (clState->goffset) {
size_t global_work_offset[1];
global_work_offset[0] = work->blk.nonce;
status = clEnqueueNDRangeKernel(clState->commandQueue, *kernel, 1, global_work_offset,
globalThreads, localThreads, 0, NULL, NULL);
} else
status = clEnqueueNDRangeKernel(clState->commandQueue, *kernel, 1, NULL,
globalThreads, localThreads, 0, NULL, NULL);
if (unlikely(status != CL_SUCCESS)) {
applog(LOG_ERR, "Error %d: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel)", status);
return -1;
}
status = clEnqueueReadBuffer(clState->commandQueue, clState->outputBuffer, blocking, 0,
BUFFERSIZE, thrdata->res, 0, NULL, NULL);
if (unlikely(status != CL_SUCCESS)) {
//.........这里部分代码省略.........
开发者ID:nushor,项目名称:cgminer,代码行数:101,代码来源:driver-opencl.c
示例10: runKernel
void runKernel(void)
{
cl_int err;
cl_event event;
size_t global_item_size_max = 16;
size_t global_item_size = cl_width * cl_height;
size_t global_item_size2[] = {cl_width, cl_height};
char *output;
int i;
/* local item size :
* the number of work item
* in a work group in each diamension
*
* the only constraint for the global_work_size is
* that it must be a multiple of the local_work_size (for each dimension).
*/
size_t local_item_size = 64;
size_t local_item_size2[] = {64, 8};
//this will update our system by calculating new velocity and updating the positions of our particles
//Make sure OpenGL is done using our VBOs
glFinish();
// map OpenGL buffer object for writing from OpenCL
//this passes in the vector of VBO buffer objects (position and color)
err = clEnqueueAcquireGLObjects(command_queue, 2, cl_pbos, 0, NULL, NULL);
checkError("acquireGLObjects", err);
clFinish(command_queue);
//execute the kernel
//err = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
// &global_item_size,
// //&local_item_size,
// NULL,
// 0, NULL, &event);
err = clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL,
global_item_size2,
local_item_size2,
//NULL,
0, NULL, &event);
checkError("clEnqueueNDRangeKernel", err);
err = clEnqueueNDRangeKernel(command_queue, kernel_max, 1, NULL,
&global_item_size_max,
//&local_item_size,
NULL,
0, NULL, &event);
checkError("clEnqueueNDRangeKernel max", err);
///* Transfer result to host */
output = malloc(4 * 4);
err = clEnqueueReadBuffer(command_queue, mobj, CL_TRUE, 0, 4 * 4 * sizeof(char), output, 0, NULL, NULL);
checkError("clEnqueueReadBuffer", err);
free(output);
//clFinish(command_queue);
//Release the VBOs so OpenGL can play with them
clEnqueueReleaseGLObjects(command_queue, 2, cl_pbos, 0, NULL, NULL);
checkError("releaseGLObjects", err);
clFlush(command_queue);
clFinish(command_queue);
}
开发者ID:meyr,项目名称:clglPlayer,代码行数:65,代码来源:algorithm.c
示例11: main
//.........这里部分代码省略.........
// Create the input and output arrays in device memory for our calculation
//
input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * count, NULL, NULL);
output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL);
if (!input || !output)
{
printf("Error: Failed to allocate device memory!\n");
exit(1);
}
timer t = createTimer();
for(int i =0;i<rep;i++){
initData(data);
// Write our data set into the input array in device memory
//
err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to write to source array!\n");
exit(1);
}
// Set the arguments to our compute kernel
//
err = 0;
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &count);
if (err != CL_SUCCESS)
{
printf("Error: Failed to set kernel arguments! %d\n", err);
exit(1);
}
// Get the maximum work group size for executing the kernel on the device
//
err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to retrieve kernel work group info! %d\n", err);
exit(1);
}
// Execute the kernel over the entire range of our 1d input data set
// using the maximum number of work group items for this device
//
global = count;
err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
if (err)
{
printf("Error: Failed to execute kernel!\n");
return EXIT_FAILURE;
}
// Wait for the command commands to get serviced before reading back results
//
clFinish(commands);
// Read back the results from the device to verify the output
//
err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL );
if (err != CL_SUCCESS)
{
printf("Error: Failed to read output array! %d\n", err);
exit(1);
}
}
double timeEnd = getTime(t);
// Validate our results
//
correct = 0;
for(int i = 0; i < arraySize; i++)
{
if(results[i] >= 0){
correct++;
if(i==0){
printf("%d",results[i]);
}else{
printf(",%d",results[i]);
}
}
}
printf("\n");
// Print a brief summary detailing the results
printf("Computed '%d/%d' values to 1!\n", correct, arraySize);
printf("TIME- %f\n",timeEnd);
// Shutdown and cleanup
clReleaseMemObject(input);
clReleaseMemObject(output);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(commands);
clReleaseContext(context);
return 0;
}
开发者ID:TrevorDev,项目名称:CollatzConjecture-openCL,代码行数:101,代码来源:openCL-allToOne.c
示例12: Extrae_OpenCL_clCreateCommandQueue
void Extrae_OpenCL_clCreateCommandQueue (cl_command_queue queue,
cl_device_id device, cl_command_queue_properties properties)
{
if (!Extrae_OpenCL_lookForOpenCLQueue (queue, NULL))
{
cl_int err;
char _threadname[THREAD_INFO_NAME_LEN];
char _hostname[HOST_NAME_MAX];
char *_device_type;
int prev_threadid, found, idx;
cl_device_type device_type;
cl_event event;
idx = nCommandQueues;
CommandQueues = (RegisteredCommandQueue_t*) realloc (
CommandQueues,
sizeof(RegisteredCommandQueue_t)*(nCommandQueues+1));
if (CommandQueues == NULL)
{
fprintf (stderr, PACKAGE_NAME": Fatal error! Failed to allocate memory for OpenCL Command Queues\n");
exit (-1);
}
CommandQueues[idx].queue = queue;
CommandQueues[idx].isOutOfOrder =
(properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) != 0;
err = clGetDeviceInfo (device, CL_DEVICE_TYPE, sizeof(device_type), &device_type, NULL);
if (err == CL_SUCCESS)
{
if (device_type == CL_DEVICE_TYPE_GPU)
_device_type = "GPU";
else if (device_type == CL_DEVICE_TYPE_CPU)
_device_type = "CPU";
else
_device_type = "Other";
}
else
_device_type = "Unknown";
/* Was the thread created before (i.e. did we executed a cudadevicereset?) */
if (gethostname(_hostname, HOST_NAME_MAX) == 0)
sprintf (_threadname, "OpenCL-%s-CQ%d-%s", _device_type, 1+idx,
_hostname);
else
sprintf (_threadname, "OpenCL-%s-CQ%d-%s", _device_type, 1+idx,
"unknown-host");
prev_threadid = Extrae_search_thread_name (_threadname, &found);
if (found)
{
/* If thread name existed, reuse its thread id */
CommandQueues[idx].threadid = prev_threadid;
}
else
{
/* For timing purposes we change num of threads here instead of doing Backend_getNumberOfThreads() + CUDAdevices*/
Backend_ChangeNumberOfThreads (Backend_getNumberOfThreads() + 1);
CommandQueues[idx].threadid = Backend_getNumberOfThreads()-1;
/* Set thread name */
Extrae_set_thread_name (CommandQueues[idx].threadid, _threadname);
}
CommandQueues[idx].nevents = 0;
#ifdef CL_VERSION_1_2
err = clEnqueueBarrierWithWaitList (queue, 0, NULL, &event);
#else
err = clEnqueueBarrier (queue);
if (err == CL_SUCCESS)
err = clEnqueueMarker (queue, &event);
#endif
CommandQueues[idx].host_reference_time = TIME;
if (err == CL_SUCCESS)
{
err = clFinish(queue);
if (err != CL_SUCCESS)
{
fprintf (stderr, PACKAGE_NAME": Error in clFinish (error = %d)! Dying...\n", err);
exit (-1);
}
err = clGetEventProfilingInfo (event, CL_PROFILING_COMMAND_SUBMIT,
sizeof(cl_ulong), &(CommandQueues[idx].device_reference_time),
NULL);
if (err != CL_SUCCESS)
{
fprintf (stderr, PACKAGE_NAME": Error in clGetEventProfilingInfo (error = %d)! Dying...\n", err);
exit (-1);
}
}
else
{
fprintf (stderr, PACKAGE_NAME": Error while looking for clock references in host & accelerator\n");
exit (-1);
}
//.........这里部分代码省略.........
开发者ID:polca-project,项目名称:polca-toolbox,代码行数:101,代码来源:opencl_common.c
示例13: main
//.........这里部分代码省略.........
exit(1);
}
// Get the preferred workgroup size multiple
//
err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(preferred_workgroup_size), &preferred_workgroup_size, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to retrieve kernel work group info! %d\n", err);
exit(1);
}
// Get the preferred workgroup size
//
err = clGetKernelWorkGroupInfo(kernel, device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(max_workgroup_size), &max_workgroup_size, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to retrieve kernel work group info! %d\n", err);
exit(1);
}
// Create the output array in device memory for our calculation
//
output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL);
if (!input || !output)
{
printf("Error: Failed to allocate device memory!\n");
exit(1);
}
// Write our data set into the input array in device memory
//
err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to write to source array!\n");
exit(1);
}
// Set the arguments to our compute kernel
//
err = 0;
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &count);
if (err != CL_SUCCESS)
{
printf("Error: Failed to set kernel arguments! %d\n", err);
exit(1);
}
// Execute the kernel over the entire range of our 1d input data set
// using the maximum number of work group items for this device
//
// global = count;
global = local;
err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
if (err)
{
printf("Error: Failed to execute kernel!\n");
return EXIT_FAILURE;
}
// Wait for the command commands to get serviced before reading back results
//
clFinish(commands);
// Read back the results from the device to verify the output
//
err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL );
if (err != CL_SUCCESS)
{
printf("Error: Failed to read output array! %d\n", err);
exit(1);
}
// Validate our results
//
correct = 0;
for(i = 0; i < count; i++)
{
if((results[i]) == data[i] * data[i])
correct++;
}
// Print a brief summary detailing the results
//
printf("Computed '%d/%d' correct values!\n", correct, count);
// Shutdown and cleanup
//
clReleaseMemObject(input);
clReleaseMemObject(output);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(commands);
clReleaseContext(context);
return 0;
}
开发者ID:InternetofAwesome,项目名称:opencl_demo,代码行数:101,代码来源:main.c
示例14: main
int main()
{
cl_device_id device_id = NULL;
cl_context context = NULL;
cl_command_queue command_queue = NULL;
cl_mem memobj = NULL;
cl_program program = NULL;
cl_kernel kernel = NULL;
cl_platform_id platform_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret;
char string[MEM_SIZE];
FILE *fp;
char fileName[] = "./hello.cl";
char *source_str;
size_t source_size;
/* Load the source code containing the kernel*/
fp = fopen(fileName, "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
fclose(fp);
/* Get Platform and Device Info */
ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices);
/* Create OpenCL context */
context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
/* Create Command Queue */
command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
/* Create Memory Buffer */
memobj = clCreateBuffer(context, CL_MEM_READ_WRITE,MEM_SIZE * sizeof(char), NULL, &ret);
/* Create Kernel Program from the source */
program = clCreateProgramWithSource(context, 1, (const char **)&source_str,
(const size_t *)&source_size, &ret);
/* Build Kernel Program */
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
/* Create OpenCL Kernel */
kernel = clCreateKernel(program, "hello", &ret);
/* Set OpenCL Kernel Parameters */
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobj);
/* Execute OpenCL Kernel */
ret = clEnqueueTask(command_queue, kernel, 0, NULL,NULL);
/* Copy results from the memory buffer */
ret = clEnqueueReadBuffer(command_queue, memobj, CL_TRUE, 0,
MEM_SIZE * sizeof(char),string, 0, NULL, NULL);
/* Display Result */
puts(string);
/* Finalization */
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(memobj);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
free(source_str);
system("Pause");
return 0;
}
开发者ID:daves14,项目名称:workspace,代码行数:80,代码来源:main.cpp
示例15: sizeof
void btParticlesDynamicsWorld::runIntegrateMotionKernel()
{
cl_int ciErrNum;
if(m_useCpuControls[SIMSTAGE_INTEGRATE_MOTION]->m_active)
{
// CPU version
#if 1
// read from GPU
unsigned int memSize = sizeof(btVector3) * m_numParticles;
ciErrNum = clEnqueueReadBuffer(m_cqCommandQue, m_dPos, CL_TRUE, 0, memSize, &(m_hPos[0]), 0, NU
|
请发表评论