本文整理汇总了C++中clWaitForEvents函数的典型用法代码示例。如果您正苦于以下问题:C++ clWaitForEvents函数的具体用法?C++ clWaitForEvents怎么用?C++ clWaitForEvents使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了clWaitForEvents函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的C++代码示例。
示例1: oclLaunchKernel
double
oclLaunchKernel(cl_kernel k, cl_command_queue q, int nbobj, int nbthread, const char *fname, const int line)
{
cl_int err = 0;
dim3 gws, lws;
cl_event event;
double elapsk;
int maxThreads = 0;
cl_uint one = 1;
cl_device_id dId = oclGetDeviceOfCQueue(q);
size_t prefsz = 32;
maxThreads = oclGetMaxWorkSize(k, dId);
maxThreads = MIN(maxThreads, nbthread);
// Get the proper size for the hardware
err = clGetKernelWorkGroupInfo(k, dId, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(prefsz), &prefsz, NULL);
oclCheckErr(err, "clGetKernelWorkGroupInfo CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE");
// make sure we have the proper multiple: AMD 7970 crashes is not met.
maxThreads = oclMultiple(maxThreads, prefsz);
// printf("1D %d \n", maxThreads);
oclMkNDrange(nbobj, maxThreads, NDR_1D, gws, lws);
// printf("Launch: %ld G:%ld %ld %ld L:%ld %ld %ld\n", nbobj, gws[0], gws[1], gws[2], lws[0], lws[1], lws[2]);
err = clEnqueueNDRangeKernel(q, k, NDR_1D, NULL, gws, lws, 0, NULL, &event);
oclCheckErrF(err, "clEnqueueNDRangeKernel", fname, line);
err = clWaitForEvents(one, &event);
oclCheckErrF(err, "clWaitForEvents", fname, line);
elapsk = oclChronoElaps(event);
err = clReleaseEvent(event);
oclCheckErrF(err, "clReleaseEvent", fname, line);
return elapsk;
}
开发者ID:kghoracle,项目名称:Hydro,代码行数:39,代码来源:ocltools.c
示例2: RunRoutine
// Describes how to run the CLBlast routine
static StatusCode RunRoutine(const Arguments<T> &args, Buffers<T> &buffers, Queue &queue) {
#ifdef OPENCL_API
auto queue_plain = queue();
auto event = cl_event{};
auto status = Hbmv(args.layout, args.triangle,
args.n, args.kl, args.alpha,
buffers.a_mat(), args.a_offset, args.a_ld,
buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
buffers.y_vec(), args.y_offset, args.y_inc,
&queue_plain, &event);
if (status == StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); }
#elif CUDA_API
auto status = Hbmv(args.layout, args.triangle,
args.n, args.kl, args.alpha,
buffers.a_mat(), args.a_offset, args.a_ld,
buffers.x_vec(), args.x_offset, args.x_inc, args.beta,
buffers.y_vec(), args.y_offset, args.y_inc,
queue.GetContext()(), queue.GetDevice()());
cuStreamSynchronize(queue());
#endif
return status;
}
开发者ID:gpu,项目名称:CLBlast,代码行数:23,代码来源:xhbmv.hpp
示例3: clWaitForEvents
PerformanceAnalyser::TimelineEntry PerformanceAnalyser::analyzeEvent(cl_event &event) {
// Wait for event information to be ready
clWaitForEvents(1, &event);
TimelineEntry entry;
cl_ulong time;
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &time, NULL);
entry.start_time = (double) time / 1000000000.0;
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &time, NULL);
entry.end_time = (double) time / 1000000000.0;
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &time, NULL);
double exec_start = ((double) time / 1000000000.0);
entry.execution_time = entry.end_time - exec_start;
entry.api_overhead = exec_start - entry.start_time;
entry.total_time = entry.end_time - entry.start_time;
entry.cpu_time = (getTime()-m_time)-entry.total_time;
return entry;
}
开发者ID:babrodtk,项目名称:ocls-core,代码行数:22,代码来源:PerformanceAnalyser.cpp
示例4: read_value
void read_value(){
int err;
cl_event readevent;
err = clEnqueueReadBuffer(commands, d_output, CL_TRUE, 0,
REC_N * sizeof(cl_int),
h_output, 0, NULL, &readevent);
if (err != CL_SUCCESS)
{
printf("Error: Failed to read output array! %d\n", err);
printf("Test failed\n");
exit(1);
}
clWaitForEvents(1, &readevent);
printf("\n[host] outputs:\n");
for (int i = 0; i < REC_N; ++i) {
printf("%d ", h_output[i]);
}
printf("\n");
}
开发者ID:ericfukuda,项目名称:sdaccel_samples,代码行数:22,代码来源:host.cpp
示例5: mat_mul_cl_clblas
void mat_mul_cl_clblas(const F *A, const F *B, F *C, size_t n, Cache *cache) {
cl_event event;
size_t mat_sizeof;
mat_sizeof = n * n * sizeof(F);
clEnqueueWriteBuffer(cache->common.command_queue, cache->buf_a, CL_TRUE, 0, mat_sizeof, (F*)A, 0, NULL, NULL);
clEnqueueWriteBuffer(cache->common.command_queue, cache->buf_b, CL_TRUE, 0, mat_sizeof, (F*)B, 0, NULL, NULL);
clblasSgemm(
clblasRowMajor,
clblasNoTrans,
clblasNoTrans,
n,
n,
n,
1.0,
cache->buf_a,
0,
n,
cache->buf_b,
0,
n,
0.0,
cache->buf_c,
0,
n,
1,
&(cache->common.command_queue),
0,
NULL,
&event
);
clWaitForEvents(1, &event);
clEnqueueReadBuffer(cache->common.command_queue, cache->buf_c, CL_TRUE, 0, mat_sizeof, C, 0, NULL, NULL);
}
开发者ID:cirosantilli,项目名称:cpp-cheat,代码行数:39,代码来源:matmul.c
示例6: mwWaitReleaseEvent
/* Wait for an event then release it */
cl_int mwWaitReleaseEvent(cl_event* ev)
{
cl_int err;
assert(ev);
err = clWaitForEvents(1, ev);
if (err != CL_SUCCESS)
{
mwPerrorCL(err, "Failed to wait for event");
return err;
}
err = clReleaseEvent(*ev);
if (err != CL_SUCCESS)
{
mwPerrorCL(err, "Failed to release event");
return err;
}
return CL_SUCCESS;
}
开发者ID:LocutusOfBorg,项目名称:milkywayathome_client,代码行数:23,代码来源:milkyway_cl_util.c
示例7: context
/*!
Copies the contents of this buffer, starting at \a offset to
\a rect within \a dest. Returns true if the copy was successful;
false otherwise.
This function will block until the request finishes.
The request is executed on the active command queue for context().
\sa copyToAsync()
*/
bool QCLBuffer::copyTo
(size_t offset, const QCLImage2D &dest, const QRect &rect)
{
const size_t dst_origin[3] = {static_cast<size_t>(rect.x()),
static_cast<size_t>(rect.y()), 0
};
const size_t region[3] = {static_cast<size_t>(rect.width()),
static_cast<size_t>(rect.height()), 1
};
cl_event event;
cl_int error = clEnqueueCopyBufferToImage
(context()->activeQueue(), memoryId(), dest.memoryId(),
offset, dst_origin, region, 0, 0, &event);
context()->reportError("QCLBuffer::copyTo(QCLImage2D):", error);
if (error == CL_SUCCESS) {
clWaitForEvents(1, &event);
clReleaseEvent(event);
return true;
} else {
return false;
}
}
开发者ID:radrad350,项目名称:QtOpenCL,代码行数:32,代码来源:qclbuffer.cpp
示例8: testScanImpl
void testScanImpl(int rLen)
{
int _CPU_GPU=0;
cl_event eventList[2];
int index=0;
cl_kernel Kernel;
int CPU_GPU;
double burden;
int result=0;
int memSize=sizeof(int)*rLen;
int outSize=sizeof(int)*rLen;
void *Rin;
HOST_MALLOC(Rin, memSize);
generateRandInt((int*)Rin, rLen,rLen,0);
void *Rout;
HOST_MALLOC(Rout, outSize);
cl_mem d_Rin;
CL_MALLOC(&d_Rin, memSize);
cl_mem d_Rout;
CL_MALLOC(&d_Rout, outSize);
cl_writebuffer(d_Rin, Rin, memSize,&index,eventList,&CPU_GPU,&burden,_CPU_GPU);
ScanPara *SP;
SP=(ScanPara*)malloc(sizeof(ScanPara));
initScan(rLen,SP);
scanImpl(d_Rin,rLen,d_Rout,&index,eventList,&Kernel,&CPU_GPU,&burden,SP,_CPU_GPU);
cl_readbuffer(Rout, d_Rout, outSize,&index,eventList,&CPU_GPU,&burden,_CPU_GPU);
clWaitForEvents(1,&eventList[(index-1)%2]);
closeScan(SP);
deschedule(CPU_GPU,burden);
//validateScan( (int*)Rin, rLen, (int*)Rout );
HOST_FREE(Rin);
HOST_FREE(Rout);
CL_FREE(d_Rin);
CL_FREE(d_Rout);
clReleaseKernel(Kernel);
clReleaseEvent(eventList[0]);
clReleaseEvent(eventList[1]);
}
开发者ID:johnspaul92,项目名称:omnidb-paralleldbonapu,代码行数:38,代码来源:testScan.cpp
示例9: acc_event_synchronize
int acc_event_synchronize (void* event){
// debug info
if (verbose_print){
fprintf(stdout, "\n ... EVENT SYNCHRONIZATION ... \n");
fprintf(stdout, " ---> Entering: acc_event_synchronize.\n");
}
// local event and queue pointers
cl_event *clevent = (cl_event *) event;
// wait for an event ( !!! need to share the same ctx !!! )
cl_error = clWaitForEvents((cl_uint) 1, clevent);
if (acc_opencl_error_check(cl_error, __LINE__))
return -1;
// debug info
if (verbose_print){
fprintf(stdout, " ---> Leaving: acc_event_synchronize.\n");
}
// assign return value
return 0;
}
开发者ID:rzk1,项目名称:cp2k-mcgill,代码行数:23,代码来源:acc_opencl_event.c
示例10: pclu_call_kernel
void
pclu_call_kernel(pclu_program* pgm, const char* name, pclu_range range, size_t argc, ...)
{
cl_int errcode;
cl_kernel kern = clCreateKernel(pgm->program, name, &errcode);
pclu_check_call("clCreateKernel", errcode);
va_list ap;
va_start(ap, argc);
for (cl_uint ii = 0; ii < argc; ++ii) {
size_t size = va_arg(ap, size_t);
void* arg = va_arg(ap, void*);
pclu_check_call("clSetKernelArg", clSetKernelArg(kern, ii, size, arg));
}
va_end(ap);
#define NO_CL_EVENTS 1
#ifdef NO_CL_EVENTS
cl_event kernel_done = 0;
#else
cl_event kernel_done = clCreateUserEvent(pgm->pclu->context, &errcode);
pclu_check_call("clCreateUserEvent", errcode);
#endif
errcode = clEnqueueNDRangeKernel(pgm->pclu->queue, kern, range.nd, 0,
range.global, 0, 0, 0, &kernel_done);
pclu_check_call("clEnqueueNDRangeKernel", errcode);
#ifndef NO_CL_EVENTS
pclu_check_call("clWaitForEvents", clWaitForEvents(1, &kernel_done));
#endif
pclu_check_call("clReleaseKernel", clReleaseKernel(kern));
}
开发者ID:NatTuck,项目名称:pocl-0.7x,代码行数:37,代码来源:pclu.c
示例11: CL_GroupBy
extern "C" int CL_GroupBy(Record * h_Rin, int rLen, Record* h_Rout, int** h_startPos,
int numThread, int numBlock , int _CPU_GPU)
{
cl_mem d_Rin;
cl_mem d_Rout;
cl_mem d_startPos;
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
cl_event eventList[2];
int index=0;
cl_kernel Kernel;
int CPU_GPU;
double burden;
int memSize = sizeof(Record)*rLen;
CL_MALLOC( &d_Rin, memSize );
CL_MALLOC(&d_Rout, memSize );
cl_writebuffer( d_Rin, h_Rin, memSize,&index,eventList,&CPU_GPU,&burden,_CPU_GPU);
int numGroup = 0;
numGroup= groupByImpl(d_Rin, rLen, d_Rout, &d_startPos, numThread, numBlock,&index,eventList,&Kernel,&CPU_GPU,&burden,_CPU_GPU);
(*h_startPos) = (int*)malloc( sizeof(int)*numGroup );
cl_readbuffer( *h_startPos, d_startPos, sizeof(int)*numGroup,&index,eventList,&CPU_GPU,&burden,_CPU_GPU);
cl_readbuffer( h_Rout, d_Rout, sizeof(Record)*rLen,&index,eventList,&CPU_GPU,&burden,_CPU_GPU);
clWaitForEvents(1,&eventList[(index-1)%2]);
deschedule(CPU_GPU,burden);
CL_FREE( d_Rin );
CL_FREE( d_Rout );
CL_FREE( d_startPos );
clReleaseKernel(Kernel);
clReleaseEvent(eventList[0]);
clReleaseEvent(eventList[1]);
printf("CL_GroupBy\n");
return numGroup;
}
开发者ID:johnspaul92,项目名称:omnidb-paralleldbonapu,代码行数:37,代码来源:GroupBy.cpp
示例12: clWaitForEvents
void deathray::SingleFrameExecute() {
cl_uint wait_list_length = 0;
cl_event wait_list[3];
result status;
if (temporal_radius_Y_ == 0 && h_Y_ > 0.f) {
status = g_SingleFrame_Y.CopyTo(srcpY_);
if (status != FILTER_OK) env_->ThrowError("Deathray: Copy Y to device status=%d and OpenCL status=%d", status, g_last_cl_error);
}
if (temporal_radius_UV_ == 0 && h_UV_ > 0.f) {
status = g_SingleFrame_U.CopyTo(srcpU_);
if (status != FILTER_OK) env_->ThrowError("Deathray: Copy U to device status=%d and OpenCL status=%d", status, g_last_cl_error);
status = g_SingleFrame_V.CopyTo(srcpV_);
if (status != FILTER_OK) env_->ThrowError("Deathray: Copy V to device status=%d and OpenCL status=%d", status, g_last_cl_error);
}
if (temporal_radius_Y_ == 0 && h_Y_ > 0.f) {
status = g_SingleFrame_Y.Execute();
if (status != FILTER_OK) env_->ThrowError("Deathray: Execute Y kernel status=%d and OpenCL status=%d", status, g_last_cl_error);
status = g_SingleFrame_Y.CopyFrom(dstpY_, wait_list);
if (status != FILTER_OK) env_->ThrowError("Deathray: Copy Y to host status=%d and OpenCL status=%d", status, g_last_cl_error);
++wait_list_length;
}
if (temporal_radius_UV_ == 0 && h_UV_ > 0.f) {
g_SingleFrame_U.Execute();
if (status != FILTER_OK) env_->ThrowError("Deathray: Execute U kernel status=%d and OpenCL status=%d", status, g_last_cl_error);
g_SingleFrame_U.CopyFrom(dstpU_, wait_list + wait_list_length++);
if (status != FILTER_OK) env_->ThrowError("Deathray: Copy U to host status=%d and OpenCL status=%d", status, g_last_cl_error);
g_SingleFrame_V.Execute();
if (status != FILTER_OK) env_->ThrowError("Deathray: Execute V kernel status=%d and OpenCL status=%d", status, g_last_cl_error);
g_SingleFrame_V.CopyFrom(dstpV_, wait_list + wait_list_length++);
if (status != FILTER_OK) env_->ThrowError("Deathray: Copy V to host status=%d and OpenCL status=%d", status, g_last_cl_error);
}
clWaitForEvents(wait_list_length, wait_list);
}
开发者ID:chappjc,项目名称:Deathray,代码行数:37,代码来源:deathray.cpp
示例13: copyhostptr_roundtrip_func
void copyhostptr_roundtrip_func()
{
timer.Start(timer_id);
//set up buffer
cl_int err;
buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
(buffer_.lda_ * buffer_.a_num_vectors_ +
buffer_.offA_) * sizeof(T),
buffer_.a_, &err);
buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
(buffer_.ldb_ * buffer_.b_num_vectors_ +
buffer_.offB_) * sizeof(T),
buffer_.b_, &err);
//call func
xTrsm_Function(false);
//read gpu buffer
err = clEnqueueReadBuffer(queue_, buffer_.buf_b_, CL_TRUE,
buffer_.offB_ * sizeof(T), buffer_.ldb_ * buffer_.b_num_vectors_ *
sizeof(T),
buffer_.b_, 0, NULL, &event_);
clWaitForEvents(1, &event_);
timer.Stop(timer_id);
}
开发者ID:nagyist,项目名称:clBLAS,代码行数:24,代码来源:clfunc_xtrsm.hpp
示例14: write_to_buffer
void write_to_buffer(eObj* e, cObj cCandidate) {
Tempest::data.lNumPSMs += 1;
if (e->iNumBufferedCandidates == 0) {
clWaitForEvents(1, &(e->clEventSent));
if (Tempest::config.profile) {
cl_ulong start;
cl_ulong end;
int err;
err = clGetEventProfilingInfo(e->clEventSent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
err |= clGetEventProfilingInfo(e->clEventSent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
if (err == 0)
e->device->totalSendTime += (end-start);
}
clReleaseEvent(e->clEventSent);
}
e->candidateBuffer[e->iNumBufferedCandidates] = cCandidate;
//memcpy(e->candidateBuffer+e->iNumBufferedCandidates, &cCandidate, sizeof(cObj));
e->iNumCandidates++;
e->iNumBufferedCandidates++;
if (e->iNumBufferedCandidates == e->candidateBufferSize) {
//printf("%d\t%d\n", gpu_info.iNumScoringKernels, iBin);
e->device->scoreCandidates(e);
}
}
开发者ID:markadamo,项目名称:tempest,代码行数:24,代码来源:theoretical.cpp
示例15: Dsyrk_internal
cl_int Dsyrk_internal(
cl_env *env, double *a, double *c, double alpha, double beta,
clblasTranspose transA, clblasUplo uplo, int ar, int ac, int n, int size_a, int size_c)
{
CHECK(clblasSetup());
cl_event events[NEVENTS];
int nevent = 0;
cl_mem mem_a = create_mem(env, a, size_a, CL_MEM_READ_ONLY, &(events[nevent++]));
cl_mem mem_c;
if (beta != 0) mem_c = create_mem(env, c, size_c, CL_MEM_READ_WRITE, &(events[nevent++]));
else mem_c = create_mem(env, NULL, size_c, CL_MEM_READ_WRITE, NULL);
int k = transA == clblasNoTrans ? ar : ac;
cl_int err = clblasDsyrk(clblasColumnMajor, uplo, transA,
n, k, alpha, mem_a, 0, ac, beta, mem_c, 0, n,
1, &(env->queues[0]), nevent, events, &(events[nevent]));
CHECK(err);
events[nevent+1] = *read_mem(env, mem_c, c, size_c, 1, &(events[nevent]));
CHECK(clWaitForEvents(1, &(events[nevent+1])));
CHECK(clReleaseMemObject(mem_a));
CHECK(clReleaseMemObject(mem_c));
clblasTeardown();
return CL_SUCCESS;
}
开发者ID:yeomii,项目名称:RclBLAS,代码行数:24,代码来源:blas3-wrapper.c
示例16: main
int
main(void)
{
cl_int err;
cl_platform_id platform = 0;
cl_device_id device = 0;
cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
cl_context ctx = 0;
cl_command_queue queue = 0;
cl_mem bufX, bufY;
cl_event event = NULL;
int ret = 0;
int lenX = 1 + (N-1)*abs(incx);
int lenY = 1 + (N-1)*abs(incy);
/* Setup OpenCL environment. */
err = clGetPlatformIDs(1, &platform, NULL);
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL);
if (err != CL_SUCCESS) {
printf( "clGetPlatformIDs() failed with %d\n", err );
return 1;
}
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
if (err != CL_SUCCESS) {
printf( "clGetDeviceIDs() failed with %d\n", err );
return 1;
}
props[1] = (cl_context_properties)platform;
ctx = clCreateContext(props, 1, &device, NULL, NULL, &err);
if (err != CL_SUCCESS) {
printf( "clCreateContext() failed with %d\n", err );
return 1;
}
queue = clCreateCommandQueue(ctx, device, 0, &err);
if (err != CL_SUCCESS) {
printf( "clCreateCommandQueue() failed with %d\n", err );
clReleaseContext(ctx);
return 1;
}
/* Setup clblas. */
err = clblasSetup();
if (err != CL_SUCCESS) {
printf("clblasSetup() failed with %d\n", err);
clReleaseCommandQueue(queue);
clReleaseContext(ctx);
return 1;
}
/* Prepare OpenCL memory objects and place matrices inside them. */
bufX = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenX*sizeof(cl_float)), NULL, &err);
bufY = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenY*sizeof(cl_float)), NULL, &err);
err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL);
err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)), Y, 0, NULL, NULL);
printResult();
/* Call clblas function. */
err = clblasSrot(N, bufX, 0, incx, bufY, 0, incy, C, S, 1, &queue, 0, NULL, &event);
// printf("here\n");
if (err != CL_SUCCESS) {
printf("clblasSrot() failed with %d\n", err);
ret = 1;
}
else {
/* Wait for calculations to be finished. */
err = clWaitForEvents(1, &event);
/* Fetch results of calculations from GPU memory. */
err = clEnqueueReadBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)),
Y, 0, NULL, NULL);
err = clEnqueueReadBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)),
X, 0, NULL, NULL);
/* At this point you will get the result of SROT placed in vector Y. */
printResult();
}
//printf("here\n");
/* Release OpenCL memory objects. */
clReleaseMemObject(bufY);
clReleaseMemObject(bufX);
/* Finalize work with clblas. */
clblasTeardown();
/* Release OpenCL working objects. */
clReleaseCommandQueue(queue);
clReleaseContext(ctx);
return ret;
}
开发者ID:BenjaminCoquelle,项目名称:clBLAS,代码行数:98,代码来源:example_srot.c
示例17: groupBy
struct tableNode * groupBy(struct groupByNode * gb, struct clContext * context, struct statistic * pp){
struct timespec start,end;
clock_gettime(CLOCK_REALTIME,&start);
cl_event ndrEvt;
cl_ulong startTime,endTime;
struct tableNode * res = NULL;
long gpuTupleNum;
int gpuGbColNum;
cl_mem gpuGbIndex;
cl_mem gpuGbType, gpuGbSize;
cl_mem gpuGbKey;
cl_mem gpuContent;
int gbCount; // the number of groups
int gbConstant = 0; // whether group by constant
cl_int error = 0;
res = (struct tableNode *) malloc(sizeof(struct tableNode));
CHECK_POINTER(res);
res->tupleSize = gb->tupleSize;
res->totalAttr = gb->outputAttrNum;
res->attrType = (int *) malloc(sizeof(int) * res->totalAttr);
CHECK_POINTER(res->attrType);
res->attrSize = (int *) malloc(sizeof(int) * res->totalAttr);
CHECK_POINTER(res->attrSize);
res->attrTotalSize = (int *) malloc(sizeof(int) * res->totalAttr);
CHECK_POINTER(res->attrTotalSize);
res->dataPos = (int *) malloc(sizeof(int) * res->totalAttr);
CHECK_POINTER(res->dataPos);
res->dataFormat = (int *) malloc(sizeof(int) * res->totalAttr);
CHECK_POINTER(res->dataFormat);
res->content = (char **) malloc(sizeof(char **) * res->totalAttr);
CHECK_POINTER(res->content);
for(int i=0;i<res->totalAttr;i++){
res->attrType[i] = gb->attrType[i];
res->attrSize[i] = gb->attrSize[i];
res->dataFormat[i] = UNCOMPRESSED;
}
gpuTupleNum = gb->table->tupleNum;
gpuGbColNum = gb->groupByColNum;
if(gpuGbColNum == 1 && gb->groupByIndex[0] == -1){
gbConstant = 1;
}
size_t localSize = 128;
size_t globalSize = 1024*128;
int blockNum = gb->table->tupleNum / localSize + 1;
if(blockNum < 1024)
globalSize = blockNum * 128;
cl_mem gpu_hashNum;
cl_mem gpu_psum;
cl_mem gpuGbCount;
long * cpuOffset = (long *)malloc(sizeof(long) * gb->table->totalAttr);
CHECK_POINTER(cpuOffset);
long offset = 0;
long totalSize = 0;
for(int i=0;i<gb->table->totalAttr;i++){
int attrSize = gb->table->attrSize[i];
int size = attrSize * gb->table->tupleNum;
cpuOffset[i] = offset;
/*align each column*/
if(size % 4 !=0){
size += 4 - (size%4);
}
offset += size;
totalSize += size;
}
gpuContent = clCreateBuffer(context->context,CL_MEM_READ_ONLY, totalSize,NULL,&error);
for(int i=0;i<gb->table->totalAttr;i++){
int attrSize = gb->table->attrSize[i];
int size = attrSize * gb->table->tupleNum;
if(gb->table->dataPos[i]==MEM){
error = clEnqueueWriteBuffer(context->queue, gpuContent, CL_TRUE, cpuOffset[i], size, gb->table->content[i],0,0,&ndrEvt);
#ifdef OPENCL_PROFILE
clWaitForEvents(1, &ndrEvt);
//.........这里部分代码省略.........
开发者ID:arnaudperin,项目名称:gpudb,代码行数:101,代码来源:groupby.cpp
示例18: main
int main(int argc, char **argv) {
cl_int status;
const char *platform_name = "NVIDIA";
if (!find_platform(platform_name, &platform)) {
fprintf(stderr,"Error: Platform \"%s\" not found\n", platform_name);
print_platforms();
teardown(-1);
}
status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
checkError (status, "Error: could not query devices");
context = clCreateContext(NULL, 1, &device, NULL, NULL, &status);
checkError(status, "could not create context");
const char name[] = KERNELDIR "/reduce.cl";
unsigned char *source;
size_t size;
if (!load_file(name, &source, &size)) {
teardown(-1);
}
program = clCreateProgramWithSource(context, 1, (const char **) &source, &size, &status);
checkError(status, "Error: failed to create program %s: ", name);
status = clBuildProgram(program, 1, &device, "-I.", NULL, NULL);
if (status != CL_SUCCESS) {
print_build_log(program, device);
checkError(status, "Error: failed to create build %s: ", name);
}
free(source);
print_device_info(device, 0);
queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &status);
checkError(status, "could not create command queue");
cl_ulong start, end;
cl_event event;
size_t width = 1024+1024;
size_t buf_size = width*sizeof(cl_float);
kernel = clCreateKernel(program, "reduce", &status);
checkError(status, "could not create kernel");
size_t work_size = width;
size_t local_size = 64;
size_t local_buf_size = local_size * sizeof(cl_float);
size_t groups = width / local_size;
size_t res_buf_size = groups * sizeof(cl_float);
float *data_in = malloc(buf_size);
float *data_out = malloc(res_buf_size);
if (!data_in || !data_out) {
fprintf(stderr,"\nError: malloc failed\n");
teardown(-1);
}
for (unsigned int i = 0; i < width; ++i) {
data_in[i] = (float) (i % 16);
}
buffer_in = clCreateBuffer(context, CL_MEM_READ_WRITE, buf_size, NULL, &status);
checkError(status, "Error: could not create buffer_in");
buffer_out = clCreateBuffer(context, CL_MEM_READ_WRITE, res_buf_size, NULL, &status);
checkError(status, "Error: could not create buffer_out");
status = clEnqueueWriteBuffer(queue, buffer_in, CL_FALSE, 0, buf_size, data_in, 0, NULL, NULL);
checkError(status, "Error: could not copy data into device");
// execute kernel
int arg = 0;
status = clSetKernelArg(kernel, arg++, sizeof(cl_mem), &buffer_in);
status = clSetKernelArg(kernel, arg++, sizeof(cl_mem), &buffer_out);
status = clSetKernelArg(kernel, arg++, local_buf_size, NULL);
status = clSetKernelArg(kernel, arg++, sizeof(cl_int), &width);
checkError(status, "Error: could not set args");
status = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &work_size, &local_size, 0, NULL, &event);
checkError(status, "Error: could not enqueue kernel");
status = clWaitForEvents(1, &event);
checkError(status, "Error: could not wait for event");
status = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
checkError(status, "Error: could not get start profile information");
status = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
checkError(status, "Error: could not get end profile information");
status = clReleaseEvent(event);
checkError(status, "Error: could not release event");
//.........这里部分代码省略.........
开发者ID:sifrrich,项目名称:ocl-examples,代码行数:101,代码来源:reduce.c
示例19: clEnqueueNDRangeKernel
//.........这里部分代码省略.........
status = clEnqueueNDRangeKernel(
commandQueue,
kernel,
1,
NULL,
globalThreads,
localThreads,
0,
NULL,
NULL);
if(!sampleCommon->checkVal(
status,
CL_SUCCESS,
"clEnqueueNDRangeKernel failed."))
{
return SDK_FAILURE;
}
status = clFinish(commandQueue);
if(!sampleCommon->checkVal(
status,
CL_SUCCESS,
"clFinish failed."))
{
return SDK_FAILURE;
}
/* Copy data from new to old */
status = clEnqueueCopyBuffer(commandQueue,
newPos,
currPos,
0,
0,
sizeof(cl_float4) * numBodies,
0,
0,
0);
if(!sampleCommon->checkVal(
status,
CL_SUCCESS,
"clEnqueueCopyBuffer failed.(newPos->oldPos)"))
{
return SDK_FAILURE;
}
status = clEnqueueCopyBuffer(commandQueue,
newVel,
currVel,
0,
0,
sizeof(cl_float4) * numBodies,
0,
0,
0);
if(!sampleCommon->checkVal(
status,
CL_SUCCESS,
"clEnqueueCopyBuffer failed.(newVel->oldVels)"))
{
return SDK_FAILURE;
}
status = clFinish(commandQueue);
if(!sampleCommon->checkVal(
status,
CL_SUCCESS,
"clFinish failed."))
{
return SDK_FAILURE;
}
/* Enqueue readBuffer*/
status = clEnqueueReadBuffer(
commandQueue,
currPos,
CL_TRUE,
0,
numBodies* sizeof(cl_float4),
pos,
0,
NULL,
&events[0]);
if(!sampleCommon->checkVal(
status,
CL_SUCCESS,
"clEnqueueReadBuffer failed."))
return SDK_FAILURE;
/* Wait for the read buffer to finish execution */
status = clWaitForEvents(1, &events[0]);
if(!sampleCommon->checkVal(
status,
CL_SUCCESS,
"clWaitForEvents failed."))
return SDK_FAILURE;
clReleaseEvent(events[0]);
return SDK_SUCCESS;
}
开发者ID:KlozeKao,项目名称:wfvopencl-benchmarks,代码行数:101,代码来源:NBody.cpp
示例20: crossprod_clblas
//.........这里部分代码省略.........
} else {
cl_output_matrix = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
ncol * ncol * sizeof(double), output_matrix_d, &err);
}
}
// ++++++++++++
const clblasOrder order = clblasColumnMajor;
const clblasTranspose transA = clblasTrans;
const size_t lda = nrow;
const size_t ldc = ncol;
const cl_float alpha = 1.0;
clblasUplo uplo = clblasUpper;
cl_event event = NULL;
if (err == CL_SUCCESS) {
if (use_float) {
if (debug) {
result << "clblasSsyrk:" << std::endl;
}
status = clblasSsyrk(order, uplo, transA, ncol, nrow, alpha, cl_input_matrix, 0, lda, 0.0,
cl_output_matrix, 0, ldc, 1, &queue, 0, NULL, &event);
if (status != CL_SUCCESS && debug) {
result << "clblasSgemm error:" << clblasErrorToString(status) << std::endl;
}
} else {
if (debug) {
result << "clblasDsyrk:" << std::endl;
}
status = clblasDsyrk(order, uplo, transA, ncol, nrow, alpha, cl_input_matrix, 0, lda, 0.0,
cl_output_matrix, 0, ldc, 1, &queue, 0, NULL, &event);
if (status != CL_SUCCESS) {
if (debug) {
result << "clblasDgemm error:" << clblasErrorToString(status) << std::endl;
}
err = status;
}
}
}
if (err == CL_SUCCESS) {
/* Wait for calculations to be finished. */
if (debug) {
result << "clWaitForEvents:" << std::endl;
}
err = clWaitForEvents(1, &event);
}
// retrieve result
if (err == CL_SUCCESS) {
if (debug) {
result << "Retrieve result:" << std::endl;
}
if (use_float) {
clEnqueueReadBuffer(queue, cl_output_matrix, CL_TRUE, 0, ncol * ncol * sizeof(float), output_matrix_f, 0, NULL, NULL);
symmetrizeSquare_f(output_matrix_f, ncol);
} else {
clEnqueueReadBuffer(queue, cl_output_matrix, CL_TRUE, 0, ncol * ncol * sizeof(double), output_matrix_d, 0, NULL, NULL);
symmetrizeSquare_d(output_matrix_d, ncol);
}
}
std::string err_str = clErrorToString(err);
result << std::endl << err_str << std::endl;
// cleanup
clReleaseMemObject(cl_output_matrix);
cl_output_matrix = NULL;
clReleaseMemObject(cl_input_matrix);
cl_input_matrix = NULL;
clReleaseCommandQueue(queue);
queue = NULL;
clReleaseContext(context);
context = NULL;
if (debug) {
CERR << result.str();
}
ErrorStatus errorStatus = { err, status };
// return status != CL_SUCCESS ? clblasErrorToString(status) : clErrorToString(err);
return errorStatus;
}
开发者ID:quadrivio,项目名称:multiblas,代码行数:101,代码来源:crossprod_clblas.cpp
注:本文中的clWaitForEvents函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论