本文整理汇总了Python中pyopencl.enqueue_nd_range_kernel函数的典型用法代码示例。如果您正苦于以下问题:Python enqueue_nd_range_kernel函数的具体用法?Python enqueue_nd_range_kernel怎么用?Python enqueue_nd_range_kernel使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了enqueue_nd_range_kernel函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: do_opencl_pow
def do_opencl_pow(hash, target):
output = numpy.zeros(1, dtype=[('v', numpy.uint64, 1)])
if (ctx == False):
return output[0][0]
data = numpy.zeros(1, dtype=hash_dt, order='C')
data[0]['v'] = ("0000000000000000" + hash).decode("hex")
data[0]['target'] = target
hash_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=data)
dest_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, output.nbytes)
kernel = program.kernel_sha512
worksize = kernel.get_work_group_info(cl.kernel_work_group_info.WORK_GROUP_SIZE, cl.get_platforms()[0].get_devices()[1])
kernel.set_arg(0, hash_buf)
kernel.set_arg(1, dest_buf)
start = time.time()
progress = 0
globamt = worksize*2000
while output[0][0] == 0:
kernel.set_arg(2, pack("<Q", progress))
cl.enqueue_nd_range_kernel(queue, kernel, (globamt,), (worksize,))
cl.enqueue_read_buffer(queue, dest_buf, output)
queue.finish()
progress += globamt
sofar = time.time() - start
print sofar, progress / sofar, "hashes/sec"
taken = time.time() - start
print progress, taken
return output[0][0]
开发者ID:N0U,项目名称:PyBitmessage,代码行数:33,代码来源:openclpow.py
示例2: do_opencl_pow
def do_opencl_pow(hash, target):
global ctx, queue, program, gpus, hash_dt
output = numpy.zeros(1, dtype=[("v", numpy.uint64, 1)])
if ctx == False:
return output[0][0]
data = numpy.zeros(1, dtype=hash_dt, order="C")
data[0]["v"] = ("0000000000000000" + hash).decode("hex")
data[0]["target"] = target
hash_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=data)
dest_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, output.nbytes)
kernel = program.kernel_sha512
worksize = kernel.get_work_group_info(cl.kernel_work_group_info.WORK_GROUP_SIZE, gpus[0])
kernel.set_arg(0, hash_buf)
kernel.set_arg(1, dest_buf)
start = time.time()
progress = 0
globamt = worksize * 2000
while output[0][0] == 0:
kernel.set_arg(2, pack("<Q", progress))
cl.enqueue_nd_range_kernel(queue, kernel, (globamt,), (worksize,))
cl.enqueue_read_buffer(queue, dest_buf, output)
queue.finish()
progress += globamt
sofar = time.time() - start
# logger.debug("Working for %.3fs, %.2f Mh/s", sofar, (progress / sofar) / 1000000)
taken = time.time() - start
# logger.debug("Took %d tries.", progress)
return output[0][0]
开发者ID:Basti1993,项目名称:PyBitmessage,代码行数:35,代码来源:openclpow.py
示例3: do_opencl_pow
def do_opencl_pow(hash, target):
output = numpy.zeros(1, dtype=[('v', numpy.uint64, 1)])
if (len(enabledGpus) == 0):
return output[0][0]
data = numpy.zeros(1, dtype=hash_dt, order='C')
data[0]['v'] = ("0000000000000000" + hash).decode("hex")
data[0]['target'] = target
hash_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=data)
dest_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, output.nbytes)
kernel = program.kernel_sha512
worksize = kernel.get_work_group_info(cl.kernel_work_group_info.WORK_GROUP_SIZE, enabledGpus[0])
kernel.set_arg(0, hash_buf)
kernel.set_arg(1, dest_buf)
start = time.time()
progress = 0
globamt = worksize*2000
while output[0][0] == 0 and shutdown == 0:
kernel.set_arg(2, pack("<Q", progress))
cl.enqueue_nd_range_kernel(queue, kernel, (globamt,), (worksize,))
cl.enqueue_read_buffer(queue, dest_buf, output)
queue.finish()
progress += globamt
sofar = time.time() - start
# logger.debug("Working for %.3fs, %.2f Mh/s", sofar, (progress / sofar) / 1000000)
if shutdown != 0:
raise Exception ("Interrupted")
taken = time.time() - start
# logger.debug("Took %d tries.", progress)
return output[0][0]
开发者ID:Bitmessage,项目名称:PyBitmessage,代码行数:35,代码来源:openclpow.py
示例4: max_length_real4
def max_length_real4(ipt):
out = CLReal(len(ipt))
kern = _lengthkern_real4.kern
kern.set_arg(0, ipt._buffer)
kern.set_arg(1, out._buffer)
cl.enqueue_nd_range_kernel(ipt._ctrl.clqueue, kern, (len(ipt),), None)
return max_reduce(out)
开发者ID:hagisgit,项目名称:SLIC,代码行数:7,代码来源:tools.py
示例5: __call__
def __call__(self, thread_count, work_group_size, *args):
fun = self.compile()
for i, arg in enumerate(args):
fun.set_arg(i, arg)
with timed_region("ParLoop kernel"):
cl.enqueue_nd_range_kernel(_queue, fun, (thread_count,),
(work_group_size,), g_times_l=False).wait()
开发者ID:GitPaean,项目名称:PyOP2,代码行数:7,代码来源:opencl.py
示例6: filterPrepare
def filterPrepare(self, e, data, keys, ndata, events):
import numpy as np
import pyopencl as cl
mf = cl.mem_flags
ndata = data.size
if keys.size != ndata: raise Exception()
filtbytes = np.bool8(False).nbytes * ndata
if not isinstance(data, cl.Buffer):
data_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf= data)
else:
data_buf = data
if not isinstance(keys, cl.Buffer):
keys_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf= keys)
else:
keys_buf = keys
filt_buf = cl.Buffer(self.ctx, mf.READ_WRITE, filtbytes)
kernel = self.prg.filterPrepare
kernel.set_args(data_buf, keys_buf, np.uint64(ndata), np.uint8(33), np.uint8(66), filt_buf)
global_dims = self.get_global(self.get_grid_dims(ndata))
print "filterPrepare"
if e is None:
e = [ cl.enqueue_nd_range_kernel(self.queue, kernel, global_dims, self.localDims), ]
else:
e = [ cl.enqueue_nd_range_kernel(self.queue, kernel, global_dims, self.localDims, wait_for=e), ]
events += e
return (e, data_buf, keys_buf, filt_buf)
开发者ID:Kobtul,项目名称:documents,代码行数:34,代码来源:filter.py
示例7: prefixSumUp
def prefixSumUp(self, e, data, ndata, data2, ndata2, events):
import numpy as np
import pyopencl as cl
mf = cl.mem_flags
if not isinstance(data, cl.Buffer):
data_buf = cl.Buffer(self.ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=data)
else:
data_buf = data
if not isinstance(data2, cl.Buffer):
data2_buf = cl.Buffer(self.ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=data2)
else:
data2_buf = data2
kernel = self.prg.prefixSumUp
kernel.set_args(data_buf, np.uint64(ndata), data2_buf, np.uint64(ndata2))
global_dims = self.get_global(self.get_grid_dims(ndata))
print "prefixSumUp"
if e is None:
e = ( cl.enqueue_nd_range_kernel(self.queue, kernel, global_dims, self.localDims, wait_for=e), )
else:
e = ( cl.enqueue_nd_range_kernel(self.queue, kernel, global_dims, self.localDims), )
events += e
return (e, data_buf, data2_buf)
开发者ID:Kobtul,项目名称:documents,代码行数:28,代码来源:filter.py
示例8: exec_lsz_safe
def exec_lsz_safe(self, localsize):
"""execute the kernel with specific localsize.
Safe also for lernels with local variables"""
oldloc = int(self._localsize)
self.localsize = localsize
cl.enqueue_nd_range_kernel(self._solverobj.clqueue, self._clkernel, (self.globalsize,), (self.localsize,))
self._solverobj.clqueue.finish()
self.localsize = oldloc
开发者ID:hagisgit,项目名称:qcl,代码行数:8,代码来源:QclKernel.py
示例9: test_algorithm
def test_algorithm(self):
print "\n**************************"
print "test_pbrs:"
passed = 0
buffersize_in = 188*8
buffersize_out = 188*8
# opencl buffer uint
self.inputbuffer = cl.Buffer(self.ctx , cl.mem_flags.READ_WRITE, size=buffersize_in*4)
# opencl buffer uint
self.outputbuffer = cl.Buffer(self.ctx , cl.mem_flags.READ_WRITE, size=buffersize_out*4)
for k in self.kernelname:
kernel = self.load_kernel(self.filename, k)
passed = 0
self.fd_input = open('test_bench_pbrs_input.csv', 'r')
self.fd_output = open('test_bench_pbrs_output.csv', 'r')
for j in range(0,6):
encoded_data = numpy.array(numpy.zeros(buffersize_out/4), dtype=numpy.uint32)
data_to_encode = string.replace(self.fd_input.readline(),'\n','')
reference_data = string.replace(self.fd_output.readline(),'\n','')
for i in range(0,7):
data_to_encode = "%s,%s" % (data_to_encode, string.replace(self.fd_input.readline(),'\n',''))
reference_data = "%s,%s" % (reference_data, string.replace(self.fd_output.readline(),'\n',''))
data_to_encode = numpy.fromstring(numpy.fromstring(data_to_encode, dtype=numpy.uint8, sep=",").tostring(), dtype=numpy.uint32)
reference_data = numpy.fromstring(reference_data, dtype=numpy.uint8, sep=",")
cl.enqueue_copy(self.queue, self.inputbuffer, data_to_encode).wait()
kernel.set_args(self.inputbuffer, self.outputbuffer)
cl.enqueue_nd_range_kernel(self.queue,kernel,(8,),(8,),None ).wait()
cl.enqueue_copy(self.queue, encoded_data, self.outputbuffer).wait()
encoded_data = (numpy.fromstring(encoded_data.tostring(), dtype=numpy.uint8))
if encoded_data.tostring() == reference_data.tostring():
passed += 1
print "Test %d PASSED" % (j+1)
else:
print "Test %d FAILED" % (j+1)
print "input data:"
print numpy.fromstring(data_to_encode.tostring(), dtype=numpy.uint8)
print "encoded data:"
print numpy.fromstring(encoded_data.tostring(), dtype=numpy.uint8)
print "reference data:"
print reference_data
print "error data:"
print (reference_data - numpy.fromstring(encoded_data.tostring(), dtype=numpy.uint8))
print "%d pass out of 6" % passed
self.fd_input.close()
self.fd_output.close()
if passed == 6:
print "All pbrs tests PASS\n"
return True
else:
print "at least one pbrs test FAILED\n"
return False
开发者ID:das-labor,项目名称:dvbt,代码行数:56,代码来源:create_pbrs_kernel.py
示例10: max_reduce_real4
def max_reduce_real4(ipt):
x = CLReal(len(ipt))
y = CLReal(len(ipt))
z = CLReal(len(ipt))
kern = _splitkern_real4.kern
kern.set_arg(0, ipt._buffer)
kern.set_arg(1, x._buffer)
kern.set_arg(2, y._buffer)
kern.set_arg(3, z._buffer)
cl.enqueue_nd_range_kernel(ipt._ctrl.clqueue, kern, (len(ipt),), None)
return max_reduce(x), max_reduce(y), max_reduce(z)
开发者ID:hagisgit,项目名称:SLIC,代码行数:11,代码来源:tools.py
示例11: prefixSum
def prefixSum(self, e, data, keys, ndata, low, hi, events):
import numpy as np
import pyopencl as cl
mf = cl.mem_flags
if not isinstance(data, cl.Buffer):
data_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf= data)
else:
data_buf = data
if not isinstance(keys, cl.Buffer):
keys_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf= keys)
else:
keys_buf = keys
grid_dims = self.get_grid_dims(ndata)
psumbytes = ndata * np.uint64(0).nbytes
bsumbytes = int(np.prod(grid_dims) * np.uint64(0).nbytes)
nbsumbytes = np.uint64(0).nbytes
psum_buf = cl.Buffer(self.ctx, mf.READ_WRITE, psumbytes)
bsum_buf = cl.Buffer(self.ctx, mf.READ_WRITE, bsumbytes)
nbsum_buf = cl.Buffer(self.ctx, mf.READ_WRITE, nbsumbytes)
low = PrefixSum.HOST_TYPE_KEYS(low)
hi = PrefixSum.HOST_TYPE_KEYS(hi)
kernel = self.prg.prefixSumDown
kernel.set_args(data_buf, keys_buf, np.uint64(ndata), low, hi, psum_buf, bsum_buf, nbsum_buf)
global_dims = self.get_global(grid_dims)
print "prefixSumDown %s %s" % (str(global_dims), str(self.localDims))
if e is None:
e = ( cl.enqueue_nd_range_kernel(self.queue, kernel, global_dims, self.localDims, wait_for=e), )
else:
e = ( cl.enqueue_nd_range_kernel(self.queue, kernel, global_dims, self.localDims), )
events += e
nbsum = np.zeros(1, dtype = np.uint64)
events += (cl.enqueue_copy(self.queue, nbsum, nbsum_buf, wait_for=e),)
if nbsum>1:
(e, bsum_buf, bsum1_buf, nbsum1_buf, ndata2) = self.prefixSumDownInplace(e, bsum_buf, nbsum.item(), events)
else:
ndata2 = np.zeros(1, dtype = np.uint64)
events += (cl.enqueue_copy(self.queue, ndata2, bsum_buf, wait_for=e),)
ndata2 = ndata2.item()
print ndata2
self.prefixSumUp(e, psum_buf, ndata, bsum_buf, nbsum, events)
return (e, data_buf, keys_buf, psum_buf, bsum_buf, nbsum_buf, ndata2)
开发者ID:Kobtul,项目名称:documents,代码行数:53,代码来源:filter.py
示例12: solve
def solve(self,puzzle,simulations = 16384, iterations = 35, workGroupSize = 128):
self.simulations = simulations
self.iterations = iterations
self.workGroupSize = workGroupSize
self.workGroups = int(self.simulations / self.workGroupSize)
self.width = np.int8(puzzle['width'])
self.height = np.int8(puzzle['height'])
#initialise buffers
self.initBuffers(puzzle)
#create kernel
self.kernel = cl.Kernel(self.program,"montecarlo")
self.kernel.set_args(self.lengthsBuffer,self.groupLengthsBuffer,self.puzzlesBuffer,self.solutionsBuffer,self.height,self.width,np.int32(self.iterations))
#execute program for a number of iterations
cl.enqueue_nd_range_kernel(self.queue,self.kernel,(self.simulations,),(self.workGroupSize,))
#unmap group lengths buffer from device
cl.enqueue_map_buffer(self.queue,self.groupLengthsBuffer,cl.map_flags.WRITE,0,self.groupLengths.shape,self.groupLengths.dtype)
self.groupLengths = self.groupLengthsBuffer.get_host_array(self.groupLengths.shape,dtype=self.groupLengths.dtype)
#unmap solutions buffer from device
cl.enqueue_map_buffer(self.queue,self.solutionsBuffer,cl.map_flags.WRITE,0,self.solutionsFlattened.shape,self.solutions.dtype)
self.solutions = self.solutionsBuffer.get_host_array(self.solutions.shape,dtype=self.solutions.dtype)
#release buffers
self.lengthsBuffer.release()
self.groupLengthsBuffer.release()
self.puzzlesBuffer.release()
self.solutionsBuffer.release()
#get the best solution
i = self.groupLengths.argmin()
bestSolution = np.array(self.solutions[i])
#convert solution to list format used by challenge
solution = []
for row in range(0,puzzle['height']):
for col in range(0,puzzle['width']):
if bestSolution[row][col]!=-1:
s = bestSolution[row][col]
#add to solution list
solution.append({'X': int(col),'Y': int(row),'Size':int(s)})
#clear cells in solution
for i in range(0,s):
for j in range(0,s):
bestSolution[row+i][col+j]=-1
return solution
开发者ID:ohlord,项目名称:cimpress,代码行数:52,代码来源:CLSolve.py
示例13: filter
def filter(self, data, keys, low, hi, events):
import numpy as np
import pyopencl as cl
mf = cl.mem_flags
ndata = data.size
(e, data_buf, keys_buf, indices_buf, bsum_buf, nbsum_buf, ndata2) = self.prefixSum(None, data, keys, ndata, low, hi, events)
filt = np.zeros(ndata, dtype = np.bool8)
indices = np.zeros(ndata, dtype = np.uint64)
data2 = np.zeros(ndata2, dtype = PrefixSum.HOST_TYPE_DATA)
keys2 = np.zeros(ndata2, dtype = PrefixSum.HOST_TYPE_KEYS)
ndata2bytes = np.uint64(0).nbytes
if PrefixSum.RETURN_FILTER == 1:
filt_buf = cl.Buffer(self.ctx, mf.READ_WRITE, filt.nbytes)
print data2.nbytes
data2_buf = cl.Buffer(self.ctx, mf.READ_WRITE, data2.nbytes)
keys2_buf = cl.Buffer(self.ctx, mf.READ_WRITE, keys2.nbytes)
ndata2_buf = cl.Buffer(self.ctx, mf.READ_WRITE, ndata2bytes)
low = PrefixSum.HOST_TYPE_KEYS(low)
hi = PrefixSum.HOST_TYPE_KEYS(hi)
kernel = self.prg.filter
if PrefixSum.RETURN_FILTER == 1:
kernel.set_args(data_buf, keys_buf, indices_buf, np.uint64(ndata), low, hi, filt_buf, data2_buf, keys2_buf, ndata2_buf)
else:
kernel.set_args(data_buf, keys_buf, indices_buf, np.uint64(ndata), low, hi, data2_buf, keys2_buf, ndata2_buf)
global_dims = self.get_global(self.get_grid_dims(ndata))
print "filter"
if e is None:
e = ( cl.enqueue_nd_range_kernel(self.queue, kernel, global_dims, self.localDims, wait_for=e), )
else:
e = ( cl.enqueue_nd_range_kernel(self.queue, kernel, global_dims, self.localDims), )
events += e
if PrefixSum.RETURN_FILTER == 1:
events += ( cl.enqueue_copy(self.queue, filt, filt_buf, wait_for=e),
cl.enqueue_copy(self.queue, indices, indices_buf, wait_for=e),
cl.enqueue_copy(self.queue, data2, data2_buf, wait_for=e),
cl.enqueue_copy(self.queue, keys2, keys2_buf, wait_for=e) )
else:
events += ( cl.enqueue_copy(self.queue, indices, indices_buf, wait_for=e),
cl.enqueue_copy(self.queue, data2, data2_buf, wait_for=e),
cl.enqueue_copy(self.queue, keys2, keys2_buf, wait_for=e) )
return (filt, indices, data2, keys2)
开发者ID:Kobtul,项目名称:documents,代码行数:52,代码来源:filter.py
示例14: _exec_chunked_unsafe
def _exec_chunked_unsafe(self, chunksize=0):
"""Unsafe for kernels with local variables."""
if chunksize > 0:
self._prep_chunked_exec(chunksize)
lenarr = self.leadingvar.length
ncnk = int(ceil(float(lenarr)/float(self._cnksz)))
cnksz = self._cnksz
for i in range(ncnk):
if (i == (ncnk - 1)) and not(lenarr % cnksz == 0):
cnksz = lenarr % cnksz
self._solverobj.__setattr__(self._cnk_name, i)
cl.enqueue_nd_range_kernel(self._solverobj.clqueue, self._clkernel, (cnksz,), None)
self._solverobj.clqueue.finish()
开发者ID:hagisgit,项目名称:qcl,代码行数:13,代码来源:QclKernel.py
示例15: change_display
def change_display(image) :
image_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=image)
mem = cl.GLBuffer(ctx, mf.WRITE_ONLY, numpy.float32(buf))
cl.enqueue_acquire_gl_objects(queue, [mem])
add_knl = prog.add
add_knl.set_args(image_buf, mem)
cl.enqueue_nd_range_kernel(queue, add_knl, image.shape, None)
cl.enqueue_release_gl_objects(queue, [mem])
queue.finish()
glFlush()
开发者ID:Blother,项目名称:Python_Interop,代码行数:13,代码来源:simple_interop.py
示例16: test_algorithm
def test_algorithm(self):
print "\n**************************"
print "test_reedsolomon:"
passed = 0
linecnt = 1
# opencl buffer uint
self.inputbuffer = cl.Buffer(self.ctx , cl.mem_flags.READ_WRITE, size=48*4)
# opencl buffer uint
self.outputbuffer = cl.Buffer(self.ctx , cl.mem_flags.READ_WRITE, size=51*4)
for k in self.kernelname:
kernel = self.load_kernel(self.filename, k)
self.fd_input = open('test_bench_rs_input.csv', 'r')
self.fd_output = open('test_bench_rs_output.csv', 'r')
for line in self.fd_input:
data_to_encode = numpy.fromstring(line, dtype=numpy.uint8, sep=",").tostring()
data_to_encode = numpy.fromstring(data_to_encode, dtype=numpy.uint32)
encoded_data = numpy.array(numpy.zeros(51), dtype=numpy.uint32)
reference_data = numpy.fromstring(self.fd_output.readline(), dtype=numpy.uint8, sep=",")
cl.enqueue_copy(self.queue, self.inputbuffer, data_to_encode).wait()
kernel.set_args(self.inputbuffer, self.outputbuffer)
cl.enqueue_nd_range_kernel(self.queue,kernel,(1,),None ).wait()
cl.enqueue_copy(self.queue, encoded_data, self.outputbuffer).wait()
if encoded_data.tostring() == reference_data.tostring():
passed += 1
print "Test %d PASSED" % linecnt
else:
print "Test %d FAILED" % linecnt
print "input data:"
print numpy.fromstring(data_to_encode.tostring(), dtype=numpy.uint8)
print "encoded data:"
print numpy.fromstring(encoded_data.tostring(), dtype=numpy.uint8)
print "reference data:"
print reference_data
print "error data:"
print (reference_data - numpy.fromstring(encoded_data.tostring(), dtype=numpy.uint8))
linecnt += 1
print "%d pass out of %d" % (passed,(linecnt-1))
self.fd_input.close()
self.fd_output.close()
if passed == (linecnt-1):
print "All reedsolomon tests PASS\n"
return True
else:
print "at least one reedsolomon test FAILED\n"
return False
开发者ID:das-labor,项目名称:dvbt,代码行数:50,代码来源:create_rs_kernel.py
示例17: updateEt_vanilla
def updateEt_vanilla(self, algo="SHG"):
root.debug("Updating Et using vanilla algorithm")
t0 = time.clock()
# transform = FFT(self.ctx, self.q, (self.Esig_w_tau_cla,) , (self.Esig_t_tau_p_cla,) , axes = [1])
# events = transform.enqueue(forward = False)
# self.Esig_t_tau_p_cla.set(np.fft.ifft(self.Esig_w_tau_cla.get(), axis=1).astype(self.dtype_c).copy())
if self.useCL == True:
events = self.Esig_t_tau_p_fft.enqueue(forward=False)
for e in events:
e.wait()
if algo == "SD":
krn = self.progs.progs["updateEtVanillaSumSD"].updateEtVanillaSumSD
krn.set_scalar_arg_dtypes((None, None, np.int32))
krn.set_args(self.Esig_t_tau_p_cla.data, self.Et_cla.data, self.N)
ev = cl.enqueue_nd_range_kernel(self.q, krn, self.Et.shape, None)
ev.wait()
Et = self.Et_cla.get()
self.Et_cla.set(-np.conj(Et).astype(self.dtype_c).copy())
# Esig_w_tau = self.Esig_w_tau_cla.get()
# Gm = np.conj(Esig_w_tau.sum(axis=1))[::-1]
# self.Et_cla.set(Gm.copy())
else:
krn = self.progs.progs["updateEtVanillaSumSHG"].updateEtVanillaSumSHG
krn.set_scalar_arg_dtypes((None, None, np.int32))
krn.set_args(self.Esig_t_tau_p_cla.data, self.Et_cla.data, self.N)
ev = cl.enqueue_nd_range_kernel(self.q, krn, self.Et.shape, None)
ev.wait()
krn = self.progs.progs["updateEtVanillaNorm"].updateEtVanillaNorm
krn.set_scalar_arg_dtypes((None, np.int32))
krn.set_args(self.Et_cla.data, self.N)
ev = cl.enqueue_nd_range_kernel(self.q, krn, [1], None)
ev.wait()
else:
self.Esig_t_tau_p_cla.set(np.fft.ifft(self.Esig_w_tau_cla.get(), axis=1).astype(self.dtype_c).copy())
Esig_t_tau_p = self.Esig_t_tau_p_cla.get()
if algo == "SD":
Et = np.sqrt(Esig_t_tau_p.sum(axis=0))
# Et = (Esig_t_tau_p.sum(axis=0))
else:
Et = Esig_t_tau_p.sum(axis=0)
Et = Et / np.abs(Et).max()
self.Et_cla.set(Et)
root.debug("".join(("Time spent: ", str(time.clock() - t0))))
开发者ID:filiplindau,项目名称:Frog,代码行数:50,代码来源:FrogCalculationCL.py
示例18: calc_weights_gradient
def calc_weights_gradient( self ):
"""
Calculate gradient of weights.
This method should be called only for processed layers as it's used
inputs array which is valid only at processing time.
"""
for l in self._next_layers:
if not l[0].processed:
l[0].calc_weights_gradient()
queue = self.opencl.queue
kernel = self.opencl.kernel_calc_layer_gradient
kernel.set_arg( 2, self._inputs_offset )
kernel.set_arg( 3, self._neurons_offset )
kernel.set_arg( 4, self._inputs_per_neuron )
kernel.set_arg( 5, self._weights_offset )
kernel.set_arg( 7, self._weights_count )
kernel.set_arg( 8, pyopencl.LocalMemory( int(
4 * ( self._inputs_per_neuron + 1 + self.opencl.max_local_size[ 0 ] // self._inputs_per_neuron ) ) ) )
self._calc_gradient_event = pyopencl.enqueue_nd_range_kernel( queue, kernel,
( int( self._weights_buf_size ), ), ( self.opencl.max_local_size[ 0 ], ),
wait_for = self._calc_gradient_wait_for
)
del self._calc_gradient_wait_for[:]
kernel = self.opencl.kernel_propagate_errors
kernel.set_arg( 2, self._neurons_offset )
kernel.set_arg( 5, self._neuron_count )
kernel.set_arg( 7, self._inputs_per_neuron )
i_s = numpy.int32( 1 )
for l in self._prev_layers:
kernel.set_arg( 3, l[0]._neurons_offset + l[1] )
kernel.set_arg( 4, l[2] )
kernel.set_arg( 6, self._weights_offset + i_s )
l[0]._calc_gradient_wait_for.append( pyopencl.enqueue_nd_range_kernel( queue, kernel,
( int( l[2] * 64 ), ), ( 64, ),
wait_for = ( self._calc_gradient_event, )
) )
i_s += l[2]
self._processed = True
开发者ID:remtcs,项目名称:gpgpu-neuralnet,代码行数:48,代码来源:layer.py
示例19: gpu_amend_values
def gpu_amend_values(queue, kernels, gpu_params, buffers, amendments):
"""
Transfers requested amendments (after collision detection check) to the GPU,
where a kernel applies them to the data
"""
intermediary_events = []
packet = amendments.get_packet()
if packet[amendments.amount_i] > 0:
events = [
cl.enqueue_copy(queue, buffers["global_amendments_n"],
packet[amendments.amount_i]),
cl.enqueue_copy(queue, buffers["global_amendment_indices"],
packet[amendments.indices_i]),
cl.enqueue_copy(queue, buffers["global_amendment_values"],
packet[amendments.values_i])]
# X groups of 64 items (amendments.amount work items)
intermediary_events.append(
cl.enqueue_nd_range_kernel(
queue, kernels["k_update_values"],
(int(np.ceil(amendments.amount / gpu_params["preferred_multiple"]) *
gpu_params["preferred_multiple"]),),
(gpu_params["preferred_multiple"],), global_work_offset=None,
wait_for=events))
return intermediary_events
开发者ID:naummo,项目名称:swarm_maze_opencl_solver,代码行数:25,代码来源:opencl_computations_demo.py
示例20: calc_chi2
def calc_chi2(self, queue, interspace, q, Iq,
rind, rxyz, lind, lxyz, origin, voxelspacing, fifj, targetIq, sq, chi2):
kernel = self.kernels.calc_chi2
workgroupsize = 16
gws = (queue.device.max_compute_units * workgroupsize * 512,)
lws = (workgroupsize,)
floatsize = 4
tmpIq = cl.LocalMemory(floatsize * q.shape[0] * workgroupsize)
shape = np.zeros(4, dtype=np.int32)
shape[:-1] = interspace.shape
shape[-1] = interspace.size
nq = np.int32(q.shape[0])
nind1 = np.int32(rind.shape[0])
nind2 = np.int32(lind.shape[0])
fifj_shape = np.zeros(4, dtype=np.int32)
fifj_shape[:-1] = fifj.shape
fifj_shape[-1] = fifj.size
kernel.set_args(interspace.data, q.data, Iq.data, tmpIq, rind.data, rxyz.data,
lind.data, lxyz.data, origin, voxelspacing, fifj.data, targetIq.data, sq.data, chi2.data,
shape, nq, nind1, nind2, fifj_shape)
status = cl.enqueue_nd_range_kernel(queue, kernel, gws, lws)
return status
开发者ID:latrocinia,项目名称:saxstools,代码行数:30,代码来源:kernels.py
注:本文中的pyopencl.enqueue_nd_range_kernel函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论