本文整理汇总了Python中numbapro.cuda.to_device函数的典型用法代码示例。如果您正苦于以下问题:Python to_device函数的具体用法?Python to_device怎么用?Python to_device使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了to_device函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: evaluation_function
def evaluation_function(factors, opts):
start = timer()
longest_wavelet, target_samples = opts['longest_wavelet'], opts['target_samples']
window_width = len(target_samples)
full_width = window_width + longest_wavelet
num_wavelengths = longest_wavelet-2
offsets_per_wavelet = full_width / num_wavelengths
num_rows = offsets_per_wavelet * num_wavelengths
result = np.zeros(window_width, dtype=np.float32)
d_factors = cuda.to_device(factors)
d_result = cuda.to_device(result)
griddim = full_width, 1
blockdim = 4, 1, 1
compute_samples_configured = compute_sample_kernel.configure(griddim, blockdim)
compute_samples_configured(d_factors, longest_wavelet, offsets_per_wavelet, d_result, num_rows)
d_result.to_host()
generated_samples_sum = sum(result)
factors_sum = sum(factors)
difference_from_target = math.fabs(sum(target_samples - result))
non_zero_factors = filter(lambda x: x != 0.0, result)
fun_value = difference_from_target + 10 * math.fabs(len(non_zero_factors))
print("Value "+str(fun_value)+" generated in " + str((timer() - start)) + " seconds. Sample sum: " +
str(generated_samples_sum)+". Factors sum: "+str(factors_sum))
return fun_value
开发者ID:RelentlessResults,项目名称:decompose001,代码行数:34,代码来源:precomputed_representation.py
示例2: run_GPU
def run_GPU(grid, adjGrid, steps, delay, initDelay, printInd, indSteps):
""" Runs the Command-Line interface for a specified number of steps,
or forever if the number of steps is specified to be -1.
Note that here, grid and adjGrid must be explicitly specified as
opposed to passed in as a Game, to enable everything to be run on the
GPU. Returns the final grid state. """
step = 0
dim = grid.shape
# move arrays to GPU
d_grid = cuda.to_device(grid)
d_adjGrid = cuda.to_device(adjGrid)
blockDim = (32,16)
gridDim = (32,8)
while step < steps or steps == -1:
# print grid
if printInd is not -1 and step % printInd is 0:
# in order to print grid, first need memory back in CPU
d_grid.to_host()
printGrid(grid, step, dim)
# print index
if indSteps is not -1 and step % indSteps is 0:
print("Step = " + str(step))
newGrid = np.zeros_like(grid)
d_newGrid = cuda.to_device(newGrid)
evolve2D_kernel[gridDim, blockDim](d_grid, d_adjGrid, d_newGrid)
d_grid = d_newGrid
grid = newGrid
sleep(delay)
if step == 0:
# allow initial position to be more easily visible
sleep(initDelay)
step += 1
d_grid.to_host()
return grid
开发者ID:goldenratio1618,项目名称:gameoflife,代码行数:34,代码来源:cmdline.py
示例3: train
def train(self,ds,epochs,batch_size=10):
for epoch in range(epochs):
start = timer()
count = 0.
correct = 0.
for i in range(len(ds)/batch_size):
count += 1.
x = encode(ds[i*batch_size][0],gpu=False)
t = encode(ds[i*batch_size][1],gpu=False)
for b in range(batch_size-1):
x = np.concatenate((x,encode(ds[i*batch_size + b+1][0],gpu=False)))
t = np.concatenate((t,encode(ds[i*batch_size + b+1][1],gpu=False)))
x = cuda.to_device(x)
t = cuda.to_device(t)
assert x.shape[1] == self.layers[0]
assert t.shape[1] == self.layers[2]
print(x.shape)
self.forward(x)
print('output',decode(self.output))
if decode(self.output) == decode(t):
correct += 1.
self.backward(t)
print("Epoch",epoch,"Time:",timer()-start,'output',decode(self.output), 'Accuracy:',correct/count)
if correct/count > 0.99:
break
开发者ID:ZhangAustin,项目名称:PyCuNN,代码行数:26,代码来源:nn.py
示例4: fit
def fit(self,X,Budget=None,W=None):
self.X = cuda.to_device(X.astype(np.float64,order='F'))
self.Budget = cuda.device_array((self.budgetSize,self.X.shape[1]),dtype=np.float64,order='F')
self.kx = cuda.device_array((self.budgetSize,self.X.shape[0]),dtype=np.float64,order='F')
self.Wkx = cuda.device_array((self.latentTopics,self.X.shape[0]),dtype=np.float64,order='F')
self.H = cuda.device_array((self.latentTopics,self.X.shape[0]),dtype=np.float64,order='F')
if Budget is None:
permutation = np.random.permutation(self.X.shape[0])
self.permutation = cuda.to_device(permutation)
initBudget(self.X,self.permutation,self.Budget)
else:
self.Budget = cuda.to_device(Budget.astype(np.float64,order='F'))
self.calculateKB()
self.calculateKX()
if W is None:
self.initW()
else:
self.W = cuda.to_device(W.astype(np.float64,order='F'))
self.t = 0
for i in xrange(self.epochs):
print "Epoch " + str(i)
samples,features = self.X.shape
permutation = getPermutation(samples,self.miniBatchSize)
self.permutation = cuda.to_device(permutation)
for j in xrange((samples + self.miniBatchSize) / self.miniBatchSize):
loadBatch(self.kx,self.permutation,j,self.kxi)
self.nextW()
self.t += 1
self.predictH()
开发者ID:ejake,项目名称:tensor-factorization,代码行数:29,代码来源:cudaOKMF.py
示例5: getIdx
def getIdx(fill_word,reduced_literal, reduced_length):#step 5: get index by interleaving fill_word and literal(also remove all-zeros word)
bin_length = max(len(bin(reduced_length-1)),len(bin(tpb-1)))#the bit number of binary form of array length
thread_num = numpy.int64(math.pow(2,bin_length))#Blelloch_scan need the length of scanned array to be even multiple of thread_per_block
compact_flag = numpy.ones(thread_num, dtype='int64')
print thread_num
index = numpy.ones(2*reduced_length, dtype='uint32')
d_index = cuda.to_device(index)
d_fill_word = cuda.to_device(fill_word)
d_reduced_literal = cuda.to_device(numpy.array(reduced_literal))
d_compact_flag = cuda.to_device(compact_flag)
#print fill_word
getIdx_gpu[1, tpb](d_fill_word, d_reduced_literal, d_index, d_compact_flag, reduced_length)
compact_flag = d_compact_flag.copy_to_host()
#print compact_flag[0:28]
useless_array = numpy.zeros(thread_num, dtype='int64')
radix_sort.Blelloch_scan_caller(d_compact_flag, useless_array, 0)
out_index_length = d_compact_flag.copy_to_host()[2*reduced_length-1] + 1
print d_compact_flag.copy_to_host()[0:2*reduced_length]
print out_index_length
out_index = numpy.zeros(out_index_length, dtype='uint32')
scatter_index[1,tpb](d_index, d_compact_flag, compact_flag, out_index, reduced_length)
#for i in out_index:
# print bin(i)
return out_index
开发者ID:DarinSSC,项目名称:WAH_on_GPU,代码行数:25,代码来源:bitmap_constructor_gpu.py
示例6: radix_sort
def radix_sort(arr, rid):
length = numpy.int64(len(arr))
bin_length = max(len(bin(length-1)),len(bin(TPB_MAX-1)))#the bit number of binary form of array length
thread_num = numpy.int64(math.pow(2,bin_length))
block_num = max(thread_num/TPB_MAX,1)
stream = cuda.stream()
one_list = numpy.zeros(shape=(thread_num), dtype='int64')
zero_list = numpy.zeros(shape=(thread_num), dtype='int64')
iter_num = len(bin(ATTR_CARD_MAX))
for i in range(iter_num):
d_arr = cuda.to_device(arr, stream)
d_rid = cuda.to_device(rid, stream)
d_zero_list = cuda.to_device(zero_list,stream)
d_one_list = cuda.to_device(one_list,stream)
get_list[block_num, TPB_MAX](arr, length, i, d_zero_list, d_one_list)#get one_list and zero_list
d_one_list.to_host(stream)
d_zero_list.to_host(stream)
stream.synchronize()
base_reduction_block_num = block_num
base_reduction_block_size = TPB_MAX
tmp_out = numpy.zeros(base_reduction_block_num, dtype='int64')
d_tmp_out = cuda.to_device(tmp_out, stream)
sum_reduction[base_reduction_block_num, base_reduction_block_size](d_zero_list, d_tmp_out)
d_tmp_out.to_host(stream)
stream.synchronize()
base = 0 #base for the scan of one_list
for j in xrange(base_reduction_block_num):
base += tmp_out[j]
Blelloch_scan_caller(d_zero_list, d_one_list, base)
array_adjust[block_num,TPB_MAX](arr, d_arr, rid, d_rid, zero_list, one_list, d_zero_list, d_one_list, length)
开发者ID:DarinSSC,项目名称:BitmapIndex_GUI,代码行数:35,代码来源:radix_sort.py
示例7: tests
def tests():
a = np.random.rand(300,500)
b = np.random.rand(500,300)
start = timer()
c = np.dot(a,b)
nptime = timer()-start
print('nptime',nptime)
x = np.array(np.random.rand(600,1500),dtype='float32',order='F')
y = np.array(np.random.rand(1500,300),dtype='float32',order='F')
z = np.zeros((1000,1000),order='F',dtype='float32')
stream = cuda.stream()
dx = cuda.to_device(x)
dy = cuda.to_device(y)
dz = cuda.to_device(z)
start = timer()
blas.gemm('N','N',1000,1500,1000,1.0,dx,dy,0.0,dz)
cutime = timer()-start
print('cutime',cutime)
#dz.copy_to_host(z)
print(dz[0])
c = np.ones((1000,1000),order='F',dtype='float32')
print(c.shape)
dc = cuda.to_device(c)
# blockDim = (256,256)
#gridDim = (((1000 + blockDim[0]-1)/blockDim[0]),((1000 + blockDim[1]-1)/blockDim[1]))
blockDim = (30,30)
gridDim = ((((c.shape[0] + blockDim[0]) - 1) / blockDim[0]), (((c.shape[1] + blockDim[1]) - 1) / blockDim[1]))
start = timer()
mtanh[gridDim,blockDim,stream](dc)
tantime = timer() - start
print('tantime',tantime)
dc.copy_to_host(c,stream=stream)
stream.synchronize()
print(c)
y = cm.CUDAMatrix(np.ones((1000,1000)))
start = timer()
cm.tanh(y)
cmtan = timer()-start
print('cmtan',cmtan)
x = cm.CUDAMatrix(np.random.rand(1000,1500))
y = cm.CUDAMatrix(np.random.rand(1500,1000))
start = timer()
cm.dot(x,y)
cmtime = timer()-start
print('cmtime',cmtime)
开发者ID:ZhangAustin,项目名称:PyCuNN,代码行数:60,代码来源:numbaprotests.py
示例8: test_scan
def test_scan():
in_h = np.empty(NUM_ELEMENTS, dtype=np.uint32)
out_h = np.zeros(NUM_ELEMENTS, dtype=np.uint32)
for i in range(0, NUM_ELEMENTS):
in_h[i] = NUM_ELEMENTS -i - 1#randint(0, 100)
tac1 = time()
in_d = cuda.to_device(in_h)
out_d = cuda.to_device(out_h)
cuda.synchronize()
tac2 = time()
tk1 = time()
for i in range(0, 32):
tk1 = time()
preScan(out_d, in_d, NUM_ELEMENTS)
cuda.synchronize()
tk2 = time()
print i, tk2 - tk1
tk2 = time()
th1 = time()
out_d.copy_to_host(out_h)
cuda.synchronize()
#print "Last = ", out_h[-1] + in_h[-1]
th2 = time()
开发者ID:jalatif,项目名称:Python_Massively_Parallel_FP_Tree,代码行数:33,代码来源:exclusive_scan.py
示例9: reduce_by_key
def reduce_by_key(input_data, chunk_id, literal, length):#step 3
flag = numpy.ones(length, dtype='int32')
stream = cuda.stream()
d_flag = cuda.to_device(flag, stream)
d_chunk_id = cuda.to_device(chunk_id, stream)
d_literal = cuda.to_device(literal, stream)
produce_flag[1,tpb](input_data, d_chunk_id, length, d_flag)
d_flag.to_host(stream)
print 'flag:'
print flag
stream.synchronize()
is_finish = numpy.zeros(length, dtype='int32')
hop = 1
while hop<32:#only 32 because the length of a word in binary form is 32
reduce_by_key_gpu[1,tpb](d_literal, d_flag, is_finish, hop, length)
hop *= 2
d_literal.to_host(stream)
d_chunk_id.to_host(stream)
stream.synchronize()
reduced_input_data = []
reduced_chunk_id = []
reduced_literal =[]
for i in xrange(length):
if flag[i]:
reduced_input_data.append(input_data[i])
reduced_chunk_id.append(chunk_id[i])
reduced_literal.append(literal[i])
return numpy.array(reduced_input_data), numpy.array(reduced_chunk_id), reduced_literal
开发者ID:DarinSSC,项目名称:WAH_on_GPU,代码行数:29,代码来源:bitmap_constructor_gpu.py
示例10: getIdx
def getIdx(fill_word,reduced_literal, reduced_length, head, cardinality):#step 5: get index by interleaving fill_word and literal(also remove all-zeros word)
bin_length = max(len(bin(2*reduced_length-1)),len(bin(tpb-1)))#the bit number of binary form of array length
thread_num = numpy.int64(math.pow(2,bin_length))#Blelloch_scan need the length of scanned array to be even multiple of thread_per_block
compact_flag = numpy.ones(thread_num, dtype='int64')
index = numpy.ones(2*reduced_length, dtype='uint32')
d_index = cuda.to_device(index)
d_fill_word = cuda.to_device(fill_word)
d_reduced_literal = cuda.to_device(numpy.array(reduced_literal))
d_compact_flag = cuda.to_device(compact_flag)
block_num = reduced_length/tpb + 1
getIdx_gpu[block_num, tpb](d_fill_word, d_reduced_literal, d_index, d_compact_flag, reduced_length)
compact_flag = d_compact_flag.copy_to_host()
useless_array = numpy.zeros(thread_num, dtype='int64')
radix_sort.Blelloch_scan_caller(d_compact_flag, useless_array, 0)
out_index_length = d_compact_flag.copy_to_host()[2*reduced_length-1] + 1
out_index = numpy.zeros(out_index_length, dtype='uint32')
offsets = []
new_block_num = 2*reduced_length/tpb + 1
scatter_index[new_block_num, tpb](d_index, d_compact_flag, compact_flag, out_index, reduced_length)
for i in xrange(reduced_length):
if head[i]:
offsets.append(d_compact_flag.copy_to_host()[2*i])
key_length = numpy.zeros(cardinality, dtype='int64')
for i in xrange(cardinality-1):
key_length[i] = offsets[i+1] - offsets[i]
key_length[cardinality-1] = out_index_length - offsets[cardinality-1]
return out_index, numpy.array(offsets), numpy.array(key_length)
开发者ID:DarinSSC,项目名称:BitmapIndex_GUI,代码行数:35,代码来源:bitmap_pickle.py
示例11: monte_carlo_pricer
def monte_carlo_pricer(paths, dt, interest, volatility):
n = paths.shape[0]
mm = MM(shape=n, dtype=np.double, prealloc=5)
blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK
gridsz = int(math.ceil(float(n) / blksz))
stream = cuda.stream()
prng = curand.PRNG(curand.PRNG.MRG32K3A, stream=stream)
# Allocate device side array
d_normdist = cuda.device_array(n, dtype=np.double, stream=stream)
c0 = interest - 0.5 * volatility ** 2
c1 = volatility * math.sqrt(dt)
# Configure the kernel
# Similar to CUDA-C: cu_monte_carlo_pricer<<<gridsz, blksz, 0, stream>>>
step_cfg = step[gridsz, blksz, stream]
d_last = cuda.to_device(paths[:, 0], to=mm.get())
for j in range(1, paths.shape[1]):
prng.normal(d_normdist, mean=0, sigma=1)
d_paths = cuda.to_device(paths[:, j], stream=stream, to=mm.get())
step_cfg(d_last, d_paths, dt, c0, c1, d_normdist)
d_paths.copy_to_host(paths[:, j], stream=stream)
mm.free(d_last, stream=stream)
d_last = d_paths
stream.synchronize()
开发者ID:Aahung,项目名称:numbapro-examples,代码行数:31,代码来源:pricer_cuda.py
示例12: main
def main():
N = 2048 * 2048
# Allocate host memory arrays
a = np.empty(N)
b = np.empty(N)
c = np.empty(N)
# Initialize host memory
a.fill(2)
b.fill(1)
c.fill(0)
# Allocate and copy GPU/device memory
d_a = cuda.to_device(a)
d_b = cuda.to_device(b)
d_c = cuda.to_device(c)
threads_per_block = 128
number_of_blocks = N / 128 + 1
saxpy [ number_of_blocks, threads_per_block ] ( d_a, d_b, d_c )
d_c.copy_to_host(c)
# Print out the first and last 5 values of c for a quality check
print c[:5]
print c[-5:]
开发者ID:mlanier,项目名称:Mlanier_Master,代码行数:28,代码来源:CUDA_Python2.py
示例13: gpumulti
def gpumulti(X,mu):
device = cuda.get_current_device()
n=len(X)
X=np.array(X)
x1 = np.array(X.T[0])
x2 = np.array(X.T[1])
bmk = np.arange(len(x1))
mu = np.array(mu)
dx1 = cuda.to_device(x1)
dx2 = cuda.to_device(x2)
dmu = cuda.to_device(mu)
dbmk = cuda.to_device(bmk)
# Set up enough threads for kernel
tpb = device.WARP_SIZE
bpg = int(np.ceil(float(n)/tpb))
cu_worker[bpg,tpb](dx1,dx2,dmu,dbmk)
bestmukey = dbmk.copy_to_host()
return bestmukey
开发者ID:lmwalkowicz,项目名称:KeplerML,代码行数:26,代码来源:km_outliers.py
示例14: main
def main():
NN = 4096
NM = 4096
A = np.zeros((NN, NM), dtype=np.float64)
Anew = np.zeros((NN, NM), dtype=np.float64)
n = NN
m = NM
iter_max = 1000
tol = 1.0e-6
error = 1.0
for j in range(n):
A[j, 0] = 1.0
Anew[j, 0] = 1.0
print "Jacobi relaxation Calculation: %d x %d mesh" % (n, m)
timer = time.time()
iter = 0
blockdim = (tpb, tpb)
griddim = (NN/blockdim[0], NM/blockdim[1])
error_grid = np.zeros(griddim)
stream = cuda.stream()
dA = cuda.to_device(A, stream) # to device and don't come back
dAnew = cuda.to_device(Anew, stream) # to device and don't come back
derror_grid = cuda.to_device(error_grid, stream)
while error > tol and iter < iter_max:
assert error_grid.dtype == np.float64
jocabi_relax_core[griddim, blockdim, stream](dA, dAnew, derror_grid)
derror_grid.to_host(stream)
# error_grid is available on host
stream.synchronize()
error = np.abs(error_grid).max()
# swap dA and dAnew
tmp = dA
dA = dAnew
dAnew = tmp
if iter % 100 == 0:
print "%5d, %0.6f (elapsed: %f s)" % (iter, error, time.time()-timer)
iter += 1
runtime = time.time() - timer
print " total: %f s" % runtime
开发者ID:Aahung,项目名称:numbapro-examples,代码行数:59,代码来源:laplace2d-numbapro-gpu-improve.py
示例15: monte_carlo_pricer
def monte_carlo_pricer(paths, dt, interest, volatility):
n = paths.shape[0]
num_streams = 2
part_width = int(math.ceil(float(n) / num_streams))
partitions = [(0, part_width)]
for i in range(1, num_streams):
begin, end = partitions[i - 1]
begin, end = end, min(end + (end - begin), n)
partitions.append((begin, end))
partlens = [end - begin for begin, end in partitions]
mm = MM(shape=part_width, dtype=np.double, prealloc=10 * num_streams)
device = cuda.get_current_device()
blksz = device.MAX_THREADS_PER_BLOCK
gridszlist = [int(math.ceil(float(partlen) / blksz))
for partlen in partlens]
strmlist = [cuda.stream() for _ in range(num_streams)]
prnglist = [curand.PRNG(curand.PRNG.MRG32K3A, stream=strm)
for strm in strmlist]
# Allocate device side array
d_normlist = [cuda.device_array(partlen, dtype=np.double, stream=strm)
for partlen, strm in zip(partlens, strmlist)]
c0 = interest - 0.5 * volatility ** 2
c1 = volatility * math.sqrt(dt)
# Configure the kernel
# Similar to CUDA-C: cu_monte_carlo_pricer<<<gridsz, blksz, 0, stream>>>
steplist = [cu_step[gridsz, blksz, strm]
for gridsz, strm in zip(gridszlist, strmlist)]
d_lastlist = [cuda.to_device(paths[s:e, 0], to=mm.get(stream=strm))
for (s, e), strm in zip(partitions, strmlist)]
for j in xrange(1, paths.shape[1]):
for prng, d_norm in zip(prnglist, d_normlist):
prng.normal(d_norm, mean=0, sigma=1)
d_pathslist = [cuda.to_device(paths[s:e, j], stream=strm,
to=mm.get(stream=strm))
for (s, e), strm in zip(partitions, strmlist)]
for step, args in zip(steplist, zip(d_lastlist, d_pathslist, d_normlist)):
d_last, d_paths, d_norm = args
step(d_last, d_paths, dt, c0, c1, d_norm)
for d_paths, strm, (s, e) in zip(d_pathslist, strmlist, partitions):
d_paths.copy_to_host(paths[s:e, j], stream=strm)
mm.free(d_last, stream=strm)
d_lastlist = d_pathslist
for strm in strmlist:
strm.synchronize()
开发者ID:Aahung,项目名称:numbapro-examples,代码行数:58,代码来源:pricer_cuda_overlap.py
示例16: kern_CUDA_dense
def kern_CUDA_dense(nsteps, dX, rho_inv, int_m, dec_m,
phi, grid_idcs, prog_bar=None):
"""`NVIDIA CUDA cuBLAS <https://developer.nvidia.com/cublas>`_ implementation
of forward-euler integration.
Function requires a working :mod:`numbapro` installation. It is typically slower
compared to :func:`kern_MKL_sparse` but it depends on your hardware.
Args:
nsteps (int): number of integration steps
dX (numpy.array[nsteps]): vector of step-sizes :math:`\\Delta X_i` in g/cm**2
rho_inv (numpy.array[nsteps]): vector of density values :math:`\\frac{1}{\\rho(X_i)}`
int_m (numpy.array): interaction matrix :eq:`int_matrix` in dense or sparse representation
dec_m (numpy.array): decay matrix :eq:`dec_matrix` in dense or sparse representation
phi (numpy.array): initial state vector :math:`\\Phi(X_0)`
prog_bar (object,optional): handle to :class:`ProgressBar` object
Returns:
numpy.array: state vector :math:`\\Phi(X_{nsteps})` after integration
"""
calc_precision = None
if config['CUDA_precision'] == 32:
calc_precision = np.float32
elif config['CUDA_precision'] == 64:
calc_precision = np.float64
else:
raise Exception("kern_CUDA_dense(): Unknown precision specified.")
#=======================================================================
# Setup GPU stuff and upload data to it
#=======================================================================
try:
from numbapro.cudalib.cublas import Blas # @UnresolvedImport
from numbapro import cuda, float32 # @UnresolvedImport
except ImportError:
raise Exception("kern_CUDA_dense(): Numbapro CUDA libaries not " +
"installed.\nCan not use GPU.")
cubl = Blas()
m, n = int_m.shape
stream = cuda.stream()
cu_int_m = cuda.to_device(int_m.astype(calc_precision), stream)
cu_dec_m = cuda.to_device(dec_m.astype(calc_precision), stream)
cu_curr_phi = cuda.to_device(phi.astype(calc_precision), stream)
cu_delta_phi = cuda.device_array(phi.shape, dtype=calc_precision)
for step in xrange(nsteps):
if prog_bar:
prog_bar.update(step)
cubl.gemv(trans='T', m=m, n=n, alpha=float32(1.0), A=cu_int_m,
x=cu_curr_phi, beta=float32(0.0), y=cu_delta_phi)
cubl.gemv(trans='T', m=m, n=n, alpha=float32(rho_inv[step]),
A=cu_dec_m, x=cu_curr_phi, beta=float32(1.0), y=cu_delta_phi)
cubl.axpy(alpha=float32(dX[step]), x=cu_delta_phi, y=cu_curr_phi)
return cu_curr_phi.copy_to_host()
开发者ID:elim723,项目名称:MCEq,代码行数:54,代码来源:kernels.py
示例17: get_indexList
def get_indexList(path, attr_selected):
path1, path2, attr_num = bitmap_pickle.get_pic_path(path)
f1 = open(path1, 'rb') # read data_map.pkl
try:
attr_map = pickle.load(f1)
attr_list = pickle.load(f1)
attr_total = pickle.load(f1)
finally:
f1.close()
f2 = open(path2, 'rb') # read bitmap_pic.pkl
try:
lists = pickle.load(f2)
key = pickle.load(f2)
offset = pickle.load(f2)
finally:
f2.close()
# attr_input is a list that stores the numbers of input attributes
# attr_num is the total number of attributes
# attr_total is the total number of data/31
attr_input = [[] for i in xrange(attr_num)]
for i in xrange(attr_num):
for attri in attr_selected[i]:
if attri in attr_map[i]:
attr_input[i].append(attr_map[i][attri])
elif attri == 'All':
attr_input[i].append(-1)
if len(attr_input[i])>1 and (-1 in attr_input[i]):
attr_input[i].remove(-1)
print attr_input
search_start_time = time.time()
if len(attr_input) != attr_num: # there might be a wrong input in input_test.py
print 'No eligible projects'
else:
tpb = 1024
blocknum = 1
attr_mul = (attr_total + (tpb * blocknum - 1))/(tpb * blocknum)
# attr_mul is the number that each thread need to be performed
#print '---index----\nattr_num:%d\nattr_total:%d\nattr_mul:%d\n----------' % (attr_num, attr_total, attr_mul)
# attr_num = 1
index_list = numpy.zeros(attr_total*31, dtype='int32')
bitmap_list = get_attr(attr_input, attr_num, attr_total, lists, key, offset)
stream = cuda.stream()
d_bitmap_list = cuda.to_device(numpy.array(bitmap_list), stream)
d_index_list = cuda.to_device(numpy.array(index_list), stream)
index_gpu[blocknum, tpb, stream](d_bitmap_list, d_index_list, attr_num, attr_total, attr_mul)
index_list = d_index_list.copy_to_host()
stream.synchronize()
search_end_time = time.time()
return index_list, search_end_time-search_start_time
开发者ID:DarinSSC,项目名称:BitmapIndex_GUI,代码行数:52,代码来源:bitmap_index2.py
示例18: flush
def flush(self, metric_opt, supp_opt):
if not self.Vcs:
# Nothing to do
return metric_opt, supp_opt
k = self.k
V = self.V
topk_list = []
nodect = V.shape[0]
numseg = len(self.Vcs)
assert nodect
assert numseg
eachsize = nodect * numseg
D = np.zeros(eachsize, dtype=np.float32)
# Fill buffer for segmented sort
for i, Vc in enumerate(self.Vcs):
D[i * nodect:(i + 1) * nodect] = Vc[:, 0]
# Prepare for GPU segmented sort
dD = cuda.to_device(D)
dI = cuda.device_array((numseg, nodect), dtype=np.uint32)
blksz = 32
init_indices[(divup(dI.shape[0], blksz),
divup(dI.shape[1], blksz)),
(blksz, blksz)](dI)
if numseg == 1:
segments = np.arange(1, dtype=np.int32)
else:
segments = (np.arange(numseg - 1, dtype=np.int32) + 1) * nodect
segmented_sort(dD, dI, cuda.to_device(segments))
for i in range(numseg):
topk = dI[i, -k:].copy_to_host()
topk_list.append(topk)
# Reduce
for topk in topk_list:
# Assume A is huge
metric = np.linalg.norm(V[topk, :]) ** 2
if metric > metric_opt:
metric_opt = metric
supp_opt = topk
# Clear all Vc
self.Vcs.clear()
return metric_opt, supp_opt
开发者ID:ContinuumIO,项目名称:numbapro-spca,代码行数:52,代码来源:dks.py
示例19: encode
def encode(word,gpu=True):
if isinstance(word,basestring):
if using_embeddings == True:
return cuda.to_device(vocab[word])
else:
x = np.zeros((1,word_idx),dtype='float32')
x[0][vocab[word]] = 1.
if gpu == True:
return cuda.to_device(x)
else:
return x
else:
return word
开发者ID:ZhangAustin,项目名称:PyCuNN,代码行数:13,代码来源:utils.py
示例20: convolve
def convolve():
# Build Filter
laplacian_pts = '''
-4 -1 0 -1 -4
-1 2 3 2 -1
0 3 4 3 0
-1 2 3 2 -1
-4 -1 0 -1 -4
'''.split()
laplacian = np.array(laplacian_pts, dtype=np.float32).reshape(5, 5)
image = get_image()
print "Image size: %s" % (image.shape,)
response = np.zeros_like(image)
response[:5, :5] = laplacian
# CPU
# Use SciPy to perform the FFT convolution
ts = timer()
cvimage_cpu = fftconvolve(image, laplacian, mode='same')
te = timer()
print 'CPU: %.2fs' % (te - ts)
# GPU
threadperblock = 32, 8
blockpergrid = best_grid_size(tuple(reversed(image.shape)), threadperblock)
print('kernel config: %s x %s' % (blockpergrid, threadperblock))
# Initialize the cuFFT system.
cufft.FFTPlan(shape=image.shape, itype=np.complex64, otype=np.complex64)
# Start GPU timer
ts = timer()
image_complex = image.astype(np.complex64)
response_complex = response.astype(np.complex64)
d_image_complex = cuda.to_device(image_complex)
d_response_complex = cuda.to_device(response_complex)
task1(cufft, d_image_complex, d_response_complex)
cvimage_gpu = d_image_complex.copy_to_host().real / np.prod(image.shape)
te = timer()
print 'GPU: %.2fs' % (te - ts)
return cvimage_cpu, cvimage_gpu
开发者ID:Aahung,项目名称:numbapro-examples,代码行数:50,代码来源:convolve.py
注:本文中的numbapro.cuda.to_device函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论