本文整理汇总了Python中numba.cuda.device_array函数的典型用法代码示例。如果您正苦于以下问题:Python device_array函数的具体用法?Python device_array怎么用?Python device_array使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了device_array函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: _concat
def _concat(cls, objs):
head = objs[0]
for o in objs:
if not o.is_type_equivalent(head):
raise ValueError("All series must be of same type")
newsize = sum(map(len, objs))
# Concatenate data
mem = cuda.device_array(shape=newsize, dtype=head.data.dtype)
data = Buffer.from_empty(mem)
for o in objs:
data.extend(o.data.to_gpu_array())
# Concatenate mask if present
if all(o.has_null_mask for o in objs):
# FIXME: Inefficient
mem = cuda.device_array(shape=newsize, dtype=np.bool)
mask = Buffer.from_empty(mem)
null_count = 0
for o in objs:
mask.extend(o._get_mask_as_series().to_gpu_array())
null_count += o._null_count
mask = Buffer(utils.boolmask_to_bitmask(mask.to_array()))
else:
mask = None
null_count = 0
col = head.replace(data=data, mask=mask, null_count=null_count)
return col
开发者ID:xennygrimmato,项目名称:pygdf,代码行数:29,代码来源:column.py
示例2: test_profiling
def test_profiling(self):
with cuda._profiling():
a = cuda.device_array(10)
del a
with cuda._profiling():
a = cuda.device_array(100)
del a
开发者ID:meego,项目名称:numba,代码行数:8,代码来源:test_profiler.py
示例3: getGraphFromEdges_gpu
def getGraphFromEdges_gpu(dest, weight, fe, od, edges, n_edges = None,
MAX_TPB = 512, stream = None):
"""
All input (except MAX_TPB and stream) are device arrays.
edges : array with the IDs of the edges that will be part of the new graph
n_edges : array of 1 element with the number of valid edges in the edges array;
if n_edges < size of edges, the last elements of the edges array are
not considered
"""
# check if number of valid edges was received
if n_edges is None:
edges_size = edges.size
n_edges = cuda.to_device(np.array([edges_size], dtype = np.int32))
else:
edges_size = int(n_edges.getitem(0))
# check if a stream was received, if not create one
if stream is None:
myStream = cuda.stream()
else:
myStream = stream
new_n_edges = edges_size * 2
# allocate memory for new graph
ndest = cuda.device_array(new_n_edges, dtype = dest.dtype,
stream = myStream)
nweight = cuda.device_array(new_n_edges, dtype = weight.dtype,
stream = myStream)
nfe = cuda.device_array_like(fe, stream = myStream)
nod = cuda.device_array_like(od, stream = myStream)
# fill new outdegree with zeros
vertexGrid = compute_cuda_grid_dim(nod.size, MAX_TPB)
memSet[vertexGrid, MAX_TPB, myStream](nod, 0)
# count all edges of new array and who they belong to
edgeGrid = compute_cuda_grid_dim(edges_size, MAX_TPB)
countEdges[edgeGrid, MAX_TPB, myStream](edges, n_edges, dest, fe, od, nod)
# get new first_edge array from new outdegree
nfe.copy_to_device(nod, stream=myStream)
ex_prefix_sum_gpu(nfe, MAX_TPB = MAX_TPB, stream = myStream)
# copy new first_edge to top_edge to serve as pointer in adding edges
top_edge = cuda.device_array_like(nfe, stream = myStream)
top_edge.copy_to_device(nfe, stream = myStream)
addEdges[edgeGrid, MAX_TPB, myStream](edges, n_edges, dest, weight, fe, od,
top_edge, ndest, nweight)
del top_edge
#del dest, weight, fe, od
return ndest, nweight, nfe, nod
开发者ID:Chiroptera,项目名称:masters_code,代码行数:56,代码来源:build.py
示例4: prescan_test
def prescan_test():
a = np.arange(2048).astype(np.int32)
reference = np.empty_like(a)
ref_sum = scan.exprefixsumNumba(a, reference)
a1 = np.arange(1024).astype(np.int32)
a2 = np.arange(1024, 2048).astype(np.int32)
ref1 = np.empty_like(a1)
ref2 = np.empty_like(a2)
ref_sum1 = scan.exprefixsumNumba(a1, ref1)
ref_sum2 = scan.exprefixsumNumba(a2, ref2)
dAux = cuda.device_array(2, dtype = np.int32)
dA = cuda.to_device(a)
sm_size = 1024 * a.dtype.itemsize
scan.prescan[2, 512, 0, sm_size](dA, dAux)
aux = dAux.copy_to_host()
a_gpu = dA.copy_to_host()
print "finish"
开发者ID:Chiroptera,项目名称:ThesisWriting,代码行数:27,代码来源:test_scan.py
示例5: test_gufunc_stream
def test_gufunc_stream(self):
#cuda.driver.flush_pending_free()
matrix_ct = 1001 # an odd number to test thread/block division in CUDA
A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2,
4)
B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4,
5)
ts = time()
stream = cuda.stream()
dA = cuda.to_device(A, stream)
dB = cuda.to_device(B, stream)
dC = cuda.device_array(shape=(1001, 2, 5), dtype=A.dtype, stream=stream)
dC = gufunc(dA, dB, out=dC, stream=stream)
C = dC.copy_to_host(stream=stream)
stream.synchronize()
tcuda = time() - ts
ts = time()
Gold = ut.matrix_multiply(A, B)
tcpu = time() - ts
stream_speedups.append(tcpu / tcuda)
self.assertTrue(np.allclose(C, Gold))
开发者ID:GaZ3ll3,项目名称:numba,代码行数:27,代码来源:test_gufunc.py
示例6: monte_carlo_pricer
def monte_carlo_pricer(paths, dt, interest, volatility):
n = paths.shape[0]
mm = MM(shape=n, dtype=np.double, prealloc=5)
blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK
gridsz = int(math.ceil(float(n) / blksz))
stream = cuda.stream()
prng = PRNG(PRNG.MRG32K3A, stream=stream)
# Allocate device side array
d_normdist = cuda.device_array(n, dtype=np.double, stream=stream)
c0 = interest - 0.5 * volatility ** 2
c1 = volatility * math.sqrt(dt)
d_last = cuda.to_device(paths[:, 0], to=mm.get())
for j in range(1, paths.shape[1]):
prng.normal(d_normdist, mean=0, sigma=1)
d_paths = cuda.to_device(paths[:, j], stream=stream, to=mm.get())
step(d_last, dt, c0, c1, d_normdist, out=d_paths, stream=stream)
d_paths.copy_to_host(paths[:, j], stream=stream)
mm.free(d_last)
d_last = d_paths
stream.synchronize()
开发者ID:ContinuumIO,项目名称:numbapro-examples,代码行数:27,代码来源:pricer_cuda_vectorize.py
示例7: gather
def gather(data, index, out=None):
"""Perform ``out = data[index]`` on the GPU
"""
if out is None:
out = cuda.device_array(shape=index.size, dtype=data.dtype)
gpu_gather.forall(index.size)(data, index, out)
return out
开发者ID:xennygrimmato,项目名称:pygdf,代码行数:7,代码来源:cudautils.py
示例8: arange
def arange(start, stop=None, step=1, dtype=np.int64):
if stop is None:
start, stop = 0, start
size = (stop - start + (step - 1)) // step
out = cuda.device_array(size, dtype=dtype)
gpu_arange.forall(size)(start, size, step, out)
return out
开发者ID:xennygrimmato,项目名称:pygdf,代码行数:7,代码来源:cudautils.py
示例9: astype
def astype(ary, dtype):
if ary.dtype == np.dtype(dtype):
return ary
else:
out = cuda.device_array(shape=ary.shape, dtype=dtype)
configured = gpu_copy.forall(out.size)
configured(ary, out)
return out
开发者ID:xennygrimmato,项目名称:pygdf,代码行数:8,代码来源:cudautils.py
示例10: sum_parts
def sum_parts(data):
"""
Driver for ``gpu_single_block_sum`` kernel
"""
arr = np.asarray(data, dtype=np.float32)
out = cuda.device_array(1, dtype=np.float32)
gpu_single_block_sum[1, gpu_block_sum_max_blockdim](arr, out)
return out.copy_to_host()[0]
开发者ID:Alexhuszagh,项目名称:numba,代码行数:8,代码来源:cuda_dask.py
示例11: __init__
def __init__(self, shape, dtype, prealloc):
self.device = cuda.get_current_device()
self.freelist = deque()
self.events = {}
for i in range(prealloc):
gpumem = cuda.device_array(shape=shape, dtype=dtype)
self.freelist.append(gpumem)
self.events[gpumem] = cuda.event(timing=False)
开发者ID:XiaoxiaSun,项目名称:numbapro-examples,代码行数:8,代码来源:cuda_helper.py
示例12: apply_reduce
def apply_reduce(fn, inp):
# allocate output+temp array
outsz = libgdf.gdf_reduce_optimal_output_size()
out = cuda.device_array(outsz, dtype=inp.dtype)
# call reduction
fn(inp.cffi_view, unwrap_devary(out), outsz)
# return 1st element
return out[0]
开发者ID:xennygrimmato,项目名称:pygdf,代码行数:8,代码来源:_gdf.py
示例13: monte_carlo_pricer
def monte_carlo_pricer(paths, dt, interest, volatility):
n = paths.shape[0]
num_streams = 2
part_width = int(math.ceil(float(n) / num_streams))
partitions = [(0, part_width)]
for i in range(1, num_streams):
begin, end = partitions[i - 1]
begin, end = end, min(end + (end - begin), n)
partitions.append((begin, end))
partlens = [end - begin for begin, end in partitions]
mm = MM(shape=part_width, dtype=np.double, prealloc=10 * num_streams)
device = cuda.get_current_device()
blksz = device.MAX_THREADS_PER_BLOCK
gridszlist = [int(math.ceil(float(partlen) / blksz))
for partlen in partlens]
strmlist = [cuda.stream() for _ in range(num_streams)]
prnglist = [PRNG(PRNG.MRG32K3A, stream=strm)
for strm in strmlist]
# Allocate device side array
d_normlist = [cuda.device_array(partlen, dtype=np.double, stream=strm)
for partlen, strm in zip(partlens, strmlist)]
c0 = interest - 0.5 * volatility ** 2
c1 = volatility * math.sqrt(dt)
# Configure the kernel
# Similar to CUDA-C: cu_monte_carlo_pricer<<<gridsz, blksz, 0, stream>>>
steplist = [cu_step[gridsz, blksz, strm]
for gridsz, strm in zip(gridszlist, strmlist)]
d_lastlist = [cuda.to_device(paths[s:e, 0], to=mm.get(stream=strm))
for (s, e), strm in zip(partitions, strmlist)]
for j in range(1, paths.shape[1]):
for prng, d_norm in zip(prnglist, d_normlist):
prng.normal(d_norm, mean=0, sigma=1)
d_pathslist = [cuda.to_device(paths[s:e, j], stream=strm,
to=mm.get(stream=strm))
for (s, e), strm in zip(partitions, strmlist)]
for step, args in zip(steplist, zip(d_lastlist, d_pathslist, d_normlist)):
d_last, d_paths, d_norm = args
step(d_last, d_paths, dt, c0, c1, d_norm)
for d_paths, strm, (s, e) in zip(d_pathslist, strmlist, partitions):
d_paths.copy_to_host(paths[s:e, j], stream=strm)
mm.free(d_last, stream=strm)
d_lastlist = d_pathslist
for strm in strmlist:
strm.synchronize()
开发者ID:AngelBerihuete,项目名称:numbapro-examples,代码行数:58,代码来源:pricer_cuda_overlap.py
示例14: test_stream_bind
def test_stream_bind(self):
stream = cuda.stream()
with stream.auto_synchronize():
arr = cuda.device_array(
(3, 3),
dtype=np.float64,
stream=stream)
self.assertEqual(arr.bind(stream).stream, stream)
self.assertEqual(arr.stream, stream)
开发者ID:esc,项目名称:numba,代码行数:9,代码来源:test_cuda_ndarray.py
示例15: mask_assign_slot
def mask_assign_slot(size, mask):
# expand bits into bytes
dtype = (np.int32 if size < 2 ** 31 else np.int64)
expanded_mask = cuda.device_array(size, dtype=dtype)
numtasks = min(64 * 128, expanded_mask.size)
gpu_expand_mask_bits.forall(numtasks)(mask, expanded_mask)
# compute prefixsum
slots = prefixsum(expanded_mask)
sz = int(slots[slots.size - 1])
return slots, sz
开发者ID:xennygrimmato,项目名称:pygdf,代码行数:11,代码来源:cudautils.py
示例16: test_device_array_interface
def test_device_array_interface(self):
dary = cuda.device_array(shape=100)
devicearray.verify_cuda_ndarray_interface(dary)
ary = np.empty(100)
dary = cuda.to_device(ary)
devicearray.verify_cuda_ndarray_interface(dary)
ary = np.asarray(1.234)
dary = cuda.to_device(ary)
self.assertEquals(dary.ndim, 1)
devicearray.verify_cuda_ndarray_interface(dary)
开发者ID:esc,项目名称:numba,代码行数:12,代码来源:test_cuda_ndarray.py
示例17: test_event_elapsed
def test_event_elapsed(self):
N = 32
dary = cuda.device_array(N, dtype=np.double)
evtstart = cuda.event()
evtend = cuda.event()
evtstart.record()
cuda.to_device(np.arange(N), to=dary)
evtend.record()
evtend.wait()
evtend.synchronize()
print(evtstart.elapsed_time(evtend))
开发者ID:ASPP,项目名称:numba,代码行数:12,代码来源:test_events.py
示例18: find_segments
def find_segments(arr):
"""Find beginning indices of runs of equal values.
Returns
-------
starting_indices : device array
The starting indices of start of segments.
Total segment count will be equal to the length of this.
"""
from . import _gdf
# Compute diffs of consecutive elements
markers = cuda.device_array(arr.size, dtype=np.int32)
gpu_mark_segment_begins.forall(markers.size)(arr, markers)
# Compute index of marked locations
slots = prefixsum(markers)
ct = slots[slots.size - 1]
scanned = slots[:-1]
# Compact segments
begins = cuda.device_array(shape=int(ct), dtype=np.intp)
gpu_scatter_segment_begins.forall(markers.size)(markers, scanned, begins)
return begins
开发者ID:xennygrimmato,项目名称:pygdf,代码行数:22,代码来源:cudautils.py
示例19: test_event_elapsed_stream
def test_event_elapsed_stream(self):
N = 32
stream = cuda.stream()
dary = cuda.device_array(N, dtype=np.double)
evtstart = cuda.event()
evtend = cuda.event()
evtstart.record(stream=stream)
cuda.to_device(np.arange(N), to=dary, stream=stream)
evtend.record(stream=stream)
evtend.wait(stream=stream)
evtend.synchronize()
# Exercise the code path
evtstart.elapsed_time(evtend)
开发者ID:cpcloud,项目名称:numba,代码行数:14,代码来源:test_events.py
示例20: run_gather
def run_gather(self, arr, diffs):
h_out_idx = np.zeros(1, dtype=np.intp)
out_queue = cuda.device_array(shape=self._maxk, dtype=arr.dtype)
gpu_insert_if_masked.forall(arr.size)(arr, diffs, h_out_idx, out_queue)
qsz = h_out_idx[0]
if self._maxk >= 0:
if qsz > self._maxk:
msg = 'too many unique value: unique values ({}) > k ({})'
raise ValueError(msg.format(qsz, self._maxk))
end = min(qsz, self._maxk)
else:
raise NotImplementedError('k is unbounded')
vals = out_queue[:end]
return vals
开发者ID:xennygrimmato,项目名称:pygdf,代码行数:14,代码来源:cudautils.py
注:本文中的numba.cuda.device_array函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论