本文整理汇总了Python中numba.cuda.syncthreads函数的典型用法代码示例。如果您正苦于以下问题:Python syncthreads函数的具体用法?Python syncthreads怎么用?Python syncthreads使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了syncthreads函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: jacobi_relax_core
def jacobi_relax_core(A, Anew, error):
smem = cuda.shared.array(shape=(32 + 2, 32 + 2), dtype=f8)
n = A.shape[0]
m = A.shape[1]
tx = cuda.threadIdx.x
ty = cuda.threadIdx.y
j = ty + cuda.blockIdx.y * cuda.blockDim.y
i = tx + cuda.blockIdx.x * cuda.blockDim.x
sy = ty + 1
sx = tx + 1
smem[sy, sx] = A[j, i]
if tx == 0 and i >= 1:
smem[sy, 0] = A[j, i - 1]
if ty == 0 and j < m - 1:
smem[0, sx] = A[j - 1, i]
if tx == 31 and j >= 1:
smem[sy, 33] = A[j, i + 1]
if ty == 31 and j < n - 1:
smem[33, sx] = A[j + 1, i]
cuda.syncthreads() # ensure smem is visible by all threads in the block
if j >= 1 and j < n - 1 and i >= 1 and i < m - 1:
Anew[j, i] = 0.25 * ( smem[sy, sx + 1] + smem[sy, sx - 1] \
+ smem[sy - 1, sx] + smem[sy + 1, sx])
error[j, i] = Anew[j, i] - A[j, i]
开发者ID:FedericoStra,项目名称:numba,代码行数:33,代码来源:laplace2d-numba-cuda-smem.py
示例2: _getOccupancyCUDAkernel
def _getOccupancyCUDAkernel(occus, coords, centers, channelsigmas, trunc):
centeridx = cuda.blockIdx.x
blockidx = cuda.blockIdx.y
atomidx = (cuda.threadIdx.x + (cuda.blockDim.x * blockidx))
if atomidx >= coords.shape[0] or centeridx >= centers.shape[0]:
return
# TODO: Can remove this. Barely any speedup
centcoor = cuda.shared.array(shape=(3), dtype=numba.float32)
centcoor[0] = centers[centeridx, 0]
centcoor[1] = centers[centeridx, 1]
centcoor[2] = centers[centeridx, 2]
cuda.syncthreads()
dx = coords[atomidx, 0] - centcoor[0]
dy = coords[atomidx, 1] - centcoor[1]
dz = coords[atomidx, 2] - centcoor[2]
d2 = dx * dx + dy * dy + dz * dz
if d2 >= trunc:
return
d1 = 1 / sqrt(d2)
for h in range(channelsigmas.shape[1]):
if channelsigmas[atomidx, h] == 0:
continue
x = channelsigmas[atomidx, h] * d1
value = 1 - exp(-(x ** 12))
cuda.atomic.max(occus, (centeridx, h), value)
开发者ID:alejandrovr,项目名称:htmd,代码行数:29,代码来源:voxeldescriptors.py
示例3: cu_square_matrix_mul
def cu_square_matrix_mul(A, B, C):
sA = cuda.shared.array(shape=SM_SIZE, dtype=float32)
sB = cuda.shared.array(shape=(tpb, tpb), dtype=float32)
tx = cuda.threadIdx.x
ty = cuda.threadIdx.y
bx = cuda.blockIdx.x
by = cuda.blockIdx.y
bw = cuda.blockDim.x
bh = cuda.blockDim.y
x = tx + bx * bw
y = ty + by * bh
acc = float32(0) # forces all the math to be f32
for i in range(bpg):
if x < n and y < n:
sA[ty, tx] = A[y, tx + i * tpb]
sB[ty, tx] = B[ty + i * tpb, x]
cuda.syncthreads()
if x < n and y < n:
for j in range(tpb):
acc += sA[ty, j] * sB[j, tx]
cuda.syncthreads()
if x < n and y < n:
C[y, x] = acc
开发者ID:stuartarchibald,项目名称:numba,代码行数:30,代码来源:test_matmul.py
示例4: simple_smem
def simple_smem(ary):
sm = cuda.shared.array(N, int32)
i = cuda.grid(1)
if i == 0:
for j in range(N):
sm[j] = j
cuda.syncthreads()
ary[i] = sm[i]
开发者ID:ASPP,项目名称:numba,代码行数:8,代码来源:test_globals.py
示例5: atomic_add_double
def atomic_add_double(idx, ary):
tid = cuda.threadIdx.x
sm = cuda.shared.array(32, float64)
sm[tid] = 0.0
cuda.syncthreads()
bin = idx[tid] % 32
cuda.atomic.add(sm, bin, 1.0)
cuda.syncthreads()
ary[tid] = sm[tid]
开发者ID:cpcloud,项目名称:numba,代码行数:9,代码来源:test_atomics.py
示例6: atomic_add3
def atomic_add3(ary):
tx = cuda.threadIdx.x
ty = cuda.threadIdx.y
sm = cuda.shared.array((4, 8), uint32)
sm[tx, ty] = ary[tx, ty]
cuda.syncthreads()
cuda.atomic.add(sm, (tx, uint64(ty)), 1)
cuda.syncthreads()
ary[tx, ty] = sm[tx, ty]
开发者ID:MJJoyce,项目名称:numba,代码行数:9,代码来源:test_atomics.py
示例7: atomic_add
def atomic_add(ary):
tid = cuda.threadIdx.x
sm = cuda.shared.array(32, uint32)
sm[tid] = 0
cuda.syncthreads()
bin = ary[tid] % 32
cuda.atomic.add(sm, bin, 1)
cuda.syncthreads()
ary[tid] = sm[tid]
开发者ID:MJJoyce,项目名称:numba,代码行数:9,代码来源:test_atomics.py
示例8: atomic_add_float
def atomic_add_float(ary):
tid = cuda.threadIdx.x
sm = cuda.shared.array(32, float32)
sm[tid] = 0
cuda.syncthreads()
bin = int(ary[tid] % 32)
cuda.atomic.add(sm, bin, 1.0)
cuda.syncthreads()
ary[tid] = sm[tid]
开发者ID:cpcloud,项目名称:numba,代码行数:9,代码来源:test_atomics.py
示例9: atomic_add_float_2
def atomic_add_float_2(ary):
tx = cuda.threadIdx.x
ty = cuda.threadIdx.y
sm = cuda.shared.array((4, 8), float32)
sm[tx, ty] = ary[tx, ty]
cuda.syncthreads()
cuda.atomic.add(sm, (tx, ty), 1)
cuda.syncthreads()
ary[tx, ty] = sm[tx, ty]
开发者ID:cpcloud,项目名称:numba,代码行数:9,代码来源:test_atomics.py
示例10: idx_kernel
def idx_kernel(arr):
s = cuda.shared.array(shape=maxThread, dtype=int32)
idx = cuda.grid(1)
if idx < arr.shape[0]:
s[cuda.threadIdx.x] = 1
cuda.syncthreads()
if idx < arr.shape[0]:
cuda.atomic.add(arr, s[cuda.threadIdx.x], 1)
开发者ID:CyberIntelMafia,项目名称:KaggleMalware,代码行数:11,代码来源:test_sync_cuda.py
示例11: atomic_max_double_shared
def atomic_max_double_shared(res, ary):
tid = cuda.threadIdx.x
smary = cuda.shared.array(32, float64)
smary[tid] = ary[tid]
smres = cuda.shared.array(1, float64)
if tid == 0:
smres[0] = res[0]
cuda.syncthreads()
cuda.atomic.max(smres, 0, smary[tid])
cuda.syncthreads()
if tid == 0:
res[0] = smres[0]
开发者ID:cpcloud,项目名称:numba,代码行数:12,代码来源:test_atomics.py
示例12: problematic
def problematic(x, y):
tid = cuda.threadIdx.x
ntid = cuda.blockDim.x
if tid > 12:
for i in range(ntid):
y[i] += x[i] // y[i]
cuda.syncthreads()
if tid < 17:
for i in range(ntid):
x[i] += x[i] // y[i]
开发者ID:cpcloud,项目名称:numba,代码行数:12,代码来源:test_exception.py
示例13: device_reduce_full_block
def device_reduce_full_block(arr, partials, sm_partials):
"""
Partially reduce `arr` into `partials` using `sm_partials` as working
space. The algorithm goes like:
array chunks of 128: | 0 | 128 | 256 | 384 | 512 |
block-0: | x | | | x | |
block-1: | | x | | | x |
block-2: | | | x | | |
The array is divided into chunks of 128 (size of a threadblock).
The threadblocks consumes the chunks in roundrobin scheduling.
First, a threadblock loads a chunk into temp memory. Then, all
subsequent chunks are combined into the temp memory.
Once all chunks are processed. Inner-block reduction is performed
on the temp memory. So that, there will just be one scalar result
per block. The result from each block is stored to `partials` at
the dedicated slot.
"""
tid = cuda.threadIdx.x
blkid = cuda.blockIdx.x
blksz = cuda.blockDim.x
gridsz = cuda.gridDim.x
# block strided loop to compute the reduction
start = tid + blksz * blkid
stop = arr.size
step = blksz * gridsz
# load first value
tmp = arr[start]
# loop over all values in block-stride
for i in range(start + step, stop, step):
tmp = reduce_op(tmp, arr[i])
cuda.syncthreads()
# inner-warp reduction
inner_warp_reduction(sm_partials, tmp)
cuda.syncthreads()
# at this point, only the first slot for each warp in tsm_partials
# is valid.
# finish up block reduction
# warning: this is assuming 4 warps.
# assert numwarps == 4
if tid < 2:
sm_partials[tid, 0] = reduce_op(sm_partials[tid, 0],
sm_partials[tid + 2, 0])
if tid == 0:
partials[blkid] = reduce_op(sm_partials[0, 0], sm_partials[1, 0])
开发者ID:cpcloud,项目名称:numba,代码行数:52,代码来源:reduction.py
示例14: oracle
def oracle(x, y):
tid = cuda.threadIdx.x
ntid = cuda.blockDim.x
if tid > 12:
for i in range(ntid):
if y[i] != 0:
y[i] += x[i] // y[i]
cuda.syncthreads()
if tid < 17:
for i in range(ntid):
if y[i] != 0:
x[i] += x[i] // y[i]
开发者ID:cpcloud,项目名称:numba,代码行数:14,代码来源:test_exception.py
示例15: gpu_unique_k
def gpu_unique_k(arr, k, out, outsz_ptr):
"""
Note: run with small blocks.
"""
tid = cuda.threadIdx.x
blksz = cuda.blockDim.x
base = 0
# shared memory
vset_size = 0
sm_mem_size = MAX_FAST_UNIQUE_K
vset = cuda.shared.array(sm_mem_size, dtype=nbtype)
share_vset_size = cuda.shared.array(1, dtype=int32)
share_loaded = cuda.shared.array(sm_mem_size, dtype=nbtype)
sm_mem_size = min(k, sm_mem_size)
while vset_size < sm_mem_size and base < arr.size:
pos = base + tid
valid_load = min(blksz, arr.size - base)
# load
if tid < valid_load:
share_loaded[tid] = arr[pos]
# wait for load to complete
cuda.syncthreads()
# thread-0 inserts
if tid == 0:
for i in range(valid_load):
val = share_loaded[i]
new_size = gpu_unique_set_insert(vset, vset_size, val)
if new_size >= 0:
vset_size = new_size
else:
vset_size = sm_mem_size + 1
share_vset_size[0] = vset_size
# wait until the insert is done
cuda.syncthreads()
vset_size = share_vset_size[0]
# increment
base += blksz
# output
if vset_size <= sm_mem_size:
for i in range(tid, vset_size, blksz):
out[i] = vset[i]
if tid == 0:
outsz_ptr[0] = vset_size
else:
outsz_ptr[0] = -1
开发者ID:xennygrimmato,项目名称:pygdf,代码行数:48,代码来源:cudautils.py
示例16: kernel
def kernel(input, output):
tile = cuda.shared.array(shape=tile_shape, dtype=dt)
tx = cuda.threadIdx.x
ty = cuda.threadIdx.y
bx = cuda.blockIdx.x * cuda.blockDim.x
by = cuda.blockIdx.y * cuda.blockDim.y
x = by + tx
y = bx + ty
if by+ty < input.shape[0] and bx+tx < input.shape[1]:
tile[ty, tx] = input[by+ty, bx+tx]
cuda.syncthreads()
if y < output.shape[0] and x < output.shape[1]:
output[y, x] = tile[tx, ty]
开发者ID:Alexhuszagh,项目名称:numba,代码行数:16,代码来源:transpose.py
示例17: argmax_lvl0
def argmax_lvl0(ary, reduce_max, reduce_arg):
"""
This only works for positive values arrays.
Shared memory must be initialized with double the size of
the block size.
"""
sm_ary = cuda.shared.array(shape = 0, dtype = ary.dtype)
# each thread will process two elements
tgid = cuda.grid(1)
thid = cuda.threadIdx.x
# pointer to value and argument side of shared memory
val_pointer = 0
arg_pointer = sm_ary.size / 2
# when global thread id is bigger or equal than the ary size
# it means that the block is incomplete; in this case we just
# fill the rest of the block with -1 so it is smaller than all
# other elements; this only works for positive arrays
if tgid < ary.size:
sm_ary[val_pointer + thid] = ary[tgid]
sm_ary[arg_pointer + thid] = tgid
else:
sm_ary[val_pointer + thid] = 0
sm_ary[arg_pointer + thid] = -1
cuda.syncthreads()
s = cuda.blockDim.x / 2
while s >0:
index = 2 * s * thid
if thid < s:
# only change if the left element is smaller than the right one
if sm_ary[val_pointer + thid] < sm_ary[val_pointer + thid + s]:
sm_ary[val_pointer + thid] = sm_ary[val_pointer + thid + s]
sm_ary[arg_pointer + index] = sm_ary[arg_pointer + index + s]
cuda.syncthreads()
if thid == 0:
reduce_ary[cuda.blockIdx.x] = sm_ary[val_pointer]
reduce_arg[cuda.blockIdx.x] = sm_ary[arg_pointer]
开发者ID:Chiroptera,项目名称:masters_code,代码行数:45,代码来源:singlelink.py
示例18: gpu_single_block_sum
def gpu_single_block_sum(arr, out):
"""
A naive single threadblock sum reduction
"""
temp = cuda.shared.array(gpu_block_sum_max_blockdim, dtype=float32)
tid = cuda.threadIdx.x
blksz = cuda.blockDim.x
temp[tid] = 0
# block stride loop to sum-reduce cooperatively
for i in range(tid, arr.size, blksz):
temp[tid] += arr[i]
cuda.syncthreads()
# naive intra block sum that uses a single thread
if tid == 0:
for i in range(1, blksz):
temp[0] += temp[i]
# store result
out[0] = temp[0]
开发者ID:Alexhuszagh,项目名称:numba,代码行数:18,代码来源:cuda_dask.py
示例19: max_kernel
def max_kernel(a, b):
"Simple implementation of reduction kernel"
# Allocate static shared memory of 256.
# This limits the maximum block size to 256.
sa = cuda.shared.array(shape=(256,), dtype=int32)
tx = cuda.threadIdx.x
bx = cuda.blockIdx.x
bw = cuda.blockDim.x
i = tx + bx * bw
if i < a.shape[0]:
sa[tx] = a[i]
if tx == 0:
# Uses the first thread of each block to perform the actual
# reduction
m = sa[tx]
cuda.syncthreads()
for j in range(1, bw):
m = mymax(m, sa[j])
b[bx] = m
开发者ID:AngelBerihuete,项目名称:numbapro-examples,代码行数:19,代码来源:max.py
示例20: device_reduce_partial_block
def device_reduce_partial_block(arr, partials, sm_partials):
"""
This computes reduction on `arr`.
This device function must be used by 1 threadblock only.
The blocksize must match `arr.size` and must not be greater than 128.
"""
tid = cuda.threadIdx.x
blkid = cuda.blockIdx.x
blksz = cuda.blockDim.x
warpid = tid // _WARPSIZE
laneid = tid % _WARPSIZE
size = arr.size
# load first value
tid = cuda.threadIdx.x
value = arr[tid]
sm_partials[warpid, laneid] = value
cuda.syncthreads()
if (warpid + 1) * _WARPSIZE < size:
# fully populated warps
inner_warp_reduction(sm_partials, value)
else:
# partially populated warps
# NOTE: this uses a very inefficient sequential algorithm
if laneid == 0:
sm_this = sm_partials[warpid, :]
base = warpid * _WARPSIZE
for i in range(1, size - base):
sm_this[0] = reduce_op(sm_this[0], sm_this[i])
cuda.syncthreads()
# finish up
if tid == 0:
num_active_warps = (blksz + _WARPSIZE - 1) // _WARPSIZE
result = sm_partials[0, 0]
for i in range(1, num_active_warps):
result = reduce_op(result, sm_partials[i, 0])
partials[blkid] = result
开发者ID:cpcloud,项目名称:numba,代码行数:42,代码来源:reduction.py
注:本文中的numba.cuda.syncthreads函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论