• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python cuda.to_device函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中numbapro.cuda.to_device函数的典型用法代码示例。如果您正苦于以下问题:Python to_device函数的具体用法?Python to_device怎么用?Python to_device使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了to_device函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: evaluation_function

def evaluation_function(factors, opts):

    start = timer()

    longest_wavelet, target_samples = opts['longest_wavelet'], opts['target_samples']
    window_width = len(target_samples)
    full_width = window_width + longest_wavelet
    num_wavelengths = longest_wavelet-2
    offsets_per_wavelet = full_width / num_wavelengths
    num_rows = offsets_per_wavelet * num_wavelengths

    result = np.zeros(window_width, dtype=np.float32)

    d_factors = cuda.to_device(factors)
    d_result = cuda.to_device(result)

    griddim = full_width, 1
    blockdim = 4, 1, 1

    compute_samples_configured = compute_sample_kernel.configure(griddim, blockdim)
    compute_samples_configured(d_factors, longest_wavelet, offsets_per_wavelet, d_result, num_rows)

    d_result.to_host()
    generated_samples_sum = sum(result)
    factors_sum = sum(factors)
    difference_from_target = math.fabs(sum(target_samples - result))
    non_zero_factors = filter(lambda x: x != 0.0, result)

    fun_value = difference_from_target + 10 * math.fabs(len(non_zero_factors))

    print("Value "+str(fun_value)+" generated in " + str((timer() - start)) + " seconds. Sample sum: " +
          str(generated_samples_sum)+". Factors sum: "+str(factors_sum))

    return fun_value
开发者ID:RelentlessResults,项目名称:decompose001,代码行数:34,代码来源:precomputed_representation.py


示例2: run_GPU

def run_GPU(grid, adjGrid, steps, delay, initDelay, printInd, indSteps):
    """ Runs the Command-Line interface for a specified number of steps,
        or forever if the number of steps is specified to be -1.
        Note that here, grid and adjGrid must be explicitly specified as
        opposed to passed in as a Game, to enable everything to be run on the
        GPU. Returns the final grid state. """
    step = 0
    dim = grid.shape
    # move arrays to GPU
    d_grid = cuda.to_device(grid)
    d_adjGrid = cuda.to_device(adjGrid)
    blockDim = (32,16)
    gridDim = (32,8)
    while step < steps or steps == -1:
        # print grid
        if printInd is not -1 and step % printInd is 0:
            # in order to print grid, first need memory back in CPU
            d_grid.to_host()
            printGrid(grid, step, dim)
        # print index
        if indSteps is not -1 and step % indSteps is 0:
            print("Step = " + str(step))
        newGrid = np.zeros_like(grid)
        d_newGrid = cuda.to_device(newGrid)
        evolve2D_kernel[gridDim, blockDim](d_grid, d_adjGrid, d_newGrid)
        d_grid = d_newGrid
        grid = newGrid
        sleep(delay)
        if step == 0:
            # allow initial position to be more easily visible
            sleep(initDelay)
        step += 1
    d_grid.to_host()
    return grid
开发者ID:goldenratio1618,项目名称:gameoflife,代码行数:34,代码来源:cmdline.py


示例3: train

	def train(self,ds,epochs,batch_size=10):

		for epoch in range(epochs):
			start = timer()
			count = 0.
			correct = 0.
			for i in range(len(ds)/batch_size):
				count += 1.
				x = encode(ds[i*batch_size][0],gpu=False)
				t = encode(ds[i*batch_size][1],gpu=False)
				for b in range(batch_size-1):
					x = np.concatenate((x,encode(ds[i*batch_size + b+1][0],gpu=False)))
					t = np.concatenate((t,encode(ds[i*batch_size + b+1][1],gpu=False)))
				x = cuda.to_device(x)
				t = cuda.to_device(t)
				assert x.shape[1] == self.layers[0]
				assert t.shape[1] == self.layers[2]
				print(x.shape)
				self.forward(x)
				print('output',decode(self.output))
				if decode(self.output) == decode(t):
					correct += 1.
				self.backward(t)
			print("Epoch",epoch,"Time:",timer()-start,'output',decode(self.output), 'Accuracy:',correct/count)
			if correct/count > 0.99:
				break
开发者ID:ZhangAustin,项目名称:PyCuNN,代码行数:26,代码来源:nn.py


示例4: fit

 def fit(self,X,Budget=None,W=None):
     self.X = cuda.to_device(X.astype(np.float64,order='F'))
     self.Budget = cuda.device_array((self.budgetSize,self.X.shape[1]),dtype=np.float64,order='F')
     self.kx = cuda.device_array((self.budgetSize,self.X.shape[0]),dtype=np.float64,order='F')
     self.Wkx = cuda.device_array((self.latentTopics,self.X.shape[0]),dtype=np.float64,order='F')
     self.H = cuda.device_array((self.latentTopics,self.X.shape[0]),dtype=np.float64,order='F')
     if Budget is None:
         permutation = np.random.permutation(self.X.shape[0])
         self.permutation = cuda.to_device(permutation)
         initBudget(self.X,self.permutation,self.Budget)
     else:
         self.Budget = cuda.to_device(Budget.astype(np.float64,order='F'))
     self.calculateKB()
     self.calculateKX()
     if W is None:
         self.initW()
     else:
         self.W = cuda.to_device(W.astype(np.float64,order='F'))
     self.t = 0
     for i in xrange(self.epochs):
         print "Epoch " + str(i)
         samples,features = self.X.shape
         permutation = getPermutation(samples,self.miniBatchSize)
         self.permutation = cuda.to_device(permutation)
         for j in xrange((samples + self.miniBatchSize) / self.miniBatchSize):
             loadBatch(self.kx,self.permutation,j,self.kxi)
             self.nextW()
             self.t += 1
     self.predictH()
开发者ID:ejake,项目名称:tensor-factorization,代码行数:29,代码来源:cudaOKMF.py


示例5: getIdx

def getIdx(fill_word,reduced_literal, reduced_length):#step 5: get index by interleaving fill_word and literal(also remove all-zeros word)
	bin_length = max(len(bin(reduced_length-1)),len(bin(tpb-1)))#the bit number of binary form of array length
	thread_num = numpy.int64(math.pow(2,bin_length))#Blelloch_scan need the length of scanned array to be even multiple of thread_per_block
	compact_flag = 	numpy.ones(thread_num, dtype='int64')
	print thread_num
	index = numpy.ones(2*reduced_length, dtype='uint32')
	d_index = cuda.to_device(index)
	d_fill_word = cuda.to_device(fill_word)
	d_reduced_literal = cuda.to_device(numpy.array(reduced_literal))
	d_compact_flag = cuda.to_device(compact_flag)
	#print fill_word
	getIdx_gpu[1, tpb](d_fill_word, d_reduced_literal, d_index, d_compact_flag, reduced_length)
	compact_flag = d_compact_flag.copy_to_host()
	#print compact_flag[0:28]

	useless_array = numpy.zeros(thread_num, dtype='int64')
	radix_sort.Blelloch_scan_caller(d_compact_flag, useless_array, 0)
	out_index_length = d_compact_flag.copy_to_host()[2*reduced_length-1] + 1
	print d_compact_flag.copy_to_host()[0:2*reduced_length]
	print out_index_length
	out_index = numpy.zeros(out_index_length, dtype='uint32')
	scatter_index[1,tpb](d_index, d_compact_flag, compact_flag, out_index, reduced_length)
	#for i in out_index:
	#	print bin(i)
	return out_index
开发者ID:DarinSSC,项目名称:WAH_on_GPU,代码行数:25,代码来源:bitmap_constructor_gpu.py


示例6: radix_sort

def radix_sort(arr, rid):
    length = numpy.int64(len(arr))
    bin_length = max(len(bin(length-1)),len(bin(TPB_MAX-1)))#the bit number of binary form of array length
    thread_num = numpy.int64(math.pow(2,bin_length))
    block_num = max(thread_num/TPB_MAX,1)

    stream = cuda.stream()
    one_list = numpy.zeros(shape=(thread_num), dtype='int64')
    zero_list = numpy.zeros(shape=(thread_num), dtype='int64')

    iter_num = len(bin(ATTR_CARD_MAX))
    for i in range(iter_num):
        d_arr = cuda.to_device(arr, stream)
        d_rid = cuda.to_device(rid, stream)
        d_zero_list = cuda.to_device(zero_list,stream)
        d_one_list = cuda.to_device(one_list,stream)
        get_list[block_num, TPB_MAX](arr, length, i, d_zero_list, d_one_list)#get one_list and zero_list
        d_one_list.to_host(stream)
        d_zero_list.to_host(stream)
        stream.synchronize()
        
        base_reduction_block_num = block_num
        base_reduction_block_size = TPB_MAX
        tmp_out = numpy.zeros(base_reduction_block_num, dtype='int64')
        d_tmp_out = cuda.to_device(tmp_out, stream)
        sum_reduction[base_reduction_block_num, base_reduction_block_size](d_zero_list, d_tmp_out)
        d_tmp_out.to_host(stream)
        stream.synchronize()
        base = 0 #base for the scan of one_list
        for j in xrange(base_reduction_block_num):
            base += tmp_out[j]

        Blelloch_scan_caller(d_zero_list, d_one_list, base)

        array_adjust[block_num,TPB_MAX](arr, d_arr, rid, d_rid, zero_list, one_list, d_zero_list, d_one_list, length)
开发者ID:DarinSSC,项目名称:BitmapIndex_GUI,代码行数:35,代码来源:radix_sort.py


示例7: tests

def tests():
    a = np.random.rand(300,500)
    b = np.random.rand(500,300)

    start = timer()
    c = np.dot(a,b)
    nptime = timer()-start
    print('nptime',nptime)

    x = np.array(np.random.rand(600,1500),dtype='float32',order='F')
    y = np.array(np.random.rand(1500,300),dtype='float32',order='F')
    z = np.zeros((1000,1000),order='F',dtype='float32')

    stream = cuda.stream()

    dx = cuda.to_device(x)
    dy = cuda.to_device(y)
    dz = cuda.to_device(z)

    start = timer()
    blas.gemm('N','N',1000,1500,1000,1.0,dx,dy,0.0,dz)
    cutime = timer()-start
    print('cutime',cutime)

    #dz.copy_to_host(z)
    print(dz[0])

    c = np.ones((1000,1000),order='F',dtype='float32')
    print(c.shape)
    dc = cuda.to_device(c)

   # blockDim = (256,256)
    #gridDim = (((1000 + blockDim[0]-1)/blockDim[0]),((1000 + blockDim[1]-1)/blockDim[1]))

    blockDim = (30,30)
    gridDim = ((((c.shape[0] + blockDim[0]) - 1) / blockDim[0]), (((c.shape[1] + blockDim[1]) - 1) / blockDim[1]))

    start = timer()
    mtanh[gridDim,blockDim,stream](dc)
    tantime = timer() - start
    print('tantime',tantime)

    dc.copy_to_host(c,stream=stream)
    stream.synchronize()
    print(c)

    y = cm.CUDAMatrix(np.ones((1000,1000)))

    start = timer()
    cm.tanh(y)
    cmtan = timer()-start
    print('cmtan',cmtan)

    x = cm.CUDAMatrix(np.random.rand(1000,1500))
    y = cm.CUDAMatrix(np.random.rand(1500,1000))

    start = timer()
    cm.dot(x,y)
    cmtime = timer()-start
    print('cmtime',cmtime)
开发者ID:ZhangAustin,项目名称:PyCuNN,代码行数:60,代码来源:numbaprotests.py


示例8: test_scan

def test_scan():

    in_h = np.empty(NUM_ELEMENTS, dtype=np.uint32)
    out_h = np.zeros(NUM_ELEMENTS, dtype=np.uint32)

    for i in range(0, NUM_ELEMENTS):
        in_h[i] = NUM_ELEMENTS -i - 1#randint(0, 100)

    tac1 = time()

    in_d = cuda.to_device(in_h)
    out_d = cuda.to_device(out_h)
    cuda.synchronize()

    tac2 = time()

    tk1 = time()

    for i in range(0, 32):
        tk1 = time()
        preScan(out_d, in_d, NUM_ELEMENTS)
        cuda.synchronize()
        tk2 = time()
        print i, tk2 - tk1
    tk2 = time()

    th1 = time()

    out_d.copy_to_host(out_h)
    cuda.synchronize()
    #print "Last = ", out_h[-1] + in_h[-1]

    th2 = time()
开发者ID:jalatif,项目名称:Python_Massively_Parallel_FP_Tree,代码行数:33,代码来源:exclusive_scan.py


示例9: reduce_by_key

def reduce_by_key(input_data, chunk_id, literal, length):#step 3
	flag = numpy.ones(length, dtype='int32')
	stream = cuda.stream()
	d_flag = cuda.to_device(flag, stream)
	d_chunk_id = cuda.to_device(chunk_id, stream)
	d_literal = cuda.to_device(literal, stream)
	produce_flag[1,tpb](input_data, d_chunk_id, length, d_flag)
	d_flag.to_host(stream)
	print 'flag:'
	print flag
	stream.synchronize()	
	is_finish = numpy.zeros(length, dtype='int32')
	hop = 1
	while hop<32:#only 32 because the length of a word in binary form is 32
		reduce_by_key_gpu[1,tpb](d_literal, d_flag, is_finish, hop, length)
		hop *= 2
	d_literal.to_host(stream)
	d_chunk_id.to_host(stream)
	stream.synchronize()

	reduced_input_data = []
	reduced_chunk_id = []
	reduced_literal =[]
	for i in xrange(length):
		if flag[i]:
			reduced_input_data.append(input_data[i])
			reduced_chunk_id.append(chunk_id[i])
			reduced_literal.append(literal[i])
	return numpy.array(reduced_input_data), numpy.array(reduced_chunk_id), reduced_literal
开发者ID:DarinSSC,项目名称:WAH_on_GPU,代码行数:29,代码来源:bitmap_constructor_gpu.py


示例10: getIdx

def getIdx(fill_word,reduced_literal, reduced_length, head, cardinality):#step 5: get index by interleaving fill_word and literal(also remove all-zeros word)
	bin_length = max(len(bin(2*reduced_length-1)),len(bin(tpb-1)))#the bit number of binary form of array length
	thread_num = numpy.int64(math.pow(2,bin_length))#Blelloch_scan need the length of scanned array to be even multiple of thread_per_block
	compact_flag = numpy.ones(thread_num, dtype='int64')
	index = numpy.ones(2*reduced_length, dtype='uint32')
	d_index = cuda.to_device(index)
	d_fill_word = cuda.to_device(fill_word)
	d_reduced_literal = cuda.to_device(numpy.array(reduced_literal))
	d_compact_flag = cuda.to_device(compact_flag)

	block_num = reduced_length/tpb + 1

	getIdx_gpu[block_num, tpb](d_fill_word, d_reduced_literal, d_index, d_compact_flag, reduced_length)
	compact_flag = d_compact_flag.copy_to_host()

	useless_array = numpy.zeros(thread_num, dtype='int64')
	radix_sort.Blelloch_scan_caller(d_compact_flag, useless_array, 0)
	out_index_length = d_compact_flag.copy_to_host()[2*reduced_length-1] + 1
	out_index = numpy.zeros(out_index_length, dtype='uint32')
	offsets = []
	
	new_block_num = 2*reduced_length/tpb + 1

	scatter_index[new_block_num, tpb](d_index, d_compact_flag, compact_flag, out_index, reduced_length)
	for i in xrange(reduced_length):
		if head[i]:
			offsets.append(d_compact_flag.copy_to_host()[2*i])

	key_length = numpy.zeros(cardinality, dtype='int64')

	for i in xrange(cardinality-1):
		key_length[i] = offsets[i+1] - offsets[i]
	key_length[cardinality-1] = out_index_length - offsets[cardinality-1]

	return out_index, numpy.array(offsets), numpy.array(key_length)	
开发者ID:DarinSSC,项目名称:BitmapIndex_GUI,代码行数:35,代码来源:bitmap_pickle.py


示例11: monte_carlo_pricer

def monte_carlo_pricer(paths, dt, interest, volatility):
    n = paths.shape[0]

    mm = MM(shape=n, dtype=np.double, prealloc=5)

    blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK
    gridsz = int(math.ceil(float(n) / blksz))

    stream = cuda.stream()
    prng = curand.PRNG(curand.PRNG.MRG32K3A, stream=stream)

    # Allocate device side array
    d_normdist = cuda.device_array(n, dtype=np.double, stream=stream)
    
    c0 = interest - 0.5 * volatility ** 2
    c1 = volatility * math.sqrt(dt)

    # Configure the kernel
    # Similar to CUDA-C: cu_monte_carlo_pricer<<<gridsz, blksz, 0, stream>>>
    step_cfg = step[gridsz, blksz, stream]
    
    d_last = cuda.to_device(paths[:, 0], to=mm.get())
    for j in range(1, paths.shape[1]):
        prng.normal(d_normdist, mean=0, sigma=1)
        d_paths = cuda.to_device(paths[:, j], stream=stream, to=mm.get())
        step_cfg(d_last, d_paths, dt, c0, c1, d_normdist)
        d_paths.copy_to_host(paths[:, j], stream=stream)
        mm.free(d_last, stream=stream)
        d_last = d_paths

    stream.synchronize()
开发者ID:Aahung,项目名称:numbapro-examples,代码行数:31,代码来源:pricer_cuda.py


示例12: main

def main():
    N = 2048 * 2048

    # Allocate host memory arrays
    a = np.empty(N)
    b = np.empty(N)
    c = np.empty(N)

    # Initialize host memory
    a.fill(2)
    b.fill(1)
    c.fill(0)

    # Allocate and copy GPU/device memory
    d_a = cuda.to_device(a)
    d_b = cuda.to_device(b)
    d_c = cuda.to_device(c)

    threads_per_block = 128
    number_of_blocks = N / 128 + 1

    saxpy [ number_of_blocks, threads_per_block ] ( d_a, d_b, d_c )

    d_c.copy_to_host(c)

    # Print out the first and last 5 values of c for a quality check
    print c[:5]
    print c[-5:]
开发者ID:mlanier,项目名称:Mlanier_Master,代码行数:28,代码来源:CUDA_Python2.py


示例13: gpumulti

def gpumulti(X,mu):
    device = cuda.get_current_device()
    
    n=len(X)
    X=np.array(X)
    x1 = np.array(X.T[0])
    x2 = np.array(X.T[1])
    
    bmk = np.arange(len(x1))
    
    mu = np.array(mu)
    
    dx1 = cuda.to_device(x1)
    dx2 = cuda.to_device(x2)
    dmu = cuda.to_device(mu)
    dbmk = cuda.to_device(bmk)
    
    # Set up enough threads for kernel
    tpb = device.WARP_SIZE
    bpg = int(np.ceil(float(n)/tpb))
        
    cu_worker[bpg,tpb](dx1,dx2,dmu,dbmk)
    
    bestmukey = dbmk.copy_to_host()
    
    return bestmukey
开发者ID:lmwalkowicz,项目名称:KeplerML,代码行数:26,代码来源:km_outliers.py


示例14: main

def main():
    NN = 4096
    NM = 4096

    A = np.zeros((NN, NM), dtype=np.float64)
    Anew = np.zeros((NN, NM), dtype=np.float64)

    n = NN
    m = NM
    iter_max = 1000

    tol = 1.0e-6
    error = 1.0

    for j in range(n):
        A[j, 0] = 1.0
        Anew[j, 0] = 1.0

    print "Jacobi relaxation Calculation: %d x %d mesh" % (n, m)

    timer = time.time()
    iter = 0

    blockdim = (tpb, tpb)
    griddim = (NN/blockdim[0], NM/blockdim[1])
        
    error_grid = np.zeros(griddim)
    
    stream = cuda.stream()

    dA = cuda.to_device(A, stream)          # to device and don't come back
    dAnew = cuda.to_device(Anew, stream)    # to device and don't come back
    derror_grid = cuda.to_device(error_grid, stream)
    
    while error > tol and iter < iter_max:
        assert error_grid.dtype == np.float64
        
        jocabi_relax_core[griddim, blockdim, stream](dA, dAnew, derror_grid)
        
        derror_grid.to_host(stream)
        
        
        # error_grid is available on host
        stream.synchronize()
        
        error = np.abs(error_grid).max()
        
        # swap dA and dAnew
        tmp = dA
        dA = dAnew
        dAnew = tmp

        if iter % 100 == 0:
            print "%5d, %0.6f (elapsed: %f s)" % (iter, error, time.time()-timer)

        iter += 1

    runtime = time.time() - timer
    print " total: %f s" % runtime
开发者ID:Aahung,项目名称:numbapro-examples,代码行数:59,代码来源:laplace2d-numbapro-gpu-improve.py


示例15: monte_carlo_pricer

def monte_carlo_pricer(paths, dt, interest, volatility):
    n = paths.shape[0]
    num_streams = 2
    
    part_width = int(math.ceil(float(n) / num_streams))
    partitions = [(0, part_width)]
    for i in range(1, num_streams):
        begin, end = partitions[i - 1]
        begin, end = end, min(end + (end - begin), n)
        partitions.append((begin, end))
    partlens = [end - begin for begin, end in partitions]

    mm = MM(shape=part_width, dtype=np.double, prealloc=10 * num_streams)

    device = cuda.get_current_device()
    blksz = device.MAX_THREADS_PER_BLOCK
    gridszlist = [int(math.ceil(float(partlen) / blksz))
                  for partlen in partlens]

    strmlist = [cuda.stream() for _ in range(num_streams)]

    prnglist = [curand.PRNG(curand.PRNG.MRG32K3A, stream=strm)
                for strm in strmlist]

    # Allocate device side array
    d_normlist = [cuda.device_array(partlen, dtype=np.double, stream=strm)
                  for partlen, strm in zip(partlens, strmlist)]

    c0 = interest - 0.5 * volatility ** 2
    c1 = volatility * math.sqrt(dt)

    # Configure the kernel
    # Similar to CUDA-C: cu_monte_carlo_pricer<<<gridsz, blksz, 0, stream>>>
    steplist = [cu_step[gridsz, blksz, strm]
               for gridsz, strm in zip(gridszlist, strmlist)]

    d_lastlist = [cuda.to_device(paths[s:e, 0], to=mm.get(stream=strm))
                  for (s, e), strm in zip(partitions, strmlist)]

    for j in xrange(1, paths.shape[1]):
        for prng, d_norm in zip(prnglist, d_normlist):
            prng.normal(d_norm, mean=0, sigma=1)

        d_pathslist = [cuda.to_device(paths[s:e, j], stream=strm,
                                      to=mm.get(stream=strm))
                       for (s, e), strm in zip(partitions, strmlist)]

        for step, args in zip(steplist, zip(d_lastlist, d_pathslist, d_normlist)):
            d_last, d_paths, d_norm = args
            step(d_last, d_paths, dt, c0, c1, d_norm)

        for d_paths, strm, (s, e) in zip(d_pathslist, strmlist, partitions):
            d_paths.copy_to_host(paths[s:e, j], stream=strm)
            mm.free(d_last, stream=strm)
        d_lastlist = d_pathslist

    for strm in strmlist:
        strm.synchronize()
开发者ID:Aahung,项目名称:numbapro-examples,代码行数:58,代码来源:pricer_cuda_overlap.py


示例16: kern_CUDA_dense

def kern_CUDA_dense(nsteps, dX, rho_inv, int_m, dec_m,
                    phi, grid_idcs, prog_bar=None):
    """`NVIDIA CUDA cuBLAS <https://developer.nvidia.com/cublas>`_ implementation 
    of forward-euler integration.
    
    Function requires a working :mod:`numbapro` installation. It is typically slower
    compared to :func:`kern_MKL_sparse` but it depends on your hardware.
    
    Args:
      nsteps (int): number of integration steps
      dX (numpy.array[nsteps]): vector of step-sizes :math:`\\Delta X_i` in g/cm**2
      rho_inv (numpy.array[nsteps]): vector of density values :math:`\\frac{1}{\\rho(X_i)}`
      int_m (numpy.array): interaction matrix :eq:`int_matrix` in dense or sparse representation
      dec_m (numpy.array): decay  matrix :eq:`dec_matrix` in dense or sparse representation
      phi (numpy.array): initial state vector :math:`\\Phi(X_0)` 
      prog_bar (object,optional): handle to :class:`ProgressBar` object
    Returns:
      numpy.array: state vector :math:`\\Phi(X_{nsteps})` after integration
    """
    
    calc_precision = None
    if config['CUDA_precision'] == 32:
        calc_precision = np.float32
    elif config['CUDA_precision'] == 64:
        calc_precision = np.float64
    else:
        raise Exception("kern_CUDA_dense(): Unknown precision specified.")    
    
    #=======================================================================
    # Setup GPU stuff and upload data to it
    #=======================================================================
    try:
        from numbapro.cudalib.cublas import Blas  # @UnresolvedImport
        from numbapro import cuda, float32  # @UnresolvedImport
    except ImportError:
        raise Exception("kern_CUDA_dense(): Numbapro CUDA libaries not " + 
                        "installed.\nCan not use GPU.")
    cubl = Blas()
    m, n = int_m.shape
    stream = cuda.stream()
    cu_int_m = cuda.to_device(int_m.astype(calc_precision), stream)
    cu_dec_m = cuda.to_device(dec_m.astype(calc_precision), stream)
    cu_curr_phi = cuda.to_device(phi.astype(calc_precision), stream)
    cu_delta_phi = cuda.device_array(phi.shape, dtype=calc_precision)
    for step in xrange(nsteps):
        if prog_bar:
            prog_bar.update(step)
        cubl.gemv(trans='T', m=m, n=n, alpha=float32(1.0), A=cu_int_m,
            x=cu_curr_phi, beta=float32(0.0), y=cu_delta_phi)
        cubl.gemv(trans='T', m=m, n=n, alpha=float32(rho_inv[step]),
            A=cu_dec_m, x=cu_curr_phi, beta=float32(1.0), y=cu_delta_phi)
        cubl.axpy(alpha=float32(dX[step]), x=cu_delta_phi, y=cu_curr_phi)

    return cu_curr_phi.copy_to_host()
开发者ID:elim723,项目名称:MCEq,代码行数:54,代码来源:kernels.py


示例17: get_indexList

def get_indexList(path, attr_selected):
    path1, path2, attr_num = bitmap_pickle.get_pic_path(path)
    f1 = open(path1, 'rb')  # read data_map.pkl
    try:
        attr_map = pickle.load(f1)
        attr_list = pickle.load(f1)
        attr_total = pickle.load(f1)
    finally:
        f1.close()

    f2 = open(path2, 'rb')  # read bitmap_pic.pkl
    try:
        lists = pickle.load(f2)
        key = pickle.load(f2)
        offset = pickle.load(f2)
    finally:
        f2.close()

    # attr_input is a list that stores the numbers of input attributes
    # attr_num is the total number of attributes
    # attr_total is the total number of data/31
	attr_input = [[] for i in xrange(attr_num)]
	for i in xrange(attr_num):
		for attri in attr_selected[i]:
			if attri in attr_map[i]:
				attr_input[i].append(attr_map[i][attri])
			elif attri == 'All':
				attr_input[i].append(-1)
		if len(attr_input[i])>1 and (-1 in attr_input[i]):
			attr_input[i].remove(-1)
	print attr_input

    search_start_time = time.time()
    if len(attr_input) != attr_num:  # there might be a wrong input in input_test.py
        print 'No eligible projects'
    else:
        tpb = 1024
        blocknum = 1
        attr_mul = (attr_total + (tpb * blocknum - 1))/(tpb * blocknum)
        # attr_mul is the number that each thread need to be performed
        #print '---index----\nattr_num:%d\nattr_total:%d\nattr_mul:%d\n----------' % (attr_num, attr_total, attr_mul)
        # attr_num = 1
        index_list = numpy.zeros(attr_total*31, dtype='int32')
        bitmap_list = get_attr(attr_input, attr_num, attr_total, lists, key, offset)
        stream = cuda.stream()
        d_bitmap_list = cuda.to_device(numpy.array(bitmap_list), stream)
        d_index_list = cuda.to_device(numpy.array(index_list), stream)
        index_gpu[blocknum, tpb, stream](d_bitmap_list, d_index_list, attr_num, attr_total, attr_mul)
        index_list = d_index_list.copy_to_host()
        stream.synchronize()
    search_end_time = time.time()
    return index_list, search_end_time-search_start_time
开发者ID:DarinSSC,项目名称:BitmapIndex_GUI,代码行数:52,代码来源:bitmap_index2.py


示例18: flush

    def flush(self, metric_opt, supp_opt):
        if not self.Vcs:
            # Nothing to do
            return metric_opt, supp_opt

        k = self.k
        V = self.V

        topk_list = []

        nodect = V.shape[0]
        numseg = len(self.Vcs)
        assert nodect
        assert numseg
        eachsize = nodect * numseg
        D = np.zeros(eachsize, dtype=np.float32)

        # Fill buffer for segmented sort
        for i, Vc in enumerate(self.Vcs):
            D[i * nodect:(i + 1) * nodect] = Vc[:, 0]

        # Prepare for GPU segmented sort
        dD = cuda.to_device(D)
        dI = cuda.device_array((numseg, nodect), dtype=np.uint32)

        blksz = 32
        init_indices[(divup(dI.shape[0], blksz),
                      divup(dI.shape[1], blksz)),
                     (blksz, blksz)](dI)

        if numseg == 1:
            segments = np.arange(1, dtype=np.int32)
        else:
            segments = (np.arange(numseg - 1, dtype=np.int32) + 1) * nodect

        segmented_sort(dD, dI, cuda.to_device(segments))

        for i in range(numseg):
            topk = dI[i, -k:].copy_to_host()
            topk_list.append(topk)

        # Reduce
        for topk in topk_list:
            # Assume A is huge
            metric = np.linalg.norm(V[topk, :]) ** 2
            if metric > metric_opt:
                metric_opt = metric
                supp_opt = topk

        # Clear all Vc
        self.Vcs.clear()
        return metric_opt, supp_opt
开发者ID:ContinuumIO,项目名称:numbapro-spca,代码行数:52,代码来源:dks.py


示例19: encode

def encode(word,gpu=True):
	if isinstance(word,basestring):
		if using_embeddings == True:
			return cuda.to_device(vocab[word])
		else:
			x = np.zeros((1,word_idx),dtype='float32')
			x[0][vocab[word]] = 1.
			if gpu == True:
				return cuda.to_device(x)
			else:
				return x
	else:
		return word
开发者ID:ZhangAustin,项目名称:PyCuNN,代码行数:13,代码来源:utils.py


示例20: convolve

def convolve():
    # Build Filter
    laplacian_pts = '''
    -4 -1 0 -1 -4
    -1 2 3 2 -1
    0 3 4 3 0
    -1 2 3 2 -1
    -4 -1 0 -1 -4
    '''.split()

    laplacian = np.array(laplacian_pts, dtype=np.float32).reshape(5, 5)

    image = get_image()

    print "Image size: %s" % (image.shape,)

    response = np.zeros_like(image)
    response[:5, :5] = laplacian

    # CPU
    # Use SciPy to perform the FFT convolution
    ts = timer()
    cvimage_cpu = fftconvolve(image, laplacian, mode='same')
    te = timer()
    print 'CPU: %.2fs' % (te - ts)

    # GPU
    threadperblock = 32, 8
    blockpergrid = best_grid_size(tuple(reversed(image.shape)), threadperblock)
    print('kernel config: %s x %s' % (blockpergrid, threadperblock))

    # Initialize the cuFFT system.
    cufft.FFTPlan(shape=image.shape, itype=np.complex64, otype=np.complex64)

    # Start GPU timer
    ts = timer()
    image_complex = image.astype(np.complex64)
    response_complex = response.astype(np.complex64)

    d_image_complex = cuda.to_device(image_complex)
    d_response_complex = cuda.to_device(response_complex)

    task1(cufft, d_image_complex, d_response_complex)

    cvimage_gpu = d_image_complex.copy_to_host().real / np.prod(image.shape)

    te = timer()
    print 'GPU: %.2fs' % (te - ts)

    return cvimage_cpu, cvimage_gpu
开发者ID:Aahung,项目名称:numbapro-examples,代码行数:50,代码来源:convolve.py



注:本文中的numbapro.cuda.to_device函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python numbers.clip函数代码示例发布时间:2022-05-27
下一篇:
Python cuda.grid函数代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap