• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python driver.pagelocked_zeros函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中pycuda.driver.pagelocked_zeros函数的典型用法代码示例。如果您正苦于以下问题:Python pagelocked_zeros函数的具体用法?Python pagelocked_zeros怎么用?Python pagelocked_zeros使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了pagelocked_zeros函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: test_streamed_kernel

    def test_streamed_kernel(self):
        # this differs from the "simple_kernel" case in that *all* computation
        # and data copying is asynchronous. Observe how this necessitates the
        # use of page-locked memory.

        mod = drv.SourceModule("""
        __global__ void multiply_them(float *dest, float *a, float *b)
        {
          const int i = threadIdx.x*blockDim.y + threadIdx.y;
          dest[i] = a[i] * b[i];
        }
        """)

        multiply_them = mod.get_function("multiply_them")

        import numpy
        shape = (32,8)
        a = drv.pagelocked_zeros(shape, dtype=numpy.float32)
        b = drv.pagelocked_zeros(shape, dtype=numpy.float32)
        a[:] = numpy.random.randn(*shape)
        b[:] = numpy.random.randn(*shape)

        strm = drv.Stream()

        dest = drv.pagelocked_empty_like(a)
        multiply_them(
                drv.Out(dest), drv.In(a), drv.In(b),
                block=shape+(1,), stream=strm)
        strm.synchronize()

        self.assert_(la.norm(dest-a*b) == 0)
开发者ID:berlinguyinca,项目名称:pycuda,代码行数:31,代码来源:test_driver.py


示例2: __call__

 def __call__(self):
     spikes = self.collected_spikes[:self.nspikes]
     total_neurons = self.net.total_neurons
     if self.use_gpu:
         if not hasattr(self, 'spikes_gpu'):
             spikes_bool = drv.pagelocked_zeros(total_neurons, dtype=uint32)
             spikes_bool[spikes] = True
             spikes_gpu = pycuda.gpuarray.to_gpu(spikes_bool)
             spikes_gpu_ptr = int(int(spikes_gpu.gpudata))
             self.spikes_bool = spikes_bool
             self.spikes_gpu = spikes_gpu
             self.spikes_gpu_ptr = spikes_gpu_ptr
         else:
             spikes_bool = self.spikes_bool
             spikes_bool[:] = False
             spikes_bool[spikes] = True
             spikes_gpu = self.spikes_gpu
             pycuda.driver.memcpy_htod(spikes_gpu.gpudata, spikes_bool)
             spikes_gpu_ptr = self.spikes_gpu_ptr
         acc_ptr = self.net.nemo_sim.propagate(self.synapse_type,
                                               spikes_gpu_ptr, total_neurons)
         if not hasattr(self, 'acc'):
             self.acc = acc = drv.pagelocked_zeros(total_neurons, dtype=float32)
         else:
             acc = self.acc
         pycuda.driver.memcpy_dtoh(acc, acc_ptr)
     else:
         spikes_ptr = spikes.ctypes.data
         spikes_len = len(spikes)
         acc_ptr = self.net.nemo_sim.propagate(self.synapse_type,
                                               spikes_ptr, spikes_len)
         acc = numpy_array_from_memory(acc_ptr, total_neurons, float32)
     for _, targetvar, targetslice in self.net.nemo_propagate_targets:
         targetvar += acc[targetslice]
     self.nspikes = 0
开发者ID:JoErNanO,项目名称:brian,代码行数:35,代码来源:briantonemo.py


示例3: _allocate_arrays

  def _allocate_arrays(self):
    #allocate gpu arrays and numpy arrays.
    if self.max_features < 4:
      imp_size = 4
    else:
      imp_size = self.max_features
    
    #allocate gpu arrays
    self.impurity_left = gpuarray.empty(imp_size, dtype = np.float32)
    self.impurity_right = gpuarray.empty(self.max_features, dtype = np.float32)
    self.min_split = gpuarray.empty(self.max_features, dtype = self.dtype_counts)
    self.label_total = gpuarray.empty(self.n_labels, self.dtype_indices)  
    self.label_total_2d = gpuarray.zeros(self.max_features * (self.MAX_BLOCK_PER_FEATURE + 1) * self.n_labels, 
        self.dtype_indices)
    self.impurity_2d = gpuarray.empty(self.max_features * self.MAX_BLOCK_PER_FEATURE * 2, np.float32)
    self.min_split_2d = gpuarray.empty(self.max_features * self.MAX_BLOCK_PER_FEATURE, self.dtype_counts)
    self.features_array_gpu = gpuarray.empty(self.n_features, np.uint16)
    self.mark_table = gpuarray.empty(self.stride, np.uint8) 

    #allocate numpy arrays
    self.idx_array = np.zeros(2 * self.n_samples, dtype = np.uint32)
    self.si_idx_array = np.zeros(self.n_samples, dtype = np.uint8)
    self.nid_array = np.zeros(self.n_samples, dtype = np.uint32)
    self.values_idx_array = np.zeros(2 * self.n_samples, dtype = self.dtype_indices)
    self.values_si_idx_array = np.zeros(2 * self.n_samples, dtype = np.uint8)
    self.threshold_value_idx = np.zeros(2, self.dtype_indices)
    self.min_imp_info = driver.pagelocked_zeros(4, dtype = np.float32)  
    self.features_array = driver.pagelocked_zeros(self.n_features, dtype = np.uint16)
    self.features_array[:] = np.arange(self.n_features, dtype = np.uint16)
开发者ID:pjankiewicz,项目名称:CudaTree,代码行数:29,代码来源:random_forest.py


示例4: getRT

    def getRT(self, s_map, srt_gpu, srt_nsamp, srt_npairs, npairs, store_rt=False):
        """
        Computes the rank template

        s_map(Sample Map) -  an list of 1s and 0s of length nsamples where 1 means use this sample
            to compute rank template
        srt_gpu - cuda memory object containing srt(sample rank template) array on gpu
        srt_nsamp, srt_npairs - shape(buffered) of srt_gpu object
        npairs - true number of gene pairs being compared
        b_size - size of the blocks for computation
        store_rt - determines the RETURN value
            False(default) = returns an numpy array shape(npairs) of the rank template
            True = returns the rt_gpu object and the padded size of the rt_gpu objet (rt_obj, npairs_padded)
        """

        b_size = self.b_size
        s_map_buff = self.s_map_buff = cuda.pagelocked_zeros((int(srt_nsamp),), np.int32,  mem_flags=cuda.host_alloc_flags.DEVICEMAP)

        s_map_buff[:len(s_map)] =  np.array(s_map,dtype=np.int32)

        s_map_gpu = np.intp(s_map_buff.base.get_device_pointer())
        #cuda.memcpy_htod(s_map_gpu, s_map_buff)
        
        #sample blocks
        g_y_sz = self.getGrid( srt_nsamp)
        #pair blocks
        g_x_sz = self.getGrid( srt_npairs )
        
        block_rt_gpu =  cuda.mem_alloc(int(g_y_sz*srt_npairs*(np.uint32(1).nbytes)) ) 


        grid = (g_x_sz, g_y_sz)

        func1,func2 = self.getrtKern(g_y_sz)

        shared_size = b_size*b_size*np.uint32(1).nbytes

        func1( srt_gpu, np.uint32(srt_nsamp), np.uint32(srt_npairs), s_map_gpu, block_rt_gpu, np.uint32(g_y_sz), block=(b_size,b_size,1), grid=grid, shared=shared_size)

        rt_buffer =self.rt_buffer = cuda.pagelocked_zeros((int(srt_npairs),), np.int32, mem_flags=cuda.host_alloc_flags.DEVICEMAP)
        rt_gpu = np.intp(rt_buffer.base.get_device_pointer())

        func2( block_rt_gpu, rt_gpu, np.int32(s_map_buff.sum()), block=(b_size,1,1), grid=(g_x_sz,))

        
        if store_rt:
            #this is in case we want to run further stuff without 
            #transferring back and forth
            return (rt_gpu, srt_npairs)
        else:
            #rt_buffer = np.zeros((srt_npairs ,), dtype=np.int32)
            #cuda.memcpy_dtoh(rt_buffer, rt_gpu)
            #rt_gpu.free()
            return rt_buffer[:npairs]
开发者ID:JohnCEarls,项目名称:tcDirac,代码行数:54,代码来源:gpu.py


示例5: prepare

 def prepare(self, P):
     n = len(P.state_(self.eqs._diffeq_names_nonzero[0]))
     var_len  = len(dict.fromkeys(self.eqs._diffeq_names))+1 # +1 needed to store t
     
     for index,varname in enumerate(self.eqs._diffeq_names):
         self.index_to_varname.append(varname)
         self.varname_to_index[varname]= index
         if varname in self.eqs._diffeq_names_nonzero :
             self.index_nonzero.append(index)
     
     self.S_in = cuda.pagelocked_zeros((n,var_len),numpy.float64)
     
     self.S_out = cuda.pagelocked_zeros((n,var_len),numpy.float64)
     
     nbytes = n * var_len * numpy.dtype(numpy.float64).itemsize
     self.S_in_gpu = cuda.mem_alloc(nbytes)
     self.S_out_gpu = cuda.mem_alloc(nbytes)
     
     Z = zeros((n,var_len))
     self.A_gpu = cuda.mem_alloc(nbytes)
     cuda.memcpy_htod(self.A_gpu, Z)
     self.B_gpu = cuda.mem_alloc(nbytes)
     cuda.memcpy_htod(self.B_gpu, Z)
     self.S_temp_gpu = cuda.mem_alloc(nbytes)
     
     modFun={}
     self.applyFun = {}
     for x in self.index_nonzero:
         s = self.eqs._function_C_String[self.index_to_varname[x]]
         args_fun =[]
         for i in xrange(var_len):
             args_fun.append("S_temp["+str(i)+" + blockIdx.x * var_len]")
         modFun[x] = SourceModule("""
             __device__ double f"""+ s +"""
             
             __global__ void applyFun(double *A,double *B,double *S_in,double *S_temp, int x, int var_len)
             { 
                 
                 int idx = x + blockIdx.x * var_len;
                 S_temp[idx] = 0;
                 B[idx] = f("""+",".join(args_fun)+""");
                 S_temp[idx] = 1;
                 A[idx] = f("""+",".join(args_fun)+""") - B[idx];
                 B[idx] /= A[idx];
                 S_temp[idx] = S_in[idx];
             }
             """)
         self.applyFun[x] = modFun[x].get_function("applyFun")
         self.applyFun[x].prepare(['P','P','P','P','i','i'],block=(1,1,1))
     
     self.calc_dict = {}
     self.already_calc = {}
开发者ID:JoErNanO,项目名称:brian,代码行数:52,代码来源:gpustateupdater.py


示例6: _initialize_gpu_ds

    def _initialize_gpu_ds(self):
        """
        Setup GPU arrays.
        """

        self.synapse_state = garray.zeros(
            max(int(self.total_synapses) + len(self.input_neuron_list), 1),
            np.float64)

        if self.total_num_gpot_neurons>0:
            # self.V = garray.zeros(
            #     int(self.total_num_gpot_neurons),
            #     np.float64)
            self.V_host = drv.pagelocked_zeros(
                int(self.total_num_gpot_neurons),
                np.float64, mem_flags=drv.host_alloc_flags.DEVICEMAP)
            self.V = garray.GPUArray(self.V_host.shape,
                                     self.V_host.dtype,
                                     gpudata=self.V_host.base.get_device_pointer())
        else:
            self.V = None

        if self.total_num_spike_neurons > 0:
            # self.spike_state = garray.zeros(int(self.total_num_spike_neurons),
            #                                 np.int32)
            self.spike_state_host = drv.pagelocked_zeros(int(self.total_num_spike_neurons),
                            np.int32, mem_flags=drv.host_alloc_flags.DEVICEMAP)
            self.spike_state = garray.GPUArray(self.spike_state_host.shape,
                                               self.spike_state_host.dtype,
                                               gpudata=self.spike_state_host.base.get_device_pointer())
        self.block_extract = (256, 1, 1)
        if len(self.out_ports_ids_gpot) > 0:
            self.out_ports_ids_gpot_g = garray.to_gpu(self.out_ports_ids_gpot)
            self.sel_out_gpot_ids_g = garray.to_gpu(self.sel_out_gpot_ids)

            self._extract_gpot = self._extract_projection_gpot_func()

        if len(self.out_ports_ids_spk) > 0:
            self.out_ports_ids_spk_g = garray.to_gpu(
                (self.out_ports_ids_spk).astype(np.int32))
            self.sel_out_spk_ids_g = garray.to_gpu(self.sel_out_spk_ids)

            self._extract_spike = self._extract_projection_spike_func()

        if self.ports_in_gpot_mem_ind is not None:
            inds = self.sel_in_gpot_ids
            self.inds_gpot = garray.to_gpu(inds)

        if self.ports_in_spk_mem_ind is not None:
            inds = self.sel_in_spk_ids
            self.inds_spike = garray.to_gpu(inds)
开发者ID:neurokernel,项目名称:neurodriver-benchmark,代码行数:51,代码来源:neurodriver_demo.py


示例7: getBuff

 def getBuff(self, frm, new_r, new_c, b_dtype):
     """
     Generates a numpy array sized (new_r,new_x) of dtype
         b_dtype that contains the np array frm such that
         frm[i,j] == new[i,j] wher new has zeros if
         frm[i,j] is out of bounds.
     """
     try:
         old_r,old_c =  frm.shape
         buff = cuda.pagelocked_zeros((new_r,new_c),b_dtype, mem_flags=cuda.host_alloc_flags.DEVICEMAP)#np.zeros((new_r,new_c),dtype=b_dtype)
         buff[:old_r,:old_c] = frm
     except ValueError:
         #oned
         old_r = frm.shape[0]
         buff = cuda.pagelocked_zeros((new_r,), b_dtype,mem_flags=cuda.host_alloc_flags.DEVICEMAP)# np.zeros((new_r,),dtype=b_dtype)
         buff[:old_r] = frm
     return buff
开发者ID:JohnCEarls,项目名称:tcDirac,代码行数:17,代码来源:gpu.py


示例8: GenerateFractal

def GenerateFractal(dimensions,position,zoom,iterations,block=(20,20,1), report=False, silent=False):
	chunkSize = numpy.array([dimensions[0]/block[0],dimensions[1]/block[1]],dtype=numpy.int32)
	zoom = numpy.float32(zoom)
	iterations = numpy.int32(iterations)
	blockDim = numpy.array([block[0],block[1]],dtype=numpy.int32)
	result = numpy.zeros(dimensions,dtype=numpy.int32)

	#Center position
	position = Vector(position[0]*zoom,position[1]*zoom)
	position = position - (Vector(result.shape[0],result.shape[1])/2)
	position = numpy.array([int(position.x),int(position.y)]).astype(numpy.float32)

	#For progress reporting:
	ppc = cuda.pagelocked_zeros((1,1),numpy.int32, mem_flags=cuda.host_alloc_flags.DEVICEMAP) #pagelocked progress counter
	ppc[0,0] = 0
	ppc_ptr = numpy.intp(ppc.base.get_device_pointer()) #pagelocked memory counter, device pointer to
	#End progress reporting

	#Copy parameters over to device
	chunkS = In(chunkSize)
	posit = In(position)
	blockD = In(blockDim)
	zoo = In(zoom)
	iters = In(iterations)
	res = In(result)

	if not silent:
		print("Calling CUDA function. Starting timer. progress starting at: "+str(ppc[0,0]))
	start_time = time.time()

	genChunk(chunkS, posit, blockD, zoo, iters, res, ppc_ptr, block=(1,1,1), grid=block)
	
	if report:
		total = (dimensions[0]*dimensions[1])
		print "Reporting up to "+str(total)+", "+str(ppc[0,0])
		while ppc[0,0] < ((dimensions[0]*dimensions[1])):
			pct = (ppc[0,0]*100)/(total)
			hashes = "#"*pct
			dashes = "-"*(100-pct)
			print "\r["+hashes+dashes+"] "+locale.format("%i",ppc[0,0],grouping=True)+"/"+locale.format("%i",total,grouping=True),
			time.sleep(0.00001)


	cuda.Context.synchronize()
	if not silent:
		print "Done. "+str(ppc[0,0])

	#Copy result back from device
	cuda.memcpy_dtoh(result, res)

	if not silent: 
		end_time = time.time()
		elapsed_time = end_time-start_time
		print("Done with call. Took "+str(elapsed_time)+" seconds. Here's the repr'd arary:\n")
		print(result)
		
	result[result.shape[0]/2,result.shape[1]/2]=iterations+1 #mark center of image
	return result
开发者ID:jshearer,项目名称:cudafractal,代码行数:58,代码来源:fractal.py


示例9: find_component_device

def find_component_device(d_v, d_D,  length):
    """

    :param d_v:
    :param d_D:
    :param ecount:
    :return:
    """
    import eulercuda.pyencode as enc
    logger = logging.getLogger('eulercuda.pycomponent.find_component_device')
    logger.info("started.")
    mem_size = length
    d_prevD = np.zeros(mem_size, dtype=np.uintc)
    d_Q = np.zeros_like(d_prevD)
    d_t1 = np.zeros_like(d_prevD)
    d_t2 = np.zeros_like(d_prevD)
    d_val1 = np.zeros_like(d_prevD)
    d_val2 = np.zeros_like(d_prevD)
    sp = np.uintc(0)

    s = np.uintc

    d_D, d_Q = component_step_init(d_v, d_D, d_Q, length)
    s, sp = 1, 1

    sptemp = drv.pagelocked_zeros(4, dtype=np.intc, mem_flags=drv.host_alloc_flags.DEVICEMAP)
    d_sptemp = np.intp(sptemp.base.get_device_pointer())

    while s == sp:
        d_D, d_prevD = d_prevD, d_D

        d_D = component_step1_shortcutting_p1(d_v, d_prevD, d_D, d_Q, length, s)

        d_Q = component_step1_shortcutting_p2(d_v, d_prevD, d_D, d_Q, length, s)

        d_t1, d_t2, d_val1, d_val2 = component_Step2_P1(d_v, d_prevD, d_D, d_Q, d_t1, d_val1, d_t2, d_val2, length, s)

        d_D, d_Q = component_Step2_P2(d_v, d_prevD, d_D, d_Q, d_t1, d_val1, d_t2, d_val2, length, s)

        d_t1, d_t2, d_val1, d_val2 = component_Step3_P1(d_v, d_prevD, d_D, d_Q, d_t1, d_val1, d_t2, d_val2, length, s)

        d_D = component_Step3_P2(d_v, d_prevD, d_D, d_Q, d_t1, d_val1, d_t2, d_val2, length, s)

        d_val1 = component_step4_P1(d_v, d_D, d_val1, length)

        d_D = component_step4_P2(d_v, d_D, d_val1, length)

        sptemp[0] = 0

        d_sptemp = (d_Q, length, d_sptemp, s)

        sp += sptemp[0]

        s += 1

    logger.info("Finished. Leaving.")
    return d_D
开发者ID:zenlc2000,项目名称:pycuda-euler,代码行数:57,代码来源:pycomponent.py


示例10: test_streamed_kernel

    def test_streamed_kernel(self):
        # this differs from the "simple_kernel" case in that *all* computation
        # and data copying is asynchronous. Observe how this necessitates the
        # use of page-locked memory.

        mod = SourceModule("""
        __global__ void multiply_them(float *dest, float *a, float *b)
        {
          const int i = threadIdx.x*blockDim.y + threadIdx.y;
          dest[i] = a[i] * b[i];
        }
        """)

        multiply_them = mod.get_function("multiply_them")

        shape = (32, 8)
        a = drv.pagelocked_zeros(shape, dtype=np.float32)
        b = drv.pagelocked_zeros(shape, dtype=np.float32)
        a[:] = np.random.randn(*shape)
        b[:] = np.random.randn(*shape)

        a_gpu = drv.mem_alloc(a.nbytes)
        b_gpu = drv.mem_alloc(b.nbytes)

        strm = drv.Stream()
        drv.memcpy_htod_async(a_gpu, a, strm)
        drv.memcpy_htod_async(b_gpu, b, strm)
        strm.synchronize()

        dest = drv.pagelocked_empty_like(a)
        multiply_them(
                drv.Out(dest), a_gpu, b_gpu,
                block=shape+(1,), stream=strm)
        strm.synchronize()

        drv.memcpy_dtoh_async(a, a_gpu, strm)
        drv.memcpy_dtoh_async(b, b_gpu, strm)
        strm.synchronize()

        assert la.norm(dest-a*b) == 0
开发者ID:davidweichiang,项目名称:pycuda,代码行数:40,代码来源:test_driver.py


示例11: getRMS

 def getRMS(self, rt_gpu, srt_gpu, padded_samples, padded_npairs, samp_id, npairs):
     """
     Returns the rank matching score
     rt_gpu - rank template gpu object (padded_npairs,)
     srt_gpu - sample rank template gpu object (padded_npairs, padded_samples)
     samp_id - the sample id to compare srt to rt
     npairs - true number of pairs
     b_size - the block size for gpu computation.
     """
     b_size = self.b_size
     gsize = int(padded_npairs/b_size)
     result = self.result= cuda.pagelocked_zeros((gsize,), dtype=np.int32, mem_flags=cuda.host_alloc_flags.DEVICEMAP)
     result_gpu = np.intp(result.base.get_device_pointer()) #cuda.mem_alloc(result.nbytes)
      
     func = self.getrmsKern()
     func( rt_gpu, srt_gpu, np.int32(samp_id), np.int32(padded_samples), np.int32(npairs), result_gpu, block=(b_size,1,1), grid=(int(gsize),), shared=b_size*np.uint32(1).nbytes )
     self.ctx.synchronize()
     
     return result.sum()/float(npairs)
开发者ID:JohnCEarls,项目名称:tcDirac,代码行数:19,代码来源:gpu.py


示例12: __init__

 def __init__(self, N, model, threshold=None, reset=NoReset(),
              init=None, refractory=0 * msecond, level=0,
              clock=None, order=1, implicit=False, unit_checking=True,
              max_delay=0 * msecond, compile=False, freeze=False, method=None,
              precision='double', maxblocksize=512, forcesync=False, pagelocked_mem=True,
              gpu_to_cpu_vars=None, cpu_to_gpu_vars=None):
     eqs = model
     eqs.prepare()
     NeuronGroup.__init__(self, N, eqs, threshold=threshold, reset=reset,
                          init=init, refractory=refractory, level=level,
                          clock=clock, order=order, compile=compile, freeze=freeze, method=method)
     self.precision = precision
     if self.precision == 'double':
         self.precision_dtype = float64
         self.precision_nbytes = 8
     else:
         self.precision_dtype = float32
         self.precision_nbytes = 4
     self.clock = guess_clock(clock)
     if gpu_to_cpu_vars is None and cpu_to_gpu_vars is None:
         self._state_updater = GPUNonlinearStateUpdater(eqs, clock=self.clock, precision=precision, maxblocksize=maxblocksize,
                                                        forcesync=forcesync)
     else:
         cpu_to_gpu_vars = [(self.get_var_index(var) * len(self) * self.precision_nbytes,
                             self.get_var_index(var) * len(self),
                             (self.get_var_index(var) + 1) * len(self)) for var in cpu_to_gpu_vars]
         gpu_to_cpu_vars = [(self.get_var_index(var) * len(self) * self.precision_nbytes,
                             self.get_var_index(var) * len(self),
                             (self.get_var_index(var) + 1) * len(self)) for var in gpu_to_cpu_vars]
         self._state_updater = UserControlledGPUNonlinearStateUpdater(eqs, clock=self.clock, precision=precision, maxblocksize=maxblocksize,
                                                        gpu_to_cpu_vars=gpu_to_cpu_vars, cpu_to_gpu_vars=cpu_to_gpu_vars)
     if pagelocked_mem:
         self._S = GPUBufferedArray(drv.pagelocked_zeros(self._S.shape, dtype=self.precision_dtype))
     else:
         self._S = GPUBufferedArray(array(self._S, dtype=self.precision_dtype))
     self._gpuneurongroup_init_finished = True
开发者ID:brian-team,项目名称:brian,代码行数:36,代码来源:gpucodegen.py


示例13: __init__

 def __init__(self, source, b, a, samplerate=None,
              precision='double', forcesync=True, pagelocked_mem=True, unroll_filterorder=None):
     # Automatically duplicate mono input to fit the desired output shape
     if b.shape[0]!=source.nchannels:
         if source.nchannels!=1:
             raise ValueError('Can only automatically duplicate source channels for mono sources, use RestructureFilterbank.')
         source = RestructureFilterbank(source, b.shape[0])
     Filterbank.__init__(self, source)
     if pycuda.context is None:
         set_gpu_device(0)
     self.precision=precision
     if self.precision=='double':
         self.precision_dtype=float64
     else:
         self.precision_dtype=float32
     self.forcesync=forcesync
     self.pagelocked_mem=pagelocked_mem
     n, m, p=b.shape
     self.filt_b=b
     self.filt_a=a
     filt_b_gpu=array(b, dtype=self.precision_dtype)
     filt_a_gpu=array(a, dtype=self.precision_dtype)
     filt_state=zeros((n, m-1, p), dtype=self.precision_dtype)
     if pagelocked_mem:
         filt_y=drv.pagelocked_zeros((n,), dtype=self.precision_dtype)
         self.pre_x=drv.pagelocked_zeros((n,), dtype=self.precision_dtype)
     else:
         filt_y=zeros(n, dtype=self.precision_dtype)
         self.pre_x=zeros(n, dtype=self.precision_dtype)
     self.filt_b_gpu=gpuarray.to_gpu(filt_b_gpu.T.flatten()) # transform to Fortran order for better GPU mem
     self.filt_a_gpu=gpuarray.to_gpu(filt_a_gpu.T.flatten()) # access speeds
     self.filt_state=gpuarray.to_gpu(filt_state.T.flatten())
     self.unroll_filterorder = unroll_filterorder
     if unroll_filterorder is None:
         if m<=32:
             unroll_filterorder = True
         else:
             unroll_filterorder = False
     # TODO: improve code, check memory access patterns, maybe use local memory
     code='''
     #define x(s,i) _x[(s)*n+(i)]
     #define y(s,i) _y[(s)*n+(i)]
     #define a(i,j,k) _a[(i)+(j)*n+(k)*n*m]
     #define b(i,j,k) _b[(i)+(j)*n+(k)*n*m]
     #define zi(i,j,k) _zi[(i)+(j)*n+(k)*n*(m-1)]
     __global__ void filt(SCALAR *_b, SCALAR *_a, SCALAR *_x, SCALAR *_zi, SCALAR *_y, int numsamples)
     {
         int j = blockIdx.x * blockDim.x + threadIdx.x;
         if(j>=n) return;
         for(int s=0; s<numsamples; s++)
         {
     '''
     for k in range(p):
         loopcode='''
         y(s,j) = b(j,0,k)*x(s,j) + zi(j,0,k);
         '''
         if unroll_filterorder:
             for i in range(m-2):
                 loopcode+=re.sub('\\bi\\b', str(i), '''
                 zi(j,i,k) = b(j,i+1,k)*x(s,j) + zi(j,i+1,k) - a(j,i+1,k)*y(s,j);
                 ''')
         else:
             loopcode+='''
             for(int i=0;i<m-2;i++)
                 zi(j,i,k) = b(j,i+1,k)*x(s,j) + zi(j,i+1,k) - a(j,i+1,k)*y(s,j);
             '''
         loopcode+='''
         zi(j,m-2,k) = b(j,m-1,k)*x(s,j) - a(j,m-1,k)*y(s,j);
         '''
         if k<p-1:
             loopcode+='''
             x(s,j) = y(s,j);
             '''
         loopcode=re.sub('\\bk\\b', str(k), loopcode)
         code+=loopcode
     code+='''
         }
     }
     '''
     code=code.replace('SCALAR', self.precision)
     code=re.sub("\\bp\\b", str(p), code) #replace the variable by their values
     code=re.sub("\\bm\\b", str(m), code)
     code=re.sub("\\bn\\b", str(n), code)
     #print code
     self.gpu_mod=pycuda.compiler.SourceModule(code)
     self.gpu_filt_func=self.gpu_mod.get_function("filt")
     blocksize=256
     if n<blocksize:
         blocksize=n
     if n%blocksize==0:
         gridsize=n/blocksize
     else:
         gridsize=n/blocksize+1
     self.block=(blocksize, 1, 1)
     self.grid=(gridsize, 1)
     self.gpu_filt_func.prepare((intp, intp, intp, intp, intp, int32), self.block)
     self._has_run_once=False
开发者ID:sivaven,项目名称:brian,代码行数:97,代码来源:gpulinearfilterbank.py


示例14: main


#.........这里部分代码省略.........
    #dest = numpy.arange(GenomeDim*4).astype(numpy.uint8)
    #for i in range(0, GenomeDim/4):
        #dest[i*8 + 0] = int('0b00100101',2) #CRASHES
        #dest[i*8 + 1] = int('0b00010000',2) #CRASHES
        #dest[i*8 + 0] = int('0b00101000',2)
        #dest[i*8 + 1] = int('0b00000000',2)
        #dest[i*8 + 2] = int('0b00000000',2)
        #dest[i*8 + 3] = int('0b00000000',2)
        #dest[i*8 + 4] = int('0b00000000',2)
        #dest[i*8 + 5] = int('0b00000000',2)
        #dest[i*8 + 6] = int('0b00000000',2)
        #dest[i*8 + 7] = int('0b00000000',2)
    #    dest[i*4 + 0] = 40
    #    dest[i*4 + 1] = 0
    #    dest[i*4 + 2] = 0
    #    dest[i*4 + 3] = 0

    dest_h = drv.mem_alloc(GenomeDim*AlignedByteLengthGenome) #dest.nbytes)
    #drv.memcpy_htod(dest_h, dest)
    #print "Genomes before: "
    #print dest

    #Set-up grids
    #grids = numpy.zeros((10000, DimGridX, DimGridY)).astype(numpy.uint8) #TEST
    #grids_h = drv.mem_alloc(GenomeDim*DimGridX*DimGridY) #TEST
    #drv.memcpy_htod(grids_h, grids)
    #print "Grids:"
    #print grids    

    #Set-up fitness values
    #fitness = numpy.zeros(FitnessValDim).astype(numpy.float32)
    #fitness_h = drv.mem_alloc(fitness.nbytes)
    #fitness_size = numpy.zeros(FitnessValDim).astype(numpy.uint32)
    fitness_size = drv.pagelocked_zeros((FitnessValDim), numpy.uint32, "C", 0)
    fitness_size_h = drv.mem_alloc(fitness_size.nbytes)
    #fitness_hash = numpy.zeros(FitnessValDim).astype(numpy.uint32)
    fitness_hash = drv.pagelocked_zeros((FitnessValDim), numpy.uint32, "C", 0)
    fitness_hash_h = drv.mem_alloc(fitness_hash.nbytes)
    #drv.memcpy_htod(fitness_h, fitness)
    #print "Fitness values:"
    #print fitness

    #Set-up grids
    #grids = numpy.zeros((GenomeDim, DimGridX, DimGridY)).astype(numpy.uint8) #TEST
    grids = drv.pagelocked_zeros((GenomeDim, DimGridX, DimGridY), numpy.uint8, "C", 0)
    grids_h = drv.mem_alloc(GenomeDim*DimGridX*DimGridY) #TEST
    
    #drv.memcpy_htod(grids_h, grids)
    #print "Grids:"
    #print grids 

    #Set-up curand
    #curand = numpy.zeros(40*GenomeDim).astype(numpy.uint8);
    #curand_h = drv.mem_alloc(curand.nbytes)
    curand_h = drv.mem_alloc(40*GenomeDim)

    #SearchSpace control
    #SearchSpaceSize = 2**24
    #BlockDimY = SearchSpaceSize / (2**16)
    #BlockDimX = SearchSpaceSize / (BlockDimY)
    #print "SearchSpaceSize: ", SearchSpaceSize, " (", BlockDimX, ", ", BlockDimY,")"
   
    #Schedule kernel calls
    #MaxBlockDim = 100
    OffsetBlocks = (SearchSpaceSize) % (BlockDimX*BlockDimY*WarpSize)
    MaxBlockCycles = (SearchSpaceSize - OffsetBlocks)/(BlockDimX*BlockDimY*WarpSize)
开发者ID:schroeder-dewitt,项目名称:polyomino-self-assembly,代码行数:67,代码来源:main.py


示例15: prepare

	def prepare(self):
		'''
		From Hines 1984 paper, discrete formula is:
		A_plus*V(i+1)-(A_plus+A_minus)*V(i)+A_minus*V(i-1)=Cm/dt*(V(i,t+dt)-V(i,t))+gtot(i)*V(i)-I0(i)
       
		A_plus: i->i+1
		A_minus: i->i-1
		
        This gives the following tridiagonal system:
        A_plus*V(i+1)-(Cm/dt+gtot(i)+A_plus+A_minus)*V(i)+A_minus*V(i-1)=-Cm/dt*V(i,t)-I0(i)
        
        Boundaries, one simple possibility (sealed ends):
        -(Cm/dt+gtot(n)+A_minus)*V(n)+A_minus*V(n-1)=-Cm/dt*V(n,t)-I0(n)
        A_plus*V(1)-(Cm/dt+gtot(0)+A_plus)*V(0)=-Cm/dt*V(0,t)-I0(0)
        '''
		mid_diameter = zeros(len(self.neuron)) # mid(i) : (i-1) <-> i
		mid_diameter[1:] = .5*(self.neuron.diameter[:-1]+self.neuron.diameter[1:])
		
		self.Aplus = zeros(len(self.neuron)) # A+ i -> j = Aplus(j)
		self.Aminus = zeros(len(self.neuron)) # A- i <- j = Aminus(j)
		self.Aplus[1]= mid_diameter[1]**2/(4*self.neuron.diameter[1]*self.neuron.length[1]**2*self.neuron.Ri)
		self.Aplus[2:]=mid_diameter[2:]**2/(4*self.neuron.diameter[1:-1]*self.neuron.length[1:-1]**2*self.neuron.Ri)
		self.Aminus[1:]=mid_diameter[1:]**2/(4*self.neuron.diameter[1:]*self.neuron.length[1:]**2*self.neuron.Ri) 
		
		self.neuron.index = zeros(len(self.neuron),int) # gives the index of the branch containing the current compartment
		self.neuron.branches = [] # (i,j,bp,ante,ante_index,pointType)
		# i is the first compartment
		# bp is the last, a branch point
		# j is the end of the "inner branch". j = bp-1
		# ante is the branch point to which i is connected
		
		self.neuron.BPcount = 0 # number of branch points (or branches). = len(self.neuron.branches)
		self.neuron.long_branches_count = 0 # number of branches with len(branch) > 1
		
		#self.vL = cuda.pagelocked_zeros((len(self.neuron)),numpy.float64)
		#self.vR = cuda.pagelocked_zeros((len(self.neuron)),numpy.float64)
		#self.d = cuda.pagelocked_zeros((len(self.neuron)),numpy.float64)
		
		self.bL = cuda.pagelocked_zeros((len(self.neuron)),numpy.float64)
		self.bR = cuda.pagelocked_zeros((len(self.neuron)),numpy.float64)
		#self.bd = cuda.pagelocked_zeros((len(self.neuron)),numpy.float64)
		self.ab = zeros((3,len(self.neuron)))
		self.ab0 = zeros(len(self.neuron))
		self.ab1 = cuda.pagelocked_zeros((len(self.neuron)),numpy.float64)
		self.ab2 = zeros(len(self.neuron))
		self.ab1_base = zeros(len(self.neuron))
		#self.res = cuda.pagelocked_zeros((3 * len(self.neuron)),numpy.float64)
		
		self.mTrunc = 0 # used to truncate vL and vR
		self.delta_list = zeros(len(self.neuron)) #used to find mTrunc
		
		# prepare_branch : fill neuron.index, neuron.branches, changes Aplus & Aminus
		self.prepare_branch(self.neuron.morphology, mid_diameter,0)
		
		# linear system P V = B used to deal with the voltage at branch points and take boundary conditions into account.
		self.P = zeros((self.neuron.BPcount,self.neuron.BPcount))
		self.B = zeros(self.neuron.BPcount)
		self.solution_bp = zeros(self.neuron.BPcount)
		
		self.gtot = zeros(len(self.neuron))
		self.I0 = zeros(len(self.neuron))
		self.i_list = []
		self.j_list = []
		self.i_list_bis = []
		self.j_list_bis = []
		new_tridiag = True
		self.bp_list = []
		self.pointType_list = []
		self.pointTypeAnte_list = []
		self.index_ante_list0 = []
		self.index_ante_list1 = []
		self.index_ante_list2 = []
		self.ante_list = []
		self.post_list = []
		self.ante_list_idx = []
		self.post_list_idx = []
		self.id = []
		self.test_list = []
		temp = zeros(self.neuron.BPcount)
		self.ind0 = []
		self.ind_bctype_0 = []
		for index,(i,j,bp,ante,index_ante,pointType) in enumerate(self.neuron.branches) :
			self.i_list.append(i)
			self.j_list.append(j)
			if new_tridiag:
				self.i_list_bis.append(i)
				ii = i
			else:
				ii = self.i_list[-1]
			if j-ii+1>2:
				self.j_list_bis.append(j)
				new_tridiag = True
			else :
				new_tridiag = False
			self.bp_list.append(bp)
			self.pointType_list.append(max(1,pointType))
			self.pointTypeAnte_list.append(max(1,self.neuron.bc[ante]))
			temp[index] = index_ante
			self.id.append(index)
			if (j-i+2>1):
#.........这里部分代码省略.........
开发者ID:JoErNanO,项目名称:brian,代码行数:101,代码来源:spatialstateupdater_linear.py


示例16: alloc_async_host_buf

 def alloc_async_host_buf(self, shape, dtype):
     """Allocates a buffer that can be used for asynchronous data
     transfers."""
     return cuda.pagelocked_zeros(shape, dtype=dtype)
开发者ID:mjanusz,项目名称:sailfish,代码行数:4,代码来源:backend_cuda.py


示例17: show

        sys.stdout.flush()
        start.record()

if rank == 0:
    print "\navg: %1.2f GFLOPS" % flops[2:-2].mean()

if rank == 1:
    total = np.zeros(tmax)
    for key in exec_time.iterkeys():
        total[:] += exec_time[key][:]
    for key in exec_time.iterkeys():
        print key, ":\t %1.2f %%" % (exec_time[key][2:-2].sum() / total[2:-2].sum() * 100)

    print "%1.2f GFLOPS\r" % ((tmax - 4) * 3 * nx * ny * nz * 30 / total[2:-2].sum() * 1e-6)

g = cuda.pagelocked_zeros((nx, ny, nz), "f")
cuda.memcpy_dtoh(g, ez_gpu)
if rank != 0:
    comm.Send(g, 0, 24)
else:
    lg = np.zeros((3 * nx, ny), "f")
    lg[:nx, :] = g[:, :, nz / 2]
    comm.Recv(g, 1, 24)
    lg[nx:-nx, :] = g[:, :, nz / 2]
    comm.Recv(g, 2, 24)
    lg[2 * nx :, :] = g[:, :, nz / 2]
    imsh.set_array(lg.T ** 2)
    show()  # draw()
    # savefig('./png-wave/%.5d.png' % tstep)

    stop.record()
开发者ID:wbkifun,项目名称:fdtd_accelerate,代码行数:31,代码来源:150-nGPU-func.py


示例18: range

		result = cuda.from_device(a_gpu, (nx,ny), 'float32')
		print ngpu
		for i in range(1,ngpu): 
			result = np.concatenate((result, mpi.world.recv(i,10)))
		for i in xrange(ny):
			print result[:nx,i],'\t',result[nx:2*nx,i],'\t',result[2*nx:,i]


if __name__ == '__main__':
	cuda.init()
	ngpu = cuda.Device.count()
	ctx = cuda.Device(mpi.rank).make_context(cuda.ctx_flags.MAP_HOST)
	
	nx, ny = 6, 5

	a_side_f = cuda.pagelocked_zeros(ny, np.float32, mem_flags=cuda.host_alloc_flags.DEVICEMAP)

	a = np.zeros((nx,ny),'f')
	if mpi.rank == 0: 
		a[-2,:] = 1.5
	elif mpi.rank == 1: 
		a[1,:] = 2.0
		a[-2,:] = 2.5
	elif mpi.rank == 2: 
		a[1,:] = 3.0
	a_gpu = cuda.to_device(a)

	if mpi.rank == 0: print 'dev 0','\t'*5,'dev 1','\t'*5,'dev 2'
	print_arr_gpus(ngpu, nx, ny, a_gpu)

	if mpi.rank == 0:
开发者ID:wbkifun,项目名称:fdtd_accelerate,代码行数:31,代码来源:011-mpi-exchange-zero_copy.py


示例19: set_c

		ey_gpu = cuda.to_device(f)
		ez_gpu = cuda.to_device(f)
		hx_gpu = cuda.to_device(f)
		hy_gpu = cuda.to_device(f)
		hz_gpu = cuda.to_device(f)

		cex_gpu = cuda.to_device( set_c(f,(None,-1,-1)) )
		cey_gpu = cuda.to_device( set_c(f,(-1,None,-1)) )
		cez_gpu = cuda.to_device( set_c(f,(-1,-1,None)) )
		chx_gpu = cuda.to_device( set_c(f,(None,0,0)) )
		chy_gpu = cuda.to_device( set_c(f,(0,None,0)) )
		chz_gpu = cuda.to_device( set_c(f,(0,0,None)) )

		# pinned memory allocation for zero-copy
		if myrank != 1:
			ex_send = cuda.pagelocked_zeros((nx,ny), np.float32, order='F',  mem_flags=cuda.host_alloc_flags.DEVICEMAP)
			ey_send = cuda.pagelocked_zeros((nx,ny), np.float32, order='F',  mem_flags=cuda.host_alloc_flags.DEVICEMAP)
			hx_recv = cuda.pagelocked_zeros((nx,ny), np.float32, order='F',  mem_flags=cuda.host_alloc_flags.DEVICEMAP)
			hy_recv = cuda.pagelocked_zeros((nx,ny), np.float32, order='F',  mem_flags=cuda.host_alloc_flags.DEVICEMAP)
			'''
			ex_send_map = ex_send.get_device_pointer()
			ey_send_map = ey_send.get_device_pointer()
			hx_recv_map = hx_recv.get_device_pointer()
			hy_recv_map = hy_recv.get_device_pointer()
			'''
		if myrank != 3:
			ex_recv = cuda.pagelocked_zeros((nx,ny), np.float32, order='F',  mem_flags=cuda.host_alloc_flags.DEVICEMAP)
			ey_recv = cuda.pagelocked_zeros((nx,ny), np.float32, order='F',  mem_flags=cuda.host_alloc_flags.DEVICEMAP)
			hx_send = cuda.pagelocked_zeros((nx,ny), np.float32, order='F',  mem_flags=cuda.host_alloc_flags.DEVICEMAP)
			hy_send = cuda.pagelocked_zeros((nx,ny), np.float32, order='F',  mem_flags=cuda.host_alloc_flags.DEVICEMAP)
			'''
开发者ID:wbkifun,项目名称:fdtd_accelerate,代码行数:31,代码来源:040-block1d-texture-smem-if-over_maxgrid-mpi-zero_copy.py


示例20: SourceModule

# Setup the kernel
mod = SourceModule("""
__global__ void add(float *a, float *b, float *c, float *c_map) {
	int idx = blockIdx.x*blockDim.x + threadIdx.x;
	float val;

	val = a[idx] + b[idx];
	c[idx] = val;
	c_map[idx] = val;
}
""")
add = mod.get_function("add")

# Memory allocation
nx = 1024
a = np.random.randn(nx).astype(np.float32)
b = np.random.randn(nx).astype(np.float32)
c = np.zeros_like(a)

a_gpu = cuda.to_device(a)
b_gpu = cuda.to_device(b)

# Page-locked host memory allocation for zero-copy
c_map = cuda.pagelocked_zeros(nx, np.float32, mem_flags=cuda.host_alloc_flags.DEVICEMAP)

add( a_gpu, b_gpu, cuda.Out(c), cuda.Out(c_map), block=(256,1,1), grid=(4,1) )
assert( np.linalg.norm( (a+b)-c ) == 0 )
assert( np.linalg.norm( (a+b)-c_map ) == 0 )

ctx.pop()
开发者ID:wbkifun,项目名称:fdtd_accelerate,代码行数:30,代码来源:zero_copy-test-01.py



注:本文中的pycuda.driver.pagelocked_zeros函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python driver.to_device函数代码示例发布时间:2022-05-25
下一篇:
Python driver.pagelocked_empty函数代码示例发布时间:2022-05-25
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap