• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python driver.mem_alloc函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中pycuda.driver.mem_alloc函数的典型用法代码示例。如果您正苦于以下问题:Python mem_alloc函数的具体用法?Python mem_alloc怎么用?Python mem_alloc使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了mem_alloc函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: diffuse_pycuda

def diffuse_pycuda(u):
    
    nx,ny = np.int32(u.shape)
    alpha = np.float32(0.645)
    dx = np.float32(3.5/(nx-1))
    dy = np.float32(3.5/(ny-1))
    dt = np.float32(1e-05)
    time = np.float32(0.4)
    nt = np.int32(np.ceil(time/dt))
#     print nt
    
    u[0,:]=200
    u[:,0]=200  
    
    u = u.astype(np.float32)
    
    u_prev = u.copy()    
    
    u_d = cuda.mem_alloc(u.size*u.dtype.itemsize)
    u_prev_d = cuda.mem_alloc(u_prev.size*u_prev.dtype.itemsize)
    cuda.memcpy_htod(u_d, u)
    cuda.memcpy_htod(u_prev_d, u_prev)

    BLOCKSIZE = 16
    gridSize = (int(np.ceil(nx/BLOCKSIZE)),int(np.ceil(nx/BLOCKSIZE)),1)
    blockSize = (BLOCKSIZE,BLOCKSIZE,1)

    for t in range(nt+1):
        copy_array(u_d, u_prev_d, nx, np.int32(BLOCKSIZE), block=blockSize, grid=gridSize)
        update(u_d, u_prev_d, nx, dx, dt, alpha, np.int32(BLOCKSIZE), block=blockSize, grid=gridSize)
    
    cuda.memcpy_dtoh(u, u_d)
    
    return u
开发者ID:htapia,项目名称:lania.pd,代码行数:34,代码来源:diffuse.py


示例2: get_spharms_l_eq_2

def get_spharms_l_eq_2(theta, phi, selected_Modes_gpu, rslt_gpu):
	modelist = np.array(sorted([mode[1] for mode in selected_modes])).astype(np.int32)


	modelist_gpu = cuda.mem_alloc(modelist.nbytes)

#	nsampslen = np.array(len(theta), ndmin=1).astype(np.int32)
	nmodeslen = np.array(len(modelist), ndmin=1).astype(np.int32)
	nsamps_gpu = cuda.mem_alloc(nsamps.nbytes)
	nmodes_gpu = cuda.mem_alloc(nmodeslen.nbytes) 	
	
	cuda.memcpy_htod(nsamps_gpu, nsamps)
	cuda.memcpy_htod(nmodes_gpu, nmodeslen)

#	cuda.memcpy_htod(theta_gpu, theta)
#	cuda.memcpy_htod(phi_gpu, phi)
	cuda.memcpy_htod(modelist_gpu, modelist)


	# Get and compile the cuda function 
	sph = mod.get_function("compute_sph_harmonics_l_eq_2")
	result_gpu = cuda.mem_alloc(theta_m.nbytes * len(modelist) * 2)	
	blk  = (1024,1,1)
	grd = (1,1,1) 
	sph(theta, phi, modelist_gpu, nmodes_gpu, nsamps_gpu, rslt_gpu, block=blk, grid=grd)	

#	cuda.memcpy_dtoh(result, result_gpu)
#	print(result[0:9])
#	print(len(result))
	return	
开发者ID:brandonbm00,项目名称:rapidpe_gpu,代码行数:30,代码来源:sph_harmonics_cu.py


示例3: main

def main():
    (h, w), d = (826,1169), 3 #img1.size, len(img1_arr[0][0])
    if LINEAR:
        thread_x, thread_y, thread_z = 128,1,1
        block_x, block_y = (w*h*d)/thread_x, 1
        if (w*h*d)%thread_x:
            block_x += 1
    else:
        thread_x, thread_y, thread_z = 16, 8, d
        block_x, block_y = h / thread_x, w / thread_y
        if h % thread_x:
            block_x += 1
        if w % thread_y:
            block_y += 1
    #print (h,w,d), (thread_x,thread_y,thread_z), (block_x,block_y)

    image_data_size = 2896782 * 4
    a_gpu = cuda.mem_alloc(image_data_size)
    b_gpu = cuda.mem_alloc(image_data_size)
    c_gpu = cuda.mem_alloc(image_data_size)
    
    image_path_pairs = []
    for i in xrange(50):
        page_num = i + 1
        path1, path2 = 'form1.%d.png'%page_num, 'form2.%d.png'%page_num
        image_path_pairs.append((path1,path2))
    
    do_work(image_path_pairs, a_gpu, b_gpu, c_gpu, (thread_x, thread_y, thread_z), (block_x, block_y))
开发者ID:B-Rich,项目名称:python_scripts,代码行数:28,代码来源:accelerated.py


示例4: __compute_sub_gaussian_gpu

    def __compute_sub_gaussian_gpu(self, sub_partitions):
        if sub_partitions < 1:
            raise Exception("You can't have less than 1 partition")
        elif sub_partitions > self.pts.shape[0]:
            raise Exception("sub partitions need to be smaller than pts size")
        # Delta Partitions
        d_part = self.pts.shape[0]/sub_partitions

        # Does the correct partitioning
        alloc_size = self.pts.shape[0]/sub_partitions * 2 * self.pts.itemsize
        self.pts_gpu = cuda.mem_alloc(alloc_size)
        self.pts[:, 0] = (self.pts[:, 0] - self.axis[0])/(self.axis[1] - self.axis[0])
        self.pts[:, 1] = (self.pts[:, 1] - self.axis[2])/(self.axis[3] - self.axis[2])

        for partition in range(sub_partitions):
            sub_pts = self.pts[partition*d_part:(partition+1)*d_part, :]
            self.__compute_guassian_on_pts(sub_pts)
        self.pts_gpu.free()

        # See's if there is a remainder of points to work with
        if self.pts.shape[0] % sub_partitions:
            alloc_size = (self.pts.shape[0] % sub_partitions) * (2 * self.pts.itemsize)
            self.pts_gpu = cuda.mem_alloc(alloc_size)
            self.__compute_guassian_on_pts(self.pts[sub_partitions*d_part:, :])
            self.pts_gpu.free()
开发者ID:SCIInstitute,项目名称:MLM,代码行数:25,代码来源:gaussian_gpu.py


示例5: calc_psd

    def calc_psd(self,bitloads,xtalk):
        #Number of expected permutations
        Ncombinations=self.K
        
        #Check if this is getting hairy and assign grid/block dimensions
        (warpcount,warpperblock,threadCount,blockCount) = self._workload_calc(Ncombinations)

        #How many individual lk's
        memdim=blockCount*threadCount

        threadshare_grid=(blockCount,1)
        threadshare_block=(threadCount,1,1)
        
        #Memory (We get away with the NCombinations because calpsd checks against it)
        d_a=cuda.mem_alloc(np.zeros((Ncombinations*self.N*self.N)).astype(self.type).nbytes)
        d_p=cuda.mem_alloc(np.zeros((Ncombinations*self.N)).astype(self.type).nbytes)
        d_bitload=cuda.mem_alloc(np.zeros((self.K*self.N)).astype(np.int32).nbytes)
        d_XTG=cuda.mem_alloc(np.zeros((self.K*self.N*self.N)).astype(self.type).nbytes)
        h_p=np.zeros((self.K,self.N)).astype(self.type)
        cuda.memcpy_htod(d_bitload,util.mat2arr(bitloads).astype(np.int32))
        cuda.memcpy_htod(d_XTG,xtalk.astype(self.type))
        #Go solve
        #__global__ void calc_psd(FPT *A, FPT *P, FPT *d_XTG, int *current_b, int N){

        self.k_calcpsd(d_a,d_p,d_XTG,d_bitload,np.int32(Ncombinations),block=threadshare_block,grid=threadshare_grid)
        cuda.Context.synchronize()
        cuda.memcpy_dtoh(h_p,d_p)
        d_a.free()
        d_bitload.free()
        d_XTG.free()
        d_p.free()
        return h_p.astype(np.float64)
开发者ID:andrewbolster,项目名称:multiuserDSM,代码行数:32,代码来源:gpu.py


示例6: cuda_crossOver

def cuda_crossOver(sola, solb):
    """ """
    
    sol_len = len(sola);
    
    a_gpu = cuda.mem_alloc(sola.nbytes);
    b_gpu = cuda.mem_alloc(solb.nbytes);
    
    cuda.memcpy_htod(a_gpu, sola);
    cuda.memcpy_htod(b_gpu, solb);
    
    func = mod.get_function("crossOver");
    func(a_gpu,b_gpu, block=(sol_len,1,1));
    
    a_new = numpy.empty_like(sola);
    b_new = numpy.empty_like(solb);
    
    cuda.memcpy_dtoh(a_new, a_gpu);
    cuda.memcpy_dtoh(b_new, b_gpu);
    
    if debug == True:
        print "a:", a;
        print "b:",b;
        print "new a:",a_new;
        print "new b:",b_new;
        
    return a_new,b_new;
开发者ID:adamuas,项目名称:coevondm,代码行数:27,代码来源:cudaInterface.py


示例7: alloc

    def alloc(self, dim, stream=None):
        """
        Ensure that this object's framebuffers are large enough to handle the
        given dimensions, allocating new ones if not.

        If ``stream`` is not None and a reallocation is necessary, the stream
        will be synchronized before the old buffers are deallocated.
        """
        nbins = dim.ah * dim.astride
        if self.nbins >= nbins:
            return
        if self.nbins is not None:
            self.free()
        try:
            self.d_front = cuda.mem_alloc(16 * nbins)
            self.d_back = cuda.mem_alloc(16 * nbins)
            self.d_side = cuda.mem_alloc(16 * nbins)
            self.nbins = nbins
        except cuda.MemoryError, e:
            # If a frame that's too large sneaks by the task distributor, we
            # don't want to kill the server, but we also don't want to leave
            # it stuck without any free memory to complete the next alloc.
            # TODO: measure free mem and only take tasks that fit (but that
            # should be done elsewhere)
            self.free(stream)
            raise e
开发者ID:vincentmele,项目名称:cuburn,代码行数:26,代码来源:render.py


示例8: calc_blob_blob_forces_pycuda

def calc_blob_blob_forces_pycuda(r_vectors, *args, **kwargs):
   
  # Determine number of threads and blocks for the GPU
  number_of_blobs = np.int32(len(r_vectors))
  threads_per_block, num_blocks = set_number_of_threads_and_blocks(number_of_blobs)

  # Get parameters from arguments
  L = kwargs.get('periodic_length')
  eps = kwargs.get('repulsion_strength')
  b = kwargs.get('debye_length')
  blob_radius = kwargs.get('blob_radius')

  # Reshape arrays
  x = np.reshape(r_vectors, number_of_blobs * 3)
  f = np.empty_like(x)
        
  # Allocate GPU memory
  x_gpu = cuda.mem_alloc(x.nbytes)
  f_gpu = cuda.mem_alloc(f.nbytes)
    
  # Copy data to the GPU (host to device)
  cuda.memcpy_htod(x_gpu, x)
    
  # Get blob-blob force function
  force = mod.get_function("calc_blob_blob_force")

  # Compute mobility force product
  force(x_gpu, f_gpu, np.float64(eps), np.float64(b), np.float64(blob_radius), np.float64(L[0]), np.float64(L[1]), np.float64(L[2]), number_of_blobs, block=(threads_per_block, 1, 1), grid=(num_blocks, 1)) 
   
  # Copy data from GPU to CPU (device to host)
  cuda.memcpy_dtoh(f, f_gpu)

  return np.reshape(f, (number_of_blobs, 3))
开发者ID:stochasticHydroTools,项目名称:RigidMultiblobsWall,代码行数:33,代码来源:forces_pycuda_user_defined.py


示例9: __init__

    def __init__(self, max_size, offsets=None):
        """
        Create a sorter. The sorter will hold on to internal resources for as
        long as it is alive, including an 'offsets' array of size 4*max_size.
        To share this cost, you may pass in an array of at least this size to
        __init__ (to, for instance, share across different bit-widths in a
        multi-pass sort).
        """
        self.init_mod()
        self.max_size = max_size
        assert max_size % self.group_size == 0
        max_grids = max_size / self.group_size

        if offsets is None:
            self.doffsets = cuda.mem_alloc(self.max_size * 4)
        else:
            self.doffsets = offsets
        self.dpfxs = cuda.mem_alloc(max_grids * self.radix_size * 4)
        self.dlocals = cuda.mem_alloc(max_grids * self.radix_size * 4)

        # There are probably better ways to choose how many condensation
        # groups to launch. TODO: maybe pick one if I care
        self.ncond = 32
        self.dcond = cuda.mem_alloc(self.radix_size * self.ncond * 4)
        self.dglobal = cuda.mem_alloc(self.radix_size * 4)
开发者ID:gijzelaerr,项目名称:cuburn,代码行数:25,代码来源:sort.py


示例10: prepare_device_arrays

    def prepare_device_arrays(self):

        self.maxLayers  = self.grid_prop.GetMaxLayers()
        nczbins_fine    = len(self.czcen_fine)
        numLayers       = np.zeros(nczbins_fine,dtype=np.int32)
        densityInLayer  = np.zeros((nczbins_fine*self.maxLayers),dtype=self.FTYPE)
        distanceInLayer = np.zeros((nczbins_fine*self.maxLayers),dtype=self.FTYPE)

        self.grid_prop.GetNumberOfLayers(numLayers)
        self.grid_prop.GetDensityInLayer(densityInLayer)
        self.grid_prop.GetDistanceInLayer(distanceInLayer)

        # Copy all these earth info arrays to device:
        self.d_numLayers       = cuda.mem_alloc(numLayers.nbytes)
        self.d_densityInLayer  = cuda.mem_alloc(densityInLayer.nbytes)
        self.d_distanceInLayer = cuda.mem_alloc(distanceInLayer.nbytes)
        cuda.memcpy_htod(self.d_numLayers,numLayers)
        cuda.memcpy_htod(self.d_densityInLayer,densityInLayer)
        cuda.memcpy_htod(self.d_distanceInLayer,distanceInLayer)

        self.d_ecen_fine = cuda.mem_alloc(self.ecen_fine.nbytes)
        self.d_czcen_fine = cuda.mem_alloc(self.czcen_fine.nbytes)
        cuda.memcpy_htod(self.d_ecen_fine,self.ecen_fine)
        cuda.memcpy_htod(self.d_czcen_fine,self.czcen_fine)

        return
开发者ID:gkrueckl,项目名称:pisa,代码行数:26,代码来源:Prob3GPUOscillationService.py


示例11: poisson_parallel

def poisson_parallel(source_im, dest_im, b_size, g_size, RGB, neighbors, interior_buffer, n):
	# create Cheetah template and fill in variables for Poisson kernal
  	template = Template(poisson_blending_source)
  	template.BLOCK_DIM_X = b_size[0]
  	template.BLOCK_DIM_Y = b_size[1]
  	template.WIDTH = dest_im.shape[1]
  	template.HEIGHT = dest_im.shape[0]
  	template.RGB = RGB
  	template.NEIGHBORS = neighbors

  	# compile the CUDA kernel
  	poisson_blending_kernel = cuda_compile(template, "poisson_blending_kernel")

  	# alloc memory in GPU
  	out_image = np.array(dest_im, dtype =np.uint8)
  	d_source, d_destination, d_buffer= cu.mem_alloc(source_im.nbytes), cu.mem_alloc(dest_im.nbytes), cu.mem_alloc(interior_buffer.nbytes)
  	cu.memcpy_htod(d_source, source_im)
  	cu.memcpy_htod(d_destination, dest_im)
  	cu.memcpy_htod(d_buffer, interior_buffer)

  	# calls CUDA for Poisson Blending n # of times
  	for i in range(n):
		poisson_blending_kernel(d_source, d_destination, d_buffer, block=b_size, grid=g_size)

	# retrieves the final output image and returns
	cu.memcpy_dtoh(out_image, d_destination)
  	return out_image
开发者ID:JMTing,项目名称:cs205,代码行数:27,代码来源:parallel_poisson.py


示例12: __init__

    def __init__(self, init_data, n_generators):

        self.ctx = curr_gpu.make_context()
        self.module = pycuda.compiler.SourceModule(kernels_cuda_src, no_extern_c=True)
        (free, total) = cuda.mem_get_info()
        print(("Global memory occupancy:%f%% free" % (free * 100 / total)))
        print(("Global free memory :%i Mo free" % (free / 10 ** 6)))

        ################################################################################################################

        self.width_mat = np.int32(init_data.shape[0])
        #        self.gpu_init_data = ga.to_gpu(init_data)
        self.gpu_init_data = cuda.mem_alloc(init_data.nbytes)
        cuda.memcpy_htod(self.gpu_init_data, init_data)

        self.cpu_new_data = np.zeros_like(init_data, dtype=np.float32)
        print("size new data = ", self.cpu_new_data.nbytes / 10 ** 6)
        (free, total) = cuda.mem_get_info()
        print(("Global memory occupancy:%f%% free" % (free * 100 / total)))
        print(("Global free memory :%i Mo free" % (free / 10 ** 6)))

        self.gpu_new_data = cuda.mem_alloc(self.cpu_new_data.nbytes)
        cuda.memcpy_htod(self.gpu_new_data, self.cpu_new_data)
        #        self.gpu_new_data = ga.to_gpu(self.cpu_new_data)

        self.cpu_vect_sum = np.zeros((self.width_mat,), dtype=np.float32)
        self.gpu_vect_sum = cuda.mem_alloc(self.cpu_vect_sum.nbytes)
        cuda.memcpy_htod(self.gpu_vect_sum, self.cpu_vect_sum)
        #        self.gpu_vect_sum = ga.to_gpu(self.cpu_vect_sum)
        ################################################################################################################
        self.init_rng = self.module.get_function("init_rng")
        self.gen_rand_mat = self.module.get_function("gen_rand_mat")
        self.sum_along_axis = self.module.get_function("sum_along_axis")
        self.norm_along_axis = self.module.get_function("norm_along_axis")
        self.init_vect_sum = self.module.get_function("init_vect_sum")
        self.copy_mat = self.module.get_function("copy_mat")
        ################################################################################################################
        self.n_generators = n_generators
        seed = 1
        self.rng_states = cuda.mem_alloc(
            n_generators
            * characterize.sizeof("curandStateXORWOW", "#include <curand_kernel.h>")
        )
        self.init_rng(
            np.int32(n_generators),
            self.rng_states,
            np.uint64(seed),
            np.uint64(0),
            block=(64, 1, 1),
            grid=(n_generators // 64 + 1, 1),
        )
        (free, total) = cuda.mem_get_info()

        size_block_x = 32
        size_block_y = 32
        n_blocks_x = int(self.width_mat) // (size_block_x) + 1
        n_blocks_y = int(self.width_mat) // (size_block_y) + 1
        self.grid = (n_blocks_x, n_blocks_y, 1)
        self.block = (size_block_x, size_block_y, 1)
开发者ID:koszullab,项目名称:centroID,代码行数:59,代码来源:cuda_lib.py


示例13: confirmInitialization

def confirmInitialization(featuresForSOM,somMatrix):
    #allocate memory for the somcuda on the device
    somMatrixPtr = pycuda.mem_alloc(somMatrix.nbytes)
    somBytesPerRow = np.int32(somMatrix.strides[0])
    somNumberOfRows = np.int32(somMatrix.shape[0])
    somNumberOfColumns = np.int32(somMatrix.shape[1])
    pycuda.memcpy_htod(somMatrixPtr,somMatrix)
    #allocate space for bmu index
    bmu = np.zeros(somMatrixRows).astype(np.float32)
    bmuPtr = pycuda.mem_alloc(bmu.nbytes)
    pycuda.memcpy_htod(bmuPtr,bmu)
    bmuIndex = np.zeros(somMatrixRows).astype(np.int32)
    bmuIndexPtr = pycuda.mem_alloc(bmuIndex.nbytes)
    pycuda.memcpy_htod(bmuIndexPtr,bmuIndex)
    intraDayOffset = features.columns.get_loc('Ret_121')
    dayOffset = features.columns.get_loc('Ret_PlusOne')
    objVal = 0.0;
    objSampSize=0.0
    r = [[[0.0 for k in range(0,3)] for i in range(somMatrixColumns)] for j in range (somMatrixRows)] 
    nodeHitMatrix = np.array(r).astype(np.float32)
    hitCountDict = defaultdict(list)
    samples = [x for x in range (0, somMatrixRows*somMatrixColumns)]
    if len(samples) >= len(featuresForSOM):
        samples = [x for x in range (0, len(featuresForSOM))]       
    for i in samples:
        feats = featuresForSOM.loc[i].as_matrix().astype(np.float32)
        featuresPtr = pycuda.mem_alloc(feats.nbytes)
        pycuda.memcpy_htod(featuresPtr,feats)
        #find the BMU
        computeBMU(somMatrixPtr, bmuPtr, bmuIndexPtr, featuresPtr, np.int32(len(featuresForSOM.columns)),  somBytesPerRow, somNumberOfRows, somNumberOfColumns, np.float32(MAX_FEAT), np.float32(INF),np.int32(metric),block=(blk,1,1),grid=(somNumberOfRows,1))
        pycuda.memcpy_dtoh(bmu,bmuPtr)
        pycuda.memcpy_dtoh(bmuIndex,bmuIndexPtr)
        block = np.argmin(bmu)
        thread = bmuIndex[block]
        val = hitCountDict[(block,thread)]
        if val == None or len(val) == 0:
            hitCountDict[(block,thread)] = [1,i]
        else:
            hitCountDict[(block,thread)][0] += 1
        val = np.int32(hitCountDict[(block,thread)])[0]
        if val == 1:
            val = 0x0000ff00
        elif val <= 10:
            val = 0x000000ff
        elif val <= 100:
            val = 0x00ff0000
        else:
            val = 0x00ffffff
        bval = (val & 0x000000ff)
        gval = ((val & 0x0000ff00) >> 8)
        rval = ((val & 0x00ff0000) >> 16)
        nodeHitMatrix[block][thread] = [rval/255.0,gval/255.0,bval/255.0]
    fig20 = plt.figure(20,figsize=(6*3.13,4*3.13))
    fig20.suptitle('Train Node Hit Counts. Black: 0 Green: 1 Blue: <=10 Red: <=100 White >100', fontsize=20)
    ax = plt.subplot(111)
    somplot = plt.imshow(nodeHitMatrix,interpolation="none")
    plt.show()
    plt.pause(0.1)
开发者ID:kdkoadd,项目名称:Self-Organizing-Map,代码行数:58,代码来源:sommap.py


示例14: computeAvgDistancetoBMU

def computeAvgDistancetoBMU(currentIter,iterationDistance, features, nodeHitMatrix, somMatrixPtr, somMatrix, featureStatsMatrix, featuresPtr, featureCount, somBytesPerRow, somNumberOfRows, somNumberOfColumns):
    adjustNodes = {}
    sampSize = 0
    cumDistance = 0.0
    nodeHitMatrix.fill(0)
    hitCountDict.clear()
    if len(featuresForSOM) < 100:
        sampSize = len(featuresForSOM)
    elif currentIter < len(featuresForSOM):
        sampSize = int(currentIter)
        if sampSize == 0:
            sampSize = min(somNumberOfRows*somNumberOfColumns,len(featuresForSOM))
    else:
        sampSize = len(featuresForSOM)
    samples = [x for x in range (0,sampSize)]
    #allocate space for bmu
    bmu = np.zeros(somMatrixRows).astype(np.float32)
    bmuPtr = pycuda.mem_alloc(bmu.nbytes)
    pycuda.memcpy_htod(bmuPtr,bmu)
    #allocate space for bmu index
    bmuIndex = np.zeros(somMatrixRows).astype(np.int32)
    bmuIndexPtr = pycuda.mem_alloc(bmuIndex.nbytes)
    pycuda.memcpy_htod(bmuIndexPtr,bmuIndex)
    for i in samples:
        feats = featuresForSOM.loc[i].as_matrix().astype(np.float32)
        featuresPtr = pycuda.mem_alloc(feats.nbytes)
        pycuda.memcpy_htod(featuresPtr,feats)
        #find the BMU
        computeBMU(somMatrixPtr, bmuPtr, bmuIndexPtr, featuresPtr, np.int32(featureCount),  somBytesPerRow, somNumberOfRows, somNumberOfColumns, np.float32(MAX_FEAT), np.float32(INF),np.int32(metric),block=(blk,1,1),grid=(somNumberOfRows,1))
        pycuda.memcpy_dtoh(bmu,bmuPtr)
        pycuda.memcpy_dtoh(bmuIndex,bmuIndexPtr)
        cumDistance += np.min(bmu)
        block = np.argmin(bmu)
        thread = bmuIndex[block]
        adjustNodes[i]=[block,thread]
        val = hitCountDict[(block,thread)]
        if val == None or len(val) == 0:
            hitCountDict[(block,thread)] = [1,i]
        else:
            hitCountDict[(block,thread)][0] += 1
        val = np.int32(hitCountDict[(block,thread)])[0]
        if val == 1:
            val = 0x0000ff00
        elif val <= 10:
            val = 0x000000ff
        elif val <= 100:
            val = 0x00ff0000
        else:
            val = 0x00ffffff
        bval = (val & 0x000000ff)
        gval = ((val & 0x0000ff00) >> 8)
        rval = ((val & 0x00ff0000) >> 16)
        nodeHitMatrix[block][thread] = [rval/255.0,gval/255.0,bval/255.0]
    iterationDistance.append(cumDistance/sampSize)
    iterationCount.append(currentIter)
    return cumDistance/sampSize
开发者ID:kdkoadd,项目名称:Self-Organizing-Map,代码行数:56,代码来源:sommap.py


示例15: set_refsmiles

    def set_refsmiles(self,refsmilesmat,refcountsmat,reflengths,refmags=None): #{{{
        """Sets the reference SMILES set to use Lingo matrix *refsmilesmat*, count matrix *refcountsmat*,
        and length vector *reflengths*. If *refmags* is provided, it will be used as the magnitude
        vector; else, the magnitude vector will be computed (on the GPU) from the count matrix.

        Because of hardware limitations, the reference matrices (*refsmilesmat* and *refcountsmat*) must have
        no more than 32,768 rows (molecules) and 65,536 columns (Lingos). Larger computations must be performed in tiles.
        """

        # Set up lingo and count matrices on device #{{{
        if self.usePycudaArray:
            # Set up using PyCUDA CUDAArray support
            self.gpu.rsmiles = cuda.matrix_to_array(refsmilesmat,order='C')
            self.gpu.rcounts = cuda.matrix_to_array(refcountsmat,order='C')
            self.gpu.tex2lr.set_array(self.gpu.rsmiles)
            self.gpu.tex2cr.set_array(self.gpu.rcounts)
        else:
            # Manually handle setup
            temprlmat = self._padded_array(refsmilesmat)
            if temprlmat.shape[1] > 65536 or temprlmat.shape[0] > 32768:
                raise ValueError("Error: reference matrix is not allowed to have more than 64K columns (LINGOs) or 32K rows (molecules) (both padded to multiple of 16). Dimensions = (%d,%d)."%temprlmat.shape)
            self.gpu.rsmiles = cuda.mem_alloc(temprlmat.nbytes)
            cuda.memcpy_htod_async(self.gpu.rsmiles,temprlmat,stream=self.gpu.stream)

            temprcmat = self._padded_array(refcountsmat)
            self.gpu.rcounts = cuda.mem_alloc(temprcmat.nbytes)
            cuda.memcpy_htod_async(self.gpu.rcounts,temprcmat,stream=self.gpu.stream)

            descriptor = cuda.ArrayDescriptor()
            descriptor.width  = temprcmat.shape[1]
            descriptor.height = temprcmat.shape[0]
            descriptor.format = cuda.array_format.UNSIGNED_INT32
            descriptor.num_channels = 1
            self.gpu.tex2lr.set_address_2d(self.gpu.rsmiles,descriptor,temprlmat.strides[0])
            self.gpu.tex2cr.set_address_2d(self.gpu.rcounts,descriptor,temprcmat.strides[0])
            self.gpu.stream.synchronize()
            del temprlmat
            del temprcmat
        #}}}

        self.rlengths = reflengths
        self.rshape = refsmilesmat.shape
        self.nref = refsmilesmat.shape[0]

        # Copy reference lengths to GPU
        self.gpu.rl_gpu = cuda.to_device(reflengths)

        # Allocate buffers for query set magnitudes
        self.gpu.rmag_gpu = cuda.mem_alloc(reflengths.nbytes)
        if refmags is not None:
            cuda.memcpy_htod(self.gpu.rmag_gpu,refmags)
        else:
            # Calculate query set magnitudes on GPU
            magthreads = 256
            self.gpu.refMagKernel(self.gpu.rmag_gpu,self.gpu.rl_gpu,numpy.int32(self.nref),block=(magthreads,1,1),grid=(30,1),shared=magthreads*4,texrefs=[self.gpu.tex2cr])
        return
开发者ID:ihaque,项目名称:SIML,代码行数:56,代码来源:GPULingo.py


示例16: gfx_init

	def gfx_init( self ) :
		try :
			print 'compiling'
			self.prog = sh.compile_program_vfg( 'shad/balls' )

			print 'compiled'

			self.loc_mmv = sh.get_loc(self.prog,'modelview' )
			self.loc_mp  = sh.get_loc(self.prog,'projection')
			self.l_color = sh.get_loc(self.prog,'color'     )
			self.l_size  = sh.get_loc(self.prog,'ballsize'  )

		except ValueError as ve :
			print "Shader compilation failed: " + str(ve)
			sys.exit(0)    

#        glUseProgram( self.prog )
#        glUniform1i( pointsid , 0 );
#        glUseProgram( 0 )

		#
		# cuda init
		#
		self.grid = (int(self.BOX),int(self.BOX))
		self.block = (1,1,int(self.BOX))

		print 'CUDA: block %s , grid %s' % (str(self.block),str(self.grid))
#        print cuda_driver.device_attribute.MAX_THREADS_PER_BLOCK
#        print cuda_driver.device_attribute.MAX_BLOCK_DIM_X
#        print cuda_driver.device_attribute.MAX_BLOCK_DIM_Y
#        print cuda_driver.device_attribute.MAX_BLOCK_DIM_Z

		floatbytes = np.dtype(np.float32).itemsize

		self.gpos = glGenBuffers(1)
		glBindBuffer( GL_ARRAY_BUFFER , self.gpos )
		glBufferData( GL_ARRAY_BUFFER , self.pos.nbytes, self.pos, GL_STREAM_DRAW )
		glBindBuffer( GL_ARRAY_BUFFER , 0 )

		self.df1 = cuda_driver.mem_alloc( self.f.nbytes )
		self.df2 = cuda_driver.mem_alloc( self.f.nbytes )

		cuda_driver.memcpy_htod( self.df1 , self.f )
		cuda_driver.memset_d32( self.df2 , 0 , self.NUM*self.Q )

		mod = cuda_driver.module_from_file( 'lbm_kernel.cubin' )

		self.collision = mod.get_function("collision_step")
		self.collision.prepare( "Piii" )

		self.streaming = mod.get_function("streaming_step")
		self.streaming.prepare( "PPiii" )

		self.colors = mod.get_function("colors")
		self.colors.prepare( "PPiii" )
开发者ID:jkotur,项目名称:particles,代码行数:55,代码来源:lbm.py


示例17: _initME

    def _initME(self):
        """Initializes the MotionEnergy CUDA functions."""
        logging.debug('initME')

        # register all device functions for easy access
        # imported from motion_energy_device.py
        self.dev_conv1 = mod.get_function("dev_conv1")
        self.dev_convn = mod.get_function("dev_convn")
        self.dev_accumDiffStims = mod.get_function("dev_accumDiffStims")
        self.dev_filt2dir = mod.get_function("dev_filt2dir")
        self.dev_edges = mod.get_function("dev_edges")
        self.dev_fullRect2 = mod.get_function("dev_fullRect2")
        self.dev_mean3 = mod.get_function("dev_mean3")
        self.dev_normalize = mod.get_function("dev_normalize")
        self.dev_split_gray = mod.get_function("dev_split_gray")
        self.dev_split_RGB = mod.get_function("dev_split_RGB")
        self.dev_sub = mod.get_function("dev_sub")
        self.dev_ave = mod.get_function("dev_ave")
        self.dev_sum = mod.get_function("dev_sum")
        self.dev_scaleHalfRect = mod.get_function("dev_scaleHalfRect")
        self.dev_scale = mod.get_function("dev_scale")
        self.dev_split_gray = mod.get_function("dev_split_gray")
        self.dev_split_RGB = mod.get_function("dev_split_RGB")
        self.dev_memcpy_dtod = mod.get_function("dev_memcpy_dtod")

        # for quick access: the size in bytes of nrX*nrY floats
        self.szXY = self.sizeofFloat * self.nrX * self.nrY

        # V1 filter responses
        self.d_resp = cuda.mem_alloc(self.szXY*self.nrFilters*self.nrScales)

        # V1 complex cell responses
        self.d_respV1c = cuda.mem_alloc(self.szXY*self.nrDirs)

        # stim frame
        self.d_stim = cuda.mem_alloc(self.szXY*self.nrC)

        # stim frame buffer (last nrT frames)
        self.d_stimBuf = cuda.mem_alloc(self.szXY*self.nrT)
        # I'm not sure if this memset works as expected... for now, memcpy an
        # array of zeros
        # cuda.memset_d32(self.d_stimBuf, 0, self.nrX*self.nrY*self.nrT)
        tmp = np.zeros(self.nrX*self.nrY*self.nrT).astype(np.float32)
        cuda.memcpy_htod(self.d_stimBuf, tmp)

        self.d_diffV1GausBufT = cuda.mem_alloc(self.szXY*self.v1GaussFiltSize)

        self.d_scalingStimBuf = cuda.mem_alloc(self.szXY*self.nrT)
        self.d_v1GausBuf = cuda.mem_alloc(self.szXY*self.v1GaussFiltSize)
        self.d_diffV1GausBuf = cuda.mem_alloc(self.szXY*self.v1GaussFiltSize)
        self.d_pop = cuda.mem_alloc(self.szXY*self.nrScales)

        self.d_scalingFilt = mod.get_global("d_scalingFilt")[0]
        self.d_v1GaussFilt = mod.get_global("d_v1GaussFilt")[0]
        self.d_complexV1Filt = mod.get_global("d_complexV1Filt")[0]
        self.d_normV1filt = mod.get_global("d_normV1filt")[0]
        self.d_diff1filt = mod.get_global("d_diff1filt")[0]
        self.d_diff2filt = mod.get_global("d_diff2filt")[0]
        self.d_diff3filt = mod.get_global("d_diff3filt")[0]
开发者ID:UCI-CARL,项目名称:MotionEnergy,代码行数:59,代码来源:motionenergy.py


示例18: CudaRPN

def CudaRPN(inPath, outPath, mycode, mydata, **kw):
    """CudaRPN implements the interface to the CUDA run environment.
    """
    verbose = kw.get('verbose', False)
    BLOCK_SIZE = 1024  # Kernel grid and block size
    STACK_SIZE = 64
    # OFFSETS = 64
    # unary_operator_names = {'plus': '+', 'minus': '-'}
    function = Function(
        start=len(hardcase),
        bss=64,
        handcode=kw.get('handcode'))

    with Timing('Total execution time'):
        with Timing('Get and convert image data to gpu ready'):
            im = Image.open(inPath)
            px = array(im).astype(float32)
            function.assemble(mycode, mydata, verbose=True)
            function.disassemble(verbose=True)
            cx = array(function.final).astype(int32)
            dx = array(function.data).astype(float32)
        with Timing('Allocate mem to gpu'):
            d_px = mem_alloc(px.nbytes)
            memcpy_htod(d_px, px)
            d_cx = mem_alloc(cx.nbytes)
            memcpy_htod(d_cx, cx)
            d_dx = mem_alloc(dx.nbytes)
            memcpy_htod(d_dx, dx)
        with Timing('Kernel execution time'):
            block = (BLOCK_SIZE, 1, 1)
            checkSize = int32(im.size[0]*im.size[1])
            grid = (int(im.size[0] * im.size[1] / BLOCK_SIZE) + 1, 1, 1)
            kernel = INCLUDE + HEAD + function.body + convolve + TAIL
            sourceCode = kernel % {
                'pixelwidth': 3,
                'stacksize': STACK_SIZE,
                'case': function.case}
            with open("RPN_sourceCode.c", "w") as target:
                print>>target, sourceCode
            module = SourceModule(sourceCode)
            func = module.get_function("RPN")
            func(d_px, d_cx, d_dx, checkSize, block=block, grid=grid)
        with Timing('Get data from gpu and convert'):
            RPNPx = empty_like(px)
            memcpy_dtoh(RPNPx, d_px)
            RPNPx = uint8(RPNPx)
        with Timing('Save image time'):
            pil_im = Image.fromarray(RPNPx, mode="RGB")
            pil_im.save(outPath)
    # Output final statistics
    if verbose:
        print '%40s: %s%s' % ('Target image', outPath, im.size)
        print Timing.text
开发者ID:jlettvin,项目名称:shmathd,代码行数:53,代码来源:gpu11.py


示例19: add

 def add(slice_a, slice_b):
     slice_c = np.empty_like(slice_a)
     a_gpu = cuda.mem_alloc(slice_a.nbytes)
     cuda.memcpy_htod(a_gpu, slice_a)
     b_gpu = cuda.mem_alloc(slice_b.nbytes)
     cuda.memcpy_htod(b_gpu, slice_b)
     c_gpu = cuda.mem_alloc(slice_c.nbytes)
     start = time.time()
     func(a_gpu, b_gpu, c_gpu, block=(BLOCK_SIZE, BLOCK_SIZE, 1))
     end = time.time()
     cuda.memcpy_dtoh(slice_c, c_gpu)
     return (slice_c, end-start)
开发者ID:mpitx,项目名称:psychic-octo-ninja,代码行数:12,代码来源:vector_sum_cudampi.py


示例20: stepN

    def stepN(self,positions,velocities,n):
        x_gpu = cuda.mem_alloc(positions.nbytes)
        v_gpu = cuda.mem_alloc(velocities.nbytes)

        cuda.memcpy_htod(x_gpu,positions)
        cuda.memcpy_htod(v_gpu,velocities)

        import numpy as np
        self.cuBoris(x_gpu, v_gpu, np.int32(n), block=(1024,1,1), grid=(self.numParts/1024 + 1,1))

        cuda.memcpy_dtoh(positions,x_gpu)
        cuda.memcpy_dtoh(velocities,v_gpu)
开发者ID:npbarnes,项目名称:Cuda-hybrid-PIC,代码行数:12,代码来源:Electromagnetism.py



注:本文中的pycuda.driver.mem_alloc函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python driver.mem_get_info函数代码示例发布时间:2022-05-25
下一篇:
Python driver.matrix_to_texref函数代码示例发布时间:2022-05-25
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap