• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python cuda.to_device函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中numba.cuda.to_device函数的典型用法代码示例。如果您正苦于以下问题:Python to_device函数的具体用法?Python to_device怎么用?Python to_device使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了to_device函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: test_func

    def test_func(self):
        A = np.array(np.random.random((n, n)), dtype=np.float32)
        B = np.array(np.random.random((n, n)), dtype=np.float32)
        C = np.empty_like(A)

        print("N = %d x %d" % (n, n))

        s = time()
        stream = cuda.stream()
        with stream.auto_synchronize():
            dA = cuda.to_device(A, stream)
            dB = cuda.to_device(B, stream)
            dC = cuda.to_device(C, stream)
            cu_square_matrix_mul[(bpg, bpg), (tpb, tpb), stream](dA, dB, dC)
            dC.copy_to_host(C, stream)

        e = time()
        tcuda = e - s

        # Host compute
        Amat = np.matrix(A)
        Bmat = np.matrix(B)

        s = time()
        Cans = Amat * Bmat
        e = time()
        tcpu = e - s

        print('cpu:  %f' % tcpu)
        print('cuda: %f' % tcuda)
        print('cuda speedup: %.2fx' % (tcpu / tcuda))

        # Check result
        self.assertTrue(np.allclose(C, Cans))
开发者ID:ASPP,项目名称:numba,代码行数:34,代码来源:test_matmul.py


示例2: test_laplace_small

    def test_laplace_small(self):
        NN = 256
        NM = 256

        A = np.zeros((NN, NM), dtype=np.float64)
        Anew = np.zeros((NN, NM), dtype=np.float64)

        n = NN
        m = NM
        iter_max = 1000

        tol = 1.0e-6
        error = 1.0

        for j in range(n):
            A[j, 0] = 1.0
            Anew[j, 0] = 1.0

        print("Jacobi relaxation Calculation: %d x %d mesh" % (n, m))

        timer = time.time()
        iter = 0

        blockdim = (tpb, tpb)
        griddim = (NN // blockdim[0], NM // blockdim[1])

        error_grid = np.zeros(griddim)

        stream = cuda.stream()

        dA = cuda.to_device(A, stream)          # to device and don't come back
        dAnew = cuda.to_device(Anew, stream)    # to device and don't come back
        derror_grid = cuda.to_device(error_grid, stream)

        while error > tol and iter < iter_max:
            self.assertTrue(error_grid.dtype == np.float64)

            jocabi_relax_core[griddim, blockdim, stream](dA, dAnew, derror_grid)

            derror_grid.copy_to_host(error_grid, stream=stream)


            # error_grid is available on host
            stream.synchronize()

            error = np.abs(error_grid).max()

            # swap dA and dAnew
            tmp = dA
            dA = dAnew
            dAnew = tmp

            if iter % 100 == 0:
                print("%5d, %0.6f (elapsed: %f s)" %
                      (iter, error, time.time() - timer))

            iter += 1

        runtime = time.time() - timer
        print(" total: %f s" % runtime)
开发者ID:ASPP,项目名称:numba,代码行数:60,代码来源:test_laplace.py


示例3: monte_carlo_pricer

def monte_carlo_pricer(paths, dt, interest, volatility):
    n = paths.shape[0]

    mm = MM(shape=n, dtype=np.double, prealloc=5)

    blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK
    gridsz = int(math.ceil(float(n) / blksz))

    stream = cuda.stream()
    prng = PRNG(PRNG.MRG32K3A, stream=stream)

    # Allocate device side array
    d_normdist = cuda.device_array(n, dtype=np.double, stream=stream)

    c0 = interest - 0.5 * volatility ** 2
    c1 = volatility * math.sqrt(dt)

    d_last = cuda.to_device(paths[:, 0], to=mm.get())
    for j in range(1, paths.shape[1]):
        prng.normal(d_normdist, mean=0, sigma=1)
        d_paths = cuda.to_device(paths[:, j], stream=stream, to=mm.get())
        step(d_last, dt, c0, c1, d_normdist, out=d_paths, stream=stream)
        d_paths.copy_to_host(paths[:, j], stream=stream)
        mm.free(d_last)
        d_last = d_paths

    stream.synchronize()
开发者ID:ContinuumIO,项目名称:numbapro-examples,代码行数:27,代码来源:pricer_cuda_vectorize.py


示例4: test_func

    def test_func(self):
        np.random.seed(42)
        A = np.array(np.random.random((n, n)), dtype=np.float32)
        B = np.array(np.random.random((n, n)), dtype=np.float32)
        C = np.empty_like(A)

        s = time()
        stream = cuda.stream()
        with stream.auto_synchronize():
            dA = cuda.to_device(A, stream)
            dB = cuda.to_device(B, stream)
            dC = cuda.to_device(C, stream)
            cu_square_matrix_mul[(bpg, bpg), (tpb, tpb), stream](dA, dB, dC)
            dC.copy_to_host(C, stream)

        e = time()
        tcuda = e - s

        # Host compute
        s = time()
        Cans = np.dot(A, B)
        e = time()
        tcpu = e - s

        # Check result
        np.testing.assert_allclose(C, Cans, rtol=1e-5)
开发者ID:GaZ3ll3,项目名称:numba,代码行数:26,代码来源:test_matmul.py


示例5: stupidconv_gpu

def stupidconv_gpu(img, filt, padval):
    """
    does convolution without using FFT because FFT is pissing me off and giving me weird answers
    :param img:
    :param filt:
    :param padval:
    :return:
    """
    cuda.close()
    cuda.select_device(1)
    # get the number of nonzero entries in the filter for later averaging of result
    filt_nnz = np.count_nonzero(filt)

    # pad the images
    s_filt = filt.shape
    s_img = img.shape

    # appropriate padding depends on context
    # pad with filt size all around img
    pad_img = np.ones((s_img[0] + (2 * s_filt[0]), s_img[1] + (2 * s_filt[1])), dtype=np.float32) * padval

    pad_img[s_filt[0]: s_img[0] + s_filt[0], s_filt[1]: s_img[1] + s_filt[1]] = img

    output = np.zeros(pad_img.shape, dtype=np.float32)

    d_pad_img = cuda.to_device(pad_img)
    d_filt = cuda.to_device(filt)
    d_output = cuda.to_device(output)

    stupidconv_gpu_helper(d_pad_img, d_filt, s_img[0], s_img[1], s_filt[0], s_filt[1], d_output)

    output = d_output.copy_to_host()
    output = output[s_filt[0]:s_filt[0] + s_img[0], s_filt[1]:s_filt[1] + s_img[1]]

    return output / filt_nnz
开发者ID:e1morganUCSD,项目名称:pyLapdog,代码行数:35,代码来源:gpufunc.py


示例6: test_with_context

    def test_with_context(self):

        @cuda.jit
        def vector_add_scalar(arr, val):
            i = cuda.grid(1)
            if i < arr.size:
                arr[i] += val


        hostarr = np.arange(10, dtype=np.float32)
        with cuda.gpus[0]:
            arr1 = cuda.to_device(hostarr)

        with cuda.gpus[1]:
            arr2 = cuda.to_device(hostarr)

        with cuda.gpus[0]:
            vector_add_scalar[1, 10](arr1, 1)

        with cuda.gpus[1]:
            vector_add_scalar[1, 10](arr2, 2)

        with cuda.gpus[0]:
            np.testing.assert_equal(arr1.copy_to_host(), (hostarr + 1))

        with cuda.gpus[1]:
            np.testing.assert_equal(arr2.copy_to_host(), (hostarr + 2))

        with cuda.gpus[0]:
            # Transfer from GPU1 to GPU0
            arr1.copy_to_device(arr2)
            np.testing.assert_equal(arr1.copy_to_host(), (hostarr + 2))
开发者ID:Alexhuszagh,项目名称:numba,代码行数:32,代码来源:test_multigpu.py


示例7: test_for_pre

    def test_for_pre(self):
        """Test issue with loop not running due to bad sign-extension at the for loop
        precondition.
        """

        @cuda.jit(argtypes=[float32[:, :], float32[:, :], float32[:]])
        def diagproduct(c, a, b):
            startX, startY = cuda.grid(2)
            gridX = cuda.gridDim.x * cuda.blockDim.x
            gridY = cuda.gridDim.y * cuda.blockDim.y
            height = c.shape[0]
            width = c.shape[1]

            for x in range(startX, width, (gridX)):
                for y in range(startY, height, (gridY)):
                    c[y, x] = a[y, x] * b[x]

        N = 8

        A, B = generate_input(N)

        F = np.empty(A.shape, dtype=A.dtype)

        blockdim = (32, 8)
        griddim = (1, 1)

        dA = cuda.to_device(A)
        dB = cuda.to_device(B)
        dF = cuda.to_device(F, copy=False)
        diagproduct[griddim, blockdim](dF, dA, dB)

        E = np.dot(A, np.diag(B))
        np.testing.assert_array_almost_equal(dF.copy_to_host(), E)
开发者ID:cpcloud,项目名称:numba,代码行数:33,代码来源:test_nondet.py


示例8: setup

 def setup(self):
     self.stream = cuda.stream()
     self.f32 = np.zeros(self.n, dtype=np.float32)
     self.d_f32 = cuda.to_device(self.f32, self.stream)
     self.f64 = np.zeros(self.n, dtype=np.float64)
     self.d_f64 = cuda.to_device(self.f64, self.stream)
     self.stream.synchronize()
开发者ID:gmarkall,项目名称:numba-benchmark,代码行数:7,代码来源:bench_cuda.py


示例9: test_gufunc_stream

    def test_gufunc_stream(self):
        #cuda.driver.flush_pending_free()
        matrix_ct = 1001 # an odd number to test thread/block division in CUDA
        A = np.arange(matrix_ct * 2 * 4, dtype=np.float32).reshape(matrix_ct, 2,
                                                                   4)
        B = np.arange(matrix_ct * 4 * 5, dtype=np.float32).reshape(matrix_ct, 4,
                                                                   5)

        ts = time()
        stream = cuda.stream()
        dA = cuda.to_device(A, stream)
        dB = cuda.to_device(B, stream)

        dC = cuda.device_array(shape=(1001, 2, 5), dtype=A.dtype, stream=stream)
        dC = gufunc(dA, dB, out=dC, stream=stream)
        C = dC.copy_to_host(stream=stream)
        stream.synchronize()

        tcuda = time() - ts

        ts = time()
        Gold = ut.matrix_multiply(A, B)
        tcpu = time() - ts

        stream_speedups.append(tcpu / tcuda)

        self.assertTrue(np.allclose(C, Gold))
开发者ID:GaZ3ll3,项目名称:numba,代码行数:27,代码来源:test_gufunc.py


示例10: driver

def driver(niters, seed):
    curr = seed
    nxt = np.zeros(len(seed))
    nxt[0] = seed[0]
    nxt[-1] = seed[-1]

    start_time = time.time()

    threads_per_block = 256
    blocks_per_grid = int(math.ceil(float(len(curr) - 2) / threads_per_block))

    d_nxt = cuda.to_device(nxt)
    d_curr = cuda.to_device(curr)
    for iter in range(niters):
        kernel[blocks_per_grid, threads_per_block](d_nxt, d_curr, len(curr) - 2)

        tmp = d_nxt
        d_nxt = d_curr
        d_curr = tmp
    d_curr.copy_to_host(curr)
    elapsed_time = time.time() - start_time

    print('Elapsed time for N=' + str(len(seed) - 2) + ', # iters=' +
            str(niters) + ' is ' + str(elapsed_time) + ' s')
    print(str(float(niters) / elapsed_time) + ' iters / s')

    return curr
开发者ID:agrippa,项目名称:hpc-bootcamp,代码行数:27,代码来源:1d_iter_avg_solution.py


示例11: test_func

    def test_func(self):

        @cuda.jit(argtypes=[float32[:, ::1], float32[:, ::1], float32[:, ::1]])
        def cu_square_matrix_mul(A, B, C):
            sA = cuda.shared.array(shape=SM_SIZE, dtype=float32)
            sB = cuda.shared.array(shape=(tpb, tpb), dtype=float32)

            tx = cuda.threadIdx.x
            ty = cuda.threadIdx.y
            bx = cuda.blockIdx.x
            by = cuda.blockIdx.y
            bw = cuda.blockDim.x
            bh = cuda.blockDim.y

            x = tx + bx * bw
            y = ty + by * bh

            acc = float32(0)  # forces all the math to be f32
            for i in range(bpg):
                if x < n and y < n:
                    sA[ty, tx] = A[y, tx + i * tpb]
                    sB[ty, tx] = B[ty + i * tpb, x]

                cuda.syncthreads()

                if x < n and y < n:
                    for j in range(tpb):
                        acc += sA[ty, j] * sB[j, tx]

                cuda.syncthreads()

            if x < n and y < n:
                C[y, x] = acc

        np.random.seed(42)
        A = np.array(np.random.random((n, n)), dtype=np.float32)
        B = np.array(np.random.random((n, n)), dtype=np.float32)
        C = np.empty_like(A)

        s = time()
        stream = cuda.stream()
        with stream.auto_synchronize():
            dA = cuda.to_device(A, stream)
            dB = cuda.to_device(B, stream)
            dC = cuda.to_device(C, stream)
            cu_square_matrix_mul[(bpg, bpg), (tpb, tpb), stream](dA, dB, dC)
            dC.copy_to_host(C, stream)

        e = time()
        tcuda = e - s

        # Host compute
        s = time()
        Cans = np.dot(A, B)
        e = time()
        tcpu = e - s

        # Check result
        np.testing.assert_allclose(C, Cans, rtol=1e-5)
开发者ID:Alexhuszagh,项目名称:numba,代码行数:59,代码来源:test_matmul.py


示例12: fork_test

def fork_test(q):
    from numba.cuda.cudadrv.error import CudaDriverError
    try:
        cuda.to_device(np.arange(1))
    except CudaDriverError as e:
        q.put(e)
    else:
        q.put(None)
开发者ID:cpcloud,项目名称:numba,代码行数:8,代码来源:test_multiprocessing.py


示例13: test_devicearray_replace

 def test_devicearray_replace(self):
     N = 100
     array = np.arange(N, dtype=np.int32)
     original = array.copy()
     gpumem = cuda.to_device(array)
     cuda.to_device(array * 2, to=gpumem)
     gpumem.copy_to_host(array)
     self.assertTrue((array == original * 2).all())
开发者ID:ASPP,项目名称:numba,代码行数:8,代码来源:test_cuda_ndarray.py


示例14: test_devicearray_replace

 def test_devicearray_replace(self):
     N = 100
     array = np.arange(N, dtype=np.int32)
     original = array.copy()
     gpumem = cuda.to_device(array)
     cuda.to_device(array * 2, to=gpumem)
     gpumem.copy_to_host(array)
     np.testing.assert_array_equal(array, original * 2)
开发者ID:esc,项目名称:numba,代码行数:8,代码来源:test_cuda_ndarray.py


示例15: monte_carlo_pricer

def monte_carlo_pricer(paths, dt, interest, volatility):
    n = paths.shape[0]
    num_streams = 2
    
    part_width = int(math.ceil(float(n) / num_streams))
    partitions = [(0, part_width)]
    for i in range(1, num_streams):
        begin, end = partitions[i - 1]
        begin, end = end, min(end + (end - begin), n)
        partitions.append((begin, end))
    partlens = [end - begin for begin, end in partitions]

    mm = MM(shape=part_width, dtype=np.double, prealloc=10 * num_streams)

    device = cuda.get_current_device()
    blksz = device.MAX_THREADS_PER_BLOCK
    gridszlist = [int(math.ceil(float(partlen) / blksz))
                  for partlen in partlens]

    strmlist = [cuda.stream() for _ in range(num_streams)]

    prnglist = [PRNG(PRNG.MRG32K3A, stream=strm)
                for strm in strmlist]

    # Allocate device side array
    d_normlist = [cuda.device_array(partlen, dtype=np.double, stream=strm)
                  for partlen, strm in zip(partlens, strmlist)]

    c0 = interest - 0.5 * volatility ** 2
    c1 = volatility * math.sqrt(dt)

    # Configure the kernel
    # Similar to CUDA-C: cu_monte_carlo_pricer<<<gridsz, blksz, 0, stream>>>
    steplist = [cu_step[gridsz, blksz, strm]
               for gridsz, strm in zip(gridszlist, strmlist)]

    d_lastlist = [cuda.to_device(paths[s:e, 0], to=mm.get(stream=strm))
                  for (s, e), strm in zip(partitions, strmlist)]

    for j in range(1, paths.shape[1]):
        for prng, d_norm in zip(prnglist, d_normlist):
            prng.normal(d_norm, mean=0, sigma=1)

        d_pathslist = [cuda.to_device(paths[s:e, j], stream=strm,
                                      to=mm.get(stream=strm))
                       for (s, e), strm in zip(partitions, strmlist)]

        for step, args in zip(steplist, zip(d_lastlist, d_pathslist, d_normlist)):
            d_last, d_paths, d_norm = args
            step(d_last, d_paths, dt, c0, c1, d_norm)

        for d_paths, strm, (s, e) in zip(d_pathslist, strmlist, partitions):
            d_paths.copy_to_host(paths[s:e, j], stream=strm)
            mm.free(d_last, stream=strm)
        d_lastlist = d_pathslist

    for strm in strmlist:
        strm.synchronize()
开发者ID:AngelBerihuete,项目名称:numbapro-examples,代码行数:58,代码来源:pricer_cuda_overlap.py


示例16: test_devicearray_contiguous_device_strided

    def test_devicearray_contiguous_device_strided(self):
        d = cuda.to_device(np.arange(20))
        arr = np.arange(20)

        with self.assertRaises(ValueError) as e:
            d.copy_to_device(cuda.to_device(arr)[::2])
        self.assertEqual(
            devicearray.errmsg_contiguous_buffer,
            str(e.exception))
开发者ID:esc,项目名称:numba,代码行数:9,代码来源:test_cuda_ndarray.py


示例17: test_laplace_small

    def test_laplace_small(self):
        if config.ENABLE_CUDASIM:
            NN, NM = 4, 4
            iter_max = 20
        else:
            NN, NM = 256, 256
            iter_max = 1000

        A = np.zeros((NN, NM), dtype=np.float64)
        Anew = np.zeros((NN, NM), dtype=np.float64)

        n = NN
        m = NM

        tol = 1.0e-6
        error = 1.0

        for j in range(n):
            A[j, 0] = 1.0
            Anew[j, 0] = 1.0

        timer = time.time()
        iter = 0

        blockdim = (tpb, tpb)
        griddim = (NN // blockdim[0], NM // blockdim[1])

        error_grid = np.zeros(griddim)

        stream = cuda.stream()

        dA = cuda.to_device(A, stream)          # to device and don't come back
        dAnew = cuda.to_device(Anew, stream)    # to device and don't come back
        derror_grid = cuda.to_device(error_grid, stream)

        while error > tol and iter < iter_max:
            self.assertTrue(error_grid.dtype == np.float64)

            jocabi_relax_core[griddim, blockdim, stream](dA, dAnew, derror_grid)

            derror_grid.copy_to_host(error_grid, stream=stream)


            # error_grid is available on host
            stream.synchronize()

            error = np.abs(error_grid).max()

            # swap dA and dAnew
            tmp = dA
            dA = dAnew
            dAnew = tmp

            iter += 1

        runtime = time.time() - timer
开发者ID:CaptainAL,项目名称:Spyder,代码行数:56,代码来源:test_laplace.py


示例18: __init__

 def __init__(self, positions, weights):
     self.calculate_forces = cuda.jit(
         argtypes=(float32[:,:], float32[:], float32[:,:])
         )(calculate_forces)
     self.accelerations = np.zeros_like(positions)
     self.n_bodies = len(weights)
     self.stream = cuda.stream()
     self.d_pos = cuda.to_device(positions, self.stream)
     self.d_wei = cuda.to_device(weights, self.stream)
     self.d_acc = cuda.to_device(self.accelerations, self.stream)
     self.stream.synchronize()
开发者ID:gmarkall,项目名称:numba-benchmark,代码行数:11,代码来源:bench_cuda.py


示例19: test_contigous_2d

    def test_contigous_2d(self):
        ary = np.arange(10)
        cary = ary.reshape(2, 5)
        fary = np.asfortranarray(cary)

        dcary = cuda.to_device(cary)
        dfary = cuda.to_device(fary)
        self.assertTrue(dcary.is_c_contigous())
        self.assertTrue(not dfary.is_c_contigous())
        self.assertTrue(not dcary.is_f_contigous())
        self.assertTrue(dfary.is_f_contigous())
开发者ID:wojons,项目名称:numba,代码行数:11,代码来源:test_array_attr.py


示例20: lapconv

def lapconv(img, filt, padval):
    """
    Performs FFT-based normalization on filter and image, without normalization

    :param numpy.core.multiarray.ndarray img: stimulus image to be convolved
    :param numpy.core.multiarray.ndarray filt: filter to convolve with
    :param float padval: value with which to pad the img before convolution
    :return: result of convolution
    :rtype: numpy.core.multiarray.ndarray
    """

    # get the number of nonzero entries in the filter for later dividing of the results
    filt_nnz = np.count_nonzero(filt)

    # pad the images
    s_filt = filt.shape
    s_img = img.shape

    # appropriate padding depends on context
    pad_img = np.ones((s_img[0] + s_filt[0], s_img[1] + s_filt[1])) * padval

    pad_img[0: s_img[0], 0: s_img[1]] = img

    pad_filt = np.zeros((s_img[0] + s_filt[0], s_img[1] + s_filt[1]))

    pad_filt[0: s_filt[0], 0: s_filt[1]] = filt

    # initialize the GPU
    FFTPlan(shape=pad_img.shape, itype=np.complex64, otype=np.complex64)

    # create temporary arrays for holding FFT values
    normtemp1 = np.zeros(pad_img.shape, dtype=np.complex64)
    normtemp2 = np.zeros(pad_img.shape, dtype=np.complex64)

    d_pad_filt = cuda.to_device(pad_filt.astype(np.complex64))
    d_pad_img = cuda.to_device(pad_img.astype(np.complex64))
    d_normtemp1 = cuda.to_device(normtemp1)
    d_normtemp2 = cuda.to_device(normtemp2)

    fft(d_pad_filt, d_normtemp1)
    fft(d_pad_img, d_normtemp2)
    vmult(d_normtemp1, d_normtemp2, out=d_normtemp1)
    ifft(d_normtemp1, d_normtemp2)
    # temp_out = (cuda.fft.ifft_inplace(cuda.fft.fft_inplace(pad_img)) * cuda.fft.fft_inplace(pad_filt)).real
    temp_out = d_normtemp2.copy_to_host().real

    # extract the appropriate portion of the filtered image
    filtered = temp_out[(s_filt[0] / 2): (s_filt[0] / 2) + s_img[0], (s_filt[1] / 2): (s_filt[1] / 2) + s_img[1]]

    # divide each value by the number of nonzero entries in the filter (and image?!?), so we get an average of all the
    # values
    filtered /= (filt_nnz * s_img[0] * s_img[1])

    return filtered
开发者ID:e1morganUCSD,项目名称:pyLapdog,代码行数:54,代码来源:gpufunc.py



注:本文中的numba.cuda.to_device函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python decorators.jit函数代码示例发布时间:2022-05-27
下一篇:
Python cuda.syncthreads函数代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap