C++ CUDA_CHECK函数代码示例

OGeek|极客世界-中国程序员成长平台 › 门户 › 编程› C++›C++教程

原作者: [db:作者] 来自: [db:来源] 收藏邀请

本文整理汇总了C++中CUDA_CHECK函数的典型用法代码示例。如果您正苦于以下问题：C++ CUDA_CHECK函数的具体用法？C++ CUDA_CHECK怎么用？C++ CUDA_CHECK使用的例子？那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。

在下文中一共展示了CUDA_CHECK函数的20个代码示例，这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞，您的评价将有助于我们的系统推荐出更棒的C++代码示例。

示例1: StopInternalThread

void InternalThread::StartInternalThread() {
  // TODO switch to failing once Caffe prefetch thread is persistent.
  // Threads should not be started and stopped repeatedly.
  // CHECK(!is_started());
  StopInternalThread();

#ifndef CPU_ONLY
  CUDA_CHECK(cudaGetDevice(&device_));
#endif
  mode_ = Caffe::mode();
  rand_seed_ = caffe_rng_rand();
  solver_count_ = Caffe::solver_count();
  root_solver_ = Caffe::root_solver();

  try {
    thread_.reset(new boost::thread(&InternalThread::entry, this));
  } catch (std::exception& e) {
    CHECK(false) << e.what();
  }
}

开发者ID:XinLiuNvidia，项目名称:caffe，代码行数:20，代码来源:internal_thread.cpp

示例2: transform

        void transform(Param<T> out, CParam<T> in, CParam<float> tf,
                       const bool inverse)
        {
            const dim_type nimages = in.dims[2];
            // Multiplied in src/backend/transform.cpp
            const dim_type ntransforms = out.dims[2] / in.dims[2];

            // Copy transform to constant memory.
            CUDA_CHECK(cudaMemcpyToSymbol(c_tmat, tf.ptr, ntransforms * 6 * sizeof(float), 0,
                                          cudaMemcpyDeviceToDevice));

            dim3 threads(TX, TY, 1);
            dim3 blocks(divup(out.dims[0], threads.x), divup(out.dims[1], threads.y));

            if (nimages > 1)     { blocks.x *= nimages;   }
            if (ntransforms > 1) { blocks.y *= ntransforms; }

            if(inverse) {
                transform_kernel<T, true><<<blocks, threads>>>(out, in, nimages, ntransforms);
            } else {

开发者ID:EasonYi，项目名称:arrayfire，代码行数:20，代码来源:transform.hpp

示例3: LOG

float Timer::MilliSeconds() {
  if (!has_run_at_least_once()) {
    LOG(WARNING) << "Timer has never been run before reading time.";
    return 0;
  }
  if (running()) {
    Stop();
  }
  if (Caffe::mode() == Caffe::GPU) {
#ifndef CPU_ONLY
    CUDA_CHECK(cudaEventElapsedTime(&elapsed_milliseconds_, start_gpu_,
                                    stop_gpu_));
#else
      NO_GPU;
#endif
  } else {
    elapsed_milliseconds_ = (stop_cpu_ - start_cpu_).total_milliseconds();
  }
  return elapsed_milliseconds_;
}

开发者ID:azrael417，项目名称:caffe，代码行数:20，代码来源:benchmark.cpp

示例4: CUDA_CHECK

void CuDNNConvolutionLayer<Dtype>::LayerSetUp(
    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
  ConvolutionLayer<Dtype>::LayerSetUp(bottom, top);
  // Initialize CUDA streams and cuDNN.
  stream_         = new cudaStream_t[this->group_ * CUDNN_STREAMS_PER_GROUP];
  handle_         = new cudnnHandle_t[this->group_ * CUDNN_STREAMS_PER_GROUP];

  for (int g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) {
    CUDA_CHECK(cudaStreamCreate(&stream_[g]));
    CUDNN_CHECK(cudnnCreate(&handle_[g]));
    CUDNN_CHECK(cudnnSetStream(handle_[g], stream_[g]));
  }

  // Set the indexing parameters.
  weight_offset_ = (this->num_output_ / this->group_)
      * (this->channels_ / this->group_) * this->kernel_h_ * this->kernel_w_;
  bias_offset_ = (this->num_output_ / this->group_);

  // Create filter descriptor.
  cudnn::createFilterDesc<Dtype>(&filter_desc_,
      this->num_output_ / this->group_, this->channels_ / this->group_,
      this->kernel_h_, this->kernel_w_);

  // Create tensor descriptor(s) for data and corresponding convolution(s).
  for (int i = 0; i < bottom.size(); i++) {
    cudnnTensor4dDescriptor_t bottom_desc;
    cudnn::createTensor4dDesc<Dtype>(&bottom_desc);
    bottom_descs_.push_back(bottom_desc);
    cudnnTensor4dDescriptor_t top_desc;
    cudnn::createTensor4dDesc<Dtype>(&top_desc);
    top_descs_.push_back(top_desc);
    cudnnConvolutionDescriptor_t conv_desc;
    cudnn::createConvolutionDesc<Dtype>(&conv_desc);
    conv_descs_.push_back(conv_desc);
  }

  // Tensor descriptor for bias.
  if (this->bias_term_) {
    cudnn::createTensor4dDesc<Dtype>(&bias_desc_);
  }
}

开发者ID:13331151，项目名称:deeplab-public，代码行数:41，代码来源:cudnn_conv_layer.cpp

示例5: morph

Array<T> morph(const Array<T> &in, const Array<T> &mask) {
    const dim4 mdims = mask.dims();

    if (mdims[0] != mdims[1])
        CUDA_NOT_SUPPORTED("Rectangular masks are not supported");

    if (mdims[0] > 19) CUDA_NOT_SUPPORTED("Kernels > 19x19 are not supported");

    Array<T> out = createEmptyArray<T>(in.dims());

    CUDA_CHECK(cudaMemcpyToSymbolAsync(
        kernel::cFilter, mask.get(), mdims[0] * mdims[1] * sizeof(T), 0,
        cudaMemcpyDeviceToDevice, cuda::getActiveStream()));

    if (isDilation)
        kernel::morph<T, true>(out, in, mdims[0]);
    else
        kernel::morph<T, false>(out, in, mdims[0]);

    return out;
}

开发者ID:9prady9，项目名称:arrayfire，代码行数:21，代码来源:morph_impl.hpp

示例6: caffe_copy

void caffe_copy(const int N, const Dtype* X, Dtype* Y) {
  if (X != Y) {
    // If there are more than one openmp thread (we are in active region)
    // then checking Caffe::mode can create additional GPU Context
    //
    if (
#ifdef _OPENMP
        (omp_in_parallel() == 0) &&
#endif
        (Caffe::mode() == Caffe::GPU)) {
#ifndef CPU_ONLY
      // NOLINT_NEXT_LINE(caffe/alt_fn)
      CUDA_CHECK(cudaMemcpy(Y, X, sizeof(Dtype) * N, cudaMemcpyDefault));
#else
      NO_GPU;
#endif
    } else {
      caffe_cpu_copy<Dtype>(N, X, Y);
    }
  }
}

开发者ID:crobertob，项目名称:caffe，代码行数:21，代码来源:math_functions.cpp

示例7: switch

// 把数据放到cpu上
inline void SyncedMemory::to_cpu() {
  switch (head_) {
  case UNINITIALIZED:
    CaffeMallocHost(&cpu_ptr_, size_);
    memset(cpu_ptr_, 0, size_);
    head_ = HEAD_AT_CPU;
    own_cpu_data_ = true;
    break;
  case HEAD_AT_GPU:
    if (cpu_ptr_ == NULL) {
      CaffeMallocHost(&cpu_ptr_, size_);
      own_cpu_data_ = true;
    }
    CUDA_CHECK(cudaMemcpy(cpu_ptr_, gpu_ptr_, size_, cudaMemcpyDeviceToHost));
    head_ = SYNCED;
    break;
  case HEAD_AT_CPU:
  case SYNCED:
    break;
  }
}

开发者ID:clarencezhang，项目名称:caffe-windows-multilabels，代码行数:22，代码来源:syncedmem.cpp

示例8: normalizeGPULaunch

/*
// Launch GPU kernel of normalize
//
// API
// int normalizeGPULaunch(const int alfa, CvLSVMFeatureMapGPU *dev_map_in,
           CvLSVMFeatureMapGPU *dev_norm, CvLSVMFeatureMapGPU *dev_map_out,
           CUstream stream);
// INPUT
// alfa
// dev_map_in
// dev_norm
// stream
// OUTPUT
// dev_map_out
// RESULT
// Error status
*/
int normalizeGPULaunch(const float alfa, CvLSVMFeatureMapGPU *dev_map_in,
        CvLSVMFeatureMapGPU *dev_norm, CvLSVMFeatureMapGPU *dev_map_out,
        CUstream stream)
{
    int sizeX, sizeY;
    int thread_num_x, thread_num_y, thread_num_z;
    int block_num_x, block_num_y, block_num_z;
    int sharedMemBytes;
    CUresult res;

    sizeX = dev_map_in->sizeX;
    sizeY = dev_map_in->sizeY;

    void *normalize_kernel_arg[] =
    { (void *) &dev_map_in->map, (void *) &dev_norm->map,
            (void *) &dev_map_out->map, (void *) &sizeX, (void *) &sizeY,
            (void *) &alfa, };

    thread_num_x =
            (sizeX < std::sqrt(max_threads_num)) ? sizeX : std::sqrt(max_threads_num);
    thread_num_y =
            (sizeY < std::sqrt(max_threads_num)) ? sizeY : std::sqrt(max_threads_num);
    thread_num_z = 1;
    block_num_x = sizeX / thread_num_x;
    block_num_y = sizeY / thread_num_y;
    block_num_z = NUM_SECTOR * 2;
    if (sizeX % thread_num_x != 0)
        block_num_x++;
    if (sizeY % thread_num_y != 0)
        block_num_y++;

    sharedMemBytes = 0;

    res = cuLaunchKernel(normalizeAndTruncate_func[0], block_num_x, block_num_y,
            block_num_z, thread_num_x, thread_num_y, thread_num_z,
            sharedMemBytes, stream, normalize_kernel_arg, NULL);
    CUDA_CHECK(res, "cuLaunchKernel(normalizeAndTruncate)");

    return LATENT_SVM_OK;
}

开发者ID:ZenzouFuruta，项目名称:Autoware，代码行数:57，代码来源:featurepyramid_gpu.cpp

示例9: PCAFeatureMapsAddNullableBorderGPULaunch

/*
// Launch GPU kernel of PCA feature maps
//
// API
// int PCAFeatureMapsAddNullableBorderGPULaunch(CvLSVMFeatureMapGPU *dev_map_in,
           CvLSVMFeatureMapGPU *dev_map_out, const int bx, const int by,
           CUstream stream);
// INPUT
// dev_map_in
// bx
// by
// stream
// OUTPUT
// dev_map_out
// RESULT
// Error status
*/
int PCAFeatureMapsAddNullableBorderGPULaunch(CvLSVMFeatureMapGPU *dev_map_in,
        CvLSVMFeatureMapGPU *dev_map_out, const int bx, const int by,
        CUstream stream)
{
    int sizeX, sizeY, p;
    int thread_num_x, thread_num_y, thread_num_z;
    int block_num_x, block_num_y, block_num_z;
    int sharedMemBytes;
    CUresult res;

    sizeX = dev_map_in->sizeX;
    sizeY = dev_map_in->sizeY;
    p = dev_map_in->numFeatures;

    void *pca_kernel_arg[] =
    { (void *) &dev_map_in->map, (void *) &dev_map_out->map, (void *) &sizeX,
            (void *) &sizeY, (void *) &p, (void *) &bx, (void *) &by };

    thread_num_x =
            (sizeX < std::sqrt(max_threads_num)) ? sizeX : std::sqrt(max_threads_num);
    thread_num_y =
            (sizeY < std::sqrt(max_threads_num)) ? sizeY : std::sqrt(max_threads_num);
    thread_num_z = 1;
    block_num_x = sizeX / thread_num_x;
    block_num_y = sizeY / thread_num_y;
    block_num_z = 1;
    if (sizeX % thread_num_x != 0)
        block_num_x++;
    if (sizeY % thread_num_y != 0)
        block_num_y++;

    sharedMemBytes = 0;

    res = cuLaunchKernel(PCAFeatureMapsAddNullableBorder_func[0], block_num_x,
            block_num_y, block_num_z, thread_num_x, thread_num_y, thread_num_z,
            sharedMemBytes, stream, pca_kernel_arg, NULL);
    CUDA_CHECK(res, "cuLaunchKernel(PCAFeatureMaps)");

    return LATENT_SVM_OK;
}

开发者ID:ZenzouFuruta，项目名称:Autoware，代码行数:57，代码来源:featurepyramid_gpu.cpp

示例10: switch

inline void SyncedMemory::to_cpu() {
  switch (head_) {
  case UNINITIALIZED:
    CaffeMallocHost(&cpu_ptr_, size_);
    CHECK(cpu_ptr_ != 0) << "size " << size_;
    memset(cpu_ptr_, 0, size_);
    head_ = HEAD_AT_CPU;
    break;
#if 0
  case HEAD_AT_GPU:
    if (cpu_ptr_ == NULL) {
      CaffeMallocHost(&cpu_ptr_, size_);
    }
    CUDA_CHECK(cudaMemcpy(cpu_ptr_, gpu_ptr_, size_, cudaMemcpyDeviceToHost));
    head_ = SYNCED;
    break;
#endif
  case HEAD_AT_CPU:
  case SYNCED:
    break;
  }
}

开发者ID:Devy001，项目名称:Caffe-mini，代码行数:22，代码来源:syncedmem.cpp

示例11: pinnedAlloc

    T* pinnedAlloc(const size_t &elements)
    {
        managerInit();
        T* ptr = NULL;
        // Allocate the higher megabyte. Overhead of creating pinned memory is
        // more so we want more resuable memory.
        size_t alloc_bytes = divup(sizeof(T) * elements, 1048576) * 1048576;

        if (elements > 0) {

            // FIXME: Add better checks for garbage collection
            // Perhaps look at total memory available as a metric
            if (pinned_maps.size() >= MAX_BUFFERS || pinned_used_bytes >= MAX_BYTES) {
                pinnedGarbageCollect();
            }

            for(mem_iter iter = pinned_maps.begin();
                iter != pinned_maps.end(); ++iter) {

                mem_info info = iter->second;
                if (info.is_free && info.bytes == alloc_bytes) {
                    iter->second.is_free = false;
                    pinned_used_bytes += alloc_bytes;
                    return (T *)iter->first;
                }
            }

            // Perform garbage collection if memory can not be allocated
            if (cudaMallocHost((void **)&ptr, alloc_bytes) != cudaSuccess) {
                pinnedGarbageCollect();
                CUDA_CHECK(cudaMallocHost((void **)(&ptr), alloc_bytes));
            }

            mem_info info = {false, false, alloc_bytes};
            pinned_maps[ptr] = info;
            pinned_used_bytes += alloc_bytes;
        }
        return (T*)ptr;
    }

开发者ID:hxiaox，项目名称:arrayfire，代码行数:39，代码来源:memory.cpp

示例12: LOG

float Timer::MicroSeconds() {
  if (!has_run_at_least_once()) {
    LOG(WARNING)<< "Timer has never been run before reading time.";
    return 0;
  }
  if (running()) {
    Stop();
  }
#ifdef USE_CUDA
  if (Caffe::mode() == Caffe::GPU) {
    CUDA_CHECK(cudaEventElapsedTime(&elapsed_milliseconds_, start_gpu_,
            stop_gpu_));
    // Cuda only measure milliseconds
    elapsed_microseconds_ = elapsed_milliseconds_ * 1000;
  } else {
#endif
    elapsed_microseconds_ = (stop_cpu_ - start_cpu_).total_microseconds();
#ifdef USE_CUDA
  }
#endif
  return elapsed_microseconds_;
}

开发者ID:rickyHong，项目名称:CaffeForOpenCL，代码行数:22，代码来源:benchmark.cpp

示例13: calculateNormGPULaunch

/*
// Launch GPU kernel of calculate norm
//
// API
//int calculateNormGPULaunch(CvLSVMFeatureMapGPU *dev_map_in,
          CvLSVMFeatureMapGPU *dev_norm, CUstream stream)
// INPUT
// dev_map_in
// stream
// OUTPUT
// dev_norm
// RESULT
// Error status
*/
int calculateNormGPULaunch(CvLSVMFeatureMapGPU *dev_map_in,
        CvLSVMFeatureMapGPU *dev_norm, CUstream stream)
{
    int sizeX, sizeY, xp;
    int thread_num_x, thread_num_y, thread_num_z;
    int block_num_x, block_num_y, block_num_z;
    int sharedMemBytes;
    CUresult res;

    sizeX = dev_map_in->sizeX;
    sizeY = dev_map_in->sizeY;
    xp = dev_map_in->numFeatures;

    void *calc_norm_kernel_arg[] =
    { (void *) &dev_map_in->map, (void *) &dev_norm->map, (void *) &sizeX,
            (void *) &sizeY, (void *) &xp, };

    thread_num_x =
            (sizeX < std::sqrt(max_threads_num)) ? sizeX : std::sqrt(max_threads_num);
    thread_num_y =
            (sizeY < std::sqrt(max_threads_num)) ? sizeY : std::sqrt(max_threads_num);
    thread_num_z = 1;
    block_num_x = sizeX / thread_num_x;
    block_num_y = sizeY / thread_num_y;
    block_num_z = 1;
    if (sizeX % thread_num_x != 0)
        block_num_x++;
    if (sizeY % thread_num_y != 0)
        block_num_y++;

    sharedMemBytes = 0;

    res = cuLaunchKernel(calculateNorm_func[0], block_num_x, block_num_y,
            block_num_z, thread_num_x, thread_num_y, thread_num_z,
            sharedMemBytes, stream, calc_norm_kernel_arg, NULL);
    CUDA_CHECK(res, "cuLaunchKernel(calcuateNorm)");

    return LATENT_SVM_OK;
}

开发者ID:ZenzouFuruta，项目名称:Autoware，代码行数:53，代码来源:featurepyramid_gpu.cpp

示例14: CUDA_CHECK

void MPIComm::ThreadFunc(int device){
#ifndef CPU_ONLY
  //LOG(ERROR)<<"device_id is "<<device;
  CUDA_CHECK(cudaSetDevice(device));
#endif
  started_.store(true);
  MPIJob job;
  while (true){
    mutex::scoped_lock lock(queue_mutex_);
    while( task_queue_.empty() && IsRunning()){
      DLOG(INFO)<<"no job running, waiting on cond";
      cond_work_.wait(lock);
    }
    lock.unlock();

    DLOG(INFO)<<"Cond fulfilled, dispatching job";
    if (IsRunning()){
      job = task_queue_.front();
      DLOG(INFO)<<task_queue_.size();
      DispatchJob(job);
      mutex::scoped_lock pop_lock(queue_mutex_);
      task_queue_.pop();
      pop_lock.unlock();
      cond_finish_.notify_one();
      DLOG(INFO)<<"job finished, poped taskqueue";
    }else{
      break;
    }

  }

  // finish remaining jobs
  while (!task_queue_.empty()){
    boost::lock_guard<mutex> lock(queue_mutex_);
    job = task_queue_.front();
    task_queue_.pop();
    DispatchJob(job);
  }
}

开发者ID:xiangqiaolxq，项目名称:caffe-parallel，代码行数:39，代码来源:mpijob.cpp

示例15: morph

Array<T>  morph(const Array<T> &in, const Array<T> &mask)
{
    const dim4 mdims = mask.dims();

    if (mdims[0] != mdims[1])
        AF_ERROR("Only square masks are supported in cuda morph currently", AF_ERR_SIZE);
    if (mdims[0] > 19)
        AF_ERROR("Upto 19x19 square kernels are only supported in cuda currently", AF_ERR_SIZE);

    Array<T> out = createEmptyArray<T>(in.dims());

    CUDA_CHECK(cudaMemcpyToSymbolAsync(kernel::cFilter, mask.get(),
                                       mdims[0] * mdims[1] * sizeof(T),
                                       0, cudaMemcpyDeviceToDevice,
                                       cuda::getStream(cuda::getActiveDeviceId())));

    if (isDilation)
        kernel::morph<T, true >(out, in, mdims[0]);
    else
        kernel::morph<T, false>(out, in, mdims[0]);

    return out;
}

开发者ID:hxiaox，项目名称:arrayfire，代码行数:23，代码来源:morph_impl.hpp

示例16: memAlloc

    T* memAlloc(const size_t &elements)
    {
        int n = getActiveDeviceId();
        T* ptr = NULL;
        size_t alloc_bytes = divup(sizeof(T) * elements, 1024) * 1024;

        if (elements > 0) {

            // FIXME: Add better checks for garbage collection
            // Perhaps look at total memory available as a metric
            if (memory_maps[n].size() >= MAX_BUFFERS || used_bytes >= MAX_BYTES) {
                garbageCollect();
            }

            for(mem_iter iter = memory_maps[n].begin();
                iter != memory_maps[n].end(); iter++) {

                mem_info info = iter->second;
                if (info.is_free && info.bytes == alloc_bytes) {
                    iter->second.is_free = false;
                    used_bytes += alloc_bytes;
                    return (T *)iter->first;
                }
            }

            // Perform garbage collection if memory can not be allocated
            if (cudaMalloc((void **)&ptr, alloc_bytes) != cudaSuccess) {
                garbageCollect();
                CUDA_CHECK(cudaMalloc((void **)(&ptr), alloc_bytes));
            }

            mem_info info = {false, alloc_bytes};
            memory_maps[n][ptr] = info;
            used_bytes += alloc_bytes;
        }
        return ptr;
    }

开发者ID:maolingao，项目名称:arrayfire，代码行数:37，代码来源:memory.cpp

示例17: CUDA_CHECK

SocketBuffer* SocketBuffer::Read(bool data) {
  // Pop the message from local queue
  QueuedMessage* qm = NULL;
  if(data) {
    qm = reinterpret_cast<QueuedMessage*>
      (this->channel_->receive_queue.pop());
#ifndef CPU_ONLY
    // Copy the received buffer to GPU memory
    CUDA_CHECK(cudaMemcpy(this->addr(), qm->buffer,  // NOLINT(caffe/alt_fn)
               qm->size, cudaMemcpyHostToDevice));  // NOLINT(caffe/alt_fn)
#else
    //caffe_copy(qm->size, qm->buffer, this->addr_);
    memcpy(this->addr_, qm->buffer, qm->size);
#endif
  } else {
    qm = reinterpret_cast<QueuedMessage*>
      (this->channel_->receive_queue_ctrl.pop());
  }
  // Free up the buffer and the wrapper object
  if(data)
    delete qm->buffer;
  delete qm;
  return this;
}

开发者ID:Aravindreddy986，项目名称:CaffeOnSpark，代码行数:24，代码来源:socket.cpp

示例18: CUDA_CHECK

P2PSync<Dtype>::~P2PSync() {
#ifndef CPU_ONLY
    int initial_device;
    CUDA_CHECK(cudaGetDevice(&initial_device));
    const int self = solver_->param().device_id();
    CUDA_CHECK(cudaSetDevice(self));

    if (parent_) {
        CUDA_CHECK(cudaFree(parent_grads_));
        const int peer = parent_->solver_->param().device_id();
        int access;
        CUDA_CHECK(cudaDeviceCanAccessPeer(&access, self, peer));
        if (access) {
            CUDA_CHECK(cudaDeviceDisablePeerAccess(peer));
        }
    }

    CUDA_CHECK(cudaSetDevice(initial_device));
#endif
}

开发者ID:flair2005，项目名称:Caffe-Solution，代码行数:20，代码来源:parallel.cpp

示例19: remaining

void DevicePair::compute(const vector<int> devices, vector<DevicePair>* pairs) {
#ifndef CPU_ONLY
    vector<int> remaining(devices);

    // Depth for reduction tree
    int remaining_depth = static_cast<int>(ceil(log2(remaining.size())));

    // Group GPUs by board
    for (int d = 0; d < remaining_depth; ++d) {
        for (int i = 0; i < remaining.size(); ++i) {
            for (int j = i + 1; j < remaining.size(); ++j) {
                cudaDeviceProp a, b;
                CUDA_CHECK(cudaGetDeviceProperties(&a, remaining[i]));
                CUDA_CHECK(cudaGetDeviceProperties(&b, remaining[j]));
                if (a.isMultiGpuBoard && b.isMultiGpuBoard) {
                    if (a.multiGpuBoardGroupID == b.multiGpuBoardGroupID) {
                        pairs->push_back(DevicePair(remaining[i], remaining[j]));
                        DLOG(INFO) << "GPU board: " << remaining[i] << ":" << remaining[j];
                        remaining.erase(remaining.begin() + j);
                        break;
                    }
                }
            }
        }
    }
    ostringstream s;
    for (int i = 0; i < remaining.size(); ++i) {
        s << (i ? ", " : "") << remaining[i];
    }
    DLOG(INFO) << "GPUs paired by boards, remaining: " << s.str();

    // Group by P2P accessibility
    remaining_depth = ceil(log2(remaining.size()));
    for (int d = 0; d < remaining_depth; ++d) {
        for (int i = 0; i < remaining.size(); ++i) {
            for (int j = i + 1; j < remaining.size(); ++j) {
                int access;
                CUDA_CHECK(
                    cudaDeviceCanAccessPeer(&access, remaining[i], remaining[j]));
                if (access) {
                    pairs->push_back(DevicePair(remaining[i], remaining[j]));
                    DLOG(INFO) << "P2P pair: " << remaining[i] << ":" << remaining[j];
                    remaining.erase(remaining.begin() + j);
                    break;
                }
            }
        }
    }
    s.str("");
    for (int i = 0; i < remaining.size(); ++i) {
        s << (i ? ", " : "") << remaining[i];
    }
    DLOG(INFO) << "GPUs paired by P2P access, remaining: " << s.str();

    // Group remaining
    remaining_depth = ceil(log2(remaining.size()));
    for (int d = 0; d < remaining_depth; ++d) {
        for (int i = 0; i < remaining.size(); ++i) {
            pairs->push_back(DevicePair(remaining[i], remaining[i + 1]));
            DLOG(INFO) << "Remaining pair: " << remaining[i] << ":"
                       << remaining[i + 1];
            remaining.erase(remaining.begin() + i + 1);
        }
    }

    // Should only be the parent node remaining
    CHECK_EQ(remaining.size(), 1);

    pairs->insert(pairs->begin(), DevicePair(-1, remaining[0]));

    CHECK(pairs->size() == devices.size());
    for (int i = 0; i < pairs->size(); ++i) {
        CHECK((*pairs)[i].parent() != (*pairs)[i].device());
        for (int j = i + 1; j < pairs->size(); ++j) {
            CHECK((*pairs)[i].device() != (*pairs)[j].device());
        }
    }
#else
    NO_GPU;
#endif
}

开发者ID:flair2005，项目名称:Caffe-Solution，代码行数:81，代码来源:parallel.cpp

示例20: orb

void orb(unsigned* out_feat,
         float** d_x,
         float** d_y,
         float** d_score,
         float** d_ori,
         float** d_size,
         unsigned** d_desc,
         std::vector<unsigned>& feat_pyr,
         std::vector<float*>& d_x_pyr,
         std::vector<float*>& d_y_pyr,
         std::vector<unsigned>& lvl_best,
         std::vector<float>& lvl_scl,
         std::vector<CParam<T> >& img_pyr,
         const float fast_thr,
         const unsigned max_feat,
         const float scl_fctr,
         const unsigned levels)
{
    unsigned patch_size = REF_PAT_SIZE;

    unsigned max_levels = feat_pyr.size();

    // In future implementations, the user will be capable of passing his
    // distribution instead of using the reference one
    //CUDA_CHECK(cudaMemcpyToSymbol(d_ref_pat, h_ref_pat, 256 * 4 * sizeof(int), 0, cudaMemcpyHostToDevice));

    std::vector<float*> d_score_pyr(max_levels);
    std::vector<float*> d_ori_pyr(max_levels);
    std::vector<float*> d_size_pyr(max_levels);
    std::vector<unsigned*> d_desc_pyr(max_levels);
    std::vector<unsigned*> d_idx_pyr(max_levels);

    unsigned total_feat = 0;

    // Calculate a separable Gaussian kernel
    unsigned gauss_len = 9;
    convAccT* h_gauss = new convAccT[gauss_len];
    gaussian1D(h_gauss, gauss_len, 2.f);
    Param<convAccT> gauss_filter;
    gauss_filter.dims[0] = gauss_len;
    gauss_filter.strides[0] = 1;

    for (int k = 1; k < 4; k++) {
        gauss_filter.dims[k] = 1;
        gauss_filter.strides[k] = gauss_filter.dims[k - 1] * gauss_filter.strides[k - 1];
    }

    dim_type gauss_elem = gauss_filter.strides[3] * gauss_filter.dims[3];
    gauss_filter.ptr = memAlloc<convAccT>(gauss_elem);
    CUDA_CHECK(cudaMemcpy(gauss_filter.ptr, h_gauss, gauss_elem * sizeof(convAccT), cudaMemcpyHostToDevice));

    delete[] h_gauss;

    for (int i = 0; i < (int)max_levels; i++) {
        if (feat_pyr[i] == 0 || lvl_best[i] == 0) {
            if (i > 0)
                memFree((T*)img_pyr[i].ptr);
            continue;
        }

        unsigned* d_usable_feat = memAlloc<unsigned>(1);
        CUDA_CHECK(cudaMemset(d_usable_feat, 0, sizeof(unsigned)));

        float* d_x_harris = memAlloc<float>(feat_pyr[i]);
        float* d_y_harris = memAlloc<float>(feat_pyr[i]);
        float* d_score_harris = memAlloc<float>(feat_pyr[i]);

        // Calculate Harris responses
        // Good block_size >= 7 (must be an odd number)
        dim3 threads(THREADS_X, THREADS_Y);
        dim3 blocks(divup(feat_pyr[i], threads.x), 1);
        harris_response<T,false><<<blocks, threads>>>(d_x_harris, d_y_harris, d_score_harris, NULL,
                                                      d_x_pyr[i], d_y_pyr[i], NULL,
                                                      feat_pyr[i], d_usable_feat,
                                                      img_pyr[i], 7, 0.04f, patch_size);
        POST_LAUNCH_CHECK();

        unsigned usable_feat = 0;
        CUDA_CHECK(cudaMemcpy(&usable_feat, d_usable_feat, sizeof(unsigned), cudaMemcpyDeviceToHost));

        memFree(d_x_pyr[i]);
        memFree(d_y_pyr[i]);
        memFree(d_usable_feat);

        feat_pyr[i] = usable_feat;

        if (feat_pyr[i] == 0) {
            memFree(d_x_harris);
            memFree(d_y_harris);
            memFree(d_score_harris);
            if (i > 0)
                memFree((T*)img_pyr[i].ptr);
            continue;
        }

        Param<float> harris_sorted;
        Param<unsigned> harris_idx;

        harris_sorted.dims[0] = harris_idx.dims[0] = feat_pyr[i];
        harris_sorted.strides[0] = harris_idx.strides[0] = 1;
//.........这里部分代码省略.........

开发者ID:pavanky，项目名称:arrayfire，代码行数:101，代码来源:orb.hpp

注：本文中的CUDA_CHECK函数示例整理自Github/MSDocs等源码及文档管理平台，相关代码片段筛选自各路编程大神贡献的开源项目，源码版权归原作者所有，传播和使用请参考对应项目的License；未经允许，请勿转载。

鲜花

握手

雷人

路过

鸡蛋

该文章已有0人参与评论

请发表评论

全部评论

专题导读

More+

10-27 六六分期app的软件客服如何联系？(六六分期

11-06 可心卡盟:win10系统火狐flash插件崩溃怎么

11-06 亲亲特价:怎么删除回收站图标

11-06 济南大学虚拟社区:鲁大师节能降温的具体办

11-06 xlueops.exe:无线网络安装向导

11-06 女斗合众国:win7系统cf与主机连接不稳定怎

11-06 0xc000022-[cf烟雾头]cf怎么调烟雾头

11-06 qizideyouhuo:应用程序无法正常启动0xc0000

11-06 ipz-185:win7系统vcf文件怎么打开

11-06 傻哥蹦迪:win10系统s4怎么打开usb调试

11-06 八神浩树gtaste:回收站清空了怎么恢复

11-06 妖尾之黑色守护:win10系统电脑没有1440x900

11-06 校园至尊魔王小说:win7系统浏览网页时字体

11-06 女斗合众国:win10系统访问共享文件夹提示请

11-06 tokyo hot n0654:恢复win7系统默认字体一招

11-06 雨酷仙境:设置win7系统转移临时文件夹腾出

11-06 阿穆纳伊之杖:win7系统开始菜单在右边还原

11-06 tunespotting:win10系统火狐flash插件总是

11-06 甘尔葛分析师：计谋网站seo关键词暴涨有什

11-06 蔡贵霖: 计谋网站seo关键词暴涨有什么秘密

11-06 博益网首页:ao3网页版进入不了解决方法

11-06 漏斗子专栏: 网站数据分析小白易懂精华篇

11-06 见证双虹怎么做:win7系统开启telnet命令的

11-06 颾狐蝶蜋:系统资源不足无法完成请求的服务

11-06 国光中学校歌:提交网站到alexa查询详细步骤

11-06 西安有情天:静态网页和动态网页的区别

11-06 红木雅尚斋:外部链接构造对网站的好处

11-06 前官礼遇：防止域名劫持–增强域安全性的10

11-06 密传二转答案: 中文分词算法有哪些

11-06 金泉家园邮编:百度快照劫持的表现及应对方

C++ CUDA_SAFE_CALL函数代码示例发布时间：2022-05-30

C++ CUBLAS_CHECK函数代码示例发布时间：2022-05-30

剪的笔顺,诠释剪的笔画,认识剪的部首

1 六六分期app的软件客服如何联系？(六六分期

六六分期app的软件客服如何联系？不知道吗？加qq群【895510560】即可！标题：六六分期

阅读：18325|2023-10-27

2 可心卡盟:win10系统火狐flash插件崩溃怎么

今天小编告诉大家如何处理win10系统火狐flash插件总是崩溃的问题，可能很多用户都不知

阅读：9698|2022-11-06

3 亲亲特价:怎么删除回收站图标

今天小编告诉大家如何对win10系统删除桌面回收站图标进行设置，可能很多用户都不知道

阅读：8193|2022-11-06

4 济南大学虚拟社区:鲁大师节能降温的具体办

今天小编告诉大家如何对win10系统电脑设置节能降温的设置方法，想必大家都遇到过需要

阅读：8560|2022-11-06

5 xlueops.exe:无线网络安装向导

我们在使用xp系统的过程中,经常需要对xp系统无线网络安装向导设置进行设置，可能很多

阅读：8469|2022-11-06

6 女斗合众国:win7系统cf与主机连接不稳定怎

今天小编告诉大家如何处理win7系统玩cf老是与主机连接不稳定的问题，可能很多用户都不

阅读：9410|2022-11-06

7 0xc000022-[cf烟雾头]cf怎么调烟雾头

电脑对日常生活的重要性小编就不多说了，可是一旦碰到win7系统设置cf烟雾头的问题，很

阅读：8443|2022-11-06

8 qizideyouhuo:应用程序无法正常启动0xc0000

我们在日常使用电脑的时候，有的小伙伴们可能在打开应用的时候会遇见提示应用程序无法

阅读：7875|2022-11-06

9 ipz-185:win7系统vcf文件怎么打开

今天小编告诉大家如何对win7系统打开vcf文件进行设置，可能很多用户都不知道怎么对win

阅读：8427|2022-11-06

10 傻哥蹦迪:win10系统s4怎么打开usb调试

今天小编告诉大家如何对win10系统s4开启USB调试模式进行设置，可能很多用户都不知道怎

阅读：7403|2022-11-06

客服电话

电子邮件

C++ CUDA_CHECK函数代码示例

示例1: StopInternalThread

示例2: transform

示例3: LOG

示例4: CUDA_CHECK

示例5: morph

示例6: caffe_copy

示例7: switch

示例8: normalizeGPULaunch

示例9: PCAFeatureMapsAddNullableBorderGPULaunch

示例10: switch

示例11: pinnedAlloc

示例12: LOG

示例13: calculateNormGPULaunch

示例14: CUDA_CHECK

示例15: morph

示例16: memAlloc

示例17: CUDA_CHECK

示例18: CUDA_CHECK

示例19: remaining

示例20: orb

请发表评论

全部评论

上一篇：

下一篇：

CVE-2022-35285

万里平台泰州会场

kojino/Harvard-Robust-Machine-Learning:

cescoffier/puppet-nexus: A Puppet Module

文的笔顺,诠释文的笔画,解读文的部首

剪的笔顺,诠释剪的笔画,认识剪的部首

六六分期app的软件客服如何联系？(六六分期

florent37/ViewAnimator: A fluent Android

florent37/Shrine-MaterialDesign2: implem

CVE-2020-36276

SimpleSoftwareIO/simple-sms: Send and re

关于我们

产品与服务

解决方案

139-2527-9053