• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

C++ CUDA_CHECK函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了C++中CUDA_CHECK函数的典型用法代码示例。如果您正苦于以下问题:C++ CUDA_CHECK函数的具体用法?C++ CUDA_CHECK怎么用?C++ CUDA_CHECK使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了CUDA_CHECK函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的C++代码示例。

示例1: StopInternalThread

void InternalThread::StartInternalThread() {
  // TODO switch to failing once Caffe prefetch thread is persistent.
  // Threads should not be started and stopped repeatedly.
  // CHECK(!is_started());
  StopInternalThread();

#ifndef CPU_ONLY
  CUDA_CHECK(cudaGetDevice(&device_));
#endif
  mode_ = Caffe::mode();
  rand_seed_ = caffe_rng_rand();
  solver_count_ = Caffe::solver_count();
  root_solver_ = Caffe::root_solver();

  try {
    thread_.reset(new boost::thread(&InternalThread::entry, this));
  } catch (std::exception& e) {
    CHECK(false) << e.what();
  }
}
开发者ID:XinLiuNvidia,项目名称:caffe,代码行数:20,代码来源:internal_thread.cpp


示例2: transform

        void transform(Param<T> out, CParam<T> in, CParam<float> tf,
                       const bool inverse)
        {
            const dim_type nimages = in.dims[2];
            // Multiplied in src/backend/transform.cpp
            const dim_type ntransforms = out.dims[2] / in.dims[2];

            // Copy transform to constant memory.
            CUDA_CHECK(cudaMemcpyToSymbol(c_tmat, tf.ptr, ntransforms * 6 * sizeof(float), 0,
                                          cudaMemcpyDeviceToDevice));

            dim3 threads(TX, TY, 1);
            dim3 blocks(divup(out.dims[0], threads.x), divup(out.dims[1], threads.y));

            if (nimages > 1)     { blocks.x *= nimages;   }
            if (ntransforms > 1) { blocks.y *= ntransforms; }

            if(inverse) {
                transform_kernel<T, true><<<blocks, threads>>>(out, in, nimages, ntransforms);
            } else {
开发者ID:EasonYi,项目名称:arrayfire,代码行数:20,代码来源:transform.hpp


示例3: LOG

float Timer::MilliSeconds() {
  if (!has_run_at_least_once()) {
    LOG(WARNING) << "Timer has never been run before reading time.";
    return 0;
  }
  if (running()) {
    Stop();
  }
  if (Caffe::mode() == Caffe::GPU) {
#ifndef CPU_ONLY
    CUDA_CHECK(cudaEventElapsedTime(&elapsed_milliseconds_, start_gpu_,
                                    stop_gpu_));
#else
      NO_GPU;
#endif
  } else {
    elapsed_milliseconds_ = (stop_cpu_ - start_cpu_).total_milliseconds();
  }
  return elapsed_milliseconds_;
}
开发者ID:azrael417,项目名称:caffe,代码行数:20,代码来源:benchmark.cpp


示例4: CUDA_CHECK

void CuDNNConvolutionLayer<Dtype>::LayerSetUp(
    const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
  ConvolutionLayer<Dtype>::LayerSetUp(bottom, top);
  // Initialize CUDA streams and cuDNN.
  stream_         = new cudaStream_t[this->group_ * CUDNN_STREAMS_PER_GROUP];
  handle_         = new cudnnHandle_t[this->group_ * CUDNN_STREAMS_PER_GROUP];

  for (int g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) {
    CUDA_CHECK(cudaStreamCreate(&stream_[g]));
    CUDNN_CHECK(cudnnCreate(&handle_[g]));
    CUDNN_CHECK(cudnnSetStream(handle_[g], stream_[g]));
  }

  // Set the indexing parameters.
  weight_offset_ = (this->num_output_ / this->group_)
      * (this->channels_ / this->group_) * this->kernel_h_ * this->kernel_w_;
  bias_offset_ = (this->num_output_ / this->group_);

  // Create filter descriptor.
  cudnn::createFilterDesc<Dtype>(&filter_desc_,
      this->num_output_ / this->group_, this->channels_ / this->group_,
      this->kernel_h_, this->kernel_w_);

  // Create tensor descriptor(s) for data and corresponding convolution(s).
  for (int i = 0; i < bottom.size(); i++) {
    cudnnTensor4dDescriptor_t bottom_desc;
    cudnn::createTensor4dDesc<Dtype>(&bottom_desc);
    bottom_descs_.push_back(bottom_desc);
    cudnnTensor4dDescriptor_t top_desc;
    cudnn::createTensor4dDesc<Dtype>(&top_desc);
    top_descs_.push_back(top_desc);
    cudnnConvolutionDescriptor_t conv_desc;
    cudnn::createConvolutionDesc<Dtype>(&conv_desc);
    conv_descs_.push_back(conv_desc);
  }

  // Tensor descriptor for bias.
  if (this->bias_term_) {
    cudnn::createTensor4dDesc<Dtype>(&bias_desc_);
  }
}
开发者ID:13331151,项目名称:deeplab-public,代码行数:41,代码来源:cudnn_conv_layer.cpp


示例5: morph

Array<T> morph(const Array<T> &in, const Array<T> &mask) {
    const dim4 mdims = mask.dims();

    if (mdims[0] != mdims[1])
        CUDA_NOT_SUPPORTED("Rectangular masks are not supported");

    if (mdims[0] > 19) CUDA_NOT_SUPPORTED("Kernels > 19x19 are not supported");

    Array<T> out = createEmptyArray<T>(in.dims());

    CUDA_CHECK(cudaMemcpyToSymbolAsync(
        kernel::cFilter, mask.get(), mdims[0] * mdims[1] * sizeof(T), 0,
        cudaMemcpyDeviceToDevice, cuda::getActiveStream()));

    if (isDilation)
        kernel::morph<T, true>(out, in, mdims[0]);
    else
        kernel::morph<T, false>(out, in, mdims[0]);

    return out;
}
开发者ID:9prady9,项目名称:arrayfire,代码行数:21,代码来源:morph_impl.hpp


示例6: caffe_copy

void caffe_copy(const int N, const Dtype* X, Dtype* Y) {
  if (X != Y) {
    // If there are more than one openmp thread (we are in active region)
    // then checking Caffe::mode can create additional GPU Context
    //
    if (
#ifdef _OPENMP
        (omp_in_parallel() == 0) &&
#endif
        (Caffe::mode() == Caffe::GPU)) {
#ifndef CPU_ONLY
      // NOLINT_NEXT_LINE(caffe/alt_fn)
      CUDA_CHECK(cudaMemcpy(Y, X, sizeof(Dtype) * N, cudaMemcpyDefault));
#else
      NO_GPU;
#endif
    } else {
      caffe_cpu_copy<Dtype>(N, X, Y);
    }
  }
}
开发者ID:crobertob,项目名称:caffe,代码行数:21,代码来源:math_functions.cpp


示例7: switch

// 把数据放到cpu上
inline void SyncedMemory::to_cpu() {
  switch (head_) {
  case UNINITIALIZED:
    CaffeMallocHost(&cpu_ptr_, size_);
    memset(cpu_ptr_, 0, size_);
    head_ = HEAD_AT_CPU;
    own_cpu_data_ = true;
    break;
  case HEAD_AT_GPU:
    if (cpu_ptr_ == NULL) {
      CaffeMallocHost(&cpu_ptr_, size_);
      own_cpu_data_ = true;
    }
    CUDA_CHECK(cudaMemcpy(cpu_ptr_, gpu_ptr_, size_, cudaMemcpyDeviceToHost));
    head_ = SYNCED;
    break;
  case HEAD_AT_CPU:
  case SYNCED:
    break;
  }
}
开发者ID:clarencezhang,项目名称:caffe-windows-multilabels,代码行数:22,代码来源:syncedmem.cpp


示例8: normalizeGPULaunch

/*
// Launch GPU kernel of normalize
//
// API
// int normalizeGPULaunch(const int alfa, CvLSVMFeatureMapGPU *dev_map_in,
           CvLSVMFeatureMapGPU *dev_norm, CvLSVMFeatureMapGPU *dev_map_out,
           CUstream stream);
// INPUT
// alfa
// dev_map_in
// dev_norm
// stream
// OUTPUT
// dev_map_out
// RESULT
// Error status
*/
int normalizeGPULaunch(const float alfa, CvLSVMFeatureMapGPU *dev_map_in,
        CvLSVMFeatureMapGPU *dev_norm, CvLSVMFeatureMapGPU *dev_map_out,
        CUstream stream)
{
    int sizeX, sizeY;
    int thread_num_x, thread_num_y, thread_num_z;
    int block_num_x, block_num_y, block_num_z;
    int sharedMemBytes;
    CUresult res;

    sizeX = dev_map_in->sizeX;
    sizeY = dev_map_in->sizeY;

    void *normalize_kernel_arg[] =
    { (void *) &dev_map_in->map, (void *) &dev_norm->map,
            (void *) &dev_map_out->map, (void *) &sizeX, (void *) &sizeY,
            (void *) &alfa, };

    thread_num_x =
            (sizeX < std::sqrt(max_threads_num)) ? sizeX : std::sqrt(max_threads_num);
    thread_num_y =
            (sizeY < std::sqrt(max_threads_num)) ? sizeY : std::sqrt(max_threads_num);
    thread_num_z = 1;
    block_num_x = sizeX / thread_num_x;
    block_num_y = sizeY / thread_num_y;
    block_num_z = NUM_SECTOR * 2;
    if (sizeX % thread_num_x != 0)
        block_num_x++;
    if (sizeY % thread_num_y != 0)
        block_num_y++;

    sharedMemBytes = 0;

    res = cuLaunchKernel(normalizeAndTruncate_func[0], block_num_x, block_num_y,
            block_num_z, thread_num_x, thread_num_y, thread_num_z,
            sharedMemBytes, stream, normalize_kernel_arg, NULL);
    CUDA_CHECK(res, "cuLaunchKernel(normalizeAndTruncate)");

    return LATENT_SVM_OK;
}
开发者ID:ZenzouFuruta,项目名称:Autoware,代码行数:57,代码来源:featurepyramid_gpu.cpp


示例9: PCAFeatureMapsAddNullableBorderGPULaunch

/*
// Launch GPU kernel of PCA feature maps
//
// API
// int PCAFeatureMapsAddNullableBorderGPULaunch(CvLSVMFeatureMapGPU *dev_map_in,
           CvLSVMFeatureMapGPU *dev_map_out, const int bx, const int by,
           CUstream stream);
// INPUT
// dev_map_in
// bx
// by
// stream
// OUTPUT
// dev_map_out
// RESULT
// Error status
*/
int PCAFeatureMapsAddNullableBorderGPULaunch(CvLSVMFeatureMapGPU *dev_map_in,
        CvLSVMFeatureMapGPU *dev_map_out, const int bx, const int by,
        CUstream stream)
{
    int sizeX, sizeY, p;
    int thread_num_x, thread_num_y, thread_num_z;
    int block_num_x, block_num_y, block_num_z;
    int sharedMemBytes;
    CUresult res;

    sizeX = dev_map_in->sizeX;
    sizeY = dev_map_in->sizeY;
    p = dev_map_in->numFeatures;

    void *pca_kernel_arg[] =
    { (void *) &dev_map_in->map, (void *) &dev_map_out->map, (void *) &sizeX,
            (void *) &sizeY, (void *) &p, (void *) &bx, (void *) &by };

    thread_num_x =
            (sizeX < std::sqrt(max_threads_num)) ? sizeX : std::sqrt(max_threads_num);
    thread_num_y =
            (sizeY < std::sqrt(max_threads_num)) ? sizeY : std::sqrt(max_threads_num);
    thread_num_z = 1;
    block_num_x = sizeX / thread_num_x;
    block_num_y = sizeY / thread_num_y;
    block_num_z = 1;
    if (sizeX % thread_num_x != 0)
        block_num_x++;
    if (sizeY % thread_num_y != 0)
        block_num_y++;

    sharedMemBytes = 0;

    res = cuLaunchKernel(PCAFeatureMapsAddNullableBorder_func[0], block_num_x,
            block_num_y, block_num_z, thread_num_x, thread_num_y, thread_num_z,
            sharedMemBytes, stream, pca_kernel_arg, NULL);
    CUDA_CHECK(res, "cuLaunchKernel(PCAFeatureMaps)");

    return LATENT_SVM_OK;
}
开发者ID:ZenzouFuruta,项目名称:Autoware,代码行数:57,代码来源:featurepyramid_gpu.cpp


示例10: switch

inline void SyncedMemory::to_cpu() {
  switch (head_) {
  case UNINITIALIZED:
    CaffeMallocHost(&cpu_ptr_, size_);
    CHECK(cpu_ptr_ != 0) << "size " << size_;
    memset(cpu_ptr_, 0, size_);
    head_ = HEAD_AT_CPU;
    break;
#if 0
  case HEAD_AT_GPU:
    if (cpu_ptr_ == NULL) {
      CaffeMallocHost(&cpu_ptr_, size_);
    }
    CUDA_CHECK(cudaMemcpy(cpu_ptr_, gpu_ptr_, size_, cudaMemcpyDeviceToHost));
    head_ = SYNCED;
    break;
#endif
  case HEAD_AT_CPU:
  case SYNCED:
    break;
  }
}
开发者ID:Devy001,项目名称:Caffe-mini,代码行数:22,代码来源:syncedmem.cpp


示例11: pinnedAlloc

    T* pinnedAlloc(const size_t &elements)
    {
        managerInit();
        T* ptr = NULL;
        // Allocate the higher megabyte. Overhead of creating pinned memory is
        // more so we want more resuable memory.
        size_t alloc_bytes = divup(sizeof(T) * elements, 1048576) * 1048576;

        if (elements > 0) {

            // FIXME: Add better checks for garbage collection
            // Perhaps look at total memory available as a metric
            if (pinned_maps.size() >= MAX_BUFFERS || pinned_used_bytes >= MAX_BYTES) {
                pinnedGarbageCollect();
            }

            for(mem_iter iter = pinned_maps.begin();
                iter != pinned_maps.end(); ++iter) {

                mem_info info = iter->second;
                if (info.is_free && info.bytes == alloc_bytes) {
                    iter->second.is_free = false;
                    pinned_used_bytes += alloc_bytes;
                    return (T *)iter->first;
                }
            }

            // Perform garbage collection if memory can not be allocated
            if (cudaMallocHost((void **)&ptr, alloc_bytes) != cudaSuccess) {
                pinnedGarbageCollect();
                CUDA_CHECK(cudaMallocHost((void **)(&ptr), alloc_bytes));
            }

            mem_info info = {false, false, alloc_bytes};
            pinned_maps[ptr] = info;
            pinned_used_bytes += alloc_bytes;
        }
        return (T*)ptr;
    }
开发者ID:hxiaox,项目名称:arrayfire,代码行数:39,代码来源:memory.cpp


示例12: LOG

float Timer::MicroSeconds() {
  if (!has_run_at_least_once()) {
    LOG(WARNING)<< "Timer has never been run before reading time.";
    return 0;
  }
  if (running()) {
    Stop();
  }
#ifdef USE_CUDA
  if (Caffe::mode() == Caffe::GPU) {
    CUDA_CHECK(cudaEventElapsedTime(&elapsed_milliseconds_, start_gpu_,
            stop_gpu_));
    // Cuda only measure milliseconds
    elapsed_microseconds_ = elapsed_milliseconds_ * 1000;
  } else {
#endif
    elapsed_microseconds_ = (stop_cpu_ - start_cpu_).total_microseconds();
#ifdef USE_CUDA
  }
#endif
  return elapsed_microseconds_;
}
开发者ID:rickyHong,项目名称:CaffeForOpenCL,代码行数:22,代码来源:benchmark.cpp


示例13: calculateNormGPULaunch

/*
// Launch GPU kernel of calculate norm
//
// API
//int calculateNormGPULaunch(CvLSVMFeatureMapGPU *dev_map_in,
          CvLSVMFeatureMapGPU *dev_norm, CUstream stream)
// INPUT
// dev_map_in
// stream
// OUTPUT
// dev_norm
// RESULT
// Error status
*/
int calculateNormGPULaunch(CvLSVMFeatureMapGPU *dev_map_in,
        CvLSVMFeatureMapGPU *dev_norm, CUstream stream)
{
    int sizeX, sizeY, xp;
    int thread_num_x, thread_num_y, thread_num_z;
    int block_num_x, block_num_y, block_num_z;
    int sharedMemBytes;
    CUresult res;

    sizeX = dev_map_in->sizeX;
    sizeY = dev_map_in->sizeY;
    xp = dev_map_in->numFeatures;

    void *calc_norm_kernel_arg[] =
    { (void *) &dev_map_in->map, (void *) &dev_norm->map, (void *) &sizeX,
            (void *) &sizeY, (void *) &xp, };

    thread_num_x =
            (sizeX < std::sqrt(max_threads_num)) ? sizeX : std::sqrt(max_threads_num);
    thread_num_y =
            (sizeY < std::sqrt(max_threads_num)) ? sizeY : std::sqrt(max_threads_num);
    thread_num_z = 1;
    block_num_x = sizeX / thread_num_x;
    block_num_y = sizeY / thread_num_y;
    block_num_z = 1;
    if (sizeX % thread_num_x != 0)
        block_num_x++;
    if (sizeY % thread_num_y != 0)
        block_num_y++;

    sharedMemBytes = 0;

    res = cuLaunchKernel(calculateNorm_func[0], block_num_x, block_num_y,
            block_num_z, thread_num_x, thread_num_y, thread_num_z,
            sharedMemBytes, stream, calc_norm_kernel_arg, NULL);
    CUDA_CHECK(res, "cuLaunchKernel(calcuateNorm)");

    return LATENT_SVM_OK;
}
开发者ID:ZenzouFuruta,项目名称:Autoware,代码行数:53,代码来源:featurepyramid_gpu.cpp


示例14: CUDA_CHECK

void MPIComm::ThreadFunc(int device){
#ifndef CPU_ONLY
  //LOG(ERROR)<<"device_id is "<<device;
  CUDA_CHECK(cudaSetDevice(device));
#endif
  started_.store(true);
  MPIJob job;
  while (true){
    mutex::scoped_lock lock(queue_mutex_);
    while( task_queue_.empty() && IsRunning()){
      DLOG(INFO)<<"no job running, waiting on cond";
      cond_work_.wait(lock);
    }
    lock.unlock();

    DLOG(INFO)<<"Cond fulfilled, dispatching job";
    if (IsRunning()){
      job = task_queue_.front();
      DLOG(INFO)<<task_queue_.size();
      DispatchJob(job);
      mutex::scoped_lock pop_lock(queue_mutex_);
      task_queue_.pop();
      pop_lock.unlock();
      cond_finish_.notify_one();
      DLOG(INFO)<<"job finished, poped taskqueue";
    }else{
      break;
    }

  }

  // finish remaining jobs
  while (!task_queue_.empty()){
    boost::lock_guard<mutex> lock(queue_mutex_);
    job = task_queue_.front();
    task_queue_.pop();
    DispatchJob(job);
  }
}
开发者ID:xiangqiaolxq,项目名称:caffe-parallel,代码行数:39,代码来源:mpijob.cpp


示例15: morph

Array<T>  morph(const Array<T> &in, const Array<T> &mask)
{
    const dim4 mdims = mask.dims();

    if (mdims[0] != mdims[1])
        AF_ERROR("Only square masks are supported in cuda morph currently", AF_ERR_SIZE);
    if (mdims[0] > 19)
        AF_ERROR("Upto 19x19 square kernels are only supported in cuda currently", AF_ERR_SIZE);

    Array<T> out = createEmptyArray<T>(in.dims());

    CUDA_CHECK(cudaMemcpyToSymbolAsync(kernel::cFilter, mask.get(),
                                       mdims[0] * mdims[1] * sizeof(T),
                                       0, cudaMemcpyDeviceToDevice,
                                       cuda::getStream(cuda::getActiveDeviceId())));

    if (isDilation)
        kernel::morph<T, true >(out, in, mdims[0]);
    else
        kernel::morph<T, false>(out, in, mdims[0]);

    return out;
}
开发者ID:hxiaox,项目名称:arrayfire,代码行数:23,代码来源:morph_impl.hpp


示例16: memAlloc

    T* memAlloc(const size_t &elements)
    {
        int n = getActiveDeviceId();
        T* ptr = NULL;
        size_t alloc_bytes = divup(sizeof(T) * elements, 1024) * 1024;

        if (elements > 0) {

            // FIXME: Add better checks for garbage collection
            // Perhaps look at total memory available as a metric
            if (memory_maps[n].size() >= MAX_BUFFERS || used_bytes >= MAX_BYTES) {
                garbageCollect();
            }

            for(mem_iter iter = memory_maps[n].begin();
                iter != memory_maps[n].end(); iter++) {

                mem_info info = iter->second;
                if (info.is_free && info.bytes == alloc_bytes) {
                    iter->second.is_free = false;
                    used_bytes += alloc_bytes;
                    return (T *)iter->first;
                }
            }

            // Perform garbage collection if memory can not be allocated
            if (cudaMalloc((void **)&ptr, alloc_bytes) != cudaSuccess) {
                garbageCollect();
                CUDA_CHECK(cudaMalloc((void **)(&ptr), alloc_bytes));
            }

            mem_info info = {false, alloc_bytes};
            memory_maps[n][ptr] = info;
            used_bytes += alloc_bytes;
        }
        return ptr;
    }
开发者ID:maolingao,项目名称:arrayfire,代码行数:37,代码来源:memory.cpp


示例17: CUDA_CHECK

SocketBuffer* SocketBuffer::Read(bool data) {
  // Pop the message from local queue
  QueuedMessage* qm = NULL;
  if(data) {
    qm = reinterpret_cast<QueuedMessage*>
      (this->channel_->receive_queue.pop());
#ifndef CPU_ONLY
    // Copy the received buffer to GPU memory
    CUDA_CHECK(cudaMemcpy(this->addr(), qm->buffer,  // NOLINT(caffe/alt_fn)
               qm->size, cudaMemcpyHostToDevice));  // NOLINT(caffe/alt_fn)
#else
    //caffe_copy(qm->size, qm->buffer, this->addr_);
    memcpy(this->addr_, qm->buffer, qm->size);
#endif
  } else {
    qm = reinterpret_cast<QueuedMessage*>
      (this->channel_->receive_queue_ctrl.pop());
  }
  // Free up the buffer and the wrapper object
  if(data)
    delete qm->buffer;
  delete qm;
  return this;
}
开发者ID:Aravindreddy986,项目名称:CaffeOnSpark,代码行数:24,代码来源:socket.cpp


示例18: CUDA_CHECK

P2PSync<Dtype>::~P2PSync() {
#ifndef CPU_ONLY
    int initial_device;
    CUDA_CHECK(cudaGetDevice(&initial_device));
    const int self = solver_->param().device_id();
    CUDA_CHECK(cudaSetDevice(self));

    if (parent_) {
        CUDA_CHECK(cudaFree(parent_grads_));
        const int peer = parent_->solver_->param().device_id();
        int access;
        CUDA_CHECK(cudaDeviceCanAccessPeer(&access, self, peer));
        if (access) {
            CUDA_CHECK(cudaDeviceDisablePeerAccess(peer));
        }
    }

    CUDA_CHECK(cudaSetDevice(initial_device));
#endif
}
开发者ID:flair2005,项目名称:Caffe-Solution,代码行数:20,代码来源:parallel.cpp


示例19: remaining

void DevicePair::compute(const vector<int> devices, vector<DevicePair>* pairs) {
#ifndef CPU_ONLY
    vector<int> remaining(devices);

    // Depth for reduction tree
    int remaining_depth = static_cast<int>(ceil(log2(remaining.size())));

    // Group GPUs by board
    for (int d = 0; d < remaining_depth; ++d) {
        for (int i = 0; i < remaining.size(); ++i) {
            for (int j = i + 1; j < remaining.size(); ++j) {
                cudaDeviceProp a, b;
                CUDA_CHECK(cudaGetDeviceProperties(&a, remaining[i]));
                CUDA_CHECK(cudaGetDeviceProperties(&b, remaining[j]));
                if (a.isMultiGpuBoard && b.isMultiGpuBoard) {
                    if (a.multiGpuBoardGroupID == b.multiGpuBoardGroupID) {
                        pairs->push_back(DevicePair(remaining[i], remaining[j]));
                        DLOG(INFO) << "GPU board: " << remaining[i] << ":" << remaining[j];
                        remaining.erase(remaining.begin() + j);
                        break;
                    }
                }
            }
        }
    }
    ostringstream s;
    for (int i = 0; i < remaining.size(); ++i) {
        s << (i ? ", " : "") << remaining[i];
    }
    DLOG(INFO) << "GPUs paired by boards, remaining: " << s.str();

    // Group by P2P accessibility
    remaining_depth = ceil(log2(remaining.size()));
    for (int d = 0; d < remaining_depth; ++d) {
        for (int i = 0; i < remaining.size(); ++i) {
            for (int j = i + 1; j < remaining.size(); ++j) {
                int access;
                CUDA_CHECK(
                    cudaDeviceCanAccessPeer(&access, remaining[i], remaining[j]));
                if (access) {
                    pairs->push_back(DevicePair(remaining[i], remaining[j]));
                    DLOG(INFO) << "P2P pair: " << remaining[i] << ":" << remaining[j];
                    remaining.erase(remaining.begin() + j);
                    break;
                }
            }
        }
    }
    s.str("");
    for (int i = 0; i < remaining.size(); ++i) {
        s << (i ? ", " : "") << remaining[i];
    }
    DLOG(INFO) << "GPUs paired by P2P access, remaining: " << s.str();

    // Group remaining
    remaining_depth = ceil(log2(remaining.size()));
    for (int d = 0; d < remaining_depth; ++d) {
        for (int i = 0; i < remaining.size(); ++i) {
            pairs->push_back(DevicePair(remaining[i], remaining[i + 1]));
            DLOG(INFO) << "Remaining pair: " << remaining[i] << ":"
                       << remaining[i + 1];
            remaining.erase(remaining.begin() + i + 1);
        }
    }

    // Should only be the parent node remaining
    CHECK_EQ(remaining.size(), 1);

    pairs->insert(pairs->begin(), DevicePair(-1, remaining[0]));

    CHECK(pairs->size() == devices.size());
    for (int i = 0; i < pairs->size(); ++i) {
        CHECK((*pairs)[i].parent() != (*pairs)[i].device());
        for (int j = i + 1; j < pairs->size(); ++j) {
            CHECK((*pairs)[i].device() != (*pairs)[j].device());
        }
    }
#else
    NO_GPU;
#endif
}
开发者ID:flair2005,项目名称:Caffe-Solution,代码行数:81,代码来源:parallel.cpp


示例20: orb

void orb(unsigned* out_feat,
         float** d_x,
         float** d_y,
         float** d_score,
         float** d_ori,
         float** d_size,
         unsigned** d_desc,
         std::vector<unsigned>& feat_pyr,
         std::vector<float*>& d_x_pyr,
         std::vector<float*>& d_y_pyr,
         std::vector<unsigned>& lvl_best,
         std::vector<float>& lvl_scl,
         std::vector<CParam<T> >& img_pyr,
         const float fast_thr,
         const unsigned max_feat,
         const float scl_fctr,
         const unsigned levels)
{
    unsigned patch_size = REF_PAT_SIZE;

    unsigned max_levels = feat_pyr.size();

    // In future implementations, the user will be capable of passing his
    // distribution instead of using the reference one
    //CUDA_CHECK(cudaMemcpyToSymbol(d_ref_pat, h_ref_pat, 256 * 4 * sizeof(int), 0, cudaMemcpyHostToDevice));

    std::vector<float*> d_score_pyr(max_levels);
    std::vector<float*> d_ori_pyr(max_levels);
    std::vector<float*> d_size_pyr(max_levels);
    std::vector<unsigned*> d_desc_pyr(max_levels);
    std::vector<unsigned*> d_idx_pyr(max_levels);

    unsigned total_feat = 0;

    // Calculate a separable Gaussian kernel
    unsigned gauss_len = 9;
    convAccT* h_gauss = new convAccT[gauss_len];
    gaussian1D(h_gauss, gauss_len, 2.f);
    Param<convAccT> gauss_filter;
    gauss_filter.dims[0] = gauss_len;
    gauss_filter.strides[0] = 1;

    for (int k = 1; k < 4; k++) {
        gauss_filter.dims[k] = 1;
        gauss_filter.strides[k] = gauss_filter.dims[k - 1] * gauss_filter.strides[k - 1];
    }

    dim_type gauss_elem = gauss_filter.strides[3] * gauss_filter.dims[3];
    gauss_filter.ptr = memAlloc<convAccT>(gauss_elem);
    CUDA_CHECK(cudaMemcpy(gauss_filter.ptr, h_gauss, gauss_elem * sizeof(convAccT), cudaMemcpyHostToDevice));

    delete[] h_gauss;

    for (int i = 0; i < (int)max_levels; i++) {
        if (feat_pyr[i] == 0 || lvl_best[i] == 0) {
            if (i > 0)
                memFree((T*)img_pyr[i].ptr);
            continue;
        }

        unsigned* d_usable_feat = memAlloc<unsigned>(1);
        CUDA_CHECK(cudaMemset(d_usable_feat, 0, sizeof(unsigned)));

        float* d_x_harris = memAlloc<float>(feat_pyr[i]);
        float* d_y_harris = memAlloc<float>(feat_pyr[i]);
        float* d_score_harris = memAlloc<float>(feat_pyr[i]);

        // Calculate Harris responses
        // Good block_size >= 7 (must be an odd number)
        dim3 threads(THREADS_X, THREADS_Y);
        dim3 blocks(divup(feat_pyr[i], threads.x), 1);
        harris_response<T,false><<<blocks, threads>>>(d_x_harris, d_y_harris, d_score_harris, NULL,
                                                      d_x_pyr[i], d_y_pyr[i], NULL,
                                                      feat_pyr[i], d_usable_feat,
                                                      img_pyr[i], 7, 0.04f, patch_size);
        POST_LAUNCH_CHECK();

        unsigned usable_feat = 0;
        CUDA_CHECK(cudaMemcpy(&usable_feat, d_usable_feat, sizeof(unsigned), cudaMemcpyDeviceToHost));

        memFree(d_x_pyr[i]);
        memFree(d_y_pyr[i]);
        memFree(d_usable_feat);

        feat_pyr[i] = usable_feat;

        if (feat_pyr[i] == 0) {
            memFree(d_x_harris);
            memFree(d_y_harris);
            memFree(d_score_harris);
            if (i > 0)
                memFree((T*)img_pyr[i].ptr);
            continue;
        }

        Param<float> harris_sorted;
        Param<unsigned> harris_idx;

        harris_sorted.dims[0] = harris_idx.dims[0] = feat_pyr[i];
        harris_sorted.strides[0] = harris_idx.strides[0] = 1;
//.........这里部分代码省略.........
开发者ID:pavanky,项目名称:arrayfire,代码行数:101,代码来源:orb.hpp



注:本文中的CUDA_CHECK函数示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
C++ CUDA_SAFE_CALL函数代码示例发布时间:2022-05-30
下一篇:
C++ CUBLAS_CHECK函数代码示例发布时间:2022-05-30
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap