本文整理汇总了C++中CUDA_CHECK函数的典型用法代码示例。如果您正苦于以下问题:C++ CUDA_CHECK函数的具体用法?C++ CUDA_CHECK怎么用?C++ CUDA_CHECK使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了CUDA_CHECK函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的C++代码示例。
示例1: StopInternalThread
void InternalThread::StartInternalThread() {
// TODO switch to failing once Caffe prefetch thread is persistent.
// Threads should not be started and stopped repeatedly.
// CHECK(!is_started());
StopInternalThread();
#ifndef CPU_ONLY
CUDA_CHECK(cudaGetDevice(&device_));
#endif
mode_ = Caffe::mode();
rand_seed_ = caffe_rng_rand();
solver_count_ = Caffe::solver_count();
root_solver_ = Caffe::root_solver();
try {
thread_.reset(new boost::thread(&InternalThread::entry, this));
} catch (std::exception& e) {
CHECK(false) << e.what();
}
}
开发者ID:XinLiuNvidia,项目名称:caffe,代码行数:20,代码来源:internal_thread.cpp
示例2: transform
void transform(Param<T> out, CParam<T> in, CParam<float> tf,
const bool inverse)
{
const dim_type nimages = in.dims[2];
// Multiplied in src/backend/transform.cpp
const dim_type ntransforms = out.dims[2] / in.dims[2];
// Copy transform to constant memory.
CUDA_CHECK(cudaMemcpyToSymbol(c_tmat, tf.ptr, ntransforms * 6 * sizeof(float), 0,
cudaMemcpyDeviceToDevice));
dim3 threads(TX, TY, 1);
dim3 blocks(divup(out.dims[0], threads.x), divup(out.dims[1], threads.y));
if (nimages > 1) { blocks.x *= nimages; }
if (ntransforms > 1) { blocks.y *= ntransforms; }
if(inverse) {
transform_kernel<T, true><<<blocks, threads>>>(out, in, nimages, ntransforms);
} else {
开发者ID:EasonYi,项目名称:arrayfire,代码行数:20,代码来源:transform.hpp
示例3: LOG
float Timer::MilliSeconds() {
if (!has_run_at_least_once()) {
LOG(WARNING) << "Timer has never been run before reading time.";
return 0;
}
if (running()) {
Stop();
}
if (Caffe::mode() == Caffe::GPU) {
#ifndef CPU_ONLY
CUDA_CHECK(cudaEventElapsedTime(&elapsed_milliseconds_, start_gpu_,
stop_gpu_));
#else
NO_GPU;
#endif
} else {
elapsed_milliseconds_ = (stop_cpu_ - start_cpu_).total_milliseconds();
}
return elapsed_milliseconds_;
}
开发者ID:azrael417,项目名称:caffe,代码行数:20,代码来源:benchmark.cpp
示例4: CUDA_CHECK
void CuDNNConvolutionLayer<Dtype>::LayerSetUp(
const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) {
ConvolutionLayer<Dtype>::LayerSetUp(bottom, top);
// Initialize CUDA streams and cuDNN.
stream_ = new cudaStream_t[this->group_ * CUDNN_STREAMS_PER_GROUP];
handle_ = new cudnnHandle_t[this->group_ * CUDNN_STREAMS_PER_GROUP];
for (int g = 0; g < this->group_ * CUDNN_STREAMS_PER_GROUP; g++) {
CUDA_CHECK(cudaStreamCreate(&stream_[g]));
CUDNN_CHECK(cudnnCreate(&handle_[g]));
CUDNN_CHECK(cudnnSetStream(handle_[g], stream_[g]));
}
// Set the indexing parameters.
weight_offset_ = (this->num_output_ / this->group_)
* (this->channels_ / this->group_) * this->kernel_h_ * this->kernel_w_;
bias_offset_ = (this->num_output_ / this->group_);
// Create filter descriptor.
cudnn::createFilterDesc<Dtype>(&filter_desc_,
this->num_output_ / this->group_, this->channels_ / this->group_,
this->kernel_h_, this->kernel_w_);
// Create tensor descriptor(s) for data and corresponding convolution(s).
for (int i = 0; i < bottom.size(); i++) {
cudnnTensor4dDescriptor_t bottom_desc;
cudnn::createTensor4dDesc<Dtype>(&bottom_desc);
bottom_descs_.push_back(bottom_desc);
cudnnTensor4dDescriptor_t top_desc;
cudnn::createTensor4dDesc<Dtype>(&top_desc);
top_descs_.push_back(top_desc);
cudnnConvolutionDescriptor_t conv_desc;
cudnn::createConvolutionDesc<Dtype>(&conv_desc);
conv_descs_.push_back(conv_desc);
}
// Tensor descriptor for bias.
if (this->bias_term_) {
cudnn::createTensor4dDesc<Dtype>(&bias_desc_);
}
}
开发者ID:13331151,项目名称:deeplab-public,代码行数:41,代码来源:cudnn_conv_layer.cpp
示例5: morph
Array<T> morph(const Array<T> &in, const Array<T> &mask) {
const dim4 mdims = mask.dims();
if (mdims[0] != mdims[1])
CUDA_NOT_SUPPORTED("Rectangular masks are not supported");
if (mdims[0] > 19) CUDA_NOT_SUPPORTED("Kernels > 19x19 are not supported");
Array<T> out = createEmptyArray<T>(in.dims());
CUDA_CHECK(cudaMemcpyToSymbolAsync(
kernel::cFilter, mask.get(), mdims[0] * mdims[1] * sizeof(T), 0,
cudaMemcpyDeviceToDevice, cuda::getActiveStream()));
if (isDilation)
kernel::morph<T, true>(out, in, mdims[0]);
else
kernel::morph<T, false>(out, in, mdims[0]);
return out;
}
开发者ID:9prady9,项目名称:arrayfire,代码行数:21,代码来源:morph_impl.hpp
示例6: caffe_copy
void caffe_copy(const int N, const Dtype* X, Dtype* Y) {
if (X != Y) {
// If there are more than one openmp thread (we are in active region)
// then checking Caffe::mode can create additional GPU Context
//
if (
#ifdef _OPENMP
(omp_in_parallel() == 0) &&
#endif
(Caffe::mode() == Caffe::GPU)) {
#ifndef CPU_ONLY
// NOLINT_NEXT_LINE(caffe/alt_fn)
CUDA_CHECK(cudaMemcpy(Y, X, sizeof(Dtype) * N, cudaMemcpyDefault));
#else
NO_GPU;
#endif
} else {
caffe_cpu_copy<Dtype>(N, X, Y);
}
}
}
开发者ID:crobertob,项目名称:caffe,代码行数:21,代码来源:math_functions.cpp
示例7: switch
// 把数据放到cpu上
inline void SyncedMemory::to_cpu() {
switch (head_) {
case UNINITIALIZED:
CaffeMallocHost(&cpu_ptr_, size_);
memset(cpu_ptr_, 0, size_);
head_ = HEAD_AT_CPU;
own_cpu_data_ = true;
break;
case HEAD_AT_GPU:
if (cpu_ptr_ == NULL) {
CaffeMallocHost(&cpu_ptr_, size_);
own_cpu_data_ = true;
}
CUDA_CHECK(cudaMemcpy(cpu_ptr_, gpu_ptr_, size_, cudaMemcpyDeviceToHost));
head_ = SYNCED;
break;
case HEAD_AT_CPU:
case SYNCED:
break;
}
}
开发者ID:clarencezhang,项目名称:caffe-windows-multilabels,代码行数:22,代码来源:syncedmem.cpp
示例8: normalizeGPULaunch
/*
// Launch GPU kernel of normalize
//
// API
// int normalizeGPULaunch(const int alfa, CvLSVMFeatureMapGPU *dev_map_in,
CvLSVMFeatureMapGPU *dev_norm, CvLSVMFeatureMapGPU *dev_map_out,
CUstream stream);
// INPUT
// alfa
// dev_map_in
// dev_norm
// stream
// OUTPUT
// dev_map_out
// RESULT
// Error status
*/
int normalizeGPULaunch(const float alfa, CvLSVMFeatureMapGPU *dev_map_in,
CvLSVMFeatureMapGPU *dev_norm, CvLSVMFeatureMapGPU *dev_map_out,
CUstream stream)
{
int sizeX, sizeY;
int thread_num_x, thread_num_y, thread_num_z;
int block_num_x, block_num_y, block_num_z;
int sharedMemBytes;
CUresult res;
sizeX = dev_map_in->sizeX;
sizeY = dev_map_in->sizeY;
void *normalize_kernel_arg[] =
{ (void *) &dev_map_in->map, (void *) &dev_norm->map,
(void *) &dev_map_out->map, (void *) &sizeX, (void *) &sizeY,
(void *) &alfa, };
thread_num_x =
(sizeX < std::sqrt(max_threads_num)) ? sizeX : std::sqrt(max_threads_num);
thread_num_y =
(sizeY < std::sqrt(max_threads_num)) ? sizeY : std::sqrt(max_threads_num);
thread_num_z = 1;
block_num_x = sizeX / thread_num_x;
block_num_y = sizeY / thread_num_y;
block_num_z = NUM_SECTOR * 2;
if (sizeX % thread_num_x != 0)
block_num_x++;
if (sizeY % thread_num_y != 0)
block_num_y++;
sharedMemBytes = 0;
res = cuLaunchKernel(normalizeAndTruncate_func[0], block_num_x, block_num_y,
block_num_z, thread_num_x, thread_num_y, thread_num_z,
sharedMemBytes, stream, normalize_kernel_arg, NULL);
CUDA_CHECK(res, "cuLaunchKernel(normalizeAndTruncate)");
return LATENT_SVM_OK;
}
开发者ID:ZenzouFuruta,项目名称:Autoware,代码行数:57,代码来源:featurepyramid_gpu.cpp
示例9: PCAFeatureMapsAddNullableBorderGPULaunch
/*
// Launch GPU kernel of PCA feature maps
//
// API
// int PCAFeatureMapsAddNullableBorderGPULaunch(CvLSVMFeatureMapGPU *dev_map_in,
CvLSVMFeatureMapGPU *dev_map_out, const int bx, const int by,
CUstream stream);
// INPUT
// dev_map_in
// bx
// by
// stream
// OUTPUT
// dev_map_out
// RESULT
// Error status
*/
int PCAFeatureMapsAddNullableBorderGPULaunch(CvLSVMFeatureMapGPU *dev_map_in,
CvLSVMFeatureMapGPU *dev_map_out, const int bx, const int by,
CUstream stream)
{
int sizeX, sizeY, p;
int thread_num_x, thread_num_y, thread_num_z;
int block_num_x, block_num_y, block_num_z;
int sharedMemBytes;
CUresult res;
sizeX = dev_map_in->sizeX;
sizeY = dev_map_in->sizeY;
p = dev_map_in->numFeatures;
void *pca_kernel_arg[] =
{ (void *) &dev_map_in->map, (void *) &dev_map_out->map, (void *) &sizeX,
(void *) &sizeY, (void *) &p, (void *) &bx, (void *) &by };
thread_num_x =
(sizeX < std::sqrt(max_threads_num)) ? sizeX : std::sqrt(max_threads_num);
thread_num_y =
(sizeY < std::sqrt(max_threads_num)) ? sizeY : std::sqrt(max_threads_num);
thread_num_z = 1;
block_num_x = sizeX / thread_num_x;
block_num_y = sizeY / thread_num_y;
block_num_z = 1;
if (sizeX % thread_num_x != 0)
block_num_x++;
if (sizeY % thread_num_y != 0)
block_num_y++;
sharedMemBytes = 0;
res = cuLaunchKernel(PCAFeatureMapsAddNullableBorder_func[0], block_num_x,
block_num_y, block_num_z, thread_num_x, thread_num_y, thread_num_z,
sharedMemBytes, stream, pca_kernel_arg, NULL);
CUDA_CHECK(res, "cuLaunchKernel(PCAFeatureMaps)");
return LATENT_SVM_OK;
}
开发者ID:ZenzouFuruta,项目名称:Autoware,代码行数:57,代码来源:featurepyramid_gpu.cpp
示例10: switch
inline void SyncedMemory::to_cpu() {
switch (head_) {
case UNINITIALIZED:
CaffeMallocHost(&cpu_ptr_, size_);
CHECK(cpu_ptr_ != 0) << "size " << size_;
memset(cpu_ptr_, 0, size_);
head_ = HEAD_AT_CPU;
break;
#if 0
case HEAD_AT_GPU:
if (cpu_ptr_ == NULL) {
CaffeMallocHost(&cpu_ptr_, size_);
}
CUDA_CHECK(cudaMemcpy(cpu_ptr_, gpu_ptr_, size_, cudaMemcpyDeviceToHost));
head_ = SYNCED;
break;
#endif
case HEAD_AT_CPU:
case SYNCED:
break;
}
}
开发者ID:Devy001,项目名称:Caffe-mini,代码行数:22,代码来源:syncedmem.cpp
示例11: pinnedAlloc
T* pinnedAlloc(const size_t &elements)
{
managerInit();
T* ptr = NULL;
// Allocate the higher megabyte. Overhead of creating pinned memory is
// more so we want more resuable memory.
size_t alloc_bytes = divup(sizeof(T) * elements, 1048576) * 1048576;
if (elements > 0) {
// FIXME: Add better checks for garbage collection
// Perhaps look at total memory available as a metric
if (pinned_maps.size() >= MAX_BUFFERS || pinned_used_bytes >= MAX_BYTES) {
pinnedGarbageCollect();
}
for(mem_iter iter = pinned_maps.begin();
iter != pinned_maps.end(); ++iter) {
mem_info info = iter->second;
if (info.is_free && info.bytes == alloc_bytes) {
iter->second.is_free = false;
pinned_used_bytes += alloc_bytes;
return (T *)iter->first;
}
}
// Perform garbage collection if memory can not be allocated
if (cudaMallocHost((void **)&ptr, alloc_bytes) != cudaSuccess) {
pinnedGarbageCollect();
CUDA_CHECK(cudaMallocHost((void **)(&ptr), alloc_bytes));
}
mem_info info = {false, false, alloc_bytes};
pinned_maps[ptr] = info;
pinned_used_bytes += alloc_bytes;
}
return (T*)ptr;
}
开发者ID:hxiaox,项目名称:arrayfire,代码行数:39,代码来源:memory.cpp
示例12: LOG
float Timer::MicroSeconds() {
if (!has_run_at_least_once()) {
LOG(WARNING)<< "Timer has never been run before reading time.";
return 0;
}
if (running()) {
Stop();
}
#ifdef USE_CUDA
if (Caffe::mode() == Caffe::GPU) {
CUDA_CHECK(cudaEventElapsedTime(&elapsed_milliseconds_, start_gpu_,
stop_gpu_));
// Cuda only measure milliseconds
elapsed_microseconds_ = elapsed_milliseconds_ * 1000;
} else {
#endif
elapsed_microseconds_ = (stop_cpu_ - start_cpu_).total_microseconds();
#ifdef USE_CUDA
}
#endif
return elapsed_microseconds_;
}
开发者ID:rickyHong,项目名称:CaffeForOpenCL,代码行数:22,代码来源:benchmark.cpp
示例13: calculateNormGPULaunch
/*
// Launch GPU kernel of calculate norm
//
// API
//int calculateNormGPULaunch(CvLSVMFeatureMapGPU *dev_map_in,
CvLSVMFeatureMapGPU *dev_norm, CUstream stream)
// INPUT
// dev_map_in
// stream
// OUTPUT
// dev_norm
// RESULT
// Error status
*/
int calculateNormGPULaunch(CvLSVMFeatureMapGPU *dev_map_in,
CvLSVMFeatureMapGPU *dev_norm, CUstream stream)
{
int sizeX, sizeY, xp;
int thread_num_x, thread_num_y, thread_num_z;
int block_num_x, block_num_y, block_num_z;
int sharedMemBytes;
CUresult res;
sizeX = dev_map_in->sizeX;
sizeY = dev_map_in->sizeY;
xp = dev_map_in->numFeatures;
void *calc_norm_kernel_arg[] =
{ (void *) &dev_map_in->map, (void *) &dev_norm->map, (void *) &sizeX,
(void *) &sizeY, (void *) &xp, };
thread_num_x =
(sizeX < std::sqrt(max_threads_num)) ? sizeX : std::sqrt(max_threads_num);
thread_num_y =
(sizeY < std::sqrt(max_threads_num)) ? sizeY : std::sqrt(max_threads_num);
thread_num_z = 1;
block_num_x = sizeX / thread_num_x;
block_num_y = sizeY / thread_num_y;
block_num_z = 1;
if (sizeX % thread_num_x != 0)
block_num_x++;
if (sizeY % thread_num_y != 0)
block_num_y++;
sharedMemBytes = 0;
res = cuLaunchKernel(calculateNorm_func[0], block_num_x, block_num_y,
block_num_z, thread_num_x, thread_num_y, thread_num_z,
sharedMemBytes, stream, calc_norm_kernel_arg, NULL);
CUDA_CHECK(res, "cuLaunchKernel(calcuateNorm)");
return LATENT_SVM_OK;
}
开发者ID:ZenzouFuruta,项目名称:Autoware,代码行数:53,代码来源:featurepyramid_gpu.cpp
示例14: CUDA_CHECK
void MPIComm::ThreadFunc(int device){
#ifndef CPU_ONLY
//LOG(ERROR)<<"device_id is "<<device;
CUDA_CHECK(cudaSetDevice(device));
#endif
started_.store(true);
MPIJob job;
while (true){
mutex::scoped_lock lock(queue_mutex_);
while( task_queue_.empty() && IsRunning()){
DLOG(INFO)<<"no job running, waiting on cond";
cond_work_.wait(lock);
}
lock.unlock();
DLOG(INFO)<<"Cond fulfilled, dispatching job";
if (IsRunning()){
job = task_queue_.front();
DLOG(INFO)<<task_queue_.size();
DispatchJob(job);
mutex::scoped_lock pop_lock(queue_mutex_);
task_queue_.pop();
pop_lock.unlock();
cond_finish_.notify_one();
DLOG(INFO)<<"job finished, poped taskqueue";
}else{
break;
}
}
// finish remaining jobs
while (!task_queue_.empty()){
boost::lock_guard<mutex> lock(queue_mutex_);
job = task_queue_.front();
task_queue_.pop();
DispatchJob(job);
}
}
开发者ID:xiangqiaolxq,项目名称:caffe-parallel,代码行数:39,代码来源:mpijob.cpp
示例15: morph
Array<T> morph(const Array<T> &in, const Array<T> &mask)
{
const dim4 mdims = mask.dims();
if (mdims[0] != mdims[1])
AF_ERROR("Only square masks are supported in cuda morph currently", AF_ERR_SIZE);
if (mdims[0] > 19)
AF_ERROR("Upto 19x19 square kernels are only supported in cuda currently", AF_ERR_SIZE);
Array<T> out = createEmptyArray<T>(in.dims());
CUDA_CHECK(cudaMemcpyToSymbolAsync(kernel::cFilter, mask.get(),
mdims[0] * mdims[1] * sizeof(T),
0, cudaMemcpyDeviceToDevice,
cuda::getStream(cuda::getActiveDeviceId())));
if (isDilation)
kernel::morph<T, true >(out, in, mdims[0]);
else
kernel::morph<T, false>(out, in, mdims[0]);
return out;
}
开发者ID:hxiaox,项目名称:arrayfire,代码行数:23,代码来源:morph_impl.hpp
示例16: memAlloc
T* memAlloc(const size_t &elements)
{
int n = getActiveDeviceId();
T* ptr = NULL;
size_t alloc_bytes = divup(sizeof(T) * elements, 1024) * 1024;
if (elements > 0) {
// FIXME: Add better checks for garbage collection
// Perhaps look at total memory available as a metric
if (memory_maps[n].size() >= MAX_BUFFERS || used_bytes >= MAX_BYTES) {
garbageCollect();
}
for(mem_iter iter = memory_maps[n].begin();
iter != memory_maps[n].end(); iter++) {
mem_info info = iter->second;
if (info.is_free && info.bytes == alloc_bytes) {
iter->second.is_free = false;
used_bytes += alloc_bytes;
return (T *)iter->first;
}
}
// Perform garbage collection if memory can not be allocated
if (cudaMalloc((void **)&ptr, alloc_bytes) != cudaSuccess) {
garbageCollect();
CUDA_CHECK(cudaMalloc((void **)(&ptr), alloc_bytes));
}
mem_info info = {false, alloc_bytes};
memory_maps[n][ptr] = info;
used_bytes += alloc_bytes;
}
return ptr;
}
开发者ID:maolingao,项目名称:arrayfire,代码行数:37,代码来源:memory.cpp
示例17: CUDA_CHECK
SocketBuffer* SocketBuffer::Read(bool data) {
// Pop the message from local queue
QueuedMessage* qm = NULL;
if(data) {
qm = reinterpret_cast<QueuedMessage*>
(this->channel_->receive_queue.pop());
#ifndef CPU_ONLY
// Copy the received buffer to GPU memory
CUDA_CHECK(cudaMemcpy(this->addr(), qm->buffer, // NOLINT(caffe/alt_fn)
qm->size, cudaMemcpyHostToDevice)); // NOLINT(caffe/alt_fn)
#else
//caffe_copy(qm->size, qm->buffer, this->addr_);
memcpy(this->addr_, qm->buffer, qm->size);
#endif
} else {
qm = reinterpret_cast<QueuedMessage*>
(this->channel_->receive_queue_ctrl.pop());
}
// Free up the buffer and the wrapper object
if(data)
delete qm->buffer;
delete qm;
return this;
}
开发者ID:Aravindreddy986,项目名称:CaffeOnSpark,代码行数:24,代码来源:socket.cpp
示例18: CUDA_CHECK
P2PSync<Dtype>::~P2PSync() {
#ifndef CPU_ONLY
int initial_device;
CUDA_CHECK(cudaGetDevice(&initial_device));
const int self = solver_->param().device_id();
CUDA_CHECK(cudaSetDevice(self));
if (parent_) {
CUDA_CHECK(cudaFree(parent_grads_));
const int peer = parent_->solver_->param().device_id();
int access;
CUDA_CHECK(cudaDeviceCanAccessPeer(&access, self, peer));
if (access) {
CUDA_CHECK(cudaDeviceDisablePeerAccess(peer));
}
}
CUDA_CHECK(cudaSetDevice(initial_device));
#endif
}
开发者ID:flair2005,项目名称:Caffe-Solution,代码行数:20,代码来源:parallel.cpp
示例19: remaining
void DevicePair::compute(const vector<int> devices, vector<DevicePair>* pairs) {
#ifndef CPU_ONLY
vector<int> remaining(devices);
// Depth for reduction tree
int remaining_depth = static_cast<int>(ceil(log2(remaining.size())));
// Group GPUs by board
for (int d = 0; d < remaining_depth; ++d) {
for (int i = 0; i < remaining.size(); ++i) {
for (int j = i + 1; j < remaining.size(); ++j) {
cudaDeviceProp a, b;
CUDA_CHECK(cudaGetDeviceProperties(&a, remaining[i]));
CUDA_CHECK(cudaGetDeviceProperties(&b, remaining[j]));
if (a.isMultiGpuBoard && b.isMultiGpuBoard) {
if (a.multiGpuBoardGroupID == b.multiGpuBoardGroupID) {
pairs->push_back(DevicePair(remaining[i], remaining[j]));
DLOG(INFO) << "GPU board: " << remaining[i] << ":" << remaining[j];
remaining.erase(remaining.begin() + j);
break;
}
}
}
}
}
ostringstream s;
for (int i = 0; i < remaining.size(); ++i) {
s << (i ? ", " : "") << remaining[i];
}
DLOG(INFO) << "GPUs paired by boards, remaining: " << s.str();
// Group by P2P accessibility
remaining_depth = ceil(log2(remaining.size()));
for (int d = 0; d < remaining_depth; ++d) {
for (int i = 0; i < remaining.size(); ++i) {
for (int j = i + 1; j < remaining.size(); ++j) {
int access;
CUDA_CHECK(
cudaDeviceCanAccessPeer(&access, remaining[i], remaining[j]));
if (access) {
pairs->push_back(DevicePair(remaining[i], remaining[j]));
DLOG(INFO) << "P2P pair: " << remaining[i] << ":" << remaining[j];
remaining.erase(remaining.begin() + j);
break;
}
}
}
}
s.str("");
for (int i = 0; i < remaining.size(); ++i) {
s << (i ? ", " : "") << remaining[i];
}
DLOG(INFO) << "GPUs paired by P2P access, remaining: " << s.str();
// Group remaining
remaining_depth = ceil(log2(remaining.size()));
for (int d = 0; d < remaining_depth; ++d) {
for (int i = 0; i < remaining.size(); ++i) {
pairs->push_back(DevicePair(remaining[i], remaining[i + 1]));
DLOG(INFO) << "Remaining pair: " << remaining[i] << ":"
<< remaining[i + 1];
remaining.erase(remaining.begin() + i + 1);
}
}
// Should only be the parent node remaining
CHECK_EQ(remaining.size(), 1);
pairs->insert(pairs->begin(), DevicePair(-1, remaining[0]));
CHECK(pairs->size() == devices.size());
for (int i = 0; i < pairs->size(); ++i) {
CHECK((*pairs)[i].parent() != (*pairs)[i].device());
for (int j = i + 1; j < pairs->size(); ++j) {
CHECK((*pairs)[i].device() != (*pairs)[j].device());
}
}
#else
NO_GPU;
#endif
}
开发者ID:flair2005,项目名称:Caffe-Solution,代码行数:81,代码来源:parallel.cpp
示例20: orb
void orb(unsigned* out_feat,
float** d_x,
float** d_y,
float** d_score,
float** d_ori,
float** d_size,
unsigned** d_desc,
std::vector<unsigned>& feat_pyr,
std::vector<float*>& d_x_pyr,
std::vector<float*>& d_y_pyr,
std::vector<unsigned>& lvl_best,
std::vector<float>& lvl_scl,
std::vector<CParam<T> >& img_pyr,
const float fast_thr,
const unsigned max_feat,
const float scl_fctr,
const unsigned levels)
{
unsigned patch_size = REF_PAT_SIZE;
unsigned max_levels = feat_pyr.size();
// In future implementations, the user will be capable of passing his
// distribution instead of using the reference one
//CUDA_CHECK(cudaMemcpyToSymbol(d_ref_pat, h_ref_pat, 256 * 4 * sizeof(int), 0, cudaMemcpyHostToDevice));
std::vector<float*> d_score_pyr(max_levels);
std::vector<float*> d_ori_pyr(max_levels);
std::vector<float*> d_size_pyr(max_levels);
std::vector<unsigned*> d_desc_pyr(max_levels);
std::vector<unsigned*> d_idx_pyr(max_levels);
unsigned total_feat = 0;
// Calculate a separable Gaussian kernel
unsigned gauss_len = 9;
convAccT* h_gauss = new convAccT[gauss_len];
gaussian1D(h_gauss, gauss_len, 2.f);
Param<convAccT> gauss_filter;
gauss_filter.dims[0] = gauss_len;
gauss_filter.strides[0] = 1;
for (int k = 1; k < 4; k++) {
gauss_filter.dims[k] = 1;
gauss_filter.strides[k] = gauss_filter.dims[k - 1] * gauss_filter.strides[k - 1];
}
dim_type gauss_elem = gauss_filter.strides[3] * gauss_filter.dims[3];
gauss_filter.ptr = memAlloc<convAccT>(gauss_elem);
CUDA_CHECK(cudaMemcpy(gauss_filter.ptr, h_gauss, gauss_elem * sizeof(convAccT), cudaMemcpyHostToDevice));
delete[] h_gauss;
for (int i = 0; i < (int)max_levels; i++) {
if (feat_pyr[i] == 0 || lvl_best[i] == 0) {
if (i > 0)
memFree((T*)img_pyr[i].ptr);
continue;
}
unsigned* d_usable_feat = memAlloc<unsigned>(1);
CUDA_CHECK(cudaMemset(d_usable_feat, 0, sizeof(unsigned)));
float* d_x_harris = memAlloc<float>(feat_pyr[i]);
float* d_y_harris = memAlloc<float>(feat_pyr[i]);
float* d_score_harris = memAlloc<float>(feat_pyr[i]);
// Calculate Harris responses
// Good block_size >= 7 (must be an odd number)
dim3 threads(THREADS_X, THREADS_Y);
dim3 blocks(divup(feat_pyr[i], threads.x), 1);
harris_response<T,false><<<blocks, threads>>>(d_x_harris, d_y_harris, d_score_harris, NULL,
d_x_pyr[i], d_y_pyr[i], NULL,
feat_pyr[i], d_usable_feat,
img_pyr[i], 7, 0.04f, patch_size);
POST_LAUNCH_CHECK();
unsigned usable_feat = 0;
CUDA_CHECK(cudaMemcpy(&usable_feat, d_usable_feat, sizeof(unsigned), cudaMemcpyDeviceToHost));
memFree(d_x_pyr[i]);
memFree(d_y_pyr[i]);
memFree(d_usable_feat);
feat_pyr[i] = usable_feat;
if (feat_pyr[i] == 0) {
memFree(d_x_harris);
memFree(d_y_harris);
memFree(d_score_harris);
if (i > 0)
memFree((T*)img_pyr[i].ptr);
continue;
}
Param<float> harris_sorted;
Param<unsigned> harris_idx;
harris_sorted.dims[0] = harris_idx.dims[0] = feat_pyr[i];
harris_sorted.strides[0] = harris_idx.strides[0] = 1;
//.........这里部分代码省略.........
开发者ID:pavanky,项目名称:arrayfire,代码行数:101,代码来源:orb.hpp
注:本文中的CUDA_CHECK函数示例整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论