• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

C++ command_queue类代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了C++中command_queue的典型用法代码示例。如果您正苦于以下问题:C++ command_queue类的具体用法?C++ command_queue怎么用?C++ command_queue使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。



在下文中一共展示了command_queue类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的C++代码示例。

示例1: bitonic_block_sort

inline size_t bitonic_block_sort(KeyIterator keys_first,
                                 ValueIterator values_first,
                                 Compare compare,
                                 const size_t count,
                                 const bool sort_by_key,
                                 command_queue &queue)
{
    typedef typename std::iterator_traits<KeyIterator>::value_type key_type;
    typedef typename std::iterator_traits<ValueIterator>::value_type value_type;

    meta_kernel k("bitonic_block_sort");
    size_t count_arg = k.add_arg<const uint_>("count");

    size_t local_keys_arg = k.add_arg<key_type *>(memory_object::local_memory, "lkeys");
    size_t local_vals_arg = 0;
    if(sort_by_key) {
        local_vals_arg = k.add_arg<uchar_ *>(memory_object::local_memory, "lidx");
    }

    k <<
        // Work item global and local ids
        k.decl<const uint_>("gid") << " = get_global_id(0);\n" <<
        k.decl<const uint_>("lid") << " = get_local_id(0);\n";

    // declare my_key and my_value
    k <<
        k.decl<key_type>("my_key") << ";\n";
    // Instead of copying values (my_value) in local memory with keys
    // we save local index (uchar) and copy my_value at the end at
    // final index. This saves local memory.
    if(sort_by_key)
    {
        k <<
            k.decl<uchar_>("my_index") << " = (uchar)(lid);\n";
    }

    // load key
    k <<
        "if(gid < count) {\n" <<
            k.var<key_type>("my_key") <<  " = " <<
                keys_first[k.var<const uint_>("gid")] << ";\n" <<
        "}\n";

    // load key and index to local memory
    k <<
        "lkeys[lid] = my_key;\n";
    if(sort_by_key)
    {
        k <<
            "lidx[lid] = my_index;\n";
    }
    k <<
        k.decl<const uint_>("offset") << " = get_group_id(0) * get_local_size(0);\n" <<
        k.decl<const uint_>("n") << " = min((uint)(get_local_size(0)),(count - offset));\n";

    // When work group size is a power of 2 bitonic sorter can be used;
    // otherwise, slower odd-even sort is used.

    k <<
        // check if n is power of 2
        "if(((n != 0) && ((n & (~n + 1)) == n))) {\n";

    // bitonic sort, not stable
    k <<
        // wait for keys and vals to be stored in local memory
        "barrier(CLK_LOCAL_MEM_FENCE);\n" <<

        "#pragma unroll\n" <<
        "for(" <<
            k.decl<uint_>("length") << " = 1; " <<
            "length < n; " <<
            "length <<= 1" <<
        ") {\n" <<
            // direction of sort: false -> asc, true -> desc
            k.decl<bool>("direction") << "= ((lid & (length<<1)) != 0);\n" <<
            "for(" <<
                k.decl<uint_>("k") << " = length; " <<
                "k > 0; " <<
                "k >>= 1" <<
            ") {\n" <<

            // sibling to compare with my key
            k.decl<uint_>("sibling_idx") << " = lid ^ k;\n" <<
            k.decl<key_type>("sibling_key") << " = lkeys[sibling_idx];\n" <<
            k.decl<bool>("compare") << " = " <<
                compare(k.var<key_type>("sibling_key"),
                        k.var<key_type>("my_key")) << ";\n" <<
            k.decl<bool>("equal") << " = !(compare || " <<
                compare(k.var<key_type>("my_key"),
                        k.var<key_type>("sibling_key")) << ");\n" <<
            k.decl<bool>("swap") <<
                " = compare ^ (sibling_idx < lid) ^ direction;\n" <<
            "swap = equal ? false : swap;\n" <<
            "my_key = swap ? sibling_key : my_key;\n";
    if(sort_by_key)
    {
        k <<
            "my_index = swap ? lidx[sibling_idx] : my_index;\n";
    }
    k <<
//.........这里部分代码省略.........
开发者ID:boostorg,项目名称:compute,代码行数:101,代码来源:merge_sort_on_gpu.hpp


示例2: find_extrema_with_reduce

InputIterator find_extrema_with_reduce(InputIterator first,
                                       InputIterator last,
                                       ::boost::compute::less<
                                           typename std::iterator_traits<
                                               InputIterator
                                           >::value_type
                                       >
                                       compare,
                                       const bool find_minimum,
                                       command_queue &queue)
{
    typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
    typedef typename std::iterator_traits<InputIterator>::value_type input_type;

    const context &context = queue.get_context();
    const device &device = queue.get_device();

    // Getting information about used queue and device
    const size_t compute_units_no = device.get_info<CL_DEVICE_MAX_COMPUTE_UNITS>();
    const size_t max_work_group_size = device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>();

    const size_t count = detail::iterator_range_size(first, last);

    std::string cache_key = std::string("__boost_find_extrema_with_reduce_")
        + type_name<input_type>();

    // load parameters
    boost::shared_ptr<parameter_cache> parameters =
        detail::parameter_cache::get_global_cache(device);

    // get preferred work group size and preferred number
    // of work groups per compute unit
    size_t work_group_size = parameters->get(cache_key, "wgsize", 256);
    size_t work_groups_per_cu = parameters->get(cache_key, "wgpcu", 64);

    // calculate work group size and number of work groups
    work_group_size = (std::min)(max_work_group_size, work_group_size);
    size_t work_groups_no = compute_units_no * work_groups_per_cu;
    work_groups_no = (std::min)(
        work_groups_no,
        static_cast<size_t>(std::ceil(float(count) / work_group_size))
    );

    // phase I: finding candidates for extremum

    // device buffors for extremum candidates and their indices
    // each work-group computes its candidate
    // zero-copy buffers are used to eliminate copying data back to host
    vector<input_type, ::boost::compute::pinned_allocator<input_type> >
        candidates(work_groups_no, context);
    vector<uint_, ::boost::compute::pinned_allocator <uint_> >
        candidates_idx(work_groups_no, context);

    // finding candidates for first extremum and their indices
    find_extrema_with_reduce(
        first, count, candidates.begin(), candidates_idx.begin(),
        work_groups_no, work_group_size, compare, find_minimum, queue
    );

    // phase II: finding extremum from among the candidates

    // mapping candidates and their indices to host
    input_type* candidates_host_ptr =
        static_cast<input_type*>(
            queue.enqueue_map_buffer(
                candidates.get_buffer(), command_queue::map_read,
                0, work_groups_no * sizeof(input_type)
            )
        );

    uint_* candidates_idx_host_ptr =
        static_cast<uint_*>(
            queue.enqueue_map_buffer(
                candidates_idx.get_buffer(), command_queue::map_read,
                0, work_groups_no * sizeof(uint_)
            )
        );

    input_type* i = candidates_host_ptr;
    uint_* idx = candidates_idx_host_ptr;
    uint_* extremum_idx = idx;
    input_type extremum = *candidates_host_ptr;
    i++; idx++;

    // find extremum (serial) from among the candidates on host
    if(!find_minimum) {
        while(idx != (candidates_idx_host_ptr + work_groups_no)) {
            input_type next = *i;
            bool compare_result =  next > extremum;
            bool equal = next == extremum;
            extremum = compare_result ? next : extremum;
            extremum_idx = compare_result ? idx : extremum_idx;
            extremum_idx = equal ? ((*extremum_idx < *idx) ? extremum_idx : idx) : extremum_idx;
            idx++, i++;
        }
    }
    else {
        while(idx != (candidates_idx_host_ptr + work_groups_no)) {
            input_type next = *i;
            bool compare_result = next < extremum;
//.........这里部分代码省略.........
开发者ID:2bbb,项目名称:compute,代码行数:101,代码来源:find_extrema_with_reduce.hpp


示例3: reduce

size_t reduce(InputIterator first,
              size_t count,
              OutputIterator result,
              size_t block_size,
              BinaryFunction function,
              command_queue &queue)
{
    typedef typename
        std::iterator_traits<InputIterator>::value_type
        input_type;
    typedef typename
        boost::compute::result_of<BinaryFunction(input_type, input_type)>::type
        result_type;

    const context &context = queue.get_context();
    size_t block_count = count / 2 / block_size;
    size_t total_block_count =
        static_cast<size_t>(std::ceil(float(count) / 2.f / float(block_size)));

    if(block_count != 0){
        meta_kernel k("block_reduce");
        size_t output_arg = k.add_arg<result_type *>(memory_object::global_memory, "output");
        size_t block_arg = k.add_arg<input_type *>(memory_object::local_memory, "block");

        k <<
            "const uint gid = get_global_id(0);\n" <<
            "const uint lid = get_local_id(0);\n" <<

            // copy values to local memory
            "block[lid] = " <<
                function(first[k.make_var<uint_>("gid*2+0")],
                         first[k.make_var<uint_>("gid*2+1")]) << ";\n" <<

            // perform reduction
            "for(uint i = 1; i < " << uint_(block_size) << "; i <<= 1){\n" <<
            "    barrier(CLK_LOCAL_MEM_FENCE);\n" <<
            "    uint mask = (i << 1) - 1;\n" <<
            "    if((lid & mask) == 0){\n" <<
            "        block[lid] = " <<
                         function(k.expr<input_type>("block[lid]"),
                                  k.expr<input_type>("block[lid+i]")) << ";\n" <<
            "    }\n" <<
            "}\n" <<

            // write block result to global output
            "if(lid == 0)\n" <<
            "    output[get_group_id(0)] = block[0];\n";

        kernel kernel = k.compile(context);
        kernel.set_arg(output_arg, result.get_buffer());
        kernel.set_arg(block_arg, local_buffer<input_type>(block_size));

        queue.enqueue_1d_range_kernel(kernel,
                                      0,
                                      block_count * block_size,
                                      block_size);
    }

    // serially reduce any leftovers
    if(block_count * block_size * 2 < count){
        size_t last_block_start = block_count * block_size * 2;

        meta_kernel k("extra_serial_reduce");
        size_t count_arg = k.add_arg<uint_>("count");
        size_t offset_arg = k.add_arg<uint_>("offset");
        size_t output_arg = k.add_arg<result_type *>(memory_object::global_memory, "output");
        size_t output_offset_arg = k.add_arg<uint_>("output_offset");

        k <<
            k.decl<result_type>("result") << " = \n" <<
                first[k.expr<uint_>("offset")] << ";\n" <<
            "for(uint i = offset + 1; i < count; i++)\n" <<
            "    result = " <<
                     function(k.var<result_type>("result"),
                              first[k.var<uint_>("i")]) << ";\n" <<
            "output[output_offset] = result;\n";

        kernel kernel = k.compile(context);
        kernel.set_arg(count_arg, static_cast<uint_>(count));
        kernel.set_arg(offset_arg, static_cast<uint_>(last_block_start));
        kernel.set_arg(output_arg, result.get_buffer());
        kernel.set_arg(output_offset_arg, static_cast<uint_>(block_count));

        queue.enqueue_task(kernel);
    }

    return total_block_count;
}
开发者ID:junmuz,项目名称:compute,代码行数:88,代码来源:reduce.hpp


示例4: get_context_id

/// Returns raw context id for the given queue.
inline context_id get_context_id(const command_queue &q) {
    return q.get_context().get();
}
开发者ID:ddemidov,项目名称:vexcl,代码行数:4,代码来源:context.hpp


示例5: inplace_reduce

inline void inplace_reduce(Iterator first,
                           Iterator last,
                           BinaryFunction function,
                           command_queue &queue)
{
    typedef typename
        std::iterator_traits<Iterator>::value_type
        value_type;

    size_t input_size = iterator_range_size(first, last);
    if(input_size < 2){
        return;
    }

    const context &context = queue.get_context();

    size_t block_size = 64;
    size_t values_per_thread = 8;
    size_t block_count = input_size / (block_size * values_per_thread);
    if(block_count * block_size * values_per_thread != input_size)
        block_count++;

    vector<value_type> output(block_count, context);

    meta_kernel k("inplace_reduce");
    size_t input_arg = k.add_arg<value_type *>(memory_object::global_memory, "input");
    size_t input_size_arg = k.add_arg<const uint_>("input_size");
    size_t output_arg = k.add_arg<value_type *>(memory_object::global_memory, "output");
    size_t scratch_arg = k.add_arg<value_type *>(memory_object::local_memory, "scratch");
    k <<
        "const uint gid = get_global_id(0);\n" <<
        "const uint lid = get_local_id(0);\n" <<
        "const uint values_per_thread =\n"
            << uint_(values_per_thread) << ";\n" <<

        // thread reduce
        "const uint index = gid * values_per_thread;\n" <<
        "if(index < input_size){\n" <<
            k.decl<value_type>("sum") << " = input[index];\n" <<
            "for(uint i = 1;\n" <<
                 "i < values_per_thread && (index + i) < input_size;\n" <<
                 "i++){\n" <<
            "    sum = " <<
                     function(k.var<value_type>("sum"),
                              k.var<value_type>("input[index+i]")) << ";\n" <<
            "}\n" <<
            "scratch[lid] = sum;\n" <<
        "}\n" <<

        // local reduce
        "for(uint i = 1; i < get_local_size(0); i <<= 1){\n" <<
        "    barrier(CLK_LOCAL_MEM_FENCE);\n" <<
        "    uint mask = (i << 1) - 1;\n" <<
        "    uint next_index = (gid + i) * values_per_thread;\n"
        "    if((lid & mask) == 0 && next_index < input_size){\n" <<
        "        scratch[lid] = " <<
                     function(k.var<value_type>("scratch[lid]"),
                              k.var<value_type>("scratch[lid+i]")) << ";\n" <<
        "    }\n" <<
        "}\n" <<

        // write output for block
        "if(lid == 0){\n" <<
        "    output[get_group_id(0)] = scratch[0];\n" <<
        "}\n"
        ;

    const buffer *input_buffer = &first.get_buffer();
    const buffer *output_buffer = &output.get_buffer();

    kernel kernel = k.compile(context);

    while(input_size > 1){
        kernel.set_arg(input_arg, *input_buffer);
        kernel.set_arg(input_size_arg, static_cast<uint_>(input_size));
        kernel.set_arg(output_arg, *output_buffer);
        kernel.set_arg(scratch_arg, local_buffer<value_type>(block_size));

        queue.enqueue_1d_range_kernel(kernel,
                                      0,
                                      block_count * block_size,
                                      block_size);

        input_size =
            static_cast<size_t>(
                std::ceil(float(input_size) / (block_size * values_per_thread)
            )
        );

        block_count = input_size / (block_size * values_per_thread);
        if(block_count * block_size * values_per_thread != input_size)
            block_count++;

        std::swap(input_buffer, output_buffer);
    }

    if(input_buffer != &first.get_buffer()){
        ::boost::compute::copy(output.begin(),
                               output.begin() + 1,
                               first,
//.........这里部分代码省略.........
开发者ID:junmuz,项目名称:compute,代码行数:101,代码来源:inplace_reduce.hpp


示例6: duplicate_queue

/// Create command queue on the same context and device as the given one.
inline command_queue duplicate_queue(const command_queue &q) {
    return command_queue(q.get_context(), q.get_device(), q.get_properties());
}
开发者ID:ddemidov,项目名称:vexcl,代码行数:4,代码来源:context.hpp


示例7: get_device

/// Returns device associated with the given queue.
inline device get_device(const command_queue &q) {
    return q.get_device();
}
开发者ID:ddemidov,项目名称:vexcl,代码行数:4,代码来源:context.hpp


示例8: scan_on_cpu

inline OutputIterator scan_on_cpu(InputIterator first,
                                  InputIterator last,
                                  OutputIterator result,
                                  bool exclusive,
                                  T init,
                                  BinaryOperator op,
                                  command_queue &queue)
{
    if(first == last){
        return result;
    }

    typedef typename
        std::iterator_traits<InputIterator>::value_type input_type;
    typedef typename
        std::iterator_traits<OutputIterator>::value_type output_type;

    const context &context = queue.get_context();

    // create scan kernel
    meta_kernel k("scan_on_cpu");

    // Arguments
    size_t n_arg = k.add_arg<ulong_>("n");
    size_t init_arg = k.add_arg<output_type>("initial_value");

    if(!exclusive){
        k <<
            k.decl<const ulong_>("start_idx") << " = 1;\n" <<
            k.decl<output_type>("sum") << " = " << first[0] << ";\n" <<
            result[0] << " = sum;\n";
    }
    else {
        k <<
            k.decl<const ulong_>("start_idx") << " = 0;\n" <<
            k.decl<output_type>("sum") << " = initial_value;\n";
    }

    k <<
        "for(ulong i = start_idx; i < n; i++){\n" <<
        k.decl<const input_type>("x") << " = "
            << first[k.var<ulong_>("i")] << ";\n";

    if(exclusive){
        k << result[k.var<ulong_>("i")] << " = sum;\n";
    }

    k << "    sum = "
        << op(k.var<output_type>("sum"), k.var<output_type>("x"))
        << ";\n";

    if(!exclusive){
        k << result[k.var<ulong_>("i")] << " = sum;\n";
    }

    k << "}\n";

    // compile scan kernel
    kernel scan_kernel = k.compile(context);

    // setup kernel arguments
    size_t n = detail::iterator_range_size(first, last);
    scan_kernel.set_arg<ulong_>(n_arg, n);
    scan_kernel.set_arg<output_type>(init_arg, static_cast<output_type>(init));

    // execute the kernel
    queue.enqueue_1d_range_kernel(scan_kernel, 0, 1, 1);

    // return iterator pointing to the end of the result range
    return result + n;
}
开发者ID:2bbb,项目名称:compute,代码行数:71,代码来源:scan_on_cpu.hpp


示例9: radix_sort_impl

inline void radix_sort_impl(const buffer_iterator<T> first,
                            const buffer_iterator<T> last,
                            const buffer_iterator<T2> values_first,
                            const bool ascending,
                            command_queue &queue)
{

    typedef T value_type;
    typedef typename radix_sort_value_type<sizeof(T)>::type sort_type;

    const device &device = queue.get_device();
    const context &context = queue.get_context();


    // if we have a valid values iterator then we are doing a
    // sort by key and have to set up the values buffer
    bool sort_by_key = (values_first.get_buffer().get() != 0);

    // load (or create) radix sort program
    std::string cache_key =
        std::string("__boost_radix_sort_") + type_name<value_type>();

    if(sort_by_key){
        cache_key += std::string("_with_") + type_name<T2>();
    }

    boost::shared_ptr<program_cache> cache =
        program_cache::get_global_cache(context);
    boost::shared_ptr<parameter_cache> parameters =
        detail::parameter_cache::get_global_cache(device);

    // sort parameters
    const uint_ k = parameters->get(cache_key, "k", 4);
    const uint_ k2 = 1 << k;
    const uint_ block_size = parameters->get(cache_key, "tpb", 128);

    // sort program compiler options
    std::stringstream options;
    options << "-DK_BITS=" << k;
    options << " -DT=" << type_name<sort_type>();
    options << " -DBLOCK_SIZE=" << block_size;

    if(boost::is_floating_point<value_type>::value){
        options << " -DIS_FLOATING_POINT";
    }

    if(boost::is_signed<value_type>::value){
        options << " -DIS_SIGNED";
    }

    if(sort_by_key){
        options << " -DSORT_BY_KEY";
        options << " -DT2=" << type_name<T2>();
        options << enable_double<T2>();
    }

    if(ascending){
        options << " -DASC";
    }

    // load radix sort program
    program radix_sort_program = cache->get_or_build(
        cache_key, options.str(), radix_sort_source, context
    );

    kernel count_kernel(radix_sort_program, "count");
    kernel scan_kernel(radix_sort_program, "scan");
    kernel scatter_kernel(radix_sort_program, "scatter");

    size_t count = detail::iterator_range_size(first, last);

    uint_ block_count = static_cast<uint_>(count / block_size);
    if(block_count * block_size != count){
        block_count++;
    }

    // setup temporary buffers
    vector<value_type> output(count, context);
    vector<T2> values_output(sort_by_key ? count : 0, context);
    vector<uint_> offsets(k2, context);
    vector<uint_> counts(block_count * k2, context);

    const buffer *input_buffer = &first.get_buffer();
    uint_ input_offset = static_cast<uint_>(first.get_index());
    const buffer *output_buffer = &output.get_buffer();
    uint_ output_offset = 0;
    const buffer *values_input_buffer = &values_first.get_buffer();
    uint_ values_input_offset = static_cast<uint_>(values_first.get_index());
    const buffer *values_output_buffer = &values_output.get_buffer();
    uint_ values_output_offset = 0;

    for(uint_ i = 0; i < sizeof(sort_type) * CHAR_BIT / k; i++){
        // write counts
        count_kernel.set_arg(0, *input_buffer);
        count_kernel.set_arg(1, input_offset);
        count_kernel.set_arg(2, static_cast<uint_>(count));
        count_kernel.set_arg(3, counts);
        count_kernel.set_arg(4, offsets);
        count_kernel.set_arg(5, block_size * sizeof(uint_), 0);
        count_kernel.set_arg(6, i * k);
//.........这里部分代码省略.........
开发者ID:2bbb,项目名称:compute,代码行数:101,代码来源:radix_sort.hpp


示例10: scan_on_cpu

inline OutputIterator scan_on_cpu(InputIterator first,
                                  InputIterator last,
                                  OutputIterator result,
                                  bool exclusive,
                                  T init,
                                  BinaryOperator op,
                                  command_queue &queue)
{
    typedef typename
        std::iterator_traits<InputIterator>::value_type input_type;
    typedef typename
        std::iterator_traits<OutputIterator>::value_type output_type;

    const context &context = queue.get_context();
    const device &device = queue.get_device();
    const size_t compute_units = queue.get_device().compute_units();

    boost::shared_ptr<parameter_cache> parameters =
        detail::parameter_cache::get_global_cache(device);

    std::string cache_key =
        "__boost_scan_cpu_" + boost::lexical_cast<std::string>(sizeof(T));

    // for inputs smaller than serial_scan_threshold
    // serial_scan algorithm is used
    uint_ serial_scan_threshold =
        parameters->get(cache_key, "serial_scan_threshold", 16384 * sizeof(T));
    serial_scan_threshold =
        (std::max)(serial_scan_threshold, uint_(compute_units));

    size_t count = detail::iterator_range_size(first, last);
    if(count == 0){
        return result;
    }
    else if(count < serial_scan_threshold) {
        return serial_scan(first, last, result, exclusive, init, op, queue);
    }

    buffer block_partial_sums(context, sizeof(output_type) * compute_units );

    // create scan kernel
    meta_kernel k("scan_on_cpu_block_scan");

    // Arguments
    size_t count_arg = k.add_arg<uint_>("count");
    size_t init_arg = k.add_arg<output_type>("initial_value");
    size_t block_partial_sums_arg =
        k.add_arg<output_type *>(memory_object::global_memory, "block_partial_sums");

    k <<
        "uint block = " <<
            "(uint)ceil(((float)count)/(get_global_size(0) + 1));\n" <<
        "uint index = get_global_id(0) * block;\n" <<
        "uint end = min(count, index + block);\n";

    if(!exclusive){
        k <<
            k.decl<output_type>("sum") << " = " <<
                first[k.var<uint_>("index")] << ";\n" <<
            result[k.var<uint_>("index")] << " = sum;\n" <<
            "index++;\n";
    }
    else {
        k <<
            k.decl<output_type>("sum") << ";\n" <<
            "if(index == 0){\n" <<
                "sum = initial_value;\n" <<
            "}\n" <<
            "else {\n" <<
                "sum = " << first[k.var<uint_>("index")] << ";\n" <<
                "index++;\n" <<
            "}\n";
    }

    k <<
        "while(index < end){\n" <<
            // load next value
            k.decl<const input_type>("value") << " = "
                << first[k.var<uint_>("index")] << ";\n";

    if(exclusive){
        k <<
            "if(get_global_id(0) == 0){\n" <<
                result[k.var<uint_>("index")] << " = sum;\n" <<
            "}\n";
    }
    k <<
            "sum = " << op(k.var<output_type>("sum"),
                           k.var<output_type>("value")) << ";\n";

    if(!exclusive){
        k <<
            "if(get_global_id(0) == 0){\n" <<
                result[k.var<uint_>("index")] << " = sum;\n" <<
            "}\n";
    }

    k <<
            "index++;\n" <<
        "}\n" << // end while
//.........这里部分代码省略.........
开发者ID:3Jade,项目名称:Sprawl,代码行数:101,代码来源:scan_on_cpu.hpp


示例11: duplicate_queue

/// Create command queue on the same context and device as the given one.
inline command_queue duplicate_queue(const command_queue &q) {
    return command_queue(q.context(), q.device(), q.flags());
}
开发者ID:,项目名称:,代码行数:4,代码来源:


示例12: select_context

/// Binds the specified CUDA context to the calling CPU thread.
inline void select_context(const command_queue &q) {
    q.context().set_current();
}
开发者ID:,项目名称:,代码行数:4,代码来源:


示例13: pad_vector

static inline std::vector<uint>
pad_vector(command_queue &q, const V &v, uint x) {
   std::vector<uint> w { v.begin(), v.end() };
   w.resize(q.device().max_block_size().size(), x);
   return w;
}
开发者ID:freedesktop-unofficial-mirror,项目名称:mesa__mesa,代码行数:6,代码来源:kernel.cpp


示例14: build_sources

/// Create and build a program from source string.
inline vex::backend::program build_sources(
        const command_queue &queue, const std::string &source,
        const std::string &options = ""
        )
{
#ifdef VEXCL_SHOW_KERNELS
    std::cout << source << std::endl;
#else
    if (getenv("VEXCL_SHOW_KERNELS"))
        std::cout << source << std::endl;
#endif

    std::string compile_options = options + " " + get_compile_options(queue);

    queue.context().set_current();

    auto cc = queue.device().compute_capability();
    std::ostringstream ccstr;
    ccstr << std::get<0>(cc) << std::get<1>(cc);

    sha1_hasher sha1;
    sha1.process(source)
        .process(queue.device().name())
        .process(compile_options)
        .process(ccstr.str())
        ;

    std::string hash = static_cast<std::string>(sha1);

    // Write source to a .cu file
    std::string basename = program_binaries_path(hash, true) + "kernel";
    std::string ptxfile  = basename + ".ptx";

    if ( !boost::filesystem::exists(ptxfile) ) {
        std::string cufile = basename + ".cu";

        {
            std::ofstream f(cufile);
            f << source;
        }

        // Compile the source to ptx.
        std::ostringstream cmdline;
        cmdline
            << "nvcc -ptx -O3"
            << " -arch=sm_" << std::get<0>(cc) << std::get<1>(cc)
            << " " << compile_options
            << " -o " << ptxfile << " " << cufile;
        if (0 != system(cmdline.str().c_str()) ) {
#ifndef VEXCL_SHOW_KERNELS
            std::cerr << source << std::endl;
#endif

            vex::detail::print_backtrace();
            throw std::runtime_error("nvcc invocation failed");
        }
    }

    // Load the compiled ptx.
    CUmodule prg;
    cuda_check( cuModuleLoad(&prg, ptxfile.c_str()) );

    return program(queue.context(), prg);
}
开发者ID:,项目名称:,代码行数:65,代码来源:


示例15: operator

 bool operator()(const command_queue &a, const command_queue &b) const {
     return a.get() < b.get();
 }
开发者ID:ddemidov,项目名称:vexcl,代码行数:3,代码来源:context.hpp


示例16: num_workgroups

 /// Standard number of workgroups to launch on a device.
 static inline size_t num_workgroups(const command_queue &q) {
     return 8 * q.device().multiprocessor_count();
 }
开发者ID:ds283,项目名称:vexcl,代码行数:4,代码来源:kernel.hpp


示例17: is_cpu

/// Checks if the compute device is CPU.
inline bool is_cpu(const command_queue &q) {
    return q.get_device().get_info<cl_device_type>(CL_DEVICE_TYPE) & CL_DEVICE_TYPE_CPU;
}
开发者ID:ddemidov,项目名称:vexcl,代码行数:4,代码来源:context.hpp


示例18: max_shared_memory_per_block

 /// The size in bytes of shared memory per block available for this kernel.
 size_t max_shared_memory_per_block(const command_queue &q) const {
     return q.device().max_shared_memory_per_block() -
         shared_size_bytes();
 }
开发者ID:ds283,项目名称:vexcl,代码行数:5,代码来源:kernel.hpp


示例19: get_device_id

/// Returns id of the device associated with the given queue.
inline device_id get_device_id(const command_queue &q) {
    return q.get_device().get();
}
开发者ID:ddemidov,项目名称:vexcl,代码行数:4,代码来源:context.hpp


示例20: serial_reduce_by_key

inline size_t serial_reduce_by_key(InputKeyIterator keys_first,
                                   InputKeyIterator keys_last,
                                   InputValueIterator values_first,
                                   OutputKeyIterator keys_result,
                                   OutputValueIterator values_result,
                                   BinaryFunction function,
                                   BinaryPredicate predicate,
                                   command_queue &queue)
{
    typedef typename
        std::iterator_traits<InputValueIterator>::value_type value_type;
    typedef typename
        std::iterator_traits<InputKeyIterator>::value_type key_type;
    typedef typename
        ::boost::compute::result_of<BinaryFunction(value_type, value_type)>::type result_type;

    const context &context = queue.get_context();
    size_t count = detail::iterator_range_size(keys_first, keys_last);
    if(count < 1){
        return count;
    }

    meta_kernel k("serial_reduce_by_key");
    size_t count_arg = k.add_arg<uint_>("count");
    size_t result_size_arg = k.add_arg<uint_ *>(memory_object::global_memory,
                                                "result_size");

    k <<
        k.decl<result_type>("result") <<
            " = " << values_first[0] << ";\n" <<
        k.decl<key_type>("previous_key") << " = " << keys_first[0] << ";\n" <<
        k.decl<result_type>("value") << ";\n" <<
        k.decl<key_type>("key") << ";\n" <<

        k.decl<uint_>("size") << " = 1;\n" <<

        keys_result[0] << " = previous_key;\n" <<
        values_result[0] << " = result;\n" <<

        "for(ulong i = 1; i < count; i++) {\n" <<
        "    value = " << values_first[k.var<uint_>("i")] << ";\n" <<
        "    key = " << keys_first[k.var<uint_>("i")] << ";\n" <<
        "    if (" << predicate(k.var<key_type>("previous_key"),
                                k.var<key_type>("key")) << ") {\n" <<

        "        result = " << function(k.var<result_type>("result"),
                                        k.var<result_type>("value")) << ";\n" <<
        "    }\n " <<
        "    else { \n" <<
                 keys_result[k.var<uint_>("size - 1")] << " = previous_key;\n" <<
                 values_result[k.var<uint_>("size - 1")] << " = result;\n" <<
        "        result = value;\n" <<
        "        size++;\n" <<
        "    } \n" <<
        "    previous_key = key;\n" <<
        "}\n" <<
        keys_result[k.var<uint_>("size - 1")] << " = previous_key;\n" <<
        values_result[k.var<uint_>("size - 1")] << " = result;\n" <<
        "*result_size = size;";

    kernel kernel = k.compile(context);

    scalar<uint_> result_size(context);
    kernel.set_arg(result_size_arg, result_size.get_buffer());
    kernel.set_arg(count_arg, static_cast<uint_>(count));

    queue.enqueue_task(kernel);

    return static_cast<size_t>(result_size.read(queue));
}
开发者ID:AbhinavJain13,项目名称:turicreate,代码行数:70,代码来源:serial_reduce_by_key.hpp



注:本文中的command_queue类示例由纯净天空整理自Github/MSDocs等源码及文档


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
C++ command_reader类代码示例发布时间:2022-05-31
下一篇:
C++ command_line_parser类代码示例发布时间:2022-05-31
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap