• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python job.MRJob类代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中mrjob.job.MRJob的典型用法代码示例。如果您正苦于以下问题:Python MRJob类的具体用法?Python MRJob怎么用?Python MRJob使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。



在下文中一共展示了MRJob类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: reducer

    def reducer(self, n, vars):
        MRJob.set_status(self, "=============>  reducer called")

        samples_from_mappers = []
        counts_from_mappers = []

        # First read all the counts from different mappers fo we know the total number of items and we can give
        # each of the sets coming from different mappers their appropriate weight
        total_counts_from_mappers = 0

        for x in vars:
            input = json.loads(x)
            total_counts_from_mappers += input[0]

            counts_from_mappers.append(input[0])
            samples_from_mappers.append(input[1])

        # Now based on the number of samples in each mapper we need to select appropriate number of samples form
        # samples_from_mappers
        i = 0
        for sample_set in samples_from_mappers:
            weight = counts_from_mappers[i] * 1.0 / total_counts_from_mappers
            number_of_needed_samples = int(round(weight * self.options.sample_size))

            for j in range(number_of_needed_samples):
                yield 1, sample_set.pop()

            i += 1
开发者ID:ddehghan,项目名称:machine_learning_class,代码行数:28,代码来源:mrSample.py


示例2: main

def main(cl_args=None):
    arg_parser = _make_arg_parser()
    options = arg_parser.parse_args(cl_args)

    MRJob.set_up_logging(quiet=options.quiet,
                         verbose=options.verbose)

    # max_hours_idle -> max_mins_idle
    max_mins_idle = options.max_mins_idle
    if max_mins_idle is None and options.max_hours_idle is not None:
        log.warning('--max-hours-idle is deprecated and will be removed'
                    ' in v0.7.0. Please use --max-mins-idle instead.')
        max_mins_idle = options.max_hours_idle * 60

    if options.mins_to_end_of_hour is not None:
        log.warning('--mins-to-end-of-hour is deprecated as of v0.6.0'
                    ' and does nothing')

    _maybe_terminate_clusters(
        dry_run=options.dry_run,
        max_mins_idle=max_mins_idle,
        unpooled_only=options.unpooled_only,
        now=_boto3_now(),
        pool_name=options.pool_name,
        pooled_only=options.pooled_only,
        max_mins_locked=options.max_mins_locked,
        quiet=options.quiet,
        **_runner_kwargs(options)
    )
开发者ID:okomestudio,项目名称:mrjob,代码行数:29,代码来源:terminate_idle_clusters.py


示例3: test_cmd_line_options

    def test_cmd_line_options(self):
        mr_job = MRJob(
            ["--partitioner", "java.lang.Object", "--partitioner", "org.apache.hadoop.mapreduce.Partitioner"]
        )

        # second option takes priority
        self.assertEqual(mr_job.job_runner_kwargs()["partitioner"], "org.apache.hadoop.mapreduce.Partitioner")
开发者ID:ndimiduk,项目名称:mrjob,代码行数:7,代码来源:test_job.py


示例4: reducer_final

    def reducer_final(self):

        MRJob.set_status(self, "=============>  reducer final called")

        for label in self.output:
            stratum_samples = self.output[label]
            yield label, (len(stratum_samples), stratum_samples)
开发者ID:marionleborgne,项目名称:machine_learning,代码行数:7,代码来源:mrStratify.py


示例5: test_spark

    def test_spark(self):
        job = MRJob(["--spark", "input_dir", "output_dir"])
        job.spark = MagicMock()

        job.execute()

        job.spark.assert_called_once_with("input_dir", "output_dir")
开发者ID:davidmarin,项目名称:mrjob,代码行数:7,代码来源:test_job.py


示例6: mapper_final

    def mapper_final(self):
        MRJob.set_status(self, "=============>  mapper final called")

        out = [self.count, self.samples]
        jOut = json.dumps(out)

        yield 1, jOut
开发者ID:AshKash,项目名称:kit-sink,代码行数:7,代码来源:mrSample.py


示例7: test_bytes_value_protocol

    def test_bytes_value_protocol(self):
        job = MRJob()
        job.OUTPUT_PROTOCOL = BytesValueProtocol

        self.assertEqual(
            job.parse_output_line(b'one two\n'),
            (None, b'one two\n'))
开发者ID:okomestudio,项目名称:mrjob,代码行数:7,代码来源:test_job.py


示例8: test_spark_method

    def test_spark_method(self):
        j = MRJob(["--no-conf"])
        j.spark = MagicMock()

        self.assertEqual(j.steps(), [SparkStep(j.spark)])

        self.assertEqual(j._steps_desc(), [dict(type="spark", spark_args=[])])
开发者ID:davidmarin,项目名称:mrjob,代码行数:7,代码来源:test_job.py


示例9: test_default_protocol

    def test_default_protocol(self):
        job = MRJob()

        data = iter([b'1\t2', b'\n{"3": ', b'4}\t"fi', b've"\n'])
        self.assertEqual(
            list(job.parse_output(data)),
            [(1, 2), ({'3': 4}, 'five')])
开发者ID:okomestudio,项目名称:mrjob,代码行数:7,代码来源:test_job.py


示例10: main

def main(args):
    # parser command-line args
    usage = '%prog [options]'
    description = "Collect EMR stats from active jobflows. "
    description += "Active jobflows are those in states of: "
    description += "BOOTSTRAPPING, RUNNING, STARTING, and WAITING. "
    description += "Collected stats include total number of active jobflows"
    description += "and total number of Amazon EC2 instances used to execute"
    description += "these jobflows. The instance counts are not separated by"
    description += "instance type."
    option_parser = OptionParser(usage=usage, description=description)
    option_parser.add_option(
        "-p", "--pretty-print",
        action="store_true", dest="pretty_print", default=False,
        help=('Pretty print the collected stats'))
    add_basic_opts(option_parser)

    options, args = option_parser.parse_args(args)
    if args:
        option_parser.error('takes no arguments')

    MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)
    log.info('collecting EMR active jobflows...')
    job_flows = collect_active_job_flows(options.conf_paths)
    log.info('compiling stats from collected jobflows...')
    stats = job_flows_to_stats(job_flows)

    if options.pretty_print:
        pretty_print(stats)
    else:
        print(json.dumps(stats))
开发者ID:tempcyc,项目名称:mrjob,代码行数:31,代码来源:collect_emr_stats.py


示例11: main

def main(args=None):
    now = _boto3_now()

    arg_parser = _make_arg_parser()
    options = arg_parser.parse_args(args)

    MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)

    log.info('getting information about running jobs')

    min_time = timedelta(hours=options.min_hours)

    emr_client = EMRJobRunner(**_runner_kwargs(options)).make_emr_client()
    cluster_summaries = _boto3_paginate(
        'Clusters', emr_client, 'list_clusters',
        ClusterStates=['STARTING', 'BOOTSTRAPPING', 'RUNNING'])

    if not options.exclude:
        filtered_cluster_summaries = cluster_summaries
    else:
        filtered_cluster_summaries = _filter_clusters(
            cluster_summaries, emr_client, options.exclude)

    job_info = _find_long_running_jobs(
        emr_client, filtered_cluster_summaries, min_time, now=now)

    _print_report(job_info)
开发者ID:Affirm,项目名称:mrjob,代码行数:27,代码来源:report_long_jobs.py


示例12: __init__

    def __init__(self, *args, **kwargs):
        MRJob.__init__(self, *args, **kwargs)

        ## load entities from json file
        log("loading entity list")
        entities = json.load(urllib.urlopen("https://s3.amazonaws.com/trec-kba-2012/entity-urlnames.json"))
        self.entity_representations = toy_kba_algorithm.prepare_entities(entities)
开发者ID:SHENbeyond,项目名称:kba-tools,代码行数:7,代码来源:toy_kba_mrjob.py


示例13: reducer

    def reducer(self, n, vars):
        MRJob.set_status(self, "=============>  reducer called")

        print "reducer:", vars
        samples_from_mappers = []
        counts_from_mappers = []

        # First read all the counts from different mappers fo we know the total number of items and we can give
        # each of the sets coming from different mappers their appropriate weight
        total_counts_from_mappers = 0

        for x in vars:
            input = json.loads(x)
            total_counts_from_mappers += input[0]

            counts_from_mappers.append(input[0])
            samples_from_mappers.append(input[1])

        # Now based on the number of samples in each mapper we need to select appropriate number of samples form
        # samples_from_mappers
        i = 0

        fileOut=open(os.path.join(PROJECT_ROOT , 'output.txt'),"w")

        for sample_set in samples_from_mappers:
            weight = counts_from_mappers[i] * 1.0 / total_counts_from_mappers
            number_of_needed_samples = int(round(weight * self.options.sample_size))

            for j in range(number_of_needed_samples):
                fileOut.write(str(sample_set.pop()) + '\n')


            i += 1
        fileOut.close()
        if False: yield 1,2
开发者ID:AshKash,项目名称:kit-sink,代码行数:35,代码来源:mrSample.py


示例14: test_wrong_type_of_step

    def test_wrong_type_of_step(self):
        mr_job = MRJob()
        mr_job.spark = MagicMock()

        self.assertRaises(TypeError, mr_job.run_mapper)
        self.assertRaises(TypeError, mr_job.run_combiner)
        self.assertRaises(TypeError, mr_job.run_reducer)
开发者ID:okomestudio,项目名称:mrjob,代码行数:7,代码来源:test_job.py


示例15: test_deprecated_mapper_final_positional_arg

    def test_deprecated_mapper_final_positional_arg(self):
        def mapper(k, v):
            pass

        def reducer(k, v):
            pass

        def mapper_final():
            pass

        stderr = StringIO()
        with no_handlers_for_logger():
            log_to_stream('mrjob.job', stderr)
            step = MRJob.mr(mapper, reducer, mapper_final)

        # should be allowed to specify mapper_final as a positional arg,
        # but we log a warning
        self.assertEqual(
            step,
            MRJob.mr(
                mapper=mapper, reducer=reducer, mapper_final=mapper_final))
        self.assertIn('mapper_final should be specified', stderr.getvalue())

        # can't specify mapper_final as a positional and keyword arg
        self.assertRaises(
            TypeError,
            MRJob.mr,
            mapper,
            reducer,
            mapper_final,
            mapper_final=mapper_final)
开发者ID:bchess,项目名称:mrjob,代码行数:31,代码来源:test_job.py


示例16: test_empty

    def test_empty(self):
        mr_job = MRJob()

        self.assertEqual(mr_job._runner_kwargs()['hadoop_input_format'],
                         None)
        self.assertEqual(mr_job._runner_kwargs()['hadoop_output_format'],
                         None)
开发者ID:okomestudio,项目名称:mrjob,代码行数:7,代码来源:test_job.py


示例17: test_mr

    def test_mr(self):

        def mapper(k, v):
            pass

        def mapper_init():
            pass

        def mapper_final():
            pass

        def reducer(k, vs):
            pass

        def reducer_init():
            pass

        def reducer_final():
            pass

        # make sure it returns the format we currently expect
        self.assertEqual(MRJob.mr(mapper, reducer),
                         stepdict(mapper, reducer))
        self.assertEqual(MRJob.mr(mapper, reducer,
                                  mapper_init=mapper_init,
                                  mapper_final=mapper_final,
                                  reducer_init=reducer_init,
                                  reducer_final=reducer_final),
                         stepdict(mapper, reducer,
                                  mapper_init=mapper_init,
                                  mapper_final=mapper_final,
                                  reducer_init=reducer_init,
                                  reducer_final=reducer_final))
        self.assertEqual(MRJob.mr(mapper),
                         stepdict(mapper))
开发者ID:DrMavenRebe,项目名称:mrjob,代码行数:35,代码来源:test_job.py


示例18: test_verbose

 def test_verbose(self):
     with patch.object(sys, 'stderr', StringIO()) as stderr:
         MRJob.set_up_logging(verbose=True)
         log = logging.getLogger('__main__')
         log.info('INFO')
         log.debug('DEBUG')
         self.assertEqual(stderr.getvalue(), 'INFO\nDEBUG\n')
开发者ID:Yelp,项目名称:mrjob,代码行数:7,代码来源:test_launch.py


示例19: main

def main(cl_args=None):
    parser = _make_arg_parser()
    options = parser.parse_args(cl_args)

    runner_alias = options.runner or _DEFAULT_RUNNER
    runner_class = _runner_class(runner_alias)

    if options.help or not options.script_or_jar:
        _print_help(options, runner_class)
        sys.exit(0)

    MRJob.set_up_logging(
        quiet=options.quiet,
        verbose=options.verbose,
    )

    kwargs = _get_runner_opt_kwargs(options, runner_class)
    kwargs.update(_HARD_CODED_OPTS)

    kwargs['input_paths'] = [os.devnull]

    step = _get_step(options, parser, cl_args)
    kwargs['steps'] = [step.description()]

    runner = runner_class(**kwargs)

    try:
        runner.run()
    finally:
        runner.cleanup()
开发者ID:Affirm,项目名称:mrjob,代码行数:30,代码来源:spark_submit.py


示例20: test_default_options

 def test_default_options(self):
     with no_handlers_for_logger('__main__'):
         with patch.object(sys, 'stderr', StringIO()) as stderr:
             MRJob.set_up_logging()
             log = logging.getLogger('__main__')
             log.info('INFO')
             log.debug('DEBUG')
             self.assertEqual(stderr.getvalue(), 'INFO\n')
开发者ID:etiennebatise,项目名称:mrjob,代码行数:8,代码来源:test_launch.py



注:本文中的mrjob.job.MRJob类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python launch.MRJobLauncher类代码示例发布时间:2022-05-27
下一篇:
Python inline.InlineMRJobRunner类代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap