• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python utils.make_classification_data函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中utils.make_classification_data函数的典型用法代码示例。如果您正苦于以下问题:Python make_classification_data函数的具体用法?Python make_classification_data怎么用?Python make_classification_data使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了make_classification_data函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: test_merge_missing_labels

def test_merge_missing_labels():
    """
    Test to ensure that labels are sucessfully copied when merging
    """

    # create a feature set
    fs1, _ = make_classification_data(num_examples=100,
                                      num_features=4,
                                      num_labels=3,
                                      train_test_ratio=1.0)

    # create a different feature set with no labels specified
    fs2, _ = make_classification_data(num_examples=100,
                                      num_features=4,
                                      feature_prefix='g',
                                      empty_labels=True,
                                      num_labels=3,
                                      train_test_ratio=1.0)

    # merge the two featuresets in different orders
    fs12 = fs1 + fs2
    fs21 = fs2 + fs1

    # make sure that the labels are the same after merging
    assert_array_equal(fs12.labels, fs1.labels)
    assert_array_equal(fs21.labels, fs1.labels)
开发者ID:BK-University,项目名称:skll,代码行数:26,代码来源:test_featureset.py


示例2: test_subtract

def test_subtract():
    """
    Test to ensure that subtraction works
    """

    # create a feature set
    fs1, _ = make_classification_data(num_examples=100,
                                      num_features=4,
                                      num_labels=2,
                                      train_test_ratio=1.0,
                                      random_state=1234)

    # create a different feature set with the same feature names
    # but different feature values
    fs2, _ = make_classification_data(num_examples=100,
                                      num_features=2,
                                      num_labels=2,
                                      train_test_ratio=1.0,
                                      random_state=5678)

    # subtract fs1 from fs2, i.e., the features in fs2
    # should be removed from fs1 but nothing else should change
    fs = fs1 - fs2

    # ensure that the labels are the same in fs and fs1
    assert_array_equal(fs.labels, fs1.labels)

    # ensure that there are only two features left
    eq_(fs.features.shape[1], 2)

    # and that they are f3 and f4
    assert_array_equal(np.array(fs.vectorizer.feature_names_), ['f03', 'f04'])
开发者ID:BK-University,项目名称:skll,代码行数:32,代码来源:test_featureset.py


示例3: check_print_model_weights

def check_print_model_weights(task='classification'):

    # create some simple classification or regression data
    if task == 'classification':
        train_fs, _ = make_classification_data(train_test_ratio=0.8)
    else:
        train_fs, _, _ = make_regression_data(num_features=4,
                                              train_test_ratio=0.8)

    # now train the appropriate model
    if task == 'classification':
        learner = Learner('LogisticRegression')
        learner.train(train_fs)
    else:
        learner = Learner('LinearRegression')
        learner.train(train_fs, grid_objective='pearson')

    # now save the model to disk
    model_file = join(_my_dir, 'output',
                      'test_print_model_weights.model')
    learner.save(model_file)

    # now call print_model_weights main() and capture the output
    print_model_weights_cmd = [model_file]
    err = ''
    try:
        old_stderr = sys.stderr
        old_stdout = sys.stdout
        sys.stderr = mystderr = StringIO()
        sys.stdout = mystdout = StringIO()
        pmw.main(print_model_weights_cmd)
        out = mystdout.getvalue()
        err = mystderr.getvalue()
    finally:
        sys.stderr = old_stderr
        sys.stdout = old_stdout
        print(err)

    # now parse the output of the print_model_weight command
    # and get the intercept and the feature values
    if task == 'classification':
        lines_to_parse = [l for l in out.split('\n')[1:] if l]
        intercept = safe_float(lines_to_parse[0].split('\t')[0])
        feature_values = []
        for ltp in lines_to_parse[1:]:
            fields = ltp.split('\t')
            feature_values.append((fields[2], safe_float(fields[0])))
        feature_values = [t[1] for t in sorted(feature_values)]
        assert_almost_equal(intercept, learner.model.intercept_[0])
        assert_allclose(learner.model.coef_[0], feature_values)
    else:
        lines_to_parse = [l for l in out.split('\n') if l]
        intercept = safe_float(lines_to_parse[0].split('=')[1])
        feature_values = []
        for ltp in lines_to_parse[1:]:
            fields = ltp.split('\t')
            feature_values.append((fields[1], safe_float(fields[0])))
        feature_values = [t[1] for t in sorted(feature_values)]
        assert_almost_equal(intercept, learner.model.intercept_)
        assert_allclose(learner.model.coef_, feature_values)
开发者ID:MechCoder,项目名称:skll,代码行数:60,代码来源:test_utilities.py


示例4: test_string_feature

def test_string_feature():
    """
    Test to make sure that string-valued features are properly
    encoded as binary features
    """
    # create a featureset that is derived from an original
    # set of features containing 3 numeric features and
    # one string-valued feature that can take six possible
    # values between 'a' to 'f'. This means that the
    # featureset will have 3 numeric + 6 binary features.
    fs, _ = make_classification_data(num_examples=100,
                                     num_features=4,
                                     num_labels=3,
                                     one_string_feature=True,
                                     num_string_values=6,
                                     train_test_ratio=1.0)

    # confirm that the number of features are as expected
    eq_(fs.features.shape, (100, 9))

    # confirm the feature names
    eq_(fs.vectorizer.feature_names_, ['f01', 'f02', 'f03',
                                       'f04=a', 'f04=b', 'f04=c',
                                       'f04=d', 'f04=e', 'f04=f'])

    # confirm that the final six features are binary
    assert_array_equal(fs.features[:, [3, 4, 5, 6, 7, 8]].data, 1)
开发者ID:BK-University,项目名称:skll,代码行数:27,代码来源:test_featureset.py


示例5: test_learner_api_load_into_existing_instance

def test_learner_api_load_into_existing_instance():
    """
    Check that `Learner.load()` works as expected
    """

    # create a LinearSVC instance and train it on some data
    learner1 = Learner('LinearSVC')
    (train_fs,
     test_fs) = make_classification_data(num_examples=200,
                                         num_features=5,
                                         use_feature_hashing=False,
                                         non_negative=True)
    learner1.train(train_fs, grid_search=False)

    # now use `load()` to replace the existing instance with a
    # different saved learner
    other_model_file = join(_my_dir, 'other', 'test_load_saved_model.{}.model'.format(sys.version_info[0]))
    learner1.load(other_model_file)

    # now load the saved model into another instance using the class method
    # `from_file()`
    learner2 = Learner.from_file(other_model_file)

    # check that the two instances are now basically the same
    eq_(learner1.model_type, learner2.model_type)
    eq_(learner1.model_params, learner2.model_params)
    eq_(learner1.model_kwargs, learner2.model_kwargs)
开发者ID:EducationalTestingService,项目名称:skll,代码行数:27,代码来源:test_classification.py


示例6: check_train_and_score_function

def check_train_and_score_function(model_type):
    """
    Check that the _train_and_score() function works as expected
    """

    # create train and test data
    (train_fs,
     test_fs) = make_classification_data(num_examples=500,
                                         train_test_ratio=0.7,
                                         num_features=5,
                                         use_feature_hashing=False,
                                         non_negative=True)

    # call _train_and_score() on this data
    estimator_name = 'LogisticRegression' if model_type == 'classifier' else 'Ridge'
    metric = 'accuracy' if model_type == 'classifier' else 'pearson'
    learner1 = Learner(estimator_name)
    train_score1, test_score1 = _train_and_score(learner1, train_fs, test_fs, metric)

    # this should yield identical results when training another instance
    # of the same learner without grid search and shuffling and evaluating
    # that instance on the train and the test set
    learner2 = Learner(estimator_name)
    learner2.train(train_fs, grid_search=False, shuffle=False)
    train_score2 = learner2.evaluate(train_fs, output_metrics=[metric])[-1][metric]
    test_score2 = learner2.evaluate(test_fs, output_metrics=[metric])[-1][metric]

    eq_(train_score1, train_score2)
    eq_(test_score1, test_score2)
开发者ID:EducationalTestingService,项目名称:skll,代码行数:29,代码来源:test_classification.py


示例7: check_filter_labels

def check_filter_labels(inverse=False):

    # create a feature set
    fs, _ = make_classification_data(num_examples=1000,
                                     num_features=4,
                                     num_labels=5,
                                     train_test_ratio=1.0)

    # keep just the instaces with 0, 1 and 2 labels
    labels_to_filter = [0, 1, 2]

    # do the actual filtering
    fs.filter(labels=labels_to_filter, inverse=inverse)

    # make sure that we removed the right things
    if inverse:
        ids_kept = fs.ids[np.where(np.logical_not(np.in1d(fs.labels,
                                                          labels_to_filter)))]
    else:
        ids_kept = fs.ids[np.where(np.in1d(fs.labels, labels_to_filter))]

    assert_array_equal(fs.ids, np.array(ids_kept))

    # make sure that number of ids, labels and features are the same
    eq_(fs.ids.shape[0], fs.labels.shape[0])
    eq_(fs.labels.shape[0], fs.features.shape[0])
开发者ID:BK-University,项目名称:skll,代码行数:26,代码来源:test_featureset.py


示例8: make_single_file_featureset_data

def make_single_file_featureset_data():
    """
    Write a training file and a test file for tests that check whether
    specifying train_file and test_file actually works.
    """
    train_fs, test_fs = make_classification_data(num_examples=600,
                                                 train_test_ratio=0.8,
                                                 num_labels=2,
                                                 num_features=3,
                                                 non_negative=False)

    # Write training feature set to a file
    train_path = join(_my_dir, 'train', 'train_single_file.jsonlines')
    writer = NDJWriter(train_path, train_fs)
    writer.write()

    # Write test feature set to a file
    test_path = join(_my_dir, 'test', 'test_single_file.jsonlines')
    writer = NDJWriter(test_path, test_fs)
    writer.write()

    # Also write another test feature set that has fewer features than the training set
    test_fs.filter(features=['f01', 'f02'])
    test_path = join(_my_dir, 'test', 'test_single_file_subset.jsonlines')
    writer = NDJWriter(test_path, test_fs)
    writer.write()
开发者ID:EducationalTestingService,项目名称:skll,代码行数:26,代码来源:test_classification.py


示例9: test_skll_convert_libsvm_map

def test_skll_convert_libsvm_map():
    """
    Test to check whether the --reuse_libsvm_map option works for skll_convert
    """

    # create some simple classification data
    orig_fs, _ = make_classification_data(train_test_ratio=1.0,
                                          one_string_feature=True)

    # now write out this feature set as a libsvm file
    orig_libsvm_file = join(_my_dir, 'other',
                            'test_skll_convert_libsvm_map.libsvm')
    writer = LibSVMWriter(orig_libsvm_file, orig_fs, quiet=True)
    writer.write()

    # now make a copy of the dataset
    swapped_fs = copy.deepcopy(orig_fs)

    # now modify this new featureset to swap the first two columns
    del swapped_fs.vectorizer.vocabulary_['f01']
    del swapped_fs.vectorizer.vocabulary_['f02']
    swapped_fs.vectorizer.vocabulary_['f01'] = 1
    swapped_fs.vectorizer.vocabulary_['f02'] = 0
    tmp = swapped_fs.features[:, 0]
    swapped_fs.features[:, 0] = swapped_fs.features[:, 1]
    swapped_fs.features[:, 1] = tmp

    # now write out this new feature set as a MegaM file
    swapped_megam_file = join(_my_dir, 'other',
                              'test_skll_convert_libsvm_map.megam')
    writer = MegaMWriter(swapped_megam_file, swapped_fs, quiet=True)
    writer.write()

    # now run skll_convert to convert this into a libsvm file
    # but using the mapping specified in the first libsvm file
    converted_libsvm_file = join(_my_dir, 'other',
                                 'test_skll_convert_libsvm_map2.libsvm')

    # now call skll convert's main function
    skll_convert_cmd = ['--reuse_libsvm_map', orig_libsvm_file,
                        '--quiet', orig_libsvm_file,
                        converted_libsvm_file]
    err = ''
    try:
        old_stderr = sys.stderr
        sys.stderr = mystderr = StringIO()
        sk.main(skll_convert_cmd)
        err = mystderr.getvalue()
    finally:
        sys.stderr = old_stderr
        print(err)

    # now read the converted libsvm file into a featureset
    reader = LibSVMReader(converted_libsvm_file, quiet=True)
    converted_fs = reader.read()

    # now ensure that this new featureset and the original
    # featureset are the same
    eq_(orig_fs, converted_fs)
开发者ID:MechCoder,项目名称:skll,代码行数:59,代码来源:test_utilities.py


示例10: check_generate_predictions_console

def check_generate_predictions_console(use_threshold=False):

    # create some simple classification data without feature hashing
    train_fs, test_fs = make_classification_data(num_examples=1000,
                                                 num_features=5)

    # save the test feature set to an NDJ file
    input_file = join(_my_dir, 'test',
                      'test_generate_predictions.jsonlines')
    writer = NDJWriter(input_file, test_fs)
    writer.write()

    # create a learner that uses an SGD classifier
    learner = Learner('SGDClassifier', probability=use_threshold)

    # train the learner with grid search
    learner.train(train_fs, grid_search=True)

    # get the predictions on the test featureset
    predictions = learner.predict(test_fs)

    # if we asked for probabilities, then use the threshold
    # to convert them into binary predictions
    if use_threshold:
        threshold = 0.6
        predictions = [int(p[1] >= threshold) for p in predictions]
    else:
        predictions = predictions.tolist()
        threshold = None

    # save the learner to a file
    model_file = join(_my_dir, 'output',
                      'test_generate_predictions_console.model')
    learner.save(model_file)

    # now call main() from generate_predictions.py
    generate_cmd = []
    if use_threshold:
        generate_cmd.append('-t {}'.format(threshold))
    generate_cmd.extend([model_file, input_file])

    # we need to capture stdout since that's what main() writes to
    err = ''
    try:
        old_stdout = sys.stdout
        old_stderr = sys.stderr
        sys.stdout = mystdout = StringIO()
        sys.stderr = mystderr = StringIO()
        gp.main(generate_cmd)
        out = mystdout.getvalue()
        err = mystderr.getvalue()
        predictions_after_saving = [int(x) for x in out.strip().split('\n')]
        eq_(predictions, predictions_after_saving)
    finally:
        sys.stdout = old_stdout
        sys.stderr = old_stderr
        print(err)
开发者ID:MechCoder,项目名称:skll,代码行数:57,代码来源:test_utilities.py


示例11: test_custom_learner_model_loading

def test_custom_learner_model_loading():
    num_labels = 10

    class_weights = [(0.5 / (num_labels - 1))
                     for x in range(num_labels - 1)] + [0.5]
    train_fs, test_fs = make_classification_data(num_examples=600,
                                                 train_test_ratio=0.8,
                                                 num_labels=num_labels,
                                                 num_features=5,
                                                 non_negative=True,
                                                 class_weights=class_weights)

    # Write training feature set to a file
    train_path = join(_my_dir, 'train',
                      'test_model_custom_learner.jsonlines')
    writer = NDJWriter(train_path, train_fs)
    writer.write()

    # Write test feature set to a file
    test_path = join(_my_dir, 'test',
                     'test_model_custom_learner.jsonlines')
    writer = NDJWriter(test_path, test_fs)
    writer.write()

    # run the configuration that trains the custom model and saves it
    cfgfile = 'test_model_save_custom_learner.template.cfg'
    config_template_path = join(_my_dir, 'configs', cfgfile)
    config_path = fill_in_config_paths(config_template_path)

    run_configuration(config_path, quiet=True)

    # save the predictions from disk into memory
    # and delete the predictions file
    outprefix = 'test_model_custom_learner'
    pred_file = join(_my_dir, 'output',
                     '{}_{}_CustomLogisticRegressionWrapper'
                     '.predictions'.format(outprefix,
                                           outprefix))
    preds1 = read_predictions(pred_file)
    os.unlink(pred_file)

    # run the configuration that loads the saved model
    # and generates the predictions again
    cfgfile = 'test_model_load_custom_learner.template.cfg'
    config_template_path = join(_my_dir, 'configs', cfgfile)
    config_path = fill_in_config_paths(config_template_path)

    run_configuration(config_path, overwrite=False, quiet=True)

    # load the newly generated predictions
    preds2 = read_predictions(pred_file)

    # make sure that they are the same as before
    assert_array_equal(preds1, preds2)
开发者ID:BK-University,项目名称:skll,代码行数:54,代码来源:test_custom_learner.py


示例12: test_merge_different_vectorizers

def test_merge_different_vectorizers():
    """
    Test to ensure rejection of merging featuresets with different vectorizers
    """

    # create a featureset each with a DictVectorizer
    fs1, _ = make_classification_data(num_examples=100,
                                      num_features=4,
                                      num_labels=3,
                                      train_test_ratio=1.0)

    # create another featureset using hashing
    fs2, _ = make_classification_data(num_examples=100,
                                      num_features=4,
                                      feature_prefix='g',
                                      num_labels=3,
                                      train_test_ratio=1.0,
                                      use_feature_hashing=True)
    # This should raise a ValueError
    fs1 + fs2
开发者ID:BK-University,项目名称:skll,代码行数:20,代码来源:test_featureset.py


示例13: test_length

def test_length():
    """
    Test to whether len() returns the number of instances
    """

    # create a featureset
    fs, _ = make_classification_data(num_examples=100,
                                     num_features=4,
                                     num_labels=3,
                                     train_test_ratio=1.0)

    eq_(len(fs), 100)
开发者ID:BK-University,项目名称:skll,代码行数:12,代码来源:test_featureset.py


示例14: test_empty_labels

def test_empty_labels():
    """
    Test to check behaviour when labels is None
    """

    # create a feature set with empty labels
    fs, _ = make_classification_data(num_examples=100,
                                     num_features=4,
                                     num_labels=3,
                                     empty_labels=True,
                                     train_test_ratio=1.0)
    assert np.isnan(fs.labels).all()
开发者ID:BK-University,项目名称:skll,代码行数:12,代码来源:test_featureset.py


示例15: test_write_hashed_featureset

def test_write_hashed_featureset():
    """
    Test to check that hashed featuresets cannot be written out
    """
    fs, _ = make_classification_data(num_examples=100,
                                     num_features=4,
                                     use_feature_hashing=True,
                                     feature_bins=2,
                                     random_state=1234)
    output_dir = join(_my_dir, 'output')
    writer = NDJWriter(join(output_dir, 'foo.jsonlines'), fs)
    writer.write()
开发者ID:EducationalTestingService,项目名称:skll,代码行数:12,代码来源:test_featureset.py


示例16: check_learner_api_grid_search_no_objective

def check_learner_api_grid_search_no_objective(task='train'):

    (train_fs,
     test_fs) = make_classification_data(num_examples=500,
                                         train_test_ratio=0.7,
                                         num_features=5,
                                         use_feature_hashing=False,
                                         non_negative=True)
    learner = Learner('LogisticRegression')
    if task == 'train':
        _ = learner.train(train_fs)
    else:
        _ = learner.cross_validate(train_fs)
开发者ID:EducationalTestingService,项目名称:skll,代码行数:13,代码来源:test_classification.py


示例17: test_all_new_labels_in_test

def test_all_new_labels_in_test():
    """
    Test classification with all labels in test set unseen
    """
    train_fs, test_fs = make_classification_data(num_labels=3,
                                                 train_test_ratio=0.8)
    # change all test labels
    test_fs.labels = test_fs.labels + 3

    learner = Learner('SVC')
    learner.train(train_fs, grid_search=False)
    res = learner.evaluate(test_fs)
    yield check_results_with_unseen_labels, res, 6, [3, 4, 5]
    yield assert_almost_equal, res[1], 0
开发者ID:EducationalTestingService,项目名称:skll,代码行数:14,代码来源:test_classification.py


示例18: test_merge_different_hashers

def test_merge_different_hashers():
    """
    Test to ensure rejection of merging featuresets with different FeatureHashers
    """

    # create a feature set with 4 feature hashing bins
    fs1, _ = make_classification_data(num_examples=100,
                                      num_features=10,
                                      num_labels=3,
                                      train_test_ratio=1.0,
                                      use_feature_hashing=True,
                                      feature_bins=4)

    # create a second feature set with 3 feature hashing bins
    fs2, _ = make_classification_data(num_examples=100,
                                      num_features=10,
                                      num_labels=3,
                                      feature_prefix='g',
                                      train_test_ratio=1.0,
                                      use_feature_hashing=True,
                                      feature_bins=3)
    # This should raise a ValueError
    fs1 + fs2
开发者ID:BK-University,项目名称:skll,代码行数:23,代码来源:test_featureset.py


示例19: test_new_labels_in_test_set

def test_new_labels_in_test_set():
    """
    Test classification experiment with an unseen label in the test set.
    """
    train_fs, test_fs = make_classification_data(num_labels=3,
                                                 train_test_ratio=0.8)
    # add new labels to the test set
    test_fs.labels[-3:] = 3

    learner = Learner('SVC')
    learner.train(train_fs, grid_search=False)
    res = learner.evaluate(test_fs)
    yield check_results_with_unseen_labels, res, 4, [3]
    yield assert_almost_equal, res[1], 0.3
开发者ID:EducationalTestingService,项目名称:skll,代码行数:14,代码来源:test_classification.py


示例20: check_predict

def check_predict(model, use_feature_hashing=False):
    """
    This tests whether predict task runs and generates the same
    number of predictions as samples in the test set. The specified
    model indicates whether to generate random regression
    or classification data.
    """

    # create the random data for the given model
    if model._estimator_type == 'regressor':
        train_fs, test_fs, _ = \
            make_regression_data(use_feature_hashing=use_feature_hashing,
                                 feature_bins=5)
    # feature hashing will not work for Naive Bayes since it requires
    # non-negative feature values
    elif model.__name__ == 'MultinomialNB':
        train_fs, test_fs = \
            make_classification_data(use_feature_hashing=False,
                                     non_negative=True)
    else:
        train_fs, test_fs = \
            make_classification_data(use_feature_hashing=use_feature_hashing,
                                     feature_bins=25)

    # create the learner with the specified model
    learner = Learner(model.__name__)

    # now train the learner on the training data and use feature hashing when
    # specified and when we are not using a Naive Bayes model
    learner.train(train_fs, grid_search=False)

    # now make predictions on the test set
    predictions = learner.predict(test_fs)

    # make sure we have the same number of outputs as the
    # number of test set samples
    eq_(len(predictions), test_fs.features.shape[0])
开发者ID:EducationalTestingService,项目名称:skll,代码行数:37,代码来源:test_classification.py



注:本文中的utils.make_classification_data函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python utils.make_dir函数代码示例发布时间:2022-05-26
下一篇:
Python utils.main函数代码示例发布时间:2022-05-26
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap