• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python model_selection.train_test_split函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中sklearn.model_selection.train_test_split函数的典型用法代码示例。如果您正苦于以下问题:Python train_test_split函数的具体用法?Python train_test_split怎么用?Python train_test_split使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了train_test_split函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: learning

	def learning( self):

		X = self.X
		y = self.y
		print( "Shape of X and y are", X.shape, y.shape)

		X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y,
			test_size=0.2, random_state=42)
		X_train, X_val, y_train, y_val = model_selection.train_test_split(X_train, y_train,
														  test_size=0.2, random_state=42)

		val_monitor = skflow.monitors.ValidationMonitor(X_val, y_val,
														early_stopping_rounds=200)
		model = skflow.TensorFlowDNNRegressor(hidden_units=[100, 50, 10], steps=5000)
		model.fit(X_train, y_train, val_monitor)

		yP = model.predict(X_test)
		score_r2 = metrics.r2_score(y_test, yP)
		score_MedAE = metrics.median_absolute_error(y_test, yP)
		print('Accuracy')
		print('--------')
		print('R2: {0:f}, MedAE: {1:f}'.format(score_r2, score_MedAE))

		if self.graph:
			kutil.regress_show4( y_test, yP)
开发者ID:jskDr,项目名称:jamespy_py3,代码行数:25,代码来源:jmultidk.py


示例2: lda_tuner

def lda_tuner(ingroup_otu, best_models):

    best_score = -1*np.inf
    dtp_series = [0.0001, 0.001, 0.01, 0.1, 0.2]
    twp_series = [0.0001, 0.001, 0.01, 0.1, 0.2]
    topic_series = [3]
    X = ingroup_otu.values
    eval_counter = 0

    for topics in topic_series: 
        for dtp in dtp_series:
            for twp in twp_series:
                eval_counter +=1
                X_train, X_test = train_test_split(X, test_size=0.5)
                lda = LatentDirichletAllocation(n_topics=topics, 
                                                doc_topic_prior=dtp, 
                                                topic_word_prior=twp, 
                                                learning_method='batch',
                                                random_state=42,
                                                max_iter=20)
                lda.fit(X_train)
                this_score = lda.score(X_test)
                this_perplexity = lda.perplexity(X_test)
                if this_score > best_score:
                    best_score = this_score
                    print "New Max Likelihood: {}".format(best_score)

                print "#{}: n:{}, dtp:{}, twp:{}, score:{}, perp:{}".format(eval_counter, 
                                                                 topics, dtp, twp,
                                                                 this_score, this_perplexity)

                best_models.append({'n': topics, 'dtp': dtp, 'twp': twp,
                                    'score': this_score, 'perp': this_perplexity})
                if (dtp == dtp_series[-1]) and (twp == twp_series[-1]):
                    eval_counter +=1
                    X_train, X_test = train_test_split(X, test_size=0.5)
                    lda = LatentDirichletAllocation(n_topics=topics, 
                                                    doc_topic_prior=1./topics, 
                                                    topic_word_prior=1./topics, 
                                                    learning_method='batch',
                                                    random_state=42,
                                                    max_iter=20)
                    lda.fit(X_train)
                    this_score = lda.score(X_test)
                    this_perplexity = lda.perplexity(X_test)
                    if this_score > best_score:
                        best_score = this_score
                        print "New Max Likelihood: {}".format(best_score)

                    print "#{}: n:{}, dtp:{}, twp:{}, score:{} perp: {}".format(eval_counter, 
                                                                                topics, 
                                                                                (1./topics), 
                                                                                (1./topics),
                                                                                this_score,
                                                                                this_perplexity)

                    best_models.append({'n': topics, 'dtp': (1./topics), 
                                        'twp': (1./topics), 'score': this_score,
                                        'perp': this_perplexity})
    return best_models
开发者ID:karoraw1,项目名称:GLM_Wrapper,代码行数:60,代码来源:otu_ts_support.py


示例3: test_base_estimator

def test_base_estimator():
    # Check base_estimator and its default values.
    rng = check_random_state(0)

    # Classification
    X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=rng)

    ensemble = BaggingClassifier(None, n_jobs=3, random_state=0).fit(X_train, y_train)

    assert_true(isinstance(ensemble.base_estimator_, DecisionTreeClassifier))

    ensemble = BaggingClassifier(DecisionTreeClassifier(), n_jobs=3, random_state=0).fit(X_train, y_train)

    assert_true(isinstance(ensemble.base_estimator_, DecisionTreeClassifier))

    ensemble = BaggingClassifier(Perceptron(), n_jobs=3, random_state=0).fit(X_train, y_train)

    assert_true(isinstance(ensemble.base_estimator_, Perceptron))

    # Regression
    X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=rng)

    ensemble = BaggingRegressor(None, n_jobs=3, random_state=0).fit(X_train, y_train)

    assert_true(isinstance(ensemble.base_estimator_, DecisionTreeRegressor))

    ensemble = BaggingRegressor(DecisionTreeRegressor(), n_jobs=3, random_state=0).fit(X_train, y_train)

    assert_true(isinstance(ensemble.base_estimator_, DecisionTreeRegressor))

    ensemble = BaggingRegressor(SVR(), n_jobs=3, random_state=0).fit(X_train, y_train)
    assert_true(isinstance(ensemble.base_estimator_, SVR))
开发者ID:agamemnonc,项目名称:scikit-learn,代码行数:32,代码来源:test_bagging.py


示例4: stacking

def stacking():
    X_train,X_test,Y_train,Y_test =train_test_split(x,y,
                                                        random_state=35,
                                                        test_size=0.2)
    x1_test =np.zeros((X_test.shape[0],len(classifiers)))#存储第一层测试集的输出结果
    x1_train =np.zeros((X_train.shape[0],len(classifiers)))
    print 'x1.shape',np.shape(x1_train)
    print 'y....',np.shape(Y_train)
    accuracy = np.zeros(len(classifiers))#每个模型的准确率
    for train_index, test_index in sss.split(X_train, Y_train):
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        clf_num = 0
        for clf in classifiers:
            clf_name = clf.__class__.__name__
            clf.fit(x_train, y_train)
            x1_train[test_index,clf_num]=clf.predict(x_test)#下层模型的训练集输入是上层模型对于对应测试集的预测输出
            x1_test[:, clf_num] += clf.predict(X_test)#直接对测试集进行预测,总共有十次,进行平均
            accuracy[clf_num] += (y_test == x1_train[test_index,clf_num]).mean()#该模型的准确率,十次平均
            clf_num += 1


    print np.shape(x1_train)
    print np.shape(y_train)
    x2_train,x2_test,y2_train,y2_test =train_test_split(x1_train,Y_train,test_size=0.1)
    lr =LogisticRegression()
    lr.fit(x2_train,y2_train)
    print lr.predict(x1_test)
    print Y_test
开发者ID:Xls1994,项目名称:DeepLearning,代码行数:29,代码来源:stackmodel.py


示例5: test_thresholded_scorers

def test_thresholded_scorers():
    # Test scorers that take thresholds.
    X, y = make_blobs(random_state=0, centers=2)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    clf = LogisticRegression(random_state=0)
    clf.fit(X_train, y_train)
    score1 = get_scorer('roc_auc')(clf, X_test, y_test)
    score2 = roc_auc_score(y_test, clf.decision_function(X_test))
    score3 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
    assert_almost_equal(score1, score2)
    assert_almost_equal(score1, score3)

    logscore = get_scorer('log_loss')(clf, X_test, y_test)
    logloss = log_loss(y_test, clf.predict_proba(X_test))
    assert_almost_equal(-logscore, logloss)

    # same for an estimator without decision_function
    clf = DecisionTreeClassifier()
    clf.fit(X_train, y_train)
    score1 = get_scorer('roc_auc')(clf, X_test, y_test)
    score2 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
    assert_almost_equal(score1, score2)

    # test with a regressor (no decision_function)
    reg = DecisionTreeRegressor()
    reg.fit(X_train, y_train)
    score1 = get_scorer('roc_auc')(reg, X_test, y_test)
    score2 = roc_auc_score(y_test, reg.predict(X_test))
    assert_almost_equal(score1, score2)

    # Test that an exception is raised on more than two classes
    X, y = make_blobs(random_state=0, centers=3)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    clf.fit(X_train, y_train)
    assert_raises(ValueError, get_scorer('roc_auc'), clf, X_test, y_test)
开发者ID:AlexanderFabisch,项目名称:scikit-learn,代码行数:35,代码来源:test_score_objects.py


示例6: train_test_split_mock_pandas

def train_test_split_mock_pandas():
    # X mock dataframe
    X_df = MockDataFrame(X)
    X_train, X_test = train_test_split(X_df)
    assert_true(isinstance(X_train, MockDataFrame))
    assert_true(isinstance(X_test, MockDataFrame))
    X_train_arr, X_test_arr = train_test_split(X_df)
开发者ID:absolutelyNoWarranty,项目名称:scikit-learn,代码行数:7,代码来源:test_split.py


示例7: read

def read(d):
    data = pd.read_table(path+uni+"_"+d+".txt",delimiter='\t')
    data['label'] = 0
    for i in range(len(data.index)):
        if data.iloc[i,3]<1000:
            data.iloc[i,len(data.columns)-1]=1
        else:
            data.iloc[i,len(data.columns)-1]=0
    X_0 = data.iloc[:,7:len(data.columns)-1]
    y_0 = data.iloc[:,len(data.columns)-1]    
    X_0,X_,y_0,y_ = train_test_split(X_0,y_0,test_size=0.0,random_state=3421)
    X_1,X_test,y_1,y_test = train_test_split(X_0,y_0,test_size=0.2,random_state=1257)
    X_2,X_3,y_2,y_3 = train_test_split(X_1,y_1,test_size=1-label_rate,random_state=11)

##############  整体预测与交互检验  ###########
#    scores_all = cross_val_score(RandomForestClassifier(n_estimators=500), X_1, y_1, cv=5, scoring='accuracy')
#    score_all_mean =scores_all.mean()
#    print(d+'5折交互检验:'+str(score_all_mean))
#    rf_all = RandomForestClassifier(n_estimators=500).fit(X_1,y_1)
#    answer_rf_all = rf_all.predict(X_test)
#    accuracy_all = metrics.accuracy_score(y_test,answer_rf_all)
#    print(d+'整体预测:'+str(accuracy_all))
################################################
    
    return data,X_2,y_2,X_3,y_3,X_test,y_test
开发者ID:IamCatkin,项目名称:Learning-Python,代码行数:25,代码来源:SSL-2.py


示例8: reduce_dataset

def reduce_dataset(uid):
    ds = load_validation_dataframe(uid)
    X_train, X_valid, X_test, y_train, y_valid, y_test = ds

    X=pd.concat((X_train,X_valid,X_test))
    y=np.concatenate((y_train,y_valid,y_test))

    if len(y) > 5000:
        neg_inds = [i for i, v in enumerate(y) if v==0]
        pos_inds = [i for i, v in enumerate(y) if v==1]

        n_neg = 5000 - len(pos_inds)
        neg_inds = sample(neg_inds, n_neg)
        inds = sorted(neg_inds + pos_inds)
        X = X.iloc[inds,:]
        y = y[inds]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
        X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.66666, random_state=42)

    Xtrain_fname = join(DATAFRAMES_FOLDER, "dfXtrain_%d_small.pickle" % uid)
    Xvalid_fname = join(DATAFRAMES_FOLDER, "dfXvalid_%d_small.pickle" % uid)
    Xtest_fname = join(DATAFRAMES_FOLDER, "dfXtestv_%d_small.pickle" % uid)
    ys_fname = join(DATAFRAMES_FOLDER, "ysv_%d_small.pickle" % uid)

    X_train.to_pickle(Xtrain_fname)
    X_valid.to_pickle(Xvalid_fname)
    X_test.to_pickle(Xtest_fname)
    pickle.dump((y_train, y_valid, y_test), open(ys_fname, 'wb'))

    return X_train, X_valid, X_test, y_train, y_valid, y_test
开发者ID:pablocelayes,项目名称:sna_classifier,代码行数:30,代码来源:datasets.py


示例9: test_classification_with_validation

    def test_classification_with_validation(self):
        tol_places = 4
        data_x, data_y = make_classification(n_samples=100, n_features=7,
                                             n_redundant=0, n_informative=7,
                                             n_clusters_per_class=2,
                                             random_state=3227)
        label_y = np.where(data_y == 0, 'A', 'B')

        train_x, test_x, train_y, test_y = train_test_split(data_x, label_y,
                                                            test_size=0.25,
                                                            random_state=3227)

        train_x, validate_x, train_y, validate_y = train_test_split(
            train_x, train_y, test_size=0.5, random_state=3227)

        params = {
            'ref_functions': ('linear_cov',),
            'criterion_type': 'bias_retrain',
            'criterion_minimum_width': 5,
            'max_layer_count': 5,
            'verbose': 0,
            'n_jobs': 'max'
        }
        model = Classifier(**params)
        model.fit(train_x, train_y, validation_data=(validate_x, validate_y))
        pred_y = model.predict_proba(test_x)
        roc_auc = roc_auc_score(model.le.transform(test_y), pred_y)
        self.assertAlmostEqual(roc_auc, 0.76, places=tol_places)

        no1 = model.predict_neuron_output(test_x, 0, 0)
        no2 = model.predict_neuron_output(test_x, 1, 0)
开发者ID:kvoyager,项目名称:GmdhPy,代码行数:31,代码来源:test_model.py


示例10: __init__

    def __init__(self, root, train=True, val=False, color_space='lab', transform=None, test_size=0.9, val_size=0.125, location='cpu'):
        """
            color_space: 'yub' or 'lab'
        """
        self.root_dir = root
        all_files = []
        for r, _, files in walk(self.root_dir):
            for f in files:
                if f.endswith('.jpg'):
                    all_files.append(join(r, f))
        train_val_files, test_files = train_test_split(
            all_files, test_size=test_size, random_state=69)
        train_files, val_files = train_test_split(train_val_files,
                                                  test_size=val_size, random_state=69)
        if (train and val):
            self.filenames = val_files
        elif train:
            self.filenames = train_files
        else:
            self.filenames = test_files

        self.color_space = color_space
        if (self.color_space not in ['rgb', 'lab']):
            raise(NotImplementedError)
        self.transform = transform
        self.location = location
        self.nnenc = NNEncode(location=self.location)
        self.train = train
开发者ID:stanleynguyen,项目名称:corolization,代码行数:28,代码来源:dataset.py


示例11: main

def main(_):

    if FLAGS.dataset == 'cifar10':
        (X_train, y_train), (_, _) = cifar10.load_data()
        X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=0)
    else:
        with open('data/train.p', mode='rb') as f:
            train = pickle.load(f)
        X_train, X_val, y_train, y_val = train_test_split(train['features'], train['labels'], test_size=0.33, random_state=0)

    train_output_file = "{}_{}_{}.p".format(FLAGS.network, FLAGS.dataset, 'bottleneck_features_train')
    validation_output_file = "{}_{}_{}.p".format(FLAGS.network, FLAGS.dataset, 'bottleneck_features_validation')

    print("Resizing to", (w, h, ch))
    print("Saving to ...")
    print(train_output_file)
    print(validation_output_file)

    with tf.Session() as sess:
        K.set_session(sess)
        K.set_learning_phase(1)

        model = create_model()

        print('Bottleneck training')
        train_gen = gen(sess, X_train, y_train, batch_size)
        bottleneck_features_train = model.predict_generator(train_gen(), X_train.shape[0])
        data = {'features': bottleneck_features_train, 'labels': y_train}
        pickle.dump(data, open(train_output_file, 'wb'))

        print('Bottleneck validation')
        val_gen = gen(sess, X_val, y_val, batch_size)
        bottleneck_features_validation = model.predict_generator(val_gen(), X_val.shape[0])
        data = {'features': bottleneck_features_validation, 'labels': y_val}
        pickle.dump(data, open(validation_output_file, 'wb'))
开发者ID:AbdulTheProgrammer,项目名称:CarND-Transfer-Learning-Lab,代码行数:35,代码来源:run_bottleneck.py


示例12: split_data

def split_data(data):
    X_train, X_test, Y_train, Y_test = train_test_split(data.loc[:, data.columns != label], data[label],
                                                        train_size=train_size + validation_size, test_size=test_size,
                                                        shuffle=False, random_state=0)
    X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train,
                                                      train_size=train_size / (train_size + validation_size),
                                                      test_size=validation_size / (train_size + validation_size),
                                                      shuffle=False, random_state=0)
    return X_train, X_val, X_test, Y_train, Y_val, Y_test
开发者ID:michaeltur3,项目名称:ML_HW3,代码行数:9,代码来源:prepare_data.py


示例13: test_split

  def test_split(self):
    ds = self.create_dataset()
    indexes = list(range(len(ds)))
    train, test = train_test_split(indexes)
    train, valid = train_test_split(train)

    splitter = SpecifiedIndexSplitter(train, valid, test)
    train_ds, valid_ds, test_ds = splitter.train_valid_test_split(ds)

    self.assertTrue(np.all(train_ds.X == ds.X[train]))
    self.assertTrue(np.all(valid_ds.X == ds.X[valid]))
    self.assertTrue(np.all(test_ds.X == ds.X[test]))
开发者ID:ktaneishi,项目名称:deepchem,代码行数:12,代码来源:test_specified_index_splitter.py


示例14: get_train_valid_test_split

def get_train_valid_test_split(n, train=0.7, valid=0.1, test=0.2, shuffle=False):
    other_split = valid+test
    if train+other_split!=1:
        raise ValueError("Train, Valid, Test splits should sum to 1")
    train_set, other_set = train_test_split(range(1,n+1), 
                                            train_size=train, test_size=other_split, shuffle=shuffle)
    valid_set, test_set = train_test_split(other_set, 
                                           train_size=valid/other_split, 
                                           test_size=test/other_split,
                                           shuffle=False)
    print("train:{} valid:{} test:{}".format(len(train_set), len(valid_set), len(test_set)))
    return train_set, valid_set, test_set
开发者ID:chesterxgchen,项目名称:DeepLearningFrameworks,代码行数:12,代码来源:utils.py


示例15: preprocess

def preprocess(data, test_size, sample=None, scale=True):

    data_frame_all = pandas.read_table(data)
    df = data_frame_all

    # for simplicity for now--and since only 11093 or <3 % of our data, we're just gonna drop those rows
    no_null_df = df.dropna(axis=0, how='any')

    # this shows us that we no longer have null values
    no_null_df.isnull().values.any()

    # let's rename our new data frame df again.  we're left with 238907 rows
    df = no_null_df
    df_unprocessed = df

    if sample:
        df = df.sample(frac=sample)
        print("sampled")

    df = df[['order_estimated_driving_time_min','order_estimated_shopping_time_min']]
    df['total_time_min'] = df.sum(axis=1)
    df['time_in_hours'] = df.total_time_min.divide(60)


    target = df.time_in_hours * 15
    df = df.drop(['time_in_hours', 'total_time_min'], axis=1)


    s1 = target.std()
    s2 = 7.5 #our chosen std deviation

    m1 = target.mean()
    m2 = 15 #our chosen mean

    target = m2 + (target - m1) * s2/s1  #scale our output to a mean of 15 and std deviation of 3



    X = df
    y = target

    if scale:
        df_pp = preprocessing.scale(df)
        print("scaled")

        X_train, X_test, y_train, y_test = train_test_split(df_pp, target, test_size=test_size, random_state=42)

    else:
        df_pp = None
        X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=test_size, random_state=42)


    return df_unprocessed, df, df_pp, target, X, X_train, X_test, y, y_train, y_test
开发者ID:alexjacobs08,项目名称:ShiptProject,代码行数:53,代码来源:order_pay_model.py


示例16: resample

def resample(X, y, sample_fraction=0.1, test_size=0.3):
    X_columns = X.columns
    y_columns = y.columns
    n = len(X_columns)

    print('~' * 80)
    print('@@-\n', y.converted.value_counts())
    print('@@0 - Original')
    show_balance(y.values)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    print('@@2 - y_train')
    show_balance(y_train)
    print('@@2 -  y_test')
    show_balance(y_test)
    assert X_train.shape[1] == n and X_test.shape[1] == n

    ros = RandomOverSampler(random_state=42)
    X_train, y_train = ros.fit_sample(X_train, y_train)
    X_test, y_test = ros.fit_sample(X_test, y_test)
    print('@@3 - Oversampled y_train')
    show_balance(y_train)
    print('@@3 - Oversampled y_test')
    show_balance(y_test)
    assert X_train.shape[1] == n and X_test.shape[1] == n

    if sample_fraction < 1.0:
        _, X_train, _, y_train = train_test_split(X_train, y_train, test_size=sample_fraction, random_state=43)
        _, X_test, _, y_test = train_test_split(X_test, y_test, test_size=sample_fraction, random_state=44)
        print('@@2 - Downsampled y_train')
        show_balance(y_train)
        print('@@2 - Downsampled y_test')
        show_balance(y_test)
        assert len(X_train.shape) == 2 and len(X_test.shape) == 2, (X_train.shape, X_test.shape)
        assert X_train.shape[1] == n and X_test.shape[1] == n, (X_train.shape, X_test.shape)

    print('X_columns=%d %s' % (len(X_columns), X_columns))
    print('y_columns=%d %s' % (len(y_columns), y_columns))
    print('X_train=%-10s y_train=%s' % (list(X_train.shape), list(y_train.shape)))
    print('X_test =%-10s y_test =%s' % (list(X_test.shape), list(y_test.shape)))
    assert X_train.shape[1] == n and X_test.shape[1] == n

    X_train = pd.DataFrame(X_train, columns=X_columns)
    y_train = pd.DataFrame(y_train, columns=y_columns, index=X_train.index)
    X_test = pd.DataFrame(X_test, columns=X_columns)
    y_test = pd.DataFrame(y_test, columns=y_columns, index=X_test.index)
    print('@@+ y_train\n', y_train.converted.value_counts(), flush=True)
    print('@@+ y_test\n', y_test.converted.value_counts(), flush=True)

    return (X_train, y_train), (X_test, y_test)
开发者ID:peterwilliams97,项目名称:Butt-Head-Astronomer,代码行数:50,代码来源:feature_select.py


示例17: test_thresholded_scorers

def test_thresholded_scorers():
    # Test scorers that take thresholds.
    X, y = make_blobs(random_state=0, centers=2)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    clf = LogisticRegression(random_state=0)
    clf.fit(X_train, y_train)
    score1 = get_scorer('roc_auc')(clf, X_test, y_test)
    score2 = roc_auc_score(y_test, clf.decision_function(X_test))
    score3 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
    assert_almost_equal(score1, score2)
    assert_almost_equal(score1, score3)

    logscore = get_scorer('neg_log_loss')(clf, X_test, y_test)
    logloss = log_loss(y_test, clf.predict_proba(X_test))
    assert_almost_equal(-logscore, logloss)

    # same for an estimator without decision_function
    clf = DecisionTreeClassifier()
    clf.fit(X_train, y_train)
    score1 = get_scorer('roc_auc')(clf, X_test, y_test)
    score2 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
    assert_almost_equal(score1, score2)

    # test with a regressor (no decision_function)
    reg = DecisionTreeRegressor()
    reg.fit(X_train, y_train)
    score1 = get_scorer('roc_auc')(reg, X_test, y_test)
    score2 = roc_auc_score(y_test, reg.predict(X_test))
    assert_almost_equal(score1, score2)

    # Test that an exception is raised on more than two classes
    X, y = make_blobs(random_state=0, centers=3)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    clf.fit(X_train, y_train)
    with pytest.raises(ValueError, match="multiclass format is not supported"):
        get_scorer('roc_auc')(clf, X_test, y_test)

    # test error is raised with a single class present in model
    # (predict_proba shape is not suitable for binary auc)
    X, y = make_blobs(random_state=0, centers=2)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    clf = DecisionTreeClassifier()
    clf.fit(X_train, np.zeros_like(y_train))
    with pytest.raises(ValueError, match="need classifier with two classes"):
        get_scorer('roc_auc')(clf, X_test, y_test)

    # for proba scorers
    with pytest.raises(ValueError, match="need classifier with two classes"):
        get_scorer('neg_log_loss')(clf, X_test, y_test)
开发者ID:srinivasreddy,项目名称:scikit-learn,代码行数:49,代码来源:test_score_objects.py


示例18: filter_split_data

def filter_split_data(X_raw, y_raw, metadatas, max_cloud_cover=1, timespan_before=np.inf, test_fraction=0.3, val_fraction=0.3, random_seed=0, normalized=True, balanced_classes=True, filter_center_cloudy=False):
    X, y, metadata_filtered = filter_data(X_raw, y_raw, metadatas, max_cloud_cover=max_cloud_cover, timespan_before=timespan_before, random_seed=random_seed, normalized=normalized, balanced_classes=balanced_classes, filter_center_cloudy=filter_center_cloudy)

    X, y, metadata_filtered=shuffle(X, y, metadata_filtered, random_state=random_seed)

    X_train, X_test, y_train, y_test, metadata_train, metadata_test=train_test_split(
        X, y, metadata_filtered, test_size=test_fraction, random_state=random_seed)

    X_train, X_val, y_train, y_val, metadata_train, metadata_val=train_test_split(
        X_train, y_train, metadata_train, test_size=val_fraction, random_state=random_seed)
#     print(X_train.shape,y_train.shape, len(metadata_train))
#     print(X_test.shape,y_test.shape, len(metadata_test))
#     print(X_val.shape,y_val.shape, len(metadata_val))

    return X_train, y_train, metadata_train, X_val, y_val, metadata_val, X_test, y_test, metadata_test
开发者ID:kinect59,项目名称:satellite_leak_detection,代码行数:15,代码来源:filters.py


示例19: test_feature_importance_regression

def test_feature_importance_regression():
    """Test that Gini importance is calculated correctly.

    This test follows the example from [1]_ (pg. 373).

    .. [1] Friedman, J., Hastie, T., & Tibshirani, R. (2001). The elements
       of statistical learning. New York: Springer series in statistics.
    """
    california = fetch_california_housing()
    X, y = california.data, california.target
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    reg = GradientBoostingRegressor(loss='huber', learning_rate=0.1,
                                    max_leaf_nodes=6, n_estimators=100,
                                    random_state=0)
    reg.fit(X_train, y_train)
    sorted_idx = np.argsort(reg.feature_importances_)[::-1]
    sorted_features = [california.feature_names[s] for s in sorted_idx]

    # The most important feature is the median income by far.
    assert sorted_features[0] == 'MedInc'

    # The three subsequent features are the following. Their relative ordering
    # might change a bit depending on the randomness of the trees and the
    # train / test split.
    assert set(sorted_features[1:4]) == {'Longitude', 'AveOccup', 'Latitude'}
开发者ID:amueller,项目名称:scikit-learn,代码行数:26,代码来源:test_gradient_boosting.py


示例20: test_gradient_boosting_validation_fraction

def test_gradient_boosting_validation_fraction():
    X, y = make_classification(n_samples=1000, random_state=0)

    gbc = GradientBoostingClassifier(n_estimators=100,
                                     n_iter_no_change=10,
                                     validation_fraction=0.1,
                                     learning_rate=0.1, max_depth=3,
                                     random_state=42)
    gbc2 = clone(gbc).set_params(validation_fraction=0.3)
    gbc3 = clone(gbc).set_params(n_iter_no_change=20)

    gbr = GradientBoostingRegressor(n_estimators=100, n_iter_no_change=10,
                                    learning_rate=0.1, max_depth=3,
                                    validation_fraction=0.1,
                                    random_state=42)
    gbr2 = clone(gbr).set_params(validation_fraction=0.3)
    gbr3 = clone(gbr).set_params(n_iter_no_change=20)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    # Check if validation_fraction has an effect
    gbc.fit(X_train, y_train)
    gbc2.fit(X_train, y_train)
    assert gbc.n_estimators_ != gbc2.n_estimators_

    gbr.fit(X_train, y_train)
    gbr2.fit(X_train, y_train)
    assert gbr.n_estimators_ != gbr2.n_estimators_

    # Check if n_estimators_ increase monotonically with n_iter_no_change
    # Set validation
    gbc3.fit(X_train, y_train)
    gbr3.fit(X_train, y_train)
    assert gbr.n_estimators_ < gbr3.n_estimators_
    assert gbc.n_estimators_ < gbc3.n_estimators_
开发者ID:amueller,项目名称:scikit-learn,代码行数:34,代码来源:test_gradient_boosting.py



注:本文中的sklearn.model_selection.train_test_split函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python model_selection.GridSearchCV类代码示例发布时间:2022-05-27
下一篇:
Python model_selection.learning_curve函数代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap