本文整理汇总了Python中sklearn.model_selection.train_test_split函数的典型用法代码示例。如果您正苦于以下问题:Python train_test_split函数的具体用法?Python train_test_split怎么用?Python train_test_split使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了train_test_split函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: learning
def learning( self):
X = self.X
y = self.y
print( "Shape of X and y are", X.shape, y.shape)
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y,
test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = model_selection.train_test_split(X_train, y_train,
test_size=0.2, random_state=42)
val_monitor = skflow.monitors.ValidationMonitor(X_val, y_val,
early_stopping_rounds=200)
model = skflow.TensorFlowDNNRegressor(hidden_units=[100, 50, 10], steps=5000)
model.fit(X_train, y_train, val_monitor)
yP = model.predict(X_test)
score_r2 = metrics.r2_score(y_test, yP)
score_MedAE = metrics.median_absolute_error(y_test, yP)
print('Accuracy')
print('--------')
print('R2: {0:f}, MedAE: {1:f}'.format(score_r2, score_MedAE))
if self.graph:
kutil.regress_show4( y_test, yP)
开发者ID:jskDr,项目名称:jamespy_py3,代码行数:25,代码来源:jmultidk.py
示例2: lda_tuner
def lda_tuner(ingroup_otu, best_models):
best_score = -1*np.inf
dtp_series = [0.0001, 0.001, 0.01, 0.1, 0.2]
twp_series = [0.0001, 0.001, 0.01, 0.1, 0.2]
topic_series = [3]
X = ingroup_otu.values
eval_counter = 0
for topics in topic_series:
for dtp in dtp_series:
for twp in twp_series:
eval_counter +=1
X_train, X_test = train_test_split(X, test_size=0.5)
lda = LatentDirichletAllocation(n_topics=topics,
doc_topic_prior=dtp,
topic_word_prior=twp,
learning_method='batch',
random_state=42,
max_iter=20)
lda.fit(X_train)
this_score = lda.score(X_test)
this_perplexity = lda.perplexity(X_test)
if this_score > best_score:
best_score = this_score
print "New Max Likelihood: {}".format(best_score)
print "#{}: n:{}, dtp:{}, twp:{}, score:{}, perp:{}".format(eval_counter,
topics, dtp, twp,
this_score, this_perplexity)
best_models.append({'n': topics, 'dtp': dtp, 'twp': twp,
'score': this_score, 'perp': this_perplexity})
if (dtp == dtp_series[-1]) and (twp == twp_series[-1]):
eval_counter +=1
X_train, X_test = train_test_split(X, test_size=0.5)
lda = LatentDirichletAllocation(n_topics=topics,
doc_topic_prior=1./topics,
topic_word_prior=1./topics,
learning_method='batch',
random_state=42,
max_iter=20)
lda.fit(X_train)
this_score = lda.score(X_test)
this_perplexity = lda.perplexity(X_test)
if this_score > best_score:
best_score = this_score
print "New Max Likelihood: {}".format(best_score)
print "#{}: n:{}, dtp:{}, twp:{}, score:{} perp: {}".format(eval_counter,
topics,
(1./topics),
(1./topics),
this_score,
this_perplexity)
best_models.append({'n': topics, 'dtp': (1./topics),
'twp': (1./topics), 'score': this_score,
'perp': this_perplexity})
return best_models
开发者ID:karoraw1,项目名称:GLM_Wrapper,代码行数:60,代码来源:otu_ts_support.py
示例3: test_base_estimator
def test_base_estimator():
# Check base_estimator and its default values.
rng = check_random_state(0)
# Classification
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=rng)
ensemble = BaggingClassifier(None, n_jobs=3, random_state=0).fit(X_train, y_train)
assert_true(isinstance(ensemble.base_estimator_, DecisionTreeClassifier))
ensemble = BaggingClassifier(DecisionTreeClassifier(), n_jobs=3, random_state=0).fit(X_train, y_train)
assert_true(isinstance(ensemble.base_estimator_, DecisionTreeClassifier))
ensemble = BaggingClassifier(Perceptron(), n_jobs=3, random_state=0).fit(X_train, y_train)
assert_true(isinstance(ensemble.base_estimator_, Perceptron))
# Regression
X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=rng)
ensemble = BaggingRegressor(None, n_jobs=3, random_state=0).fit(X_train, y_train)
assert_true(isinstance(ensemble.base_estimator_, DecisionTreeRegressor))
ensemble = BaggingRegressor(DecisionTreeRegressor(), n_jobs=3, random_state=0).fit(X_train, y_train)
assert_true(isinstance(ensemble.base_estimator_, DecisionTreeRegressor))
ensemble = BaggingRegressor(SVR(), n_jobs=3, random_state=0).fit(X_train, y_train)
assert_true(isinstance(ensemble.base_estimator_, SVR))
开发者ID:agamemnonc,项目名称:scikit-learn,代码行数:32,代码来源:test_bagging.py
示例4: stacking
def stacking():
X_train,X_test,Y_train,Y_test =train_test_split(x,y,
random_state=35,
test_size=0.2)
x1_test =np.zeros((X_test.shape[0],len(classifiers)))#存储第一层测试集的输出结果
x1_train =np.zeros((X_train.shape[0],len(classifiers)))
print 'x1.shape',np.shape(x1_train)
print 'y....',np.shape(Y_train)
accuracy = np.zeros(len(classifiers))#每个模型的准确率
for train_index, test_index in sss.split(X_train, Y_train):
x_train, x_test = x[train_index], x[test_index]
y_train, y_test = y[train_index], y[test_index]
clf_num = 0
for clf in classifiers:
clf_name = clf.__class__.__name__
clf.fit(x_train, y_train)
x1_train[test_index,clf_num]=clf.predict(x_test)#下层模型的训练集输入是上层模型对于对应测试集的预测输出
x1_test[:, clf_num] += clf.predict(X_test)#直接对测试集进行预测,总共有十次,进行平均
accuracy[clf_num] += (y_test == x1_train[test_index,clf_num]).mean()#该模型的准确率,十次平均
clf_num += 1
print np.shape(x1_train)
print np.shape(y_train)
x2_train,x2_test,y2_train,y2_test =train_test_split(x1_train,Y_train,test_size=0.1)
lr =LogisticRegression()
lr.fit(x2_train,y2_train)
print lr.predict(x1_test)
print Y_test
开发者ID:Xls1994,项目名称:DeepLearning,代码行数:29,代码来源:stackmodel.py
示例5: test_thresholded_scorers
def test_thresholded_scorers():
# Test scorers that take thresholds.
X, y = make_blobs(random_state=0, centers=2)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
clf = LogisticRegression(random_state=0)
clf.fit(X_train, y_train)
score1 = get_scorer('roc_auc')(clf, X_test, y_test)
score2 = roc_auc_score(y_test, clf.decision_function(X_test))
score3 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
assert_almost_equal(score1, score2)
assert_almost_equal(score1, score3)
logscore = get_scorer('log_loss')(clf, X_test, y_test)
logloss = log_loss(y_test, clf.predict_proba(X_test))
assert_almost_equal(-logscore, logloss)
# same for an estimator without decision_function
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
score1 = get_scorer('roc_auc')(clf, X_test, y_test)
score2 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
assert_almost_equal(score1, score2)
# test with a regressor (no decision_function)
reg = DecisionTreeRegressor()
reg.fit(X_train, y_train)
score1 = get_scorer('roc_auc')(reg, X_test, y_test)
score2 = roc_auc_score(y_test, reg.predict(X_test))
assert_almost_equal(score1, score2)
# Test that an exception is raised on more than two classes
X, y = make_blobs(random_state=0, centers=3)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
clf.fit(X_train, y_train)
assert_raises(ValueError, get_scorer('roc_auc'), clf, X_test, y_test)
开发者ID:AlexanderFabisch,项目名称:scikit-learn,代码行数:35,代码来源:test_score_objects.py
示例6: train_test_split_mock_pandas
def train_test_split_mock_pandas():
# X mock dataframe
X_df = MockDataFrame(X)
X_train, X_test = train_test_split(X_df)
assert_true(isinstance(X_train, MockDataFrame))
assert_true(isinstance(X_test, MockDataFrame))
X_train_arr, X_test_arr = train_test_split(X_df)
开发者ID:absolutelyNoWarranty,项目名称:scikit-learn,代码行数:7,代码来源:test_split.py
示例7: read
def read(d):
data = pd.read_table(path+uni+"_"+d+".txt",delimiter='\t')
data['label'] = 0
for i in range(len(data.index)):
if data.iloc[i,3]<1000:
data.iloc[i,len(data.columns)-1]=1
else:
data.iloc[i,len(data.columns)-1]=0
X_0 = data.iloc[:,7:len(data.columns)-1]
y_0 = data.iloc[:,len(data.columns)-1]
X_0,X_,y_0,y_ = train_test_split(X_0,y_0,test_size=0.0,random_state=3421)
X_1,X_test,y_1,y_test = train_test_split(X_0,y_0,test_size=0.2,random_state=1257)
X_2,X_3,y_2,y_3 = train_test_split(X_1,y_1,test_size=1-label_rate,random_state=11)
############## 整体预测与交互检验 ###########
# scores_all = cross_val_score(RandomForestClassifier(n_estimators=500), X_1, y_1, cv=5, scoring='accuracy')
# score_all_mean =scores_all.mean()
# print(d+'5折交互检验:'+str(score_all_mean))
# rf_all = RandomForestClassifier(n_estimators=500).fit(X_1,y_1)
# answer_rf_all = rf_all.predict(X_test)
# accuracy_all = metrics.accuracy_score(y_test,answer_rf_all)
# print(d+'整体预测:'+str(accuracy_all))
################################################
return data,X_2,y_2,X_3,y_3,X_test,y_test
开发者ID:IamCatkin,项目名称:Learning-Python,代码行数:25,代码来源:SSL-2.py
示例8: reduce_dataset
def reduce_dataset(uid):
ds = load_validation_dataframe(uid)
X_train, X_valid, X_test, y_train, y_valid, y_test = ds
X=pd.concat((X_train,X_valid,X_test))
y=np.concatenate((y_train,y_valid,y_test))
if len(y) > 5000:
neg_inds = [i for i, v in enumerate(y) if v==0]
pos_inds = [i for i, v in enumerate(y) if v==1]
n_neg = 5000 - len(pos_inds)
neg_inds = sample(neg_inds, n_neg)
inds = sorted(neg_inds + pos_inds)
X = X.iloc[inds,:]
y = y[inds]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.66666, random_state=42)
Xtrain_fname = join(DATAFRAMES_FOLDER, "dfXtrain_%d_small.pickle" % uid)
Xvalid_fname = join(DATAFRAMES_FOLDER, "dfXvalid_%d_small.pickle" % uid)
Xtest_fname = join(DATAFRAMES_FOLDER, "dfXtestv_%d_small.pickle" % uid)
ys_fname = join(DATAFRAMES_FOLDER, "ysv_%d_small.pickle" % uid)
X_train.to_pickle(Xtrain_fname)
X_valid.to_pickle(Xvalid_fname)
X_test.to_pickle(Xtest_fname)
pickle.dump((y_train, y_valid, y_test), open(ys_fname, 'wb'))
return X_train, X_valid, X_test, y_train, y_valid, y_test
开发者ID:pablocelayes,项目名称:sna_classifier,代码行数:30,代码来源:datasets.py
示例9: test_classification_with_validation
def test_classification_with_validation(self):
tol_places = 4
data_x, data_y = make_classification(n_samples=100, n_features=7,
n_redundant=0, n_informative=7,
n_clusters_per_class=2,
random_state=3227)
label_y = np.where(data_y == 0, 'A', 'B')
train_x, test_x, train_y, test_y = train_test_split(data_x, label_y,
test_size=0.25,
random_state=3227)
train_x, validate_x, train_y, validate_y = train_test_split(
train_x, train_y, test_size=0.5, random_state=3227)
params = {
'ref_functions': ('linear_cov',),
'criterion_type': 'bias_retrain',
'criterion_minimum_width': 5,
'max_layer_count': 5,
'verbose': 0,
'n_jobs': 'max'
}
model = Classifier(**params)
model.fit(train_x, train_y, validation_data=(validate_x, validate_y))
pred_y = model.predict_proba(test_x)
roc_auc = roc_auc_score(model.le.transform(test_y), pred_y)
self.assertAlmostEqual(roc_auc, 0.76, places=tol_places)
no1 = model.predict_neuron_output(test_x, 0, 0)
no2 = model.predict_neuron_output(test_x, 1, 0)
开发者ID:kvoyager,项目名称:GmdhPy,代码行数:31,代码来源:test_model.py
示例10: __init__
def __init__(self, root, train=True, val=False, color_space='lab', transform=None, test_size=0.9, val_size=0.125, location='cpu'):
"""
color_space: 'yub' or 'lab'
"""
self.root_dir = root
all_files = []
for r, _, files in walk(self.root_dir):
for f in files:
if f.endswith('.jpg'):
all_files.append(join(r, f))
train_val_files, test_files = train_test_split(
all_files, test_size=test_size, random_state=69)
train_files, val_files = train_test_split(train_val_files,
test_size=val_size, random_state=69)
if (train and val):
self.filenames = val_files
elif train:
self.filenames = train_files
else:
self.filenames = test_files
self.color_space = color_space
if (self.color_space not in ['rgb', 'lab']):
raise(NotImplementedError)
self.transform = transform
self.location = location
self.nnenc = NNEncode(location=self.location)
self.train = train
开发者ID:stanleynguyen,项目名称:corolization,代码行数:28,代码来源:dataset.py
示例11: main
def main(_):
if FLAGS.dataset == 'cifar10':
(X_train, y_train), (_, _) = cifar10.load_data()
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=0)
else:
with open('data/train.p', mode='rb') as f:
train = pickle.load(f)
X_train, X_val, y_train, y_val = train_test_split(train['features'], train['labels'], test_size=0.33, random_state=0)
train_output_file = "{}_{}_{}.p".format(FLAGS.network, FLAGS.dataset, 'bottleneck_features_train')
validation_output_file = "{}_{}_{}.p".format(FLAGS.network, FLAGS.dataset, 'bottleneck_features_validation')
print("Resizing to", (w, h, ch))
print("Saving to ...")
print(train_output_file)
print(validation_output_file)
with tf.Session() as sess:
K.set_session(sess)
K.set_learning_phase(1)
model = create_model()
print('Bottleneck training')
train_gen = gen(sess, X_train, y_train, batch_size)
bottleneck_features_train = model.predict_generator(train_gen(), X_train.shape[0])
data = {'features': bottleneck_features_train, 'labels': y_train}
pickle.dump(data, open(train_output_file, 'wb'))
print('Bottleneck validation')
val_gen = gen(sess, X_val, y_val, batch_size)
bottleneck_features_validation = model.predict_generator(val_gen(), X_val.shape[0])
data = {'features': bottleneck_features_validation, 'labels': y_val}
pickle.dump(data, open(validation_output_file, 'wb'))
开发者ID:AbdulTheProgrammer,项目名称:CarND-Transfer-Learning-Lab,代码行数:35,代码来源:run_bottleneck.py
示例12: split_data
def split_data(data):
X_train, X_test, Y_train, Y_test = train_test_split(data.loc[:, data.columns != label], data[label],
train_size=train_size + validation_size, test_size=test_size,
shuffle=False, random_state=0)
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train,
train_size=train_size / (train_size + validation_size),
test_size=validation_size / (train_size + validation_size),
shuffle=False, random_state=0)
return X_train, X_val, X_test, Y_train, Y_val, Y_test
开发者ID:michaeltur3,项目名称:ML_HW3,代码行数:9,代码来源:prepare_data.py
示例13: test_split
def test_split(self):
ds = self.create_dataset()
indexes = list(range(len(ds)))
train, test = train_test_split(indexes)
train, valid = train_test_split(train)
splitter = SpecifiedIndexSplitter(train, valid, test)
train_ds, valid_ds, test_ds = splitter.train_valid_test_split(ds)
self.assertTrue(np.all(train_ds.X == ds.X[train]))
self.assertTrue(np.all(valid_ds.X == ds.X[valid]))
self.assertTrue(np.all(test_ds.X == ds.X[test]))
开发者ID:ktaneishi,项目名称:deepchem,代码行数:12,代码来源:test_specified_index_splitter.py
示例14: get_train_valid_test_split
def get_train_valid_test_split(n, train=0.7, valid=0.1, test=0.2, shuffle=False):
other_split = valid+test
if train+other_split!=1:
raise ValueError("Train, Valid, Test splits should sum to 1")
train_set, other_set = train_test_split(range(1,n+1),
train_size=train, test_size=other_split, shuffle=shuffle)
valid_set, test_set = train_test_split(other_set,
train_size=valid/other_split,
test_size=test/other_split,
shuffle=False)
print("train:{} valid:{} test:{}".format(len(train_set), len(valid_set), len(test_set)))
return train_set, valid_set, test_set
开发者ID:chesterxgchen,项目名称:DeepLearningFrameworks,代码行数:12,代码来源:utils.py
示例15: preprocess
def preprocess(data, test_size, sample=None, scale=True):
data_frame_all = pandas.read_table(data)
df = data_frame_all
# for simplicity for now--and since only 11093 or <3 % of our data, we're just gonna drop those rows
no_null_df = df.dropna(axis=0, how='any')
# this shows us that we no longer have null values
no_null_df.isnull().values.any()
# let's rename our new data frame df again. we're left with 238907 rows
df = no_null_df
df_unprocessed = df
if sample:
df = df.sample(frac=sample)
print("sampled")
df = df[['order_estimated_driving_time_min','order_estimated_shopping_time_min']]
df['total_time_min'] = df.sum(axis=1)
df['time_in_hours'] = df.total_time_min.divide(60)
target = df.time_in_hours * 15
df = df.drop(['time_in_hours', 'total_time_min'], axis=1)
s1 = target.std()
s2 = 7.5 #our chosen std deviation
m1 = target.mean()
m2 = 15 #our chosen mean
target = m2 + (target - m1) * s2/s1 #scale our output to a mean of 15 and std deviation of 3
X = df
y = target
if scale:
df_pp = preprocessing.scale(df)
print("scaled")
X_train, X_test, y_train, y_test = train_test_split(df_pp, target, test_size=test_size, random_state=42)
else:
df_pp = None
X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=test_size, random_state=42)
return df_unprocessed, df, df_pp, target, X, X_train, X_test, y, y_train, y_test
开发者ID:alexjacobs08,项目名称:ShiptProject,代码行数:53,代码来源:order_pay_model.py
示例16: resample
def resample(X, y, sample_fraction=0.1, test_size=0.3):
X_columns = X.columns
y_columns = y.columns
n = len(X_columns)
print('~' * 80)
print('@@-\n', y.converted.value_counts())
print('@@0 - Original')
show_balance(y.values)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
print('@@2 - y_train')
show_balance(y_train)
print('@@2 - y_test')
show_balance(y_test)
assert X_train.shape[1] == n and X_test.shape[1] == n
ros = RandomOverSampler(random_state=42)
X_train, y_train = ros.fit_sample(X_train, y_train)
X_test, y_test = ros.fit_sample(X_test, y_test)
print('@@3 - Oversampled y_train')
show_balance(y_train)
print('@@3 - Oversampled y_test')
show_balance(y_test)
assert X_train.shape[1] == n and X_test.shape[1] == n
if sample_fraction < 1.0:
_, X_train, _, y_train = train_test_split(X_train, y_train, test_size=sample_fraction, random_state=43)
_, X_test, _, y_test = train_test_split(X_test, y_test, test_size=sample_fraction, random_state=44)
print('@@2 - Downsampled y_train')
show_balance(y_train)
print('@@2 - Downsampled y_test')
show_balance(y_test)
assert len(X_train.shape) == 2 and len(X_test.shape) == 2, (X_train.shape, X_test.shape)
assert X_train.shape[1] == n and X_test.shape[1] == n, (X_train.shape, X_test.shape)
print('X_columns=%d %s' % (len(X_columns), X_columns))
print('y_columns=%d %s' % (len(y_columns), y_columns))
print('X_train=%-10s y_train=%s' % (list(X_train.shape), list(y_train.shape)))
print('X_test =%-10s y_test =%s' % (list(X_test.shape), list(y_test.shape)))
assert X_train.shape[1] == n and X_test.shape[1] == n
X_train = pd.DataFrame(X_train, columns=X_columns)
y_train = pd.DataFrame(y_train, columns=y_columns, index=X_train.index)
X_test = pd.DataFrame(X_test, columns=X_columns)
y_test = pd.DataFrame(y_test, columns=y_columns, index=X_test.index)
print('@@+ y_train\n', y_train.converted.value_counts(), flush=True)
print('@@+ y_test\n', y_test.converted.value_counts(), flush=True)
return (X_train, y_train), (X_test, y_test)
开发者ID:peterwilliams97,项目名称:Butt-Head-Astronomer,代码行数:50,代码来源:feature_select.py
示例17: test_thresholded_scorers
def test_thresholded_scorers():
# Test scorers that take thresholds.
X, y = make_blobs(random_state=0, centers=2)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
clf = LogisticRegression(random_state=0)
clf.fit(X_train, y_train)
score1 = get_scorer('roc_auc')(clf, X_test, y_test)
score2 = roc_auc_score(y_test, clf.decision_function(X_test))
score3 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
assert_almost_equal(score1, score2)
assert_almost_equal(score1, score3)
logscore = get_scorer('neg_log_loss')(clf, X_test, y_test)
logloss = log_loss(y_test, clf.predict_proba(X_test))
assert_almost_equal(-logscore, logloss)
# same for an estimator without decision_function
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
score1 = get_scorer('roc_auc')(clf, X_test, y_test)
score2 = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])
assert_almost_equal(score1, score2)
# test with a regressor (no decision_function)
reg = DecisionTreeRegressor()
reg.fit(X_train, y_train)
score1 = get_scorer('roc_auc')(reg, X_test, y_test)
score2 = roc_auc_score(y_test, reg.predict(X_test))
assert_almost_equal(score1, score2)
# Test that an exception is raised on more than two classes
X, y = make_blobs(random_state=0, centers=3)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
clf.fit(X_train, y_train)
with pytest.raises(ValueError, match="multiclass format is not supported"):
get_scorer('roc_auc')(clf, X_test, y_test)
# test error is raised with a single class present in model
# (predict_proba shape is not suitable for binary auc)
X, y = make_blobs(random_state=0, centers=2)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
clf = DecisionTreeClassifier()
clf.fit(X_train, np.zeros_like(y_train))
with pytest.raises(ValueError, match="need classifier with two classes"):
get_scorer('roc_auc')(clf, X_test, y_test)
# for proba scorers
with pytest.raises(ValueError, match="need classifier with two classes"):
get_scorer('neg_log_loss')(clf, X_test, y_test)
开发者ID:srinivasreddy,项目名称:scikit-learn,代码行数:49,代码来源:test_score_objects.py
示例18: filter_split_data
def filter_split_data(X_raw, y_raw, metadatas, max_cloud_cover=1, timespan_before=np.inf, test_fraction=0.3, val_fraction=0.3, random_seed=0, normalized=True, balanced_classes=True, filter_center_cloudy=False):
X, y, metadata_filtered = filter_data(X_raw, y_raw, metadatas, max_cloud_cover=max_cloud_cover, timespan_before=timespan_before, random_seed=random_seed, normalized=normalized, balanced_classes=balanced_classes, filter_center_cloudy=filter_center_cloudy)
X, y, metadata_filtered=shuffle(X, y, metadata_filtered, random_state=random_seed)
X_train, X_test, y_train, y_test, metadata_train, metadata_test=train_test_split(
X, y, metadata_filtered, test_size=test_fraction, random_state=random_seed)
X_train, X_val, y_train, y_val, metadata_train, metadata_val=train_test_split(
X_train, y_train, metadata_train, test_size=val_fraction, random_state=random_seed)
# print(X_train.shape,y_train.shape, len(metadata_train))
# print(X_test.shape,y_test.shape, len(metadata_test))
# print(X_val.shape,y_val.shape, len(metadata_val))
return X_train, y_train, metadata_train, X_val, y_val, metadata_val, X_test, y_test, metadata_test
开发者ID:kinect59,项目名称:satellite_leak_detection,代码行数:15,代码来源:filters.py
示例19: test_feature_importance_regression
def test_feature_importance_regression():
"""Test that Gini importance is calculated correctly.
This test follows the example from [1]_ (pg. 373).
.. [1] Friedman, J., Hastie, T., & Tibshirani, R. (2001). The elements
of statistical learning. New York: Springer series in statistics.
"""
california = fetch_california_housing()
X, y = california.data, california.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
reg = GradientBoostingRegressor(loss='huber', learning_rate=0.1,
max_leaf_nodes=6, n_estimators=100,
random_state=0)
reg.fit(X_train, y_train)
sorted_idx = np.argsort(reg.feature_importances_)[::-1]
sorted_features = [california.feature_names[s] for s in sorted_idx]
# The most important feature is the median income by far.
assert sorted_features[0] == 'MedInc'
# The three subsequent features are the following. Their relative ordering
# might change a bit depending on the randomness of the trees and the
# train / test split.
assert set(sorted_features[1:4]) == {'Longitude', 'AveOccup', 'Latitude'}
开发者ID:amueller,项目名称:scikit-learn,代码行数:26,代码来源:test_gradient_boosting.py
示例20: test_gradient_boosting_validation_fraction
def test_gradient_boosting_validation_fraction():
X, y = make_classification(n_samples=1000, random_state=0)
gbc = GradientBoostingClassifier(n_estimators=100,
n_iter_no_change=10,
validation_fraction=0.1,
learning_rate=0.1, max_depth=3,
random_state=42)
gbc2 = clone(gbc).set_params(validation_fraction=0.3)
gbc3 = clone(gbc).set_params(n_iter_no_change=20)
gbr = GradientBoostingRegressor(n_estimators=100, n_iter_no_change=10,
learning_rate=0.1, max_depth=3,
validation_fraction=0.1,
random_state=42)
gbr2 = clone(gbr).set_params(validation_fraction=0.3)
gbr3 = clone(gbr).set_params(n_iter_no_change=20)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
# Check if validation_fraction has an effect
gbc.fit(X_train, y_train)
gbc2.fit(X_train, y_train)
assert gbc.n_estimators_ != gbc2.n_estimators_
gbr.fit(X_train, y_train)
gbr2.fit(X_train, y_train)
assert gbr.n_estimators_ != gbr2.n_estimators_
# Check if n_estimators_ increase monotonically with n_iter_no_change
# Set validation
gbc3.fit(X_train, y_train)
gbr3.fit(X_train, y_train)
assert gbr.n_estimators_ < gbr3.n_estimators_
assert gbc.n_estimators_ < gbc3.n_estimators_
开发者ID:amueller,项目名称:scikit-learn,代码行数:34,代码来源:test_gradient_boosting.py
注:本文中的sklearn.model_selection.train_test_split函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论