本文整理汇总了Python中sklearn.pipeline.Pipeline类的典型用法代码示例。如果您正苦于以下问题:Python Pipeline类的具体用法?Python Pipeline怎么用?Python Pipeline使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Pipeline类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: simple_classification_without_cross_fold_validation
def simple_classification_without_cross_fold_validation(x, y, estimator, scoring):
'''
Run normal SVM classification without cross-fold validation.
'''
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3) # 30% reserved for validation
# feature selection since we have a small sample space
fs = SelectPercentile(scoring, percentile=20)
pipeline = Pipeline([('featureselector', fs), ('scaler', StandardScaler()), ('estimator', estimator)])
pipeline = OneVsRestClassifier(pipeline)
clfer = pipeline.fit(x_train, y_train)
y_predict_train = clfer.predict(x_train)
print "%% Accuracy on training set: %2.3f" % metrics.accuracy_score(y_train, y_predict_train)
y_predict_test = clfer.predict(x_test)
print "\n%% Accuracy on testing set: %2.3f" % metrics.accuracy_score(y_test, y_predict_test)
print "\nClassification Report:"
print metrics.classification_report(y_test, y_predict_test)
print "Confusion Matrix:"
print metrics.confusion_matrix(y_test, y_predict_test)
开发者ID:neerajrao,项目名称:hybrid-svm-author-attribution,代码行数:27,代码来源:svmAuthorRec.py
示例2: test_set_pipeline_step_none
def test_set_pipeline_step_none():
# Test setting Pipeline steps to None
X = np.array([[1]])
y = np.array([1])
mult2 = Mult(mult=2)
mult3 = Mult(mult=3)
mult5 = Mult(mult=5)
def make():
return Pipeline([("m2", mult2), ("m3", mult3), ("last", mult5)])
pipeline = make()
exp = 2 * 3 * 5
assert_array_equal([[exp]], pipeline.fit_transform(X, y))
assert_array_equal([exp], pipeline.fit(X).predict(X))
assert_array_equal(X, pipeline.inverse_transform([[exp]]))
pipeline.set_params(m3=None)
exp = 2 * 5
assert_array_equal([[exp]], pipeline.fit_transform(X, y))
assert_array_equal([exp], pipeline.fit(X).predict(X))
assert_array_equal(X, pipeline.inverse_transform([[exp]]))
assert_dict_equal(
pipeline.get_params(deep=True),
{"steps": pipeline.steps, "m2": mult2, "m3": None, "last": mult5, "m2__mult": 2, "last__mult": 5},
)
pipeline.set_params(m2=None)
exp = 5
assert_array_equal([[exp]], pipeline.fit_transform(X, y))
assert_array_equal([exp], pipeline.fit(X).predict(X))
assert_array_equal(X, pipeline.inverse_transform([[exp]]))
# for other methods, ensure no AttributeErrors on None:
other_methods = ["predict_proba", "predict_log_proba", "decision_function", "transform", "score"]
for method in other_methods:
getattr(pipeline, method)(X)
pipeline.set_params(m2=mult2)
exp = 2 * 5
assert_array_equal([[exp]], pipeline.fit_transform(X, y))
assert_array_equal([exp], pipeline.fit(X).predict(X))
assert_array_equal(X, pipeline.inverse_transform([[exp]]))
pipeline = make()
pipeline.set_params(last=None)
# mult2 and mult3 are active
exp = 6
assert_array_equal([[exp]], pipeline.fit(X, y).transform(X))
assert_array_equal([[exp]], pipeline.fit_transform(X, y))
assert_array_equal(X, pipeline.inverse_transform([[exp]]))
assert_raise_message(AttributeError, "'NoneType' object has no attribute 'predict'", getattr, pipeline, "predict")
# Check None step at construction time
exp = 2 * 5
pipeline = Pipeline([("m2", mult2), ("m3", None), ("last", mult5)])
assert_array_equal([[exp]], pipeline.fit_transform(X, y))
assert_array_equal([exp], pipeline.fit(X).predict(X))
assert_array_equal(X, pipeline.inverse_transform([[exp]]))
开发者ID:cheral,项目名称:scikit-learn,代码行数:60,代码来源:test_pipeline.py
示例3: train_clf
def train_clf(self):
pipeline = Pipeline([
("tfidf", TfidfVectorizer(ngram_range=(1, 2), sublinear_tf=True)),
("svc", LinearSVC(C=100))
])
pipeline.fit(self.dataset.data, self.dataset.target)
return pipeline
开发者ID:fuxes,项目名称:twitter-sentiment-clustering,代码行数:7,代码来源:sentiment.py
示例4: main
def main():
corpus = capitalCorpus()
transformer = textTransformer()
continents = np.array(os.listdir('txt/'))
for continent_dir in enumerate(continents):
corpus = getText(continent_dir,corpus,transformer)
#Split corpus into training set and test set
train_X, test_X, train_Y, test_Y = train_test_split(corpus.data,
corpus.target, test_size = 0.25, random_state=54321)
#Build a pipeline
clf = MultinomialNB()
count_vect = CountVectorizer()
tfidf_transformer = TfidfTransformer(use_idf = True)
clf_pipe = Pipeline(
[
('vectorizer', count_vect),
('tfidf', tfidf_transformer),
('classifier', clf)
]
).fit(train_X, train_Y)
predicted = clf_pipe.predict(test_X)
print(classification_report(test_Y, predicted))
开发者ID:Thanuka7777,项目名称:Text-Classification-using-ScikitLearn,代码行数:30,代码来源:Classify.py
示例5: Regressor
class Regressor(BaseEstimator):
def __init__(self):
self.clf = Pipeline([
("RF", RandomForestRegressor(n_estimators=200, max_depth=15,
n_jobs=N_JOBS))])
self.scaler = StandardScaler()
self.agglo = FeatureAgglomeration(n_clusters=500)
def fit(self, X, y):
y = y.ravel()
n_samples, n_lags, n_lats, n_lons = X.shape
self.scaler.fit(X[:, -1].reshape(n_samples, -1))
X = X.reshape(n_lags * n_samples, -1)
connectivity = grid_to_graph(n_lats, n_lons)
self.agglo.connectivity = connectivity
X = self.scaler.transform(X)
X = self.agglo.fit_transform(X)
X = X.reshape(n_samples, -1)
self.clf.fit(X, y)
def predict(self, X):
n_samples, n_lags, n_lats, n_lons = X.shape
X = X.reshape(n_lags * n_samples, -1)
X = self.scaler.transform(X)
X = self.agglo.transform(X)
X = X.reshape(n_samples, -1)
return self.clf.predict(X)
开发者ID:agramfort,项目名称:el_nino_ramp,代码行数:27,代码来源:regressor.py
示例6: create_union_model
def create_union_model(params=None):
def preprocessor(tweet):
tweet = tweet.lower()
for k in emo_repl_order:
tweet = tweet.replace(k, emo_repl[k])
for r, repl in re_repl.items():
tweet = re.sub(r, repl, tweet)
return tweet.replace("-", " ").replace("_", " ")
tfidf_ngrams = TfidfVectorizer(preprocessor=preprocessor,
analyzer="word")
ling_stats = LinguisticVectorizer()
all_features = FeatureUnion(
[('ling', ling_stats), ('tfidf', tfidf_ngrams)])
#all_features = FeatureUnion([('tfidf', tfidf_ngrams)])
#all_features = FeatureUnion([('ling', ling_stats)])
clf = MultinomialNB()
pipeline = Pipeline([('all', all_features), ('clf', clf)])
if params:
pipeline.set_params(**params)
return pipeline
开发者ID:greatabel,项目名称:MachineLearning,代码行数:25,代码来源:i14combine+classify.py
示例7: svcDictVector
def svcDictVector():
recipeData = getRecipeData()
labels = [recipe['cuisine'] for recipe in recipeData]
ingredientsFixtures = [sorted(set(e['ingredients'])) for e in recipeData]
for i, w in enumerate(ingredientsFixtures):
ingredientsFixtures[i] = dict(zip(w, [1] * len(w)))
pipeline = Pipeline([
('dict', DictVectorizer()),
('variance', VarianceThreshold()),
('tfidf', TfidfTransformer()),
('bayes', svm.LinearSVC()),
])
pipeline.fit(ingredientsFixtures, labels)
print pipeline
testRecipes = getTestData()
testIngredientsFixtures = [sorted(set(e['ingredients'])) for e in testRecipes]
for i, w in enumerate(testIngredientsFixtures):
testIngredientsFixtures[i] = dict(zip(w, [1] * len(w)))
predictions = pipeline.predict(testIngredientsFixtures)
outputPercentCorrect(predictions)
copyAndOutput(predictions, testRecipes)
开发者ID:adatta02,项目名称:whats-cooking,代码行数:26,代码来源:fit.py
示例8: classify
def classify(text, label):
#~ Testing purpose: 10-fold cross validation
cv = KFold(n = len(label), n_folds = 10)
n_c = [100, 200, 500, 1000, 2000, 5000, 10000]
for i in n_c:
clf = Pipeline([
('vect',
TfidfVectorizer(
analyzer='word',
ngram_range=(1, 1),
stop_words = 'english',
lowercase=True,
token_pattern=r'\b\w+\b',
tokenizer=tokenize_doc,
min_df = 1)),
('dim_reduction',
TruncatedSVD(n_components=i)),
#~ ('feature_selection',
#~ SelectKBest(
#~ chi2,
#~ k=35)),
('classification',
LogisticRegression())
#~ SVC(kernel = 'linear'))
])
print "len(label) ", len(label), " | text ", len(text)
print ""
clf.fit(np.asarray(text), np.asarray(label))
cv_score = cross_val_score(clf, text, label, cv = cv, verbose = 1)
print "Log Reg | n_c = ", i
print "Accuracy List ", cv_score, " | Avg Accuracy ", np.mean(cv_score)
开发者ID:CUBigDataClass,项目名称:Disaster-Analysis,代码行数:35,代码来源:tweet_classification.py
示例9: ModelPipeline
class ModelPipeline(object):
def __init__(self, clf):
self.columns =[]
self.pipeline = Pipeline([
('clf', clf)
])
def fit(self, X_train, y_train):
self.pipeline.fit(X_train, y_train)
self.columns = list(X_train.columns)
def predict(self, X_test):
return self.pipeline.predict(X_test)
def feat_importances(self, n=10, string=True):
imp = self.pipeline.steps[0][1].feature_importances_
if string:
return ''.join('%s: %s%%\n' % (self.columns[feat], round(
imp[feat] * 100, 3)) for feat in np.argsort(imp)[-1:-(n+1):-1])
else:
return self.columns[np.argsort(imp)[-1:-(n+1):-1]], \
sorted(imp)[-1:-(n+1):-1]
def grid_search(self, X, y):
parameters = {
'clf__n_estimators': [100, 200, 300] ,
'clf__max_features': ['sqrt', 50, 80],
'clf__max_depth': [None, 50, 100],
'clf__oob_score': [False, True],
'clf__random_state':[29],
'clf__class_weight':['balanced', None, 'balanced_subsample'],
'clf__min_samples_split': [2, 10, 20]
}
grid_search = GridSearchCV(self.pipeline, parameters, n_jobs=-1, verbose=1, scoring = "recall")
print("Performing grid search...")
print("pipeline:", [name for name, _ in self.pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(X, y)
print("done in %0.3fs" % (time() - t0))
print()
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
print("\t%s: %r" % (param_name, best_parameters[param_name]))
return best_parameters
开发者ID:nmoraesmunter,项目名称:BeTheChange,代码行数:60,代码来源:verified_victory_pipeline.py
示例10: Vectorizer
class Vectorizer():
def __init__(self, hash=False, min_df=0.015, max_df=0.9):
"""
`min_df` is set to filter out extremely rare words,
since we don't want those to dominate the distance metric.
`max_df` is set to filter out extremely common words,
since they don't convey much information.
"""
if hash:
args = [
('vectorizer', HashingVectorizer(input='content', stop_words='english', lowercase=True, tokenizer=Tokenizer())),
('tfidf', TfidfTransformer(norm=None, use_idf=True, smooth_idf=True)),
('feature_reducer', TruncatedSVD(n_components=400)),
('normalizer', Normalizer(copy=False))
]
else:
args = [
('vectorizer', CountVectorizer(input='content', stop_words='english', lowercase=True, tokenizer=Tokenizer(), min_df=min_df, max_df=max_df)),
('tfidf', TfidfTransformer(norm=None, use_idf=True, smooth_idf=True)),
('normalizer', Normalizer(copy=False))
]
self.pipeline = Pipeline(args)
def vectorize(self, docs, train=False):
if train:
return self.pipeline.fit_transform(docs)
else:
return self.pipeline.transform(docs)
@property
def vocabulary(self):
return self.pipeline.named_steps['vectorizer'].get_feature_names()
开发者ID:frnsys,项目名称:eclair,代码行数:35,代码来源:text.py
示例11: train
def train(docs):
"""
Trains and serializes (pickles) a vectorizing pipeline
based on training data.
`min_df` is set to filter out extremely rare words,
since we don't want those to dominate the distance metric.
`max_df` is set to filter out extremely common words,
since they don't convey much information.
"""
pipeline = Pipeline([
('vectorizer', CountVectorizer(input='content', stop_words='english', lowercase=True, tokenizer=Tokenizer(), min_df=0.015, max_df=0.9)),
('tfidf', TfidfTransformer(norm=None, use_idf=True, smooth_idf=True)),
('feature_reducer', TruncatedSVD(n_components=100)),
('normalizer', Normalizer(copy=False))
])
print('Training on {0} docs...'.format(len(docs)))
pipeline.fit(docs)
PIPELINE = pipeline
print('Serializing pipeline to {0}'.format(PIPELINE_PATH))
pipeline_file = open(PIPELINE_PATH, 'wb')
pickle.dump(pipeline, pipeline_file)
print('Training complete.')
开发者ID:frnsys,项目名称:news_automata,代码行数:27,代码来源:vectorize.py
示例12: test_multiple_cols_numbers_ignored
def test_multiple_cols_numbers_ignored(self):
t = bt.Split_transform(input_features=["a","b"],ignore_numbers=True,output_feature="res")
df = pd.DataFrame.from_dict([{"a":"a b","b":"c 1","c":3}])
transformers = [("split_transform",t)]
p = Pipeline(transformers)
df2 = p.transform(df)
self.assertTrue(len(df2["res"][0]) == 3)
开发者ID:pk359,项目名称:seldon-server,代码行数:7,代码来源:test_basic_transforms.py
示例13: test_multiple_cols
def test_multiple_cols(self):
t = bt.Split_transform(input_features=["a","b"],output_feature="res")
df = pd.DataFrame.from_dict([{"a":"a b","b":"c d","c":3},{"a":"word1","b":"word2"}])
transformers = [("split_transform",t)]
p = Pipeline(transformers)
df2 = p.transform(df)
self.assertTrue(len(df2["res"][0]) == 4)
开发者ID:pk359,项目名称:seldon-server,代码行数:7,代码来源:test_basic_transforms.py
示例14: test_sklearn_pipeline
def test_sklearn_pipeline(self):
df = pd.DataFrame.from_dict([{"a":"something","b":1},{"a":"something2"}])
t = bt.Exclude_features_transform(excluded=["b"])
transformers = [("exclude_transform",t)]
p = Pipeline(transformers)
df2 = p.fit_transform(df)
self.assertEquals(len(df2.columns),1)
开发者ID:pk359,项目名称:seldon-server,代码行数:7,代码来源:test_basic_transforms.py
示例15: train_optimal_classifier
def train_optimal_classifier(clf, X, y, params, scale=False, folds=1000):
pipeline = 0
combined_features = FeatureUnion([("pca", PCA()), ("univ_select", SelectKBest())])
if scale:
pipeline = Pipeline([("minmax", MinMaxScaler()),
("features", combined_features), ("clf", clf)])
else:
pipeline = Pipeline([("features", combined_features), ("clf", clf)])
param_grid = dict(features__pca__n_components=[0,1,3,6,9,12,15],
features__univ_select__k=list(range(0, len(X[0]))))
for k, v in params.iteritems():
param_grid["clf__" + k] = v
grid_search = GridSearchCV(
pipeline,
param_grid=param_grid,
cv=cross_validation.StratifiedShuffleSplit(y, folds),
verbose=1,
scoring='f1',
error_score=0,
refit=True,
)
grid_search.fit(X, y)
return (grid_search.best_estimator_, grid_search.best_score_, pipeline.fit(X,y))
开发者ID:pcasaretto,项目名称:machinelearning-finalproject,代码行数:28,代码来源:exploration.py
示例16: train_regressor
def train_regressor(data, X_columns, y_show=y_init+y_curr):
X = data.loc[:,X_columns]
ys = data.loc[:, [i for i in y_show if i not in X_columns]]
print()
for n_trees in [256]:
#list(range(4, 16)) + [18,20] + [2**n for n in range(4, 12)]:
#[n for n in range(4, 64)]:#[2**n for n in range(1, 12)]:
forest = Pipeline(steps=[
('forest', ExtraTreesRegressor(
#RandomForestRegressor(
n_estimators=n_trees,
n_jobs=min(n_trees, 62),
oob_score=True, bootstrap=True))])
start = time()
forest.fit(X, ys)#new_ys)
end = time()
print(n_trees, forest.steps[0][1].oob_score_, end-start)
print()
print("%.5g seconds to train regressor" % (end-start))
print()
y_names = ys.columns
X_names = X.columns
return [forest, y_names, X_names]
开发者ID:earlbellinger,项目名称:asteroseismology,代码行数:26,代码来源:learn.py
示例17: train
def train(param_search=False):
data = load_files(download())
y = [data.target_names[t] for t in data.target]
# The random state on the LR estimator is fixed to the most arbitrary value
# that I could come up with. It is biased toward the middle number keys on
# my keyboard.
clf = Pipeline([('tfidf', TfidfVectorizer(min_df=2, dtype=float,
sublinear_tf=True,
ngram_range=(1, 2),
strip_accents='unicode')),
('lr', LogisticRegression(random_state=623, C=5000))])
if param_search:
params = {'tfidf__ngram_range': [(1, 1), (1, 2)],
'lr__C': [1000, 5000, 10000]}
print("Starting parameter search for review sentiment classification")
# We ignore the original folds in the data, preferring a simple 5-fold
# CV instead; this is intended to get a working model, not results for
# publication.
gs = GridSearchCV(clf, params, cv=5, refit=True, n_jobs=-1, verbose=2)
gs.fit(data.data, y)
print("Parameters found:")
pprint(gs.best_params_)
print("Cross-validation accuracy: %.3f" % gs.best_score_)
return gs.best_estimator_
else:
print("Training logistic regression for movie review polarity")
return clf.fit(data.data, y)
开发者ID:fanfannothing,项目名称:xtas,代码行数:33,代码来源:_polarity.py
示例18: predict
def predict():
pipeline = Pipeline([
('min/max scaler', MinMaxScaler(feature_range=(0.0, 1.0))),
('neural network', Classifier(layers=[Layer("ExpLin", units=5), Layer("Softmax")], n_iter=25))])
X = np.load('All_features.npz')['arr_0']
D = np.load('Akunin_features.npz')['arr_0']
all_samples = [1]*141 + [0]*123
y = np.array(all_samples)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.0, random_state=0)
pipeline.fit(X_train, y_train)
pickle.dump(pipeline, open('NeuralNet_model.pkl', 'wb'))
prediction = pipeline.predict(D)
probs = pipeline.predict_proba(D)
gradation = {1.01: 5, 0.9: 4, 0.8: 3, 0.7: 2, 0.6: 1}
ress1 = []
simple_predicts = []
scale_predicts = []
for i in prediction:
simple_predicts.append(i[0])
for i in probs:
scale_predicts.append(i[1]*10)
compare = []
for u in gradation:
if i[1] < u:
compare.append(gradation[u])
ress1.append(min(compare))
return simple_predicts, scale_predicts
开发者ID:IraPS,项目名称:Suspense,代码行数:35,代码来源:net_predict.py
示例19: clasificador
def clasificador(self,X_train, y_train, X_test, target_names, y_test,all_labels):
lb = preprocessing.MultiLabelBinarizer()
Y = lb.fit_transform(y_train)
classifier = Pipeline([
('vectorizer',CountVectorizer(strip_accents='unicode')),
('tfidf',TfidfTransformer()),
('to_dense', DenseTransformer()),
('clf',OneVsRestClassifier(GaussianNB()))])
classifier.fit(X_train,Y)
predicted = classifier.predict(X_test)
etiquetas = lb.inverse_transform(predicted)
for i in range(0,len(etiquetas)):
etiquetas[i]=list(etiquetas[i])
valoresMacro = self.macro(etiquetas,y_test)
valoresMicro = self.micro(etiquetas, y_test)
开发者ID:josearcosaneas,项目名称:RepositorioPara-la-entrega-del-TFG,代码行数:28,代码来源:resumen+mas+extractoGB.py
示例20: op_machine_predict
def op_machine_predict(self):
"""
与machine_predict的区别在于,op_machine_predict在每个window中都通过grid_search
的方法确定最后的参数。该模型的训练及预测步骤如下:
对于每一个窗口的数据
1) 对输入的ta_factors进行标准化的处理
2) Feature selection:方法可选择
3) PCA降维
4) 训练并Grid_Search
"""
ta_factors, labels = self.set_factors_labels()
svc = SVC(kernel='linear')
min_max_scaler = preprocessing.MinMaxScaler()
pre = pd.DataFrame(index=ta_factors.index[self.window_size:], columns=['pre_label', 'pre_actual'])
Cs = range(10, 100, 10)
gammas = range(5, 100, 5)
n_s = self.window_size
for num in range(0, len(ta_factors)-n_s):
ta_factors_scaled = min_max_scaler.fit_transform(ta_factors.ix[num:num+n_s+1])
x_train = ta_factors_scaled[:-1]
x_test = ta_factors_scaled[-1:]
y_train = labels[num:num+n_s]
y_test = labels[num+n_s]
# ta_factors_scaled_pca = pca.fit_transform(ta_factors_scaled)
rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(y_train, 2))
clf = Pipeline([('feature_select', rfecv), ('svm', SVC())])
# estimator = GridSearchCV(clf, dict(svm__C=Cs, svm__gamma=gammas))
pre_model = clf.fit(x_train, y_train)
pre['pre_label'][num] = pre_model.predict(x_test).item()
pre['pre_actual'][num] = y_test
pre['pre_acu'] = pre['pre_label'] == pre['pre_actual']
self.prediction_results = pre
return pre
开发者ID:FayolChang,项目名称:mlp,代码行数:35,代码来源:index_prediction.py
注:本文中的sklearn.pipeline.Pipeline类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论