本文整理汇总了Python中sklearn.datasets.load_files函数的典型用法代码示例。如果您正苦于以下问题:Python load_files函数的具体用法?Python load_files怎么用?Python load_files使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了load_files函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: export_classifier
def export_classifier():
#note that this data is not in the git repo
train_small = load_files('./training_data/')
test_small = load_files('./test_data/')
# Turn the text documents into vectors of word frequencies
vectorizer = CountVectorizer(min_df=5, ngram_range=(1, 2),
stop_words='english',
strip_accents='ascii')
X_train = vectorizer.fit_transform(train_small.data)
y_train = train_small.target
# Fit a classifier on the training set
classifier = LogisticRegression(penalty='l2', tol=0.0001, C=1.0,
fit_intercept=True, intercept_scaling=1,
).fit(X_train, y_train)
print("Training score: {0:.1f}%".format(
classifier.score(X_train, y_train) * 100))
# Evaluate the classifier on the testing set
X_test = vectorizer.transform(test_small.data)
y_test = test_small.target
print("Testing score: {0:.1f}%".format(
classifier.score(X_test, y_test) * 100))
export_pickle('LRclassifier.txt', classifier)
export_pickle('LRvectorizer.txt', vectorizer)
开发者ID:sazlin,项目名称:reTOracle,代码行数:26,代码来源:LR.py
示例2: getData
def getData():
train_data= load_files('training')
test_data=load_files("test")
count_Vec=TfidfVectorizer(min_df=1,decode_error="replace")
doc_train=count_Vec.fit_transform(train_data.data)
doc_test=count_Vec.transform(test_data.data)# ! there is transform not fit_transform
return doc_train.toarray(),train_data.target,doc_test.toarray()
开发者ID:chen33,项目名称:nlp,代码行数:7,代码来源:index.py
示例3: getData
def getData():
train_data= load_files('dataset/train')
test_data=load_files("dataset/test")
count_Vec=TfidfVectorizer(min_df=1,decode_error="replace")
doc_train=count_Vec.fit_transform(train_data.data)
doc_test=count_Vec.transform(test_data.data)
return doc_train.toarray(),train_data.target,doc_test.toarray(),test_data.target
开发者ID:chen33,项目名称:nlp,代码行数:7,代码来源:lr.py
示例4: createDataSet
def createDataSet(train_path,test_path,category,k):
"""
create vectorized text feature
'0' refer to 'atheism'
'1' refer to 'sports'
"""
train_set = datasets.load_files(train_path,categories=category,
load_content=True, shuffle=True, encoding='utf-8', decode_error='ignore', random_state=0)
count_vect = CountVectorizer(encoding = 'utf-8',lowercase = True,
decode_error = 'ignore', analyzer = 'word', ngram_range = (2,4),min_df = 1)
tfidf_vecter = TfidfVectorizer( max_df = 0.8, stop_words = 'english')
test_set = datasets.load_files(test_path,categories=category,
load_content=True, shuffle=True, encoding='utf-8', decode_error='ignore', random_state=0)
X_train_tfidf = tfidf_vecter.fit_transform(train_set.data)
X_train_counts = count_vect.fit_transform(train_set.data)
X_test_tfidf = tfidf_vecter.transform(test_set.data)
X_test_counts = count_vect.transform(test_set.data)
for i in range(X_train_counts.shape[0]):
if train_set.target[i] == k:
train_set.target[i] = 1
else:
train_set.target[i] = -1
for i in range(X_test_counts.shape[0]):
if test_set.target[i] == k:
test_set.target[i] = 1
else:
test_set.target[i] = -1
#X_train_normalize = preprocessing.normalize(X_train_counts, norm = 'l2')
#print train_set.target_names
#print train_set.target
#print size
#print len(train_set.target)
#print X_train_tfidf.shape
#print X_train_counts
#print X_train_normalize
return X_train_counts, train_set.target, X_train_counts.shape,X_test_counts, test_set.target, X_test_counts.shape
开发者ID:zoezou2015,项目名称:ML_hm1,代码行数:59,代码来源:document_vectorize.py
示例5: load
def load(dataset, categories):
if dataset == 'full':
train = load_files('aclImdb/aggregate/', categories=categories)
return train
elif dataset == 'split':
train = load_files('aclImdb/train/', categories=categories)
test = load_files('aclImdb/test/', categories=categories)
return (train, test)
开发者ID:aakashjain,项目名称:ReviewClassification,代码行数:9,代码来源:data_loader.py
示例6: vector_for_input_binary
def vector_for_input_binary(train_file_path="/mnt/hgfs/temp/machine learning/train",
test_file_path="/mnt/hgfs/temp/machine learning/test", categories=None):
train_data = load.load_files(train_file_path, categories=categories, encoding='utf-8', decode_error='ignore')
test_data = load.load_files(test_file_path, categories=categories, encoding='utf-8', decode_error='ignore')
vectorized = feature_extraction.CountVectorizer(min_df=1, binary=True)
train_input = vectorized.fit_transform(train_data['data'])
test_input = vectorized.transform(test_data['data'])
return train_input, train_data['target'], test_input, test_data['target']
开发者ID:zoezou2015,项目名称:ML_hm1,代码行数:10,代码来源:Homework_1.py
示例7: test_grid_search_cv_on_newsgroup
def test_grid_search_cv_on_newsgroup():
## load news group data
categories = [
'alt.atheism',
'talk.religion.misc',
'comp.graphics',
'sci.space',
]
twenty_train_small = load_files('./data/20news-bydate-train/',
categories=categories, charset='latin-1')
twenty_test_small = load_files('./data/20news-bydate-test/',
categories=categories, charset='latin-1')
## model pipeline using tfidf and passive aggresive
pipeline = Pipeline((
('vec', TfidfVectorizer(min_df=1, max_df=0.8, use_idf=True)),
('clf', PassiveAggressiveClassifier(C=1)),
))
param_grid = {
'vec__min_df': [1, 2],
'vec__max_df': [0.8, 1.0],
'vec__ngram_range': [(1, 1), (1, 2)],
'vec__use_idf': [True, False]
}
X, y = twenty_train_small.data, twenty_train_small.target
## cross validation on n_iter = 5
grid_searcher = meta_search.GridSearch()
# persist only once
grid_searcher.persist_cv_splits('text_classification', X, y, './tmp/')
grid_searcher.search(pipeline, param_grid)
import time
while not grid_searcher.isready():
print time.sleep(2)
print 'progress:', grid_searcher.progress()
print 'best result:', grid_searcher.best_params_so_far()
if grid_searcher.best_params_so_far():
pass#grid_searcher.abort()
print len(grid_searcher.partial_result())
## run again with naive bayesian
## no need to persist_cv_splits
pipeline = Pipeline((
('vec', TfidfVectorizer(min_df=1, max_df=0.8, use_idf=True)),
('clf', MultinomialNB()),
))
grid_searcher10 = meta_search.GridSearch(datafiles = grid_searcher.datafiles)
grid_searcher10.search(pipeline, param_grid)
while not grid_searcher10.isready():
print time.sleep(2)
print 'progress:', grid_searcher10.progress()
print 'best result:', grid_searcher10.best_params_so_far()
if grid_searcher10.best_params_so_far():
pass#grid_searcher10.abort()
print len(grid_searcher10.partial_result())
开发者ID:dolaameng,项目名称:machine-learning-toolkit,代码行数:52,代码来源:test_meta_search.py
示例8: main
def main():
#buildTrainSet()
#buildTestSet()
train = load_files('model/train', encoding='utf-8')
test = load_files('model/test', encoding='utf-8')
print train.cc
# for l in train.target_names:
# print l
# for l in train.target:
# print l
vectorizer = TfidfVectorizer(sublinear_tf=True, stop_words='english')
X_train = vectorizer.fit(train)
X_test = vectorizer.fit_transform(test)
print X_train.get_feature_names()
开发者ID:titopsur,项目名称:python_test,代码行数:14,代码来源:test.py
示例9: vector_for_input
def vector_for_input(train_file_path=path1,
test_file_path=path2, categories=None):
train_data = load.load_files(train_file_path, categories=categories, encoding='utf-8', decode_error='ignore')
test_data = load.load_files(test_file_path, categories=categories, encoding='utf-8', decode_error='ignore')
# vectorized_normalized = feature_extraction.TfidfVectorizer(min_df=1)
# train_input_normalized = vectorized_normalized.fit_transform(train_data['data'])
# test_input_normalized = vectorized_normalized.transform(test_data['data'])
vectorized = feature_extraction.CountVectorizer(min_df=1)
train_input = vectorized.fit_transform(train_data['data'])
test_input = vectorized.transform(test_data['data'])
return train_input, train_data['target'], test_input, test_data['target']
开发者ID:zoezou2015,项目名称:ML_hm1,代码行数:14,代码来源:Homework_1.py
示例10: load_data
def load_data():
# Descargamos los datos, los descomprimimos en la carpeta ./data/txt_sentoken
# "http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz")
dataset = load_files('./data/txt_sentoken', shuffle=False)
print("n_samples: %d" % len(dataset.data))
return dataset
开发者ID:yazquez,项目名称:poc-machine-learning,代码行数:7,代码来源:MyGensim.py
示例11: testdata_stats
def testdata_stats():
test_dataset = datasets.load_files(project_root+"/testdata",
encoding='utf-8',
decode_error='ignore')
# save_thing_to_file(test_dataset, "test_dataset.txt")
bayes = get_thing_from_file("bayes.txt")
bayes.fit(test_dataset.data, test_dataset.target)
predicted_nb = bayes.predict(test_dataset.data)
print "*****BAYESIAN STATS****"
print "average accuracy = " + \
str(numpy.mean(predicted_nb == test_dataset.target))
print(metrics.classification_report(test_dataset.target, predicted_nb,
target_names=test_dataset.target_names))
print "*****BAYESIAN CONFUSION MATRIX*****"
print metrics.confusion_matrix(test_dataset.target, predicted_nb)
svm = get_thing_from_file("svm.txt")
svm.fit(test_dataset.data, test_dataset.target)
predicted_svm = svm.predict(test_dataset.data)
print "*****SVM STATS*****"
print "average accuracy = " + \
str(numpy.mean(predicted_svm == test_dataset.target))
print(metrics.classification_report(test_dataset.target, predicted_svm,
target_names=test_dataset.target_names))
print "*****SVM CONFUSION MATRIX*****"
print metrics.confusion_matrix(test_dataset.target, predicted_svm)
开发者ID:colinricardo28,项目名称:Peepl,代码行数:31,代码来源:analysis.py
示例12: load_SRAA
def load_SRAA(AVI_HOME='./SRAA/partition1/data', percent=1./3, rnd=2342, \
vect=CountVectorizer(min_df=5, max_df=1.0, binary=True, ngram_range=(1, 1))):
data = load_files(AVI_HOME, encoding="latin1", load_content=True, random_state=rnd)
data.data = [remove_header_subject(text) for text in data.data]
indices = ShuffleSplit(len(data.data), n_iter=1, test_size=percent, indices=True, random_state=rnd)
for train_ind, test_ind in indices:
data = bunch.Bunch(train=bunch.Bunch(data=[data.data[i] for i in train_ind], target=data.target[train_ind]),
test=bunch.Bunch(data=[data.data[i] for i in test_ind], target=data.target[test_ind]))
X_tr = vect.fit_transform(data.train.data)
y_tr = data.train.target
X_te = vect.transform(data.test.data)
y_te = data.test.target
# cache the files
pickle.dump(X_tr, open('SRAA_X_train.pickle', 'wb'))
pickle.dump(y_tr, open('SRAA_y_train.pickle', 'wb'))
pickle.dump(X_te, open('SRAA_X_test.pickle', 'wb'))
pickle.dump(y_te, open('SRAA_y_test.pickle', 'wb'))
pickle.dump(data.train.data, open('SRAA_X_train_corpus.pickle', 'wb'))
pickle.dump(data.test.data, open('SRAA_X_test_corpus.pickle', 'wb'))
pickle.dump(vect.get_feature_names(), open('SRAA_feature_names.pickle', 'wb'))
return (X_tr, y_tr, X_te, y_te, data.train.data, data.test.data)
开发者ID:dzhuang2,项目名称:active_learn,代码行数:26,代码来源:load_SRAA.py
示例13: text_sentiment
def text_sentiment(docs_new):
docs_new=[docs_new]
twenty_train= load_files('./Sentiment') #the complete data is in this directory; like comp.graphics etc
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
# Fit a classifier on the training set
#clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)
#f = open('my_classifier.pickle', 'wb')
#pickle.dump(clf, f)
#f = open('my_classifier.pickle',)
#clf = pickle.load(f)
#f.close()
# save the classifier
#with open('my_sentiment.pkl', 'wb') as fid:
#cPickle.dump(clf, fid)
# load it again
with open('my_sentiment.pkl', 'rb') as fid:
clf = cPickle.load(fid)
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = clf.predict(X_new_tfidf)
return twenty_train.target_names[predicted]
开发者ID:amangarg078,项目名称:TextGenius,代码行数:29,代码来源:sentiment.py
示例14: text_classifly_twang
def text_classifly_twang(dataset_dir_name, fs_method, fs_num):
print 'Loading dataset, 80% for training, 20% for testing...'
movie_reviews = load_files(dataset_dir_name)
doc_str_list_train, doc_str_list_test, doc_class_list_train, doc_class_list_test = train_test_split(movie_reviews.data, movie_reviews.target, test_size = 0.2, random_state = 0)
print 'Feature selection...'
print 'fs method:' + fs_method, 'fs num:' + str(fs_num)
vectorizer = CountVectorizer(binary = True)
word_tokenizer = vectorizer.build_tokenizer()
doc_terms_list_train = [word_tokenizer(doc_str) for doc_str in doc_str_list_train]
term_set_fs = feature_selection.feature_selection(doc_terms_list_train, doc_class_list_train, fs_method)[:fs_num]
print 'Building VSM model...'
term_dict = dict(zip(term_set_fs, range(len(term_set_fs))))
vectorizer.fixed_vocabulary = True
vectorizer.vocabulary_ = term_dict
doc_train_vec = vectorizer.fit_transform(doc_str_list_train)
doc_test_vec= vectorizer.transform(doc_str_list_test)
clf = MultinomialNB().fit(doc_train_vec, doc_class_list_train) #调用MultinomialNB分类
doc_test_predicted = clf.predict(doc_test_vec)
acc = np.mean(doc_test_predicted == doc_class_list_test)
print 'Accuracy: ', acc
return acc
开发者ID:ZHAOTING,项目名称:WebDataMining_Kaggle,代码行数:26,代码来源:feature_selection_test.py
示例15: __init__
def __init__(self, file_path):
self.training_documents = load_files(container_path='./20news-bydate/20news-bydate-train',
categories=CATEGORIES,
decode_error='ignore',
shuffle=True,
encoding='utf-8',
random_state=42)
self.test_documents = load_files(container_path='./20news-bydate/20news-bydate-test',
categories=CATEGORIES,
decode_error='ignore',
shuffle=True,
encoding='utf-8',
random_state=42)
self.file_path = file_path
开发者ID:sherkin735,项目名称:dmsapp,代码行数:16,代码来源:Classifier.py
示例16: runClassifiers
def runClassifiers (dataDir):
data = load_files(dataDir)
nbClassifier = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('classifier', MultinomialNB())])
parameters = {'vect__ngram_range': [(1,1),(2,2),(3,3),(1,2),(1,3)],
'vect__binary': [True, False],
'tfidf__use_idf': [True, False],
'classifier__alpha': [1e-2, 1e-3]}
gs = GridSearchCV(nbClassifier, parameters, n_jobs=-1, verbose=1)
gs.fit(data.data, data.target)
best_parameters = gs.best_estimator_.get_params()
print("Best score: %0.3f" % gs.best_score_)
for params, mean_score, scores in gs.grid_scores_:
print("%0.3f (+/-%0.03f) for %r"
% (mean_score, scores.std() * 2, params))
for param_name in sorted(parameters.keys()):
print("\t%s: %r" % (param_name, best_parameters[param_name]))
print("Done")
pass
开发者ID:maya-ramanath,项目名称:OP,代码行数:26,代码来源:nb.py
示例17: load_docs
def load_docs(path):
dataset = load_files(args.train_path)
docs = []
for raw_data in dataset.data:
docs.append(json.loads(raw_data))
dataset.data = docs
return dataset
开发者ID:rolando-archive,项目名称:yatiri,代码行数:7,代码来源:run_classifier.py
示例18: test_load_files_w_categories_desc_and_encoding
def test_load_files_w_categories_desc_and_encoding():
category = os.path.abspath(TEST_CATEGORY_DIR1).split("/").pop()
res = load_files(LOAD_FILES_ROOT, description="test", categories=category, encoding="utf-8")
assert_equal(len(res.filenames), 1)
assert_equal(len(res.target_names), 1)
assert_equal(res.DESCR, "test")
assert_equal(res.data, [u("Hello World!\n")])
开发者ID:smorfopoulou,项目名称:viral_denovo_pipeline,代码行数:7,代码来源:test_base.py
示例19: test_docs
def test_docs(dir):
# Load documents
docs = datasets.load_files(container_path="../../sklearn_data/"+dir)
X, y = docs.data, docs.target
baseline = 1/float(len(list(np.unique(y))))
# Select Features via Bag of Words approach without stop words
#X = CountVectorizer(charset_error='ignore', stop_words='english', strip_accents='unicode', ).fit_transform(X)
X = TfidfVectorizer(charset_error='ignore', stop_words='english', analyzer='char', ngram_range=(2,4), strip_accents='unicode', sublinear_tf=True, max_df=0.5).fit_transform(X)
n_samples, n_features = X.shape
# sklearn's grid search
parameters = { 'alpha': np.logspace(-100,0,10)}
bv = Bootstrap(n_samples, n_iter=10, test_size=0.3, random_state=42)
mnb_gv = GridSearchCV(MultinomialNB(), parameters, cv=bv,)
#scores = cross_val_score(mnb_gv, X, y, cv=bv)
mnb_gv.fit(X, y)
mnb_gv_best_params = mnb_gv.best_params_.values()[0]
print mnb_gv.best_score_
print mnb_gv_best_params
# CV with Bootstrap
mnb = MultinomialNB(alpha=mnb_gv_best_params)
boot_scores = cross_val_score(mnb, X, y, cv=bv)
print mean_sem(boot_scores)
improvement = (mnb_gv.best_score_ - baseline) / baseline
rand_baseline.append(baseline)
test_results.append([mnb_gv.best_score_])
com_results.append(improvement)
sem_results.append(sem(boot_scores))
开发者ID:dropofwill,项目名称:author-attr-experiments,代码行数:35,代码来源:multidoc_mnb.py
示例20: train
def train(param_search=False):
data = load_files(download())
y = [data.target_names[t] for t in data.target]
# The random state on the LR estimator is fixed to the most arbitrary value
# that I could come up with. It is biased toward the middle number keys on
# my keyboard.
clf = make_pipeline(TfidfVectorizer(min_df=2, dtype=float,
sublinear_tf=True,
ngram_range=(1, 2),
strip_accents='unicode'),
LogisticRegression(random_state=623, C=5000))
if param_search:
params = {'tfidf__ngram_range': [(1, 1), (1, 2)],
'lr__C': [1000, 5000, 10000]}
print("Starting parameter search for review sentiment classification")
# We ignore the original folds in the data, preferring a simple 5-fold
# CV instead; this is intended to get a working model, not results for
# publication.
gs = GridSearchCV(clf, params, cv=5, refit=True, n_jobs=-1, verbose=2)
gs.fit(data.data, y)
print("Parameters found:")
pprint(gs.best_params_)
print("Cross-validation accuracy: %.3f" % gs.best_score_)
return gs.best_estimator_
else:
print("Training logistic regression for movie review polarity")
return clf.fit(data.data, y)
开发者ID:PaulHuygen,项目名称:xtas,代码行数:33,代码来源:_polarity.py
注:本文中的sklearn.datasets.load_files函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论