• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python datasets.load_files函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中sklearn.datasets.load_files函数的典型用法代码示例。如果您正苦于以下问题:Python load_files函数的具体用法?Python load_files怎么用?Python load_files使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了load_files函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: export_classifier

def export_classifier():
    #note that this data is not in the git repo
    train_small = load_files('./training_data/')
    test_small = load_files('./test_data/')

    # Turn the text documents into vectors of word frequencies
    vectorizer = CountVectorizer(min_df=5, ngram_range=(1, 2),
                                 stop_words='english',
                                 strip_accents='ascii')
    X_train = vectorizer.fit_transform(train_small.data)
    y_train = train_small.target

    # Fit a classifier on the training set
    classifier = LogisticRegression(penalty='l2', tol=0.0001, C=1.0,
                                    fit_intercept=True, intercept_scaling=1,
                                    ).fit(X_train, y_train)
    print("Training score: {0:.1f}%".format(
        classifier.score(X_train, y_train) * 100))

    # Evaluate the classifier on the testing set
    X_test = vectorizer.transform(test_small.data)
    y_test = test_small.target
    print("Testing score: {0:.1f}%".format(
        classifier.score(X_test, y_test) * 100))
    export_pickle('LRclassifier.txt', classifier)
    export_pickle('LRvectorizer.txt', vectorizer)
开发者ID:sazlin,项目名称:reTOracle,代码行数:26,代码来源:LR.py


示例2: getData

def getData():
	train_data= load_files('training')    
	test_data=load_files("test")
	count_Vec=TfidfVectorizer(min_df=1,decode_error="replace")
	doc_train=count_Vec.fit_transform(train_data.data)
	doc_test=count_Vec.transform(test_data.data)# ! there is transform not fit_transform
	return doc_train.toarray(),train_data.target,doc_test.toarray()
开发者ID:chen33,项目名称:nlp,代码行数:7,代码来源:index.py


示例3: getData

def getData():
	train_data= load_files('dataset/train')    
	test_data=load_files("dataset/test")
	count_Vec=TfidfVectorizer(min_df=1,decode_error="replace")
	doc_train=count_Vec.fit_transform(train_data.data)
	doc_test=count_Vec.transform(test_data.data)
	return doc_train.toarray(),train_data.target,doc_test.toarray(),test_data.target
开发者ID:chen33,项目名称:nlp,代码行数:7,代码来源:lr.py


示例4: createDataSet

def createDataSet(train_path,test_path,category,k):
	"""
	create vectorized text feature
    '0' refer to 'atheism'
    '1' refer to 'sports'

	"""
	train_set = datasets.load_files(train_path,categories=category, 
	load_content=True, shuffle=True, encoding='utf-8', decode_error='ignore', random_state=0)

	count_vect = CountVectorizer(encoding = 'utf-8',lowercase = True,
	 decode_error = 'ignore',  analyzer = 'word', ngram_range = (2,4),min_df = 1)
	
	tfidf_vecter = TfidfVectorizer( max_df = 0.8, stop_words = 'english')

	test_set = datasets.load_files(test_path,categories=category, 
	load_content=True, shuffle=True, encoding='utf-8',  decode_error='ignore', random_state=0)

	

	X_train_tfidf = tfidf_vecter.fit_transform(train_set.data)
	X_train_counts = count_vect.fit_transform(train_set.data)

	X_test_tfidf = tfidf_vecter.transform(test_set.data)
	X_test_counts = count_vect.transform(test_set.data)


	 
	for i in range(X_train_counts.shape[0]):
		if train_set.target[i] == k:
			train_set.target[i] = 1
		else:
			train_set.target[i] = -1

	for i in range(X_test_counts.shape[0]):
		if test_set.target[i] == k:
			test_set.target[i] = 1
		else:
			test_set.target[i] = -1

	
	
	#X_train_normalize = preprocessing.normalize(X_train_counts, norm = 'l2')
	



	#print train_set.target_names
	#print train_set.target
	#print size 
	#print len(train_set.target)


	#print X_train_tfidf.shape
	#print X_train_counts
	#print X_train_normalize


	return X_train_counts, train_set.target, X_train_counts.shape,X_test_counts, test_set.target, X_test_counts.shape
开发者ID:zoezou2015,项目名称:ML_hm1,代码行数:59,代码来源:document_vectorize.py


示例5: load

def load(dataset, categories):
    if dataset == 'full':
        train = load_files('aclImdb/aggregate/', categories=categories)
        return train

    elif dataset == 'split':    
        train = load_files('aclImdb/train/', categories=categories)
        test = load_files('aclImdb/test/', categories=categories)
        return (train, test)
开发者ID:aakashjain,项目名称:ReviewClassification,代码行数:9,代码来源:data_loader.py


示例6: vector_for_input_binary

def vector_for_input_binary(train_file_path="/mnt/hgfs/temp/machine learning/train",
                            test_file_path="/mnt/hgfs/temp/machine learning/test", categories=None):
    train_data = load.load_files(train_file_path, categories=categories, encoding='utf-8', decode_error='ignore')
    test_data = load.load_files(test_file_path, categories=categories, encoding='utf-8', decode_error='ignore')

    vectorized = feature_extraction.CountVectorizer(min_df=1, binary=True)
    train_input = vectorized.fit_transform(train_data['data'])
    test_input = vectorized.transform(test_data['data'])

    return train_input, train_data['target'], test_input, test_data['target']
开发者ID:zoezou2015,项目名称:ML_hm1,代码行数:10,代码来源:Homework_1.py


示例7: test_grid_search_cv_on_newsgroup

def test_grid_search_cv_on_newsgroup():
    ## load news group data
    categories = [
        'alt.atheism',
        'talk.religion.misc',
        'comp.graphics',
        'sci.space',
    ]
    twenty_train_small = load_files('./data/20news-bydate-train/',
        categories=categories, charset='latin-1')
    twenty_test_small = load_files('./data/20news-bydate-test/',
        categories=categories, charset='latin-1')
    ## model pipeline using tfidf and passive aggresive
    pipeline = Pipeline((
        ('vec', TfidfVectorizer(min_df=1, max_df=0.8, use_idf=True)),
        ('clf', PassiveAggressiveClassifier(C=1)),
    ))
    param_grid = {
        'vec__min_df': [1, 2],
        'vec__max_df': [0.8, 1.0],
        'vec__ngram_range': [(1, 1), (1, 2)],
        'vec__use_idf': [True, False]
    }
    X, y = twenty_train_small.data, twenty_train_small.target
    ## cross validation on n_iter = 5
    grid_searcher = meta_search.GridSearch()
    # persist only once
    grid_searcher.persist_cv_splits('text_classification', X, y, './tmp/')
    grid_searcher.search(pipeline, param_grid)
    import time
    while not grid_searcher.isready():
        print time.sleep(2)
        print 'progress:', grid_searcher.progress()
        print 'best result:', grid_searcher.best_params_so_far()
        if grid_searcher.best_params_so_far():
            pass#grid_searcher.abort()
    print len(grid_searcher.partial_result())
    ## run again with naive bayesian
    ## no need to persist_cv_splits
    pipeline = Pipeline((
        ('vec', TfidfVectorizer(min_df=1, max_df=0.8, use_idf=True)),
        ('clf', MultinomialNB()),
    ))
    grid_searcher10 = meta_search.GridSearch(datafiles = grid_searcher.datafiles)
    grid_searcher10.search(pipeline, param_grid)
    while not grid_searcher10.isready():
        print time.sleep(2)
        print 'progress:', grid_searcher10.progress()
        print 'best result:', grid_searcher10.best_params_so_far()
        if grid_searcher10.best_params_so_far():
            pass#grid_searcher10.abort()
    print len(grid_searcher10.partial_result())    
开发者ID:dolaameng,项目名称:machine-learning-toolkit,代码行数:52,代码来源:test_meta_search.py


示例8: main

def main():
    #buildTrainSet()
    #buildTestSet()
    train = load_files('model/train', encoding='utf-8')
    test = load_files('model/test', encoding='utf-8')
    print train.cc
#    for l in train.target_names:
#        print l
#    for l in train.target:
#        print l
    vectorizer = TfidfVectorizer(sublinear_tf=True, stop_words='english')
    X_train = vectorizer.fit(train)
    X_test = vectorizer.fit_transform(test)
    print X_train.get_feature_names()
开发者ID:titopsur,项目名称:python_test,代码行数:14,代码来源:test.py


示例9: vector_for_input

def vector_for_input(train_file_path=path1,
                     test_file_path=path2, categories=None):
    train_data = load.load_files(train_file_path, categories=categories, encoding='utf-8', decode_error='ignore')
    test_data = load.load_files(test_file_path, categories=categories, encoding='utf-8', decode_error='ignore')

    # vectorized_normalized = feature_extraction.TfidfVectorizer(min_df=1)
    # train_input_normalized = vectorized_normalized.fit_transform(train_data['data'])
    # test_input_normalized = vectorized_normalized.transform(test_data['data'])

    vectorized = feature_extraction.CountVectorizer(min_df=1)
    train_input = vectorized.fit_transform(train_data['data'])
    test_input = vectorized.transform(test_data['data'])

    return train_input, train_data['target'], test_input, test_data['target']
开发者ID:zoezou2015,项目名称:ML_hm1,代码行数:14,代码来源:Homework_1.py


示例10: load_data

def load_data():
    # Descargamos los datos, los descomprimimos en la carpeta ./data/txt_sentoken
    # "http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz")
    dataset = load_files('./data/txt_sentoken', shuffle=False)
    print("n_samples: %d" % len(dataset.data))

    return dataset
开发者ID:yazquez,项目名称:poc-machine-learning,代码行数:7,代码来源:MyGensim.py


示例11: testdata_stats

def testdata_stats():
    test_dataset = datasets.load_files(project_root+"/testdata",
                                     encoding='utf-8',
                                  decode_error='ignore')

    # save_thing_to_file(test_dataset, "test_dataset.txt")

    bayes = get_thing_from_file("bayes.txt")
    bayes.fit(test_dataset.data, test_dataset.target)
    predicted_nb = bayes.predict(test_dataset.data)

    print "*****BAYESIAN STATS****"
    print "average accuracy = " + \
            str(numpy.mean(predicted_nb == test_dataset.target))

    print(metrics.classification_report(test_dataset.target, predicted_nb,
    target_names=test_dataset.target_names))
    print "*****BAYESIAN CONFUSION MATRIX*****"
    print metrics.confusion_matrix(test_dataset.target, predicted_nb)

    svm = get_thing_from_file("svm.txt")
    svm.fit(test_dataset.data, test_dataset.target)
    predicted_svm = svm.predict(test_dataset.data)

    print "*****SVM STATS*****"
    print "average accuracy = " + \
            str(numpy.mean(predicted_svm == test_dataset.target))
    print(metrics.classification_report(test_dataset.target, predicted_svm,
    target_names=test_dataset.target_names))
    print "*****SVM CONFUSION MATRIX*****"
    print metrics.confusion_matrix(test_dataset.target, predicted_svm)
开发者ID:colinricardo28,项目名称:Peepl,代码行数:31,代码来源:analysis.py


示例12: load_SRAA

def load_SRAA(AVI_HOME='./SRAA/partition1/data', percent=1./3, rnd=2342, \
              vect=CountVectorizer(min_df=5, max_df=1.0, binary=True, ngram_range=(1, 1))):
    data = load_files(AVI_HOME, encoding="latin1", load_content=True, random_state=rnd)
    data.data = [remove_header_subject(text) for text in data.data]

    indices = ShuffleSplit(len(data.data), n_iter=1, test_size=percent, indices=True, random_state=rnd)
    for train_ind, test_ind in indices:
        data = bunch.Bunch(train=bunch.Bunch(data=[data.data[i] for i in train_ind], target=data.target[train_ind]),
                              test=bunch.Bunch(data=[data.data[i] for i in test_ind], target=data.target[test_ind]))

    X_tr = vect.fit_transform(data.train.data)
    y_tr = data.train.target

    X_te = vect.transform(data.test.data)
    y_te = data.test.target
    
    # cache the files
    pickle.dump(X_tr, open('SRAA_X_train.pickle', 'wb'))
    pickle.dump(y_tr, open('SRAA_y_train.pickle', 'wb'))
    pickle.dump(X_te, open('SRAA_X_test.pickle', 'wb'))
    pickle.dump(y_te, open('SRAA_y_test.pickle', 'wb'))
    pickle.dump(data.train.data, open('SRAA_X_train_corpus.pickle', 'wb'))
    pickle.dump(data.test.data, open('SRAA_X_test_corpus.pickle', 'wb'))
    pickle.dump(vect.get_feature_names(), open('SRAA_feature_names.pickle', 'wb'))
    
    return (X_tr, y_tr, X_te, y_te, data.train.data, data.test.data)
开发者ID:dzhuang2,项目名称:active_learn,代码行数:26,代码来源:load_SRAA.py


示例13: text_sentiment

def text_sentiment(docs_new):
   docs_new=[docs_new]
   twenty_train= load_files('./Sentiment')  #the complete data is in this directory; like comp.graphics etc
   count_vect = CountVectorizer()
   X_train_counts = count_vect.fit_transform(twenty_train.data)
   tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
   X_train_tf = tf_transformer.transform(X_train_counts)
   tfidf_transformer = TfidfTransformer()
   X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

   # Fit a classifier on the training set
   #clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)
   #f = open('my_classifier.pickle', 'wb')
   #pickle.dump(clf, f)
   #f = open('my_classifier.pickle',)
   #clf = pickle.load(f)
   #f.close()
   # save the classifier
   #with open('my_sentiment.pkl', 'wb') as fid:
      #cPickle.dump(clf, fid)    

   # load it again
   with open('my_sentiment.pkl', 'rb') as fid:
      clf = cPickle.load(fid)
   X_new_counts = count_vect.transform(docs_new)
   X_new_tfidf = tfidf_transformer.transform(X_new_counts)

   predicted = clf.predict(X_new_tfidf)
   return twenty_train.target_names[predicted]
开发者ID:amangarg078,项目名称:TextGenius,代码行数:29,代码来源:sentiment.py


示例14: text_classifly_twang

def text_classifly_twang(dataset_dir_name, fs_method, fs_num):
    print 'Loading dataset, 80% for training, 20% for testing...'
    movie_reviews = load_files(dataset_dir_name)  
    doc_str_list_train, doc_str_list_test, doc_class_list_train, doc_class_list_test = train_test_split(movie_reviews.data, movie_reviews.target, test_size = 0.2, random_state = 0)
    
    print 'Feature selection...'
    print 'fs method:' + fs_method, 'fs num:' + str(fs_num)
    vectorizer = CountVectorizer(binary = True)   
    word_tokenizer = vectorizer.build_tokenizer()
    doc_terms_list_train = [word_tokenizer(doc_str) for doc_str in doc_str_list_train]
    term_set_fs = feature_selection.feature_selection(doc_terms_list_train, doc_class_list_train, fs_method)[:fs_num]
    
    print 'Building VSM model...'
    term_dict = dict(zip(term_set_fs, range(len(term_set_fs))))
    vectorizer.fixed_vocabulary = True
    vectorizer.vocabulary_ = term_dict
    doc_train_vec = vectorizer.fit_transform(doc_str_list_train)
    doc_test_vec= vectorizer.transform(doc_str_list_test)
    
    clf = MultinomialNB().fit(doc_train_vec, doc_class_list_train)  #调用MultinomialNB分类
    doc_test_predicted = clf.predict(doc_test_vec)
    
    acc = np.mean(doc_test_predicted == doc_class_list_test)  
    print 'Accuracy: ', acc
    
    return acc
开发者ID:ZHAOTING,项目名称:WebDataMining_Kaggle,代码行数:26,代码来源:feature_selection_test.py


示例15: __init__

    def __init__(self, file_path):
        self.training_documents = load_files(container_path='./20news-bydate/20news-bydate-train',
                                       categories=CATEGORIES,
                                       decode_error='ignore',
                                       shuffle=True,
                                       encoding='utf-8',
                                       random_state=42)

        self.test_documents = load_files(container_path='./20news-bydate/20news-bydate-test',
                                       categories=CATEGORIES,
                                       decode_error='ignore',
                                       shuffle=True,
                                       encoding='utf-8',
                                       random_state=42)

        self.file_path = file_path
开发者ID:sherkin735,项目名称:dmsapp,代码行数:16,代码来源:Classifier.py


示例16: runClassifiers

def runClassifiers (dataDir):
    
    data = load_files(dataDir)

    nbClassifier = Pipeline([('vect', CountVectorizer()),
                             ('tfidf', TfidfTransformer()),
                             ('classifier', MultinomialNB())])
    
    parameters = {'vect__ngram_range': [(1,1),(2,2),(3,3),(1,2),(1,3)],
                  'vect__binary': [True, False],
                  'tfidf__use_idf': [True, False],
                  'classifier__alpha': [1e-2, 1e-3]}
    
    gs = GridSearchCV(nbClassifier, parameters, n_jobs=-1, verbose=1)
    gs.fit(data.data, data.target)
    best_parameters = gs.best_estimator_.get_params()
    
    print("Best score: %0.3f" % gs.best_score_)
    for params, mean_score, scores in gs.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r"
              % (mean_score, scores.std() * 2, params))
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    print("Done")
    
    pass
开发者ID:maya-ramanath,项目名称:OP,代码行数:26,代码来源:nb.py


示例17: load_docs

def load_docs(path):
    dataset = load_files(args.train_path)
    docs = []
    for raw_data in dataset.data:
        docs.append(json.loads(raw_data))
    dataset.data = docs
    return dataset
开发者ID:rolando-archive,项目名称:yatiri,代码行数:7,代码来源:run_classifier.py


示例18: test_load_files_w_categories_desc_and_encoding

def test_load_files_w_categories_desc_and_encoding():
    category = os.path.abspath(TEST_CATEGORY_DIR1).split("/").pop()
    res = load_files(LOAD_FILES_ROOT, description="test", categories=category, encoding="utf-8")
    assert_equal(len(res.filenames), 1)
    assert_equal(len(res.target_names), 1)
    assert_equal(res.DESCR, "test")
    assert_equal(res.data, [u("Hello World!\n")])
开发者ID:smorfopoulou,项目名称:viral_denovo_pipeline,代码行数:7,代码来源:test_base.py


示例19: test_docs

def test_docs(dir):
	# Load documents
	docs = datasets.load_files(container_path="../../sklearn_data/"+dir)
	X, y = docs.data, docs.target

	baseline = 1/float(len(list(np.unique(y))))

	# Select Features via Bag of Words approach without stop words
	#X = CountVectorizer(charset_error='ignore', stop_words='english', strip_accents='unicode', ).fit_transform(X)
	X = TfidfVectorizer(charset_error='ignore', stop_words='english', analyzer='char', ngram_range=(2,4), strip_accents='unicode', sublinear_tf=True, max_df=0.5).fit_transform(X)
	n_samples, n_features = X.shape


	# sklearn's grid search
	parameters = { 'alpha': np.logspace(-100,0,10)}

	bv = Bootstrap(n_samples, n_iter=10, test_size=0.3, random_state=42)
	mnb_gv = GridSearchCV(MultinomialNB(), parameters, cv=bv,)
	#scores = cross_val_score(mnb_gv, X, y, cv=bv)
	mnb_gv.fit(X, y)
	mnb_gv_best_params = mnb_gv.best_params_.values()[0]
	print mnb_gv.best_score_
	print mnb_gv_best_params

	# CV with Bootstrap
	mnb = MultinomialNB(alpha=mnb_gv_best_params)
	boot_scores = cross_val_score(mnb, X, y, cv=bv)
	print mean_sem(boot_scores)

	improvement = (mnb_gv.best_score_ - baseline) / baseline

	rand_baseline.append(baseline)
	test_results.append([mnb_gv.best_score_])
	com_results.append(improvement)
	sem_results.append(sem(boot_scores))
开发者ID:dropofwill,项目名称:author-attr-experiments,代码行数:35,代码来源:multidoc_mnb.py


示例20: train

def train(param_search=False):
    data = load_files(download())
    y = [data.target_names[t] for t in data.target]

    # The random state on the LR estimator is fixed to the most arbitrary value
    # that I could come up with. It is biased toward the middle number keys on
    # my keyboard.
    clf = make_pipeline(TfidfVectorizer(min_df=2, dtype=float,
                                        sublinear_tf=True,
                                        ngram_range=(1, 2),
                                        strip_accents='unicode'),
                        LogisticRegression(random_state=623, C=5000))

    if param_search:
        params = {'tfidf__ngram_range': [(1, 1), (1, 2)],
                  'lr__C': [1000, 5000, 10000]}

        print("Starting parameter search for review sentiment classification")
        # We ignore the original folds in the data, preferring a simple 5-fold
        # CV instead; this is intended to get a working model, not results for
        # publication.
        gs = GridSearchCV(clf, params, cv=5, refit=True, n_jobs=-1, verbose=2)
        gs.fit(data.data, y)

        print("Parameters found:")
        pprint(gs.best_params_)
        print("Cross-validation accuracy: %.3f" % gs.best_score_)

        return gs.best_estimator_

    else:
        print("Training logistic regression for movie review polarity")
        return clf.fit(data.data, y)
开发者ID:PaulHuygen,项目名称:xtas,代码行数:33,代码来源:_polarity.py



注:本文中的sklearn.datasets.load_files函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python datasets.load_iris函数代码示例发布时间:2022-05-27
下一篇:
Python datasets.load_digits函数代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap