• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python datasets.fetch_20newsgroups函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中sklearn.datasets.fetch_20newsgroups函数的典型用法代码示例。如果您正苦于以下问题:Python fetch_20newsgroups函数的具体用法?Python fetch_20newsgroups怎么用?Python fetch_20newsgroups使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了fetch_20newsgroups函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: load_20newsgroups

def load_20newsgroups(category=None, shuffle=True, rnd=1):
    categories = {'religion': ['alt.atheism', 'talk.religion.misc'],
                  'graphics': ['comp.graphics', 'comp.windows.x'],
                  'hardware': ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'],
                  'baseball': ['rec.sport.baseball', 'sci.crypt']}
    cat = None
    if category is not None:
        cat = categories[category]

    data = bunch.Bunch()
    data.train = fetch_20newsgroups(subset='train', categories=cat, remove=('headers', 'footers', 'quotes'),
                                    shuffle=shuffle, random_state=rnd)

    # data.train.data = np.array([keep_header_subject(text) for text in data.train.data], dtype=object)
    data.train.data = np.array(data.train.data, dtype=object)
    data.test = fetch_20newsgroups(subset='test', categories=cat, remove=('headers', 'footers', 'quotes'),
                                   shuffle=shuffle, random_state=rnd)

    # data.test.data = np.array([keep_header_subject(text) for text in data.test.data], dtype=object)
    data.test.data = np.array(data.test.data, dtype=object)
    data = minimum_size(data)

    if shuffle:
        random_state = np.random.RandomState(rnd)
        indices = np.arange(data.train.target.shape[0])
        random_state.shuffle(indices)
        data.train.filenames = data.train.filenames[indices]
        data.train.target = data.train.target[indices]
        # Use an object array to shuffle: avoids memory copy
        data_lst = np.array(data.train.data, dtype=object)
        data_lst = data_lst[indices]
        data.train.data = data_lst

    return data
开发者ID:mramire8,项目名称:structured,代码行数:34,代码来源:datautils.py


示例2: get_data

def get_data():
    newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories= ['talk.politics.guns', 'talk.politics.mideast','alt.atheism','talk.politics.misc', 'talk.religion.misc'])
    newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'),categories= ['talk.politics.guns', 'talk.politics.mideast','alt.atheism','talk.politics.misc', 'talk.religion.misc'])
    # texts_train, target_train = newsgroups_train.data, newsgroups_train.target
    # texts_test, target_test = newsgroups_test.data,newsgroups_test.target
    #return texts_train, target_train, newsgroups_train.filenames, texts_test, target_test, newsgroups_test.filenames
    return newsgroups_train, newsgroups_test
开发者ID:t0mst0ne,项目名称:wordvectors,代码行数:7,代码来源:20newsgroup.py


示例3: export_20ng

def export_20ng(remove_headers=False, remove_footers=False, remove_quotes=False, categories=None):
    output_dir = os.path.join('..', 'datasets', '20ng', 'data')
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    remove = []
    if remove_headers:
        remove.append('headers')
    if remove_footers:
        remove.append('footers')
    if remove_quotes:
        remove.append('quotes')

    print categories

    ng_train = fetch_20newsgroups(subset='train', remove=remove, categories=categories)
    keys = ['train' + str(i) for i in range(len(ng_train.data))]
    print len(keys)
    train_text = dict(zip(keys, ng_train.data))
    fh.write_to_json(train_text, os.path.join(output_dir, 'train.json'))

    train_labels = pd.DataFrame(ng_train.target, columns=['target'], index=keys)
    train_labels.to_csv(os.path.join(output_dir, 'train.csv'))
    print train_labels.shape

    ng_test = fetch_20newsgroups(subset='test', remove=remove, categories=categories)
    keys = ['test' + str(i) for i in range(len(ng_test.data))]
    test_text = dict(zip(keys, ng_train.data))
    fh.write_to_json(test_text, os.path.join(output_dir, 'test.json'))

    test_labels = pd.DataFrame(ng_test.target, columns=['target'], index=keys)
    test_labels.to_csv(os.path.join(output_dir, 'test.csv'))
开发者ID:anukat2015,项目名称:ARKcat,代码行数:32,代码来源:format_20ng_data.py


示例4: test_lime_text_tabular_not_equal_random_state

    def test_lime_text_tabular_not_equal_random_state(self):
        categories = ['alt.atheism', 'soc.religion.christian']
        newsgroups_train = fetch_20newsgroups(subset='train',
                                              categories=categories)
        newsgroups_test = fetch_20newsgroups(subset='test',
                                             categories=categories)
        class_names = ['atheism', 'christian']
        vectorizer = TfidfVectorizer(lowercase=False)
        train_vectors = vectorizer.fit_transform(newsgroups_train.data)
        test_vectors = vectorizer.transform(newsgroups_test.data)
        nb = MultinomialNB(alpha=.01)
        nb.fit(train_vectors, newsgroups_train.target)
        pred = nb.predict(test_vectors)
        f1_score(newsgroups_test.target, pred, average='weighted')
        c = make_pipeline(vectorizer, nb)

        explainer = LimeTextExplainer(
            class_names=class_names, random_state=10)
        exp_1 = explainer.explain_instance(newsgroups_test.data[83],
                                           c.predict_proba, num_features=6)

        explainer = LimeTextExplainer(
            class_names=class_names, random_state=20)
        exp_2 = explainer.explain_instance(newsgroups_test.data[83],
                                           c.predict_proba, num_features=6)

        self.assertFalse(exp_1.as_map() == exp_2.as_map())
开发者ID:marcotcr,项目名称:lime,代码行数:27,代码来源:test_lime_text.py


示例5: News

def News():
    from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
    newsgroups_train = datasets.fetch_20newsgroups(subset='train')
    vectorizer = CountVectorizer(encoding='latin-1', max_features=30000)
    #vectorizer = HashingVectorizer(encoding='latin-1')
    x_train = vectorizer.fit_transform(newsgroups_train.data)
    x_train = numpy.asarray(x_train.todense(), dtype='float32')
    y_train = numpy.asarray(newsgroups_train.target, dtype='int32')
    newsgroups_test = datasets.fetch_20newsgroups(subset='test')
    x_test = vectorizer.transform(newsgroups_test.data)
    x_test = numpy.asarray(x_test.todense(), dtype='float32')
    y_test = numpy.asarray(newsgroups_test.target, dtype='int32')
    dnn=RegularizedNet(numpy_rng=numpy.random.RandomState(123), theano_rng=None, 
            n_ins=x_train.shape[1],
            layers_types=[ReLU, ReLU, LogisticRegression],
            layers_sizes=[1000, 1000],
            n_outs=len(set(y_train)),
            rho=0.95, 
            eps=1.E-6,
            max_norm=0.,
            debugprint=False,
            L1_reg=0.,
            L2_reg=1./x_train.shape[0])
    print len(set(y_train))
    dnn.fit(x_train, y_train, max_epochs=30, method='adadelta_nesterov', verbose=True, plot=False)
    test_error = dnn.score(x_test, y_test)
    print("score: %f" % (1. - test_error))
开发者ID:KayneWest,项目名称:Stuff,代码行数:27,代码来源:nesterov_dnn.py


示例6: Load20NG

def Load20NG():
    cats = ['alt.atheism', 'soc.religion.christian']
    newsgroups_train = fetch_20newsgroups(subset='train', categories=cats)
    newsgroups_test = fetch_20newsgroups(subset='test', categories=cats)
    train, train_labels = newsgroups_train.data, newsgroups_train.target
    test, test_labels = newsgroups_test.data, newsgroups_test.target
    return train, train_labels, test, test_labels
开发者ID:UW-MODE,项目名称:naacl16-demo,代码行数:7,代码来源:learn_models.py


示例7: test_20news_vectorized

def test_20news_vectorized():
    try:
        datasets.fetch_20newsgroups(subset='all',
                                    download_if_missing=False)
    except IOError:
        raise SkipTest("Download 20 newsgroups to run this test")

    # test subset = train
    bunch = datasets.fetch_20newsgroups_vectorized(subset="train")
    assert_true(sp.isspmatrix_csr(bunch.data))
    assert_equal(bunch.data.shape, (11314, 130107))
    assert_equal(bunch.target.shape[0], 11314)
    assert_equal(bunch.data.dtype, np.float64)

    # test subset = test
    bunch = datasets.fetch_20newsgroups_vectorized(subset="test")
    assert_true(sp.isspmatrix_csr(bunch.data))
    assert_equal(bunch.data.shape, (7532, 130107))
    assert_equal(bunch.target.shape[0], 7532)
    assert_equal(bunch.data.dtype, np.float64)

    # test return_X_y option
    fetch_func = partial(datasets.fetch_20newsgroups_vectorized, subset='test')
    check_return_X_y(bunch, fetch_func)

    # test subset = all
    bunch = datasets.fetch_20newsgroups_vectorized(subset='all')
    assert_true(sp.isspmatrix_csr(bunch.data))
    assert_equal(bunch.data.shape, (11314 + 7532, 130107))
    assert_equal(bunch.target.shape[0], 11314 + 7532)
    assert_equal(bunch.data.dtype, np.float64)
开发者ID:AlexisMignon,项目名称:scikit-learn,代码行数:31,代码来源:test_20news.py


示例8: train_20_news

def train_20_news(n_jobs, n_folds):
    from sklearn.datasets import fetch_20newsgroups
    train = fetch_20newsgroups(subset='train', shuffle=False, random_state=100,
                               remove=('headers', 'footers', 'quotes'))
    test = fetch_20newsgroups(subset='test', shuffle=False, random_state=100,
                              remove=('headers', 'footers', 'quotes'))

    x_train = map(dt.clean_str, train.data)
    x_test = map(dt.clean_str, test.data)

    text_clf = Pipeline([
                         # ('clean', Cleaner()),
                         ('vect', CountVectorizer(ngram_range=(1, 2), stop_words='english')),
                         ('tfidf', TfidfTransformer(sublinear_tf=True)),
                         ('clf', SGDClassifier(fit_intercept=True, random_state=0))
                         ])

    SGDClassifier_params = {
        'clf__alpha': np.arange(4e-5, 2e-3, 2e-5),
        'clf__loss': ('squared_loss', 'hinge', 'squared_hinge'),
        'clf__penalty': ('l2', 'elasticnet'),
    }

    gs_clf = GridSearchCV(text_clf, SGDClassifier_params, n_jobs=n_jobs, cv=n_folds, refit=True, verbose=3)
    gs_clf.fit(x_train, train.target)

    result_str = list()
    result_str.append('\n')
    result_str.append('best params:')
    result_str.append(str(gs_clf.best_params_))
    result_str.append('best score = %f' % gs_clf.best_score_)
    result_str = '\n'.join(result_str)
    print result_str

    print "test score = " % gs_clf.score(x_test, test.target)
开发者ID:irina-goltsman,项目名称:ConvolutionNeuralNetwork,代码行数:35,代码来源:base_experiments.py


示例9: file

def file():
    cats = ["alt.atheism", "sci.electronics"]

    newsgroups_train = fetch_20newsgroups(subset="train", categories=cats)

    newsgroups_test = fetch_20newsgroups(subset="test", categories=cats)
    vectorizer = TfidfVectorizer()  # 把所有文档都切词,统计了

    vectors_train = vectorizer.fit_transform(newsgroups_train.data)
    vectors = vectorizer.transform(newsgroups_test.data)
    print vectors.shape[1]
    # f=open('test_all.txt','wb')
    for j in range(0, vectors.shape[0]):
        item_id = list()
        tokens = vectorizer.build_tokenizer()(newsgroups_test.data[j])  # 提取分词结果
        # print tokens

        word_sort = np.argsort(-vectors[j].data)
        print "顶点" + str(j)
        for i in range(0, len(word_sort)):
            word = vectorizer.get_feature_names()[vectors[j].indices[word_sort[i]]]  # 这个是tf-idf詞
            for line in range(0, len(tokens)):
                if tokens[line].lower() == word:
                    item_id.append((line, word_sort[i]))

        pos_item = sorted(item_id, key=lambda jj: jj[0], reverse=True)  # 抽取tf-idf词

        word_word = np.zeros([len(word_sort), len(word_sort)])
        for p in range(0, len(pos_item)):
            if p < (len(pos_item) - 1):
                ki = word_sort[pos_item[p][1]]
                kj = word_sort[pos_item[p + 1][1]]
                word_word[ki, kj] = word_word[ki, kj] + 1
开发者ID:yanshengli,项目名称:DBN_Learning,代码行数:33,代码来源:file_to_graph1_test.py


示例10: load_sklearn_data

    def load_sklearn_data(self,name):
        if name == "digits":
            training = fetch_20newsgroups(subset='train',shuffle=True,random_state=42);
            testing = fetch_20newsgroups(subset='test',shuffle=True,random_state=100);
            validation = fetch_20newsgroups(subset='test',shuffle=True,random_state=200);
            categories = training.target_names
            data_train_size_mb = size_mb(training.data)
            data_test_size_mb = size_mb(testing.data)
            data_test_size_mb = size_mb(validation.data)
            
            print("%d documents - %0.3fMB (training set)" % (
                len(training.data), data_train_size_mb))
            print("%d documents - %0.3fMB (test set)" % (
                len(testing.data), data_test_size_mb))

            print("%d documents - %0.3fMB (test set)" % (
                len(validation.data), data_test_size_mb))
                
            print("%d categories" % len(categories))
            print()        
            
            training=[training.data,training.target_names]
            testing=[testing.data,testing.target_names]
            validation=[validation.data,validation.target_names]
            
            return [training,testing,validation];
开发者ID:BLiuBLiu,项目名称:OpenVision,代码行数:26,代码来源:LoadDataSets.py


示例11: test_20news

def test_20news():
    try:
        data = datasets.fetch_20newsgroups(
            subset='all', download_if_missing=False, shuffle=False)
    except IOError:
        raise SkipTest("Download 20 newsgroups to run this test")

    # Extract a reduced dataset
    data2cats = datasets.fetch_20newsgroups(
        subset='all', categories=data.target_names[-1:-3:-1], shuffle=False)
    # Check that the ordering of the target_names is the same
    # as the ordering in the full dataset
    assert_equal(data2cats.target_names,
                 data.target_names[-2:])
    # Assert that we have only 0 and 1 as labels
    assert_equal(np.unique(data2cats.target).tolist(), [0, 1])

    # Check that the number of filenames is consistent with data/target
    assert_equal(len(data2cats.filenames), len(data2cats.target))
    assert_equal(len(data2cats.filenames), len(data2cats.data))

    # Check that the first entry of the reduced dataset corresponds to
    # the first entry of the corresponding category in the full dataset
    entry1 = data2cats.data[0]
    category = data2cats.target_names[data2cats.target[0]]
    label = data.target_names.index(category)
    entry2 = data.data[np.where(data.target == label)[0][0]]
    assert_equal(entry1, entry2)
开发者ID:0664j35t3r,项目名称:scikit-learn,代码行数:28,代码来源:test_20news.py


示例12: loadData

    def loadData(self, opts):
        if opts.all_categories:
            categories = None
        else:
            categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics',
                          'sci.space']

        if opts.filtered:
            remove = ('headers', 'footers', 'quotes')
        else:
            remove = ()

        print('Loading 20 newsgroups dataset for categories:')
        print((categories if categories else 'all'))

        data_train = fetch_20newsgroups(subset='train', categories=categories,
                                        shuffle=True, random_state=42,
                                        remove=remove)

        data_test = fetch_20newsgroups(subset='test', categories=categories,
                                       shuffle=True, random_state=42,
                                       remove=remove)
        
        categories = data_train.target_names  # for case categories == None
        # print(len(data_train))
        print('data loaded')
        
        return data_train, data_test, categories
开发者ID:gaolu,项目名称:ImprovedSearchResultClassificationAndClustering,代码行数:28,代码来源:dataLoader.py


示例13: uai

def uai(params):#, **kwargs):
    print 'Params: ', params, '\n'
    #y = benchmark_functions.save_svm_on_grid(params, opt_time=ret_time, **kwargs)
    logreg = linear_model.LogisticRegression(penalty=params['penalty'],tol=float(params['tol']),C=float(params['strength']))
    if params['n_min'] > params['n_max']:
      z=params['n_min']
      params['n_min']=params['n_max']
      params['n_max']=z
    if params['stop_words']==True:
      st='english'
    else:
      st=None 
    vectorizer = TfidfVectorizer(ngram_range=(int(params['n_min']),int(params['n_max'])),binary=params['binary'],use_idf=params['idf'],smooth_idf=True,stop_words=st)
    if params['cats'] == 'all':
        cats = None
    elif params['cats'] == 'science':
        cats = ['sci.med','sci.space','sci.crypt','sci.electronics']
    elif params['cats'] == 'religion':
        cats = ['alt.atheism', 'talk.religion.misc']
    elif params['cats'] == 'graphics':
        cats = ['comp.windows.x','comp.graphics']
    #cats = ['sci.med','sci.space']
    #cats = ['comp.sys.ibm.pc.hardware','comp.sys.mac.hardware']
    print 'preprocess data'
    #newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=cats)
    #vectors = vectorizer.fit_transform(newsgroups_train.data)
    #print vectors.shape
    #newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), categories=cats)   
    #print 'preprocess test data'
    #vectors_test = vectorizer.fit_transform(newsgroups_test.data)
    if params['rm_footers']:
        to_remove = ('headers', 'footers')
    else:
        to_remove = ('headers',)
        
    print_20n(to_remove, cats, params)
    newsgroups_all = fetch_20newsgroups(subset='all', remove=to_remove, categories=cats)#,'footers'))#,'footers','quotes'), categories=cats)   
    vectors_all = vectorizer.fit_transform(newsgroups_all.data)
    #nrow=round(7.0/10.0*vectors_all.shape[0])
    newsgroups_train = fetch_20newsgroups(subset='train',remove=to_remove, categories=cats)
    nrow=newsgroups_train.target.shape[0]
    #print nrow
    #print vectors_all.shape
    vectors=vectors_all[0:nrow,:]
    vectors_test=vectors_all[nrow:,:]
    #print vectors.shape
    #print vectors_test.shape
    print 'fit model'
    logreg.fit(vectors,newsgroups_all.target[0:nrow])
    print 'predict model'
    pred=logreg.predict(vectors_test)
    print 'evaluate'
    y=metrics.accuracy_score(newsgroups_all.target[nrow:], pred)
    print 'Result: ', y
    print('idf: ', params['idf'], 'rm_footers: ', params['rm_footers'], 'cats: ', params['cats'])
    return -y
开发者ID:Noahs-ARK,项目名称:ARKcat,代码行数:56,代码来源:20n_example.py


示例14: load_data

def load_data():
    twenty_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)
    twenty_test = fetch_20newsgroups(subset='test',shuffle=True, random_state=42)
     
    x_train = twenty_train.data
    y_train = twenty_train.target
    x_test = twenty_test.data
    y_test = twenty_test.target
    print 'data loaded!'
    return (x_train, y_train, x_test, y_test)
开发者ID:dcrankshaw,项目名称:clipper-plots,代码行数:10,代码来源:newsgroups_class.py


示例15: exercise

def exercise():
    groups = ['comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 
        'comp.sys.mac.hardware', 'comp.windows.x', 'sci.space']
    train_data = fetch_20newsgroups(subset='train', categories=groups)
    clusterizer = DocumentClusterizer()
    clusterizer.train(train_data.data)
    test_data = fetch_20newsgroups(subset='test', categories=groups)
    for i in range(10):
        sample = test_data.data[np.random.randint(len(test_data.data))]
        clusterizer.find_most_similar(sample)
开发者ID:jichao06,项目名称:datascience,代码行数:10,代码来源:clustering.py


示例16: get_login_pages

def get_login_pages(keywords):
    from sklearn.datasets import fetch_20newsgroups
    import gensim
    import re
    """
    newsgroups_train = fetch_20newsgroups(subset='train')
    for  news in newsgroups_train.target_names:
        print news

    alt.atheism
    comp.graphics
    comp.os.ms-windows.misc
    comp.sys.ibm.pc.hardware
    comp.sys.mac.hardware
    comp.windows.x
    misc.forsale
    rec.autos
    rec.motorcycles
    rec.sport.baseball
    rec.sport.hockey
    sci.crypt
    sci.electronics
    sci.med
    sci.space
    soc.religion.christian
    talk.politics.guns
    talk.politics.mideast
    talk.politics.misc
    talk.religion.misc
    """
    #cats = ['sci.crypt']
    #newsgroups_train = fetch_20newsgroups(subset='train', categories=cats)
    newsgroups_train = fetch_20newsgroups(subset='train')
    newsgroups_test = fetch_20newsgroups(subset='test')

    newsgroups=[]
    newsgroups.append(newsgroups_train.data)
    newsgroups.append(newsgroups_test.data)
    #newsgroups_train = fetch_20newsgroups()
    #print len(newsgroups_train.data)
    print newsgroups_train.data
    sentences=[re.findall("[a-z\-]+",s.lower()) for s in newsgroups_train.data]
    #sentences = [s.lower().split() for s in newsgroups_train.data]
    #print sentences

    model = gensim.models.Word2Vec(sentences, size=200, window=5, min_count=1, workers=4,iter=20)

    #print len(sentences)

    for key in keywords:
        print "[%s] most_similar:" % key
        results=model.most_similar(positive=[key], topn=10)
        for i in results:
            print i
开发者ID:Emersonxuelinux,项目名称:2book,代码行数:54,代码来源:scanner-poc.py


示例17: testNaiveBayesSK2

 def testNaiveBayesSK2(self):
     categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
     newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
     newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)
     vectorizer = TfidfVectorizer()
     # Both vectors and vectors_test are SciPy CSR matrix
     vectors = vectorizer.fit_transform(newsgroups_train.data)
     vectors_test = vectorizer.transform(newsgroups_test.data)
     nb = NaiveBayes(sqlCtx)
     nb.fit(vectors, newsgroups_train.target)
     pred = nb.predict(vectors_test)
     score = metrics.f1_score(newsgroups_test.target, pred, average='weighted')
     self.failUnless(score > 0.8)
开发者ID:d-behi,项目名称:incubator-systemml,代码行数:13,代码来源:test_mllearn.py


示例18: test_naive_bayes1

 def test_naive_bayes1(self):
     categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
     newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
     newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)
     vectorizer = TfidfVectorizer()
     # Both vectors and vectors_test are SciPy CSR matrix
     vectors = vectorizer.fit_transform(newsgroups_train.data)
     vectors_test = vectorizer.transform(newsgroups_test.data)
     nb = NaiveBayes(sparkSession)
     mllearn_predicted = nb.fit(vectors, newsgroups_train.target).predict(vectors_test)
     from sklearn.naive_bayes import MultinomialNB
     clf = MultinomialNB()
     sklearn_predicted = clf.fit(vectors, newsgroups_train.target).predict(vectors_test)
     self.failUnless(accuracy_score(sklearn_predicted, mllearn_predicted) > 0.95 )
开发者ID:frreiss,项目名称:fred-systemml,代码行数:14,代码来源:test_mllearn_numpy.py


示例19: __init__

  def __init__(self):
    data_train = fetch_20newsgroups(subset='train', categories=None,
                                    shuffle=True, random_state=42)
    data_test = fetch_20newsgroups(subset='test', categories=None,
                                   shuffle=True, random_state=42)
    self.train_data = data_train.data
    self.train_target = data_train.target
    self.alltest_data = data_test.data
    self.alltest_target = data_test.target

    self.categories = data_train.target_names
    self.num_classes = 20

    DataGatherer.__init__(self)
开发者ID:Web5design,项目名称:big-data,代码行数:14,代码来源:loader.py


示例20: load_dataset

def load_dataset(category_list):
    """
    :return: Load the 20_newsgroup dataset depending on category_list.
             If [] provided return everything
    """

    if category_list == []:  # read all categories from news20 dataset
        train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)
        test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42)
    else:            # read only computer technology & recreational activity categories
        train = fetch_20newsgroups(subset='train',  shuffle=True, random_state=42, categories=category_list)
        test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42,  categories=category_list)

    return train, test
开发者ID:RonakSumbaly,项目名称:EE239AS-Signal-and-Systems,代码行数:14,代码来源:utility.py



注:本文中的sklearn.datasets.fetch_20newsgroups函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python datasets.fetch_20newsgroups_vectorized函数代码示例发布时间:2022-05-27
下一篇:
Python datasets.dump_svmlight_file函数代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap