• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python reuters.categories函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中nltk.corpus.reuters.categories函数的典型用法代码示例。如果您正苦于以下问题:Python categories函数的具体用法?Python categories怎么用?Python categories使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了categories函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: load_data

def load_data(config={}):
    """
    Load the Reuters dataset.

    Returns
    -------
    data : dict
        with keys 'x_train', 'x_test', 'y_train', 'y_test', 'labels'
    """
    stop_words = stopwords.words("english")
    vectorizer = TfidfVectorizer(stop_words=stop_words)
    mlb = MultiLabelBinarizer()

    documents = reuters.fileids()
    test = [d for d in documents if d.startswith('test/')]
    train = [d for d in documents if d.startswith('training/')]

    docs = {}
    docs['train'] = [reuters.raw(doc_id) for doc_id in train]
    docs['test'] = [reuters.raw(doc_id) for doc_id in test]
    xs = {'train': [], 'test': []}
    xs['train'] = vectorizer.fit_transform(docs['train']).toarray()
    xs['test'] = vectorizer.transform(docs['test']).toarray()
    ys = {'train': [], 'test': []}
    ys['train'] = mlb.fit_transform([reuters.categories(doc_id)
                                     for doc_id in train])
    ys['test'] = mlb.transform([reuters.categories(doc_id)
                                for doc_id in test])
    data = {'x_train': xs['train'], 'y_train': ys['train'],
            'x_test': xs['test'], 'y_test': ys['test'],
            'labels': globals()["labels"]}
    return data
开发者ID:MartinThoma,项目名称:algorithms,代码行数:32,代码来源:reuters.py


示例2: print_reuters

def print_reuters():
    from nltk.corpus import reuters
    # print reuters.fileids()
    # print reuters.categories()
    print reuters.categories('training/9865')
    print reuters.categories(['training/9865','training/9880'])
    print reuters.fileids('barley')
    print reuters.fileids(['barely','corn'])
开发者ID:Paul-Lin,项目名称:misc,代码行数:8,代码来源:toturial.py


示例3: __init__

 def __init__(self):
     # print reuters categories
     print "reuters categories"
     print reuters.categories()
     # TODO this is probably bad
     print "getting nodes"
     self.nodes = database.get_all_nodes()
     print "training classifier"
     self.classifier = DocumentClassifier()
开发者ID:nathanjordan,项目名称:bernstein,代码行数:9,代码来源:classifier.py


示例4: explore_categories

def explore_categories(max_len=5000, min_len=100, percentage=0.3):
    for cat in reuters.categories():
        for cat2 in reuters.categories():
            if cat2 > cat:
                if  len(set(reuters.fileids(cat)) & set(reuters.fileids(cat2))) == 0:
                    l1 = len(reuters.fileids(cat))
                    l2 = len(reuters.fileids(cat2))
                    if ( (l1 + l2) > min_len) and ( (l1 + l2) < max_len) and float((min(l1, l2))/float(l1+l2) > percentage):
                        print cat, cat2, l1 + l2, float(min(l1, l2))/float(l1+l2)
开发者ID:verasazonova,项目名称:textsim,代码行数:9,代码来源:reuters.py


示例5: get_test_set

def get_test_set():
    single_categories = [(id, re.categories(id)[0])
                         for id in re.fileids()
                         if len(re.categories(id)) == 1]

    single_cat_list = distribution(single_categories, itemgetter(1))
    used_categories = [x[0]
                       for x in single_cat_list
                       if x[1] < 600 and x[1] > 200]

    return [pair for pair in single_categories if pair[1] in used_categories]
开发者ID:simone-trubian,项目名称:blog-posts,代码行数:11,代码来源:clustering.py


示例6: get_target

    def get_target(self):

        # cat1 vs. cat2
        if len(self.categories) > 1:
            target = [ [cat for cat in reuters.categories(fileid) if cat in self.categories][0]
                       for fileid in self.fileids]
        # cat1 vs. not cat1
        else:
            target = [ 1 if self.categories[0] in reuters.categories(fileid) else 0
                       for fileid in self.fileids]
        self.classes, target = np.unique(target, return_inverse=True)
        return target
开发者ID:verasazonova,项目名称:textsim,代码行数:12,代码来源:reuters.py


示例7: create_tfidf_data

def create_tfidf_data(docs,categories,n=None):
    """
    Crea una struttura [(label,[parole])] parsando il documento
    :param docs: lista dei documenti reuters
    :param categories: nomi delle categorie da considerare
    :param n: numero di documenti da usare
    :return: list
    """
    if n:
        docs = docs[:n]

    cat_num = {}; i = 1
    for c in categories:
        cat_num[c] = i
        i += 1

    y = []
    corpus = []
    for d in docs:
        c = reuters.categories(d)[0]
        if c in categories:
            y.append(getSVMCategory(cat_num[c]))
            corpus.append(reuters.raw(d).lower())

    return y, corpus
开发者ID:BugliL,项目名称:SVNexercise,代码行数:25,代码来源:mainSGD.py


示例8: reuters_high_info_words

def reuters_high_info_words(score_fn=BigramAssocMeasures.chi_sq):
	labeled_words = []
	
	for label in reuters.categories():
		labeled_words.append((label, reuters.words(categories=[label])))
	
	return high_information_words(labeled_words, score_fn=score_fn)
开发者ID:RomanZacharia,项目名称:python_text_processing_w_nltk2_cookbook,代码行数:7,代码来源:featx.py


示例9: get_testset_trainset_nltk_reuters

def get_testset_trainset_nltk_reuters():
    from nltk.corpus import reuters
    global categories_file_name_dict
    global cat_num_docs
    clean_files = [f for f in reuters.fileids() if len(reuters.categories(fileids=f))==1]    
    testset = [f for f in clean_files if f[:5]=='test/']
    trainset = [f for f in clean_files if f[:9]=='training/']
    for cat in reuters.categories():
        li=[f for f in reuters.fileids(categories=cat) if f in trainset]
        li_te = [f for f in reuters.fileids(categories=cat) if f in testset]
        if len(li)>20 and len(li_te)>20:
            cat_num_docs[cat]=len(li)
            li.extend(li_te)
            categories_file_name_dict[cat]=li
    return [[ f for f in trainset if f2c('reuters',f) in categories_file_name_dict],
            [ f for f in testset if f2c('reuters',f) in categories_file_name_dict]]            
开发者ID:genf,项目名称:Naive-Bayes-Document-Classifier,代码行数:16,代码来源:Preprocessor.py


示例10: collection_stats

def collection_stats():
	# List of documents
	documents = reuters.fileids()
	print(str(len(documents)) + " documents");
	
	train_docs = list(filter(lambda doc: doc.startswith("train"), documents));
	print(str(len(train_docs)) + " total train documents");
	
	test_docs = list(filter(lambda doc: doc.startswith("test"), documents));	
	print(str(len(test_docs)) + " total test documents");

	# List of categories 
	categories = reuters.categories();
	print(str(len(categories)) + " categories");

	# Documents in a category
	category_docs = reuters.fileids("acq");

	# Words for a document
	document_id = category_docs[0]
	document_words = reuters.words(category_docs[0]);
	print(document_words);	

	# Raw document
	print(reuters.raw(document_id));
开发者ID:BugliL,项目名称:SVNexercise,代码行数:25,代码来源:test2.py


示例11: f2c

def f2c(corpus,fileName):
    if corpus=='mr':
        from nltk.corpus import movie_reviews as mr
        return mr.categories(fileids = fileName)[0]    
    else:
        from nltk.corpus import reuters
        return reuters.categories(fileids = fileName)[0]    
开发者ID:genf,项目名称:Naive-Bayes-Document-Classifier,代码行数:7,代码来源:Filename_To_Cat.py


示例12: import_reuters_files

def import_reuters_files(ds, silent=False, log=sys.stdout):
    """
    Import the brown corpus into `ds`. E.g.
    
    >>> from nathan.core import Dataspace
    >>> ds = Dataspace()
    >>> %time brown.import_brown(ds, silent=True)
    CPU times: user 12min 28s, sys: 536 ms, total: 12min 29s
    Wall time: 12min 29s
    """
    if not silent:
        total = len(reuters.fileids())
        counter = 0
    root_handle = ds.insert("#reuters")
    for fileid in reuters.fileids():
        tags = ["@%s" % category for category in reuters.categories(fileid)]
        file_handle = ds.insert(["#%s" % fileid] + tags)
        ds.link(root_handle, file_handle)
        for sent in reuters.sents(fileid):
            norm = [word.lower() for word in sent]
            sen_handle = ds.insert(norm)
            ds.link(file_handle, sen_handle)
        if not silent:
            counter += 1
            if (counter % 10 == 0):
                print("importing %s of %s files..." % (counter, total), 
                    file=log)
开发者ID:tdiggelm,项目名称:nltk-playground,代码行数:27,代码来源:train.py


示例13: format_data

def format_data(docs, all_categories):
    y = []; corpus = []
    for d in docs:
        current_categories = filter(lambda x: x in all_categories,reuters.categories(d))
        if current_categories:
            y.append(current_categories[0])
            corpus.append(reuters.raw(d).lower())
    return y, corpus
开发者ID:BugliL,项目名称:SVNexercise,代码行数:8,代码来源:main.py


示例14: makeWordSet

def makeWordSet(args=None):
    '''Use the Brown corpus to see how many words used'''
    word_set = set()
    for cat in brown.categories():
        word_set = word_set.union(set(brown.words(categories=cat)))
    for cat in reuters.categories():
        word_set = word_set.union(set(reuters.words(categories=cat)))
    return word_set
开发者ID:divanshugarg,项目名称:Kaggle-Projects-Stuff,代码行数:8,代码来源:wordCheck.py


示例15: __iter__

 def __iter__(self):
     """ Generator of docs while collecting ordered structured info. """
     for n, reutersid in enumerate(reuters.fileids()):         # 'training|test/xxxx'
         dataset, _ = reutersid.split('/')       # extract dataset
         if self.dataset in dataset:             # yield only filtered dataset
             if self.categories is not None:
                 top_category = reuters.categories(reutersid)[0]            # grab first category only
                 self.category_mask.append(self.categories[top_category])   # n-th doc -> classid
             yield reuters.raw(reutersid)        # return raw document
开发者ID:lum4chi,项目名称:IR,代码行数:9,代码来源:reuterscorpus.py


示例16: __init__

 def __init__(self, dataset=''):
     """
         Docs in reuters corpus are identified by ids like "training|test/xxxx".
     :param dataset: filter for ids
     """
     self.dataset = dataset # filter docs
     self.categories = {c: n for n, c in enumerate(reuters.categories())} # map class with int
     self.docs = {d: n for n, d in enumerate(reuters.fileids())}  # map docs with int
     self.category_mask = [] # mask nth doc with its ith class
开发者ID:lum4chi,项目名称:IR,代码行数:9,代码来源:reuterscorpus.py


示例17: reuters_train_test_feats

def reuters_train_test_feats(feature_detector=bag_of_words):
	train_feats = []
	test_feats = []
	for fileid in reuters.fileids():
		if fileid.startswith('training'):
			featlist = train_feats
		else:   # fileid.startswith('test')
			featlist = test_feats
		feats = feature_detector(reuters.words(fileid))
		labels = reuters.categories(fileid)
		featlist.append((feats, labels))
	return train_feats, test_feats
开发者ID:sophist114,项目名称:Python,代码行数:12,代码来源:EmotionAnalysis.py


示例18: create_tfidf_data

def create_tfidf_data(docs,n=None):
    """
    Crea una struttura [(label,[parole])] togliendo le stopwords
    e parsando il documento
    :param docs: lista dei documenti reuters
    :param n: numero di documenti da usare
    :return: list
    """
    if n:
        docs = docs[:n]
    y = [reuters.categories(d)[0] for d in docs]
    corpus = [reuters.raw(d).lower() for d in docs]
    return y, corpus
开发者ID:BugliL,项目名称:SVNexercise,代码行数:13,代码来源:main.py


示例19: computeStats

    def computeStats(self, categories):
        files = batchReadReuters('training', categories)
        for file_name in files:
            raw_txt = readFromFile('/home/dales3d/nltk_data/corpora/reuters/' + file_name)
            fileCategories = reuters.categories(file_name)
            #for cat in categories:
            #	if cat not in self.activeCategories:
            #		self.activeCategories.append(cat)
            self.activeCategories = categories

            words = extractWords(raw_txt)
            keywords = meter(words)
            for word in keywords:
                if word not in self.wordsStatDict:
                    self.wordsStatDict[word] = WordStats()
                w_stat = self.wordsStatDict[word]
                w_stat.word = word
                w_stat.addText(file_name, keywords[word], fileCategories)
开发者ID:maxim-popkov,项目名称:graph-term,代码行数:18,代码来源:Reader.py


示例20: makeData

def makeData(file, set):
    labels = []
    f = open(file, "w")
    for doc in set:
        title = []
        label = reuters.categories(doc)[0]
        labels.append(label)
        for i in reuters.words(doc):
            if not i.isupper():
                break
            else:
                title.append(i)
        f.write(' '.join(title) + "\n")
    f.close()

    f = open("labels" + file, "w")
    f.write("\n".join(labels))
    f.close()
开发者ID:noelano,项目名称:Thesis,代码行数:18,代码来源:ReutersCorpus.py



注:本文中的nltk.corpus.reuters.categories函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python reuters.fileids函数代码示例发布时间:2022-05-27
下一篇:
Python xmldocs.XMLCorpusView类代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap