本文整理汇总了Python中nltk.corpus.reuters.categories函数的典型用法代码示例。如果您正苦于以下问题:Python categories函数的具体用法?Python categories怎么用?Python categories使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了categories函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: load_data
def load_data(config={}):
"""
Load the Reuters dataset.
Returns
-------
data : dict
with keys 'x_train', 'x_test', 'y_train', 'y_test', 'labels'
"""
stop_words = stopwords.words("english")
vectorizer = TfidfVectorizer(stop_words=stop_words)
mlb = MultiLabelBinarizer()
documents = reuters.fileids()
test = [d for d in documents if d.startswith('test/')]
train = [d for d in documents if d.startswith('training/')]
docs = {}
docs['train'] = [reuters.raw(doc_id) for doc_id in train]
docs['test'] = [reuters.raw(doc_id) for doc_id in test]
xs = {'train': [], 'test': []}
xs['train'] = vectorizer.fit_transform(docs['train']).toarray()
xs['test'] = vectorizer.transform(docs['test']).toarray()
ys = {'train': [], 'test': []}
ys['train'] = mlb.fit_transform([reuters.categories(doc_id)
for doc_id in train])
ys['test'] = mlb.transform([reuters.categories(doc_id)
for doc_id in test])
data = {'x_train': xs['train'], 'y_train': ys['train'],
'x_test': xs['test'], 'y_test': ys['test'],
'labels': globals()["labels"]}
return data
开发者ID:MartinThoma,项目名称:algorithms,代码行数:32,代码来源:reuters.py
示例2: print_reuters
def print_reuters():
from nltk.corpus import reuters
# print reuters.fileids()
# print reuters.categories()
print reuters.categories('training/9865')
print reuters.categories(['training/9865','training/9880'])
print reuters.fileids('barley')
print reuters.fileids(['barely','corn'])
开发者ID:Paul-Lin,项目名称:misc,代码行数:8,代码来源:toturial.py
示例3: __init__
def __init__(self):
# print reuters categories
print "reuters categories"
print reuters.categories()
# TODO this is probably bad
print "getting nodes"
self.nodes = database.get_all_nodes()
print "training classifier"
self.classifier = DocumentClassifier()
开发者ID:nathanjordan,项目名称:bernstein,代码行数:9,代码来源:classifier.py
示例4: explore_categories
def explore_categories(max_len=5000, min_len=100, percentage=0.3):
for cat in reuters.categories():
for cat2 in reuters.categories():
if cat2 > cat:
if len(set(reuters.fileids(cat)) & set(reuters.fileids(cat2))) == 0:
l1 = len(reuters.fileids(cat))
l2 = len(reuters.fileids(cat2))
if ( (l1 + l2) > min_len) and ( (l1 + l2) < max_len) and float((min(l1, l2))/float(l1+l2) > percentage):
print cat, cat2, l1 + l2, float(min(l1, l2))/float(l1+l2)
开发者ID:verasazonova,项目名称:textsim,代码行数:9,代码来源:reuters.py
示例5: get_test_set
def get_test_set():
single_categories = [(id, re.categories(id)[0])
for id in re.fileids()
if len(re.categories(id)) == 1]
single_cat_list = distribution(single_categories, itemgetter(1))
used_categories = [x[0]
for x in single_cat_list
if x[1] < 600 and x[1] > 200]
return [pair for pair in single_categories if pair[1] in used_categories]
开发者ID:simone-trubian,项目名称:blog-posts,代码行数:11,代码来源:clustering.py
示例6: get_target
def get_target(self):
# cat1 vs. cat2
if len(self.categories) > 1:
target = [ [cat for cat in reuters.categories(fileid) if cat in self.categories][0]
for fileid in self.fileids]
# cat1 vs. not cat1
else:
target = [ 1 if self.categories[0] in reuters.categories(fileid) else 0
for fileid in self.fileids]
self.classes, target = np.unique(target, return_inverse=True)
return target
开发者ID:verasazonova,项目名称:textsim,代码行数:12,代码来源:reuters.py
示例7: create_tfidf_data
def create_tfidf_data(docs,categories,n=None):
"""
Crea una struttura [(label,[parole])] parsando il documento
:param docs: lista dei documenti reuters
:param categories: nomi delle categorie da considerare
:param n: numero di documenti da usare
:return: list
"""
if n:
docs = docs[:n]
cat_num = {}; i = 1
for c in categories:
cat_num[c] = i
i += 1
y = []
corpus = []
for d in docs:
c = reuters.categories(d)[0]
if c in categories:
y.append(getSVMCategory(cat_num[c]))
corpus.append(reuters.raw(d).lower())
return y, corpus
开发者ID:BugliL,项目名称:SVNexercise,代码行数:25,代码来源:mainSGD.py
示例8: reuters_high_info_words
def reuters_high_info_words(score_fn=BigramAssocMeasures.chi_sq):
labeled_words = []
for label in reuters.categories():
labeled_words.append((label, reuters.words(categories=[label])))
return high_information_words(labeled_words, score_fn=score_fn)
开发者ID:RomanZacharia,项目名称:python_text_processing_w_nltk2_cookbook,代码行数:7,代码来源:featx.py
示例9: get_testset_trainset_nltk_reuters
def get_testset_trainset_nltk_reuters():
from nltk.corpus import reuters
global categories_file_name_dict
global cat_num_docs
clean_files = [f for f in reuters.fileids() if len(reuters.categories(fileids=f))==1]
testset = [f for f in clean_files if f[:5]=='test/']
trainset = [f for f in clean_files if f[:9]=='training/']
for cat in reuters.categories():
li=[f for f in reuters.fileids(categories=cat) if f in trainset]
li_te = [f for f in reuters.fileids(categories=cat) if f in testset]
if len(li)>20 and len(li_te)>20:
cat_num_docs[cat]=len(li)
li.extend(li_te)
categories_file_name_dict[cat]=li
return [[ f for f in trainset if f2c('reuters',f) in categories_file_name_dict],
[ f for f in testset if f2c('reuters',f) in categories_file_name_dict]]
开发者ID:genf,项目名称:Naive-Bayes-Document-Classifier,代码行数:16,代码来源:Preprocessor.py
示例10: collection_stats
def collection_stats():
# List of documents
documents = reuters.fileids()
print(str(len(documents)) + " documents");
train_docs = list(filter(lambda doc: doc.startswith("train"), documents));
print(str(len(train_docs)) + " total train documents");
test_docs = list(filter(lambda doc: doc.startswith("test"), documents));
print(str(len(test_docs)) + " total test documents");
# List of categories
categories = reuters.categories();
print(str(len(categories)) + " categories");
# Documents in a category
category_docs = reuters.fileids("acq");
# Words for a document
document_id = category_docs[0]
document_words = reuters.words(category_docs[0]);
print(document_words);
# Raw document
print(reuters.raw(document_id));
开发者ID:BugliL,项目名称:SVNexercise,代码行数:25,代码来源:test2.py
示例11: f2c
def f2c(corpus,fileName):
if corpus=='mr':
from nltk.corpus import movie_reviews as mr
return mr.categories(fileids = fileName)[0]
else:
from nltk.corpus import reuters
return reuters.categories(fileids = fileName)[0]
开发者ID:genf,项目名称:Naive-Bayes-Document-Classifier,代码行数:7,代码来源:Filename_To_Cat.py
示例12: import_reuters_files
def import_reuters_files(ds, silent=False, log=sys.stdout):
"""
Import the brown corpus into `ds`. E.g.
>>> from nathan.core import Dataspace
>>> ds = Dataspace()
>>> %time brown.import_brown(ds, silent=True)
CPU times: user 12min 28s, sys: 536 ms, total: 12min 29s
Wall time: 12min 29s
"""
if not silent:
total = len(reuters.fileids())
counter = 0
root_handle = ds.insert("#reuters")
for fileid in reuters.fileids():
tags = ["@%s" % category for category in reuters.categories(fileid)]
file_handle = ds.insert(["#%s" % fileid] + tags)
ds.link(root_handle, file_handle)
for sent in reuters.sents(fileid):
norm = [word.lower() for word in sent]
sen_handle = ds.insert(norm)
ds.link(file_handle, sen_handle)
if not silent:
counter += 1
if (counter % 10 == 0):
print("importing %s of %s files..." % (counter, total),
file=log)
开发者ID:tdiggelm,项目名称:nltk-playground,代码行数:27,代码来源:train.py
示例13: format_data
def format_data(docs, all_categories):
y = []; corpus = []
for d in docs:
current_categories = filter(lambda x: x in all_categories,reuters.categories(d))
if current_categories:
y.append(current_categories[0])
corpus.append(reuters.raw(d).lower())
return y, corpus
开发者ID:BugliL,项目名称:SVNexercise,代码行数:8,代码来源:main.py
示例14: makeWordSet
def makeWordSet(args=None):
'''Use the Brown corpus to see how many words used'''
word_set = set()
for cat in brown.categories():
word_set = word_set.union(set(brown.words(categories=cat)))
for cat in reuters.categories():
word_set = word_set.union(set(reuters.words(categories=cat)))
return word_set
开发者ID:divanshugarg,项目名称:Kaggle-Projects-Stuff,代码行数:8,代码来源:wordCheck.py
示例15: __iter__
def __iter__(self):
""" Generator of docs while collecting ordered structured info. """
for n, reutersid in enumerate(reuters.fileids()): # 'training|test/xxxx'
dataset, _ = reutersid.split('/') # extract dataset
if self.dataset in dataset: # yield only filtered dataset
if self.categories is not None:
top_category = reuters.categories(reutersid)[0] # grab first category only
self.category_mask.append(self.categories[top_category]) # n-th doc -> classid
yield reuters.raw(reutersid) # return raw document
开发者ID:lum4chi,项目名称:IR,代码行数:9,代码来源:reuterscorpus.py
示例16: __init__
def __init__(self, dataset=''):
"""
Docs in reuters corpus are identified by ids like "training|test/xxxx".
:param dataset: filter for ids
"""
self.dataset = dataset # filter docs
self.categories = {c: n for n, c in enumerate(reuters.categories())} # map class with int
self.docs = {d: n for n, d in enumerate(reuters.fileids())} # map docs with int
self.category_mask = [] # mask nth doc with its ith class
开发者ID:lum4chi,项目名称:IR,代码行数:9,代码来源:reuterscorpus.py
示例17: reuters_train_test_feats
def reuters_train_test_feats(feature_detector=bag_of_words):
train_feats = []
test_feats = []
for fileid in reuters.fileids():
if fileid.startswith('training'):
featlist = train_feats
else: # fileid.startswith('test')
featlist = test_feats
feats = feature_detector(reuters.words(fileid))
labels = reuters.categories(fileid)
featlist.append((feats, labels))
return train_feats, test_feats
开发者ID:sophist114,项目名称:Python,代码行数:12,代码来源:EmotionAnalysis.py
示例18: create_tfidf_data
def create_tfidf_data(docs,n=None):
"""
Crea una struttura [(label,[parole])] togliendo le stopwords
e parsando il documento
:param docs: lista dei documenti reuters
:param n: numero di documenti da usare
:return: list
"""
if n:
docs = docs[:n]
y = [reuters.categories(d)[0] for d in docs]
corpus = [reuters.raw(d).lower() for d in docs]
return y, corpus
开发者ID:BugliL,项目名称:SVNexercise,代码行数:13,代码来源:main.py
示例19: computeStats
def computeStats(self, categories):
files = batchReadReuters('training', categories)
for file_name in files:
raw_txt = readFromFile('/home/dales3d/nltk_data/corpora/reuters/' + file_name)
fileCategories = reuters.categories(file_name)
#for cat in categories:
# if cat not in self.activeCategories:
# self.activeCategories.append(cat)
self.activeCategories = categories
words = extractWords(raw_txt)
keywords = meter(words)
for word in keywords:
if word not in self.wordsStatDict:
self.wordsStatDict[word] = WordStats()
w_stat = self.wordsStatDict[word]
w_stat.word = word
w_stat.addText(file_name, keywords[word], fileCategories)
开发者ID:maxim-popkov,项目名称:graph-term,代码行数:18,代码来源:Reader.py
示例20: makeData
def makeData(file, set):
labels = []
f = open(file, "w")
for doc in set:
title = []
label = reuters.categories(doc)[0]
labels.append(label)
for i in reuters.words(doc):
if not i.isupper():
break
else:
title.append(i)
f.write(' '.join(title) + "\n")
f.close()
f = open("labels" + file, "w")
f.write("\n".join(labels))
f.close()
开发者ID:noelano,项目名称:Thesis,代码行数:18,代码来源:ReutersCorpus.py
注:本文中的nltk.corpus.reuters.categories函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论