本文整理汇总了Python中nltk.corpus.brown.categories函数的典型用法代码示例。如果您正苦于以下问题:Python categories函数的具体用法?Python categories怎么用?Python categories使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了categories函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: print_brown
def print_brown():
from nltk.corpus import brown
print brown.categories()
print brown.words(categories='news')
print brown.words(fileids=['cg22'])
print brown.sents(categories=['news','reviews'])
news_text=brown.words(categories='news')
fdist=nltk.FreqDist([w.lower() for w in news_text])
modals=['can','could','may','might','must','will']
for m in modals:
print m+':',fdist[m]
开发者ID:Paul-Lin,项目名称:misc,代码行数:11,代码来源:toturial.py
示例2: get_training_test_sentences
def get_training_test_sentences(self):
brown_cats = ",".join(brown.categories())
self.news_text = brown.words(categories= brown.categories())
self.news_tagged_sentences = brown.tagged_sents(categories= brown.categories())
size = int(len(self.news_tagged_sentences) * .9)
brown_train = self.news_tagged_sentences[:size]
brown_test = self.news_tagged_sentences[size:]
self.train_sents = brown_train
self.test_sents = brown_test
开发者ID:TheFourMonkeysProject,项目名称:Alfred,代码行数:11,代码来源:trainers.py
示例3: build_all_brown
def build_all_brown(subset_size=None):
documents = []
categories = []
all_categories = set()
try:
fileids = brown.fileids()
for fileid in fileids:
if subset_size:
if len(all_categories) > subset_size:
break
category = brown.categories(fileid)[0]
words = [x.lower() for x in brown.words(fileid)]
documents.append(words)
categories.append(category)
all_categories.add(category)
if subset_size != len(brown.categories()):
# exclude the final item, since it's the sole member of the next group
documents = documents[:-1]
categories = categories[:-1]
documents = [" ".join(d) for d in documents]
except LookupError:
""" we don't have the Brown corpus via nltk on this machine """
try:
with open("brown_docs_cats.pickle") as f:
documents, categories = pickle.load(f)
except IOError:
raise Exception("can't load Brown Corpus via NLTK or file")
# documents = [' '.join(d) for d in documents]
"""
# let's NOT get tempted to hide away the encoding
# we'll probably need to access, e.g., the vectorizer, to do reverse
# transformations once we want to interpret/evaluate the model
doc_vectorizer = CountVectorizer()
doc_vec = doc_vectorizer.fit_transform(documents)
"""
return documents, categories
开发者ID:kinguistics,项目名称:naivebayes,代码行数:48,代码来源:brown_testing.py
示例4: import_brown_pos
def import_brown_pos(ds, simplify_tags=False, silent=False, log=sys.stdout):
"""
Import the brown corpus into `ds`. E.g.
>>> from nathan.core import Dataspace
>>> ds = Dataspace()
>>> %time brown.import_brown(ds, silent=True)
CPU times: user 12min 28s, sys: 536 ms, total: 12min 29s
Wall time: 12min 29s
"""
if not silent:
total = len(brown.sents())
counter = 0
for category in brown.categories():
cat_handle = ds.insert("#%s" % category)
for sent in brown.tagged_sents(categories=category):
if simplify_tags:
norm = (simplify_tag(t) for t in sent)
norm = [nltk.tuple2str(t) for t in norm]
sen_handle = ds.insert(norm)
ds.link(cat_handle, sen_handle)
if not silent:
counter += 1
if (counter % 100 == 0):
print("importing %s of %s sentences..." % (counter, total),
file=log)
开发者ID:tdiggelm,项目名称:nltk-playground,代码行数:26,代码来源:train.py
示例5: ex11
def ex11():
from nltk.corpus import brown
modals = set(["can", "could", "may", "might", "shall", "should", "will", "would", "must", "ought"])
cfd = nltk.ConditionalFreqDist(
(genre, modal)
for genre in brown.categories()
for modal in [w.lower() for w in brown.words(categories=genre) if w.lower() in modals])
cfd.plot()
开发者ID:447327642,项目名称:nltk-examples,代码行数:8,代码来源:ch02_ex.py
示例6: brown_diversity
def brown_diversity():
"""calculate and display lexical diversity score (token/token_type) for each brown corpus category"""
cfd = nltk.ConditionalFreqDist((category, word)
for category in brown.categories()
for word in brown.words(categories=category))
print "{0:15s} {1:10s}".format("CATEGORY", "DIVERSITY")
for category in cfd.conditions():
print "{0:15s} {1:10f}".format(category, (cfd[category].N() * 1.0 / cfd[category].B()))
开发者ID:jyzhang,项目名称:py-nlp,代码行数:8,代码来源:ch2.py
示例7: fun08
def fun08():
"""fun08"""
cfd = nltk.ConditionalFreqDist((genre, word) \
for genre in brown.categories() \
for word in brown.words(categories=genre))
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
modals = ['can', 'could', 'may', 'might', 'must', 'will']
cfd.tabulate(conditions=genres, samples=modals)
开发者ID:gree2,项目名称:hobby,代码行数:8,代码来源:ch02.py
示例8: makeWordSet
def makeWordSet(args=None):
'''Use the Brown corpus to see how many words used'''
word_set = set()
for cat in brown.categories():
word_set = word_set.union(set(brown.words(categories=cat)))
for cat in reuters.categories():
word_set = word_set.union(set(reuters.words(categories=cat)))
return word_set
开发者ID:divanshugarg,项目名称:Kaggle-Projects-Stuff,代码行数:8,代码来源:wordCheck.py
示例9: exercise_brown2
def exercise_brown2():
"""带条件的频率分布函数"""
cfd = nltk.ConditionalFreqDist(
(genre, word) for genre in brown.categories() for word in brown.words(categories=genre)
)
genres = ["news", "religion", "hobbies", "science_fiction", "romance", "humor"]
modals = ["can", "could", "may", "might", "must", "will"]
cfd.tabulate(conditions=genres, samples=modals)
开发者ID:BurnellLiu,项目名称:LiuProject,代码行数:9,代码来源:chapter_02.py
示例10: ex16
def ex16():
from nltk.corpus import brown
lex_div = {}
for category in brown.categories():
words = brown.words(categories=category)
ld = len(words) / len(set(words))
print category, ld
lex_div[category] = ld
print sorted(lex_div.iteritems(), key=operator.itemgetter(1))
开发者ID:447327642,项目名称:nltk-examples,代码行数:9,代码来源:ch02_ex.py
示例11: exercise_brown
def exercise_brown():
# 打印布朗语料库中的分类
print brown.categories()
# 打印分类为新闻的文本词汇
print brown.words(categories="news")
# 打印文本'cg22'
print brown.words(fileids=["cg22"])
# 打印句子
print brown.sents(categories=["news", "reviews"])
"""比较不同文体中的情态动词的用法"""
# 获取文本
news_text = brown.words(categories="news")
# 单词定义频率
fdist = nltk.FreqDist([w.lower() for w in news_text])
# 定义情态动词表
modals = ["can", "could", "may", "might", "must", "will"]
for m in modals:
print m + ":", fdist[m]
开发者ID:BurnellLiu,项目名称:LiuProject,代码行数:19,代码来源:chapter_02.py
示例12: print_modals
def print_modals():
from nltk.corpus import brown
cfd=nltk.ConditionalFreqDist(
(genre,word)
for genre in brown.categories()
for word in brown.words(categories=genre)
)
genres=['news','religion','hobbies','science_fiction','romance','humor']
modals=['can','could','may','might','must','will']
cfd.tabulate(conditions=genres,samples=modals)
开发者ID:Paul-Lin,项目名称:misc,代码行数:10,代码来源:toturial.py
示例13: test_sentences
def test_sentences(categories=[]):
"""returns a test sentence set: [[(word, tag), ..], [(word, tag), ..], ..]"""
if len(categories) == 0:
categories = brown.categories() # use all of the brown categories
sents = []
for category in categories:
total = len(brown.tagged_sents(categories=category))
start = int(TEST_PROPORTION * total) # use the last k sentences for test
sents += brown.tagged_sents(categories=category, simplify_tags=True)[-start:-1]
return sents
开发者ID:jyzhang,项目名称:py-nlp,代码行数:10,代码来源:pos.py
示例14: training_sentences
def training_sentences(use=1.0, categories=[]):
"""returns a training sentence set: [[(word, tag), ..], [(word, tag), ..], ..]"""
if len(categories) == 0:
categories = brown.categories() # use all of the brown categories
sents = []
for category in categories:
total = len(brown.tagged_sents(categories=category))
max = int((1-TEST_PROPORTION) * use * total) - 1 # use the first n sentences for training
sents += brown.tagged_sents(categories=category, simplify_tags=True)[0:max]
return sents
开发者ID:jyzhang,项目名称:py-nlp,代码行数:10,代码来源:pos.py
示例15: init_corpus
def init_corpus():
print 'init corpus.. ',
global categories, category_sentences
categories = brown.categories()
half_cat = int(len(categories) * 0.5)
categories = categories[:half_cat]
for category in categories:
sents = brown.tagged_sents(categories = category)
category_sentences[category] = sents
print 'done'
开发者ID:haje01,项目名称:enser,代码行数:10,代码来源:application.py
示例16: ch03_29_reading_difficulty
def ch03_29_reading_difficulty():
sent_tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
from nltk.corpus import brown
for category in brown.categories():
raw = brown.raw(categories=category)
words = len(brown.words(categories=category))
sentences = len(sent_tokenizer.tokenize(raw))
letters_per_word = (len(raw) - words) / words # raw chars - words space chars
words_per_sentence = words / sentences
reading_level = (4.71 * letters_per_word) + (0.5 * words_per_sentence) + 21.43
print category, reading_level
开发者ID:447327642,项目名称:nltk-examples,代码行数:11,代码来源:ch03_ex.py
示例17: main
def main():
#Annahme: Wort "heart" statt "fun"
words = [u'money', u'duty', u'love', u'heart']
categories = [u'science_fiction', u'romance', u'government', u'humor', u'religion']
#Dein Code
cfd = nltk.ConditionalFreqDist((genre, word)
for genre in brown.categories()
for word in brown.words(categories=genre))
cfd.tabulate(conditions=categories, samples=words)
print_min_max_for_all(cfd, words, categories)
开发者ID:billbos,项目名称:CL,代码行数:12,代码来源:Aufgabe6.py
示例18: brown
def brown():
brown.categories()
brown.words(categories='news')
brown.words(fileids=['cg22'])
brown.sents(categories=['news', 'editorial', 'reviews'])
news_text = brown.words(categories='news')
fdist = nltk.FreqDist([w.lower() for w in news_text])
modals = ['can', 'could', 'may', 'might', 'must', 'will']
for m in modals:
print m + ':', fdist[m],
cfd = nltk.ConditionalFreqDist(
(genre, word)
for genre in brown.categories()
for word in brown.words(categories=genre))
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
modals = ['can', 'could', 'may', 'might', 'must', 'will']
cfd.tabulate(conditions=genres, samples=modals)
开发者ID:AkiraKane,项目名称:Python,代码行数:21,代码来源:c02_text_corpora.py
示例19: ex10
def ex10():
from nltk.corpus import brown
from nltk.corpus import stopwords
stopwords = stopwords.words("english")
for genre in brown.categories():
print genre
words = map(lambda x : x.lower(), brown.words(categories=genre))
fd = nltk.FreqDist([w for w in words if w.isalpha() and not(w in stopwords)])
vocab_size = len(set(words))
sum = 0
for word in fd.keys():
freq = fd[word]
print "... %s (%f)" % (word, (freq * 100 / vocab_size))
sum = sum + freq
if (sum > (vocab_size / 3)):
break
开发者ID:447327642,项目名称:nltk-examples,代码行数:16,代码来源:ch02_ex.py
示例20: exercise11
def exercise11():
print
print "Exercise 11"
cfd = nltk.ConditionalFreqDist((genre, word) for genre in brown.categories() for word in brown.words(categories=genre))
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
modal_verb = ['shall', 'should', 'may', 'had', 'have']
print "Tabulation data for closed class of words in english"
print "For modal verbs:"
cfd.tabulate(conditions = genres, samples = modal_verb)
print
print "For Prepositions:"
prepositions = ['for', 'from', 'onto', 'to', 'with']
cfd.tabulate(conditions = genres, samples = prepositions)
print
print "For Pronoun:"
pronoun = ['me', 'she', 'her', 'I', 'we']
cfd.tabulate(conditions = genres, samples = pronoun)
print
开发者ID:GirishSrinivas,项目名称:PythonPrograms,代码行数:18,代码来源:Girish_Srinivas_Ch2.py
注:本文中的nltk.corpus.brown.categories函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论