本文整理汇总了Python中nltk.corpus.gutenberg.words函数的典型用法代码示例。如果您正苦于以下问题:Python words函数的具体用法?Python words怎么用?Python words使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了words函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: exercise_gutenberg
def exercise_gutenberg():
# 打印古腾堡项目的文件列表
print gutenberg.fileids()
# 挑选一个文本: 简-奥斯丁的《爱玛》
emma = gutenberg.words("austen-emma.txt")
# 查看书的长度
print len(emma)
# 导入文本
emma_text = nltk.Text(emma)
emma_text.concordance("surprize")
for file_id in gutenberg.fileids():
chars_list = gutenberg.raw(file_id)
words_list = gutenberg.words(file_id)
sents_list = gutenberg.sents(file_id)
# 统计文件的总字符数
num_chars = len(chars_list)
# 统计文件的总单词数
num_words = len(words_list)
# 统计文件的总句子数
num_sents = len(sents_list)
# 统计文件的非重复单词数
num_vocab = len(set([w.lower() for w in words_list]))
# 打印词的平均字符数, 句子的平均单词数, 每个单词出现的平均次数, 文件名
print num_chars / num_words, num_words / num_sents, num_words / num_vocab, file_id
开发者ID:BurnellLiu,项目名称:LiuProject,代码行数:29,代码来源:chapter_02.py
示例2: gutenberg
def gutenberg():
from nltk.corpus import gutenberg
for t in gutenberg.fileids():
num_chars = len(gutenberg.raw(t))
num_words = len(gutenberg.words(t))
num_sents = len(gutenberg.sents(t))
num_vocab = len(set([w.lower() for w in gutenberg.words(t)]))
print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), t
开发者ID:kwdhd,项目名称:nlp,代码行数:8,代码来源:main.py
示例3: page57
def page57():
"""Statistics from the Gutenberg corpora"""
from nltk.corpus import gutenberg
for fileid in gutenberg.fileids():
num_chars = len(gutenberg.raw(fileid))
num_words = len(gutenberg.words(fileid))
num_sents = len(gutenberg.sents(fileid))
num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
print int(num_chars / num_words), int(num_words / num_sents),
print int(num_words / num_vocab), fileid
开发者ID:andreoliwa,项目名称:nlp-book,代码行数:11,代码来源:book_examples.py
示例4: fun02
def fun02():
"""fun02"""
for fileid in gutenberg.fileids():
num_chars = len(gutenberg.raw(fileid))
num_words = len(gutenberg.words(fileid))
num_sents = len(gutenberg.sents(fileid))
num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
# average word length average sentence length
print int(num_chars/num_words), int(num_words/num_sents),
# number of times each vocabulary item appers in the text
print int(num_words/num_vocab), fileid
开发者ID:gree2,项目名称:hobby,代码行数:11,代码来源:ch02.py
示例5: for_print
def for_print():
'''
显示每个文本的三个统计量
:return:
'''
for fileid in gutenberg.fileids():
num_chars=len(gutenberg.raw(fileid))
num_words=len(gutenberg.words(fileid))
num_sents=len(gutenberg.sents(fileid))
num_vocab=len(set([w.lower() for w in gutenberg.words(fileid)]))
print int(num_chars/num_words),int(num_words/num_sents),int(num_words/num_vocab),fileid
开发者ID:Paul-Lin,项目名称:misc,代码行数:11,代码来源:toturial.py
示例6: ex2
def ex2():
from nltk.corpus import gutenberg
ap = gutenberg.words("austen-persuasion.txt")
word_tokens = len(ap)
word_types = len(set([w.lower() for w in ap]))
print "#-word tokens=", word_tokens
print "#-word types=", word_types
开发者ID:447327642,项目名称:nltk-examples,代码行数:7,代码来源:ch02_ex.py
示例7: ex17
def ex17():
from nltk.corpus import gutenberg
macbeth = gutenberg.words("shakespeare-macbeth.txt")
stopwords = set(nltk.corpus.stopwords.words())
fd = nltk.FreqDist([w for w in macbeth if w.lower() not in stopwords
and len(w) > 3 and w.isalpha()])
print fd.keys()[0:50]
开发者ID:447327642,项目名称:nltk-examples,代码行数:7,代码来源:ch02_ex.py
示例8: main
def main():
# gutenberg
gu_words = gutenberg.words()
gu_words_exclude_stops = exclude_stopwords(gu_words)
gu_fd1 = get_frequency_distribution(gu_words)
gu_fd2 = get_frequency_distribution(gu_words_exclude_stops)
pylab.plot(gu_fd1, color='red')
pylab.plot(gu_fd2, color='orange')
# inaugural
in_words = inaugural.words()
in_words_exclude_stops = exclude_stopwords(in_words)
in_fd1 = get_frequency_distribution(in_words)
in_fd2 = get_frequency_distribution(in_words_exclude_stops)
pylab.plot(in_fd1, color='black')
pylab.plot(in_fd2, color='gray')
# reuters
yen_words = reuters.words(categories='yen')
yen_words_exclude_stops = exclude_stopwords(yen_words)
yen_fd1 = get_frequency_distribution(yen_words)
yen_fd2 = get_frequency_distribution(yen_words_exclude_stops)
pylab.plot(yen_fd1, color='blue')
pylab.plot(yen_fd2, color='green')
pylab.xscale('log')
pylab.yscale('log')
pylab.show()
开发者ID:t2y,项目名称:learnnlp,代码行数:31,代码来源:practice23_a.py
示例9: generateSentence
def generateSentence():
corpus = random.randint(0,3)
if corpus == 0:
text = brown.words()
elif corpus == 1:
text = gutenberg.words()
elif corpus == 2:
text = webtext.words()
elif corpus == 3:
text = movie_reviews.words()
tweetString = ''
lengthOfTweet = random.randint(0,20)
len(text)
firstRun = True
blank = ' '
startOfWord = ''
startOfWordIndex = 0
startingWord = random.randint(0, (len(text) - 40))
punctuation = [".", ",", '"', ";", ":", "?", "!", ")", "(", "*", "[", "]", "‘", "“", "#"]
for x in xrange(startingWord,(startingWord + len(text))):
startOfWord = text[x]
if startOfWord ==".":
startOfWordIndex = x
break
for x in xrange(startOfWordIndex + 1, startOfWordIndex+lengthOfTweet):
if text[x] in punctuation:
tweetString = tweetString + text[x]
elif text[x] not in punctuation:
tweetString = tweetString + blank + text[x]
return tweetString
开发者ID:mathieuhendey,项目名称:Twitter-bot,代码行数:33,代码来源:Twertbot.py
示例10: exercise2
def exercise2():
print
print "Exercise 2"
words = gutenberg.words('austen-persuasion.txt')
print "Number of word tokens in the text austen-persuasion.txt: %d" %len(words)
print "Number of word-types in the text austen-persuasion.txt: %d" %len(set(words))
print set(words)
print
开发者ID:GirishSrinivas,项目名称:PythonPrograms,代码行数:8,代码来源:Girish_Srinivas_Ch2.py
示例11: fun01
def fun01():
"""fun01"""
print gutenberg.fileids()
# emma by jane austen
emma = gutenberg.words('austen-emma.txt')
# how many words it contains
print len(emma)
print Text(emma).concordance("surprize")
开发者ID:gree2,项目名称:hobby,代码行数:8,代码来源:ch02.py
示例12: find_word_probability
def find_word_probability(CORPUS):
''' Find word occurrence probabilty from the given corpus'''
cfd = ConditionalFreqDist()
prev_word = None
for word in gutenberg.words(CORPUS):
cfd[prev_word][word] += 1
prev_word = word
return cfd
开发者ID:sreejithc321,项目名称:natural_language_processing,代码行数:8,代码来源:sentence_maker.py
示例13: main
def main():
loader = WordLoader()
loader.load_valid_words_from_aspell("en_GB")
loader.load_valid_words_from_aspell("en_US")
all_words = brown.words() + gutenberg.words()
sorted_words_filename = 'sorted_words.txt'
loader.write_sorted_words(all_words, sorted_words_filename)
sorted_words = loader.sorted_words
print_anagrams(sorted_words, all_words)
开发者ID:donkirkby,项目名称:vograbulary,代码行数:9,代码来源:pinagrams.py
示例14: gutenberg
def gutenberg():
emma = nltk.corpus.gutenberg.words('austen-emma.txt')
print len(emma)
print gutenberg.fileids()
emma = gutenberg.words('austen-emma.txt')
macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')
macbeth_sentences[1037]
longest_len = max([len(s) for s in macbeth_sentences])
[s for s in macbeth_sentences if len(s) == longest_len]
for fileid in gutenberg.fileids():
num_chars = len(gutenberg.raw(fileid))
num_words = len(gutenberg.words(fileid))
num_sents = len(gutenberg.sents(fileid))
num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), fileid
开发者ID:AkiraKane,项目名称:Python,代码行数:19,代码来源:c02_text_corpora.py
示例15: structure
def structure():
raw = gutenberg.raw("burgess-busterbrown.txt")
raw[1:20]
words = gutenberg.words("burgess-busterbrown.txt")
words[1:20]
sents = gutenberg.sents("burgess-busterbrown.txt")
sents[1:20]
开发者ID:AkiraKane,项目名称:Python,代码行数:10,代码来源:c02_text_corpora.py
示例16: searchText
def searchText():
moby = nltk.Text(gutenberg.words('melville-moby_dick.txt'))
moby.findall(r"<a> (<.*>) <man>")
chat = nltk.Text(nps_chat.words())
chat.findall(r"<.*> <.*> <bro>")
chat.findall(r"<l.*>{3,}")
hobbies_learned = nltk.Text(brown.words(categories=['hobbies', 'learned']))
hobbies_learned.findall(r"<\w*> <and> <other> <\w*s>")
开发者ID:AkiraKane,项目名称:Python,代码行数:10,代码来源:c03_re.py
示例17: main
def main():
sample_rankings = FreqDist(gutenberg.words('austen-persuasion.txt'))
training_set_rankings = udhr_rankings(debug=True)
predictions = predict_language(sample_rankings, training_set_rankings, debug=True)
print
for language, value in predictions:
if value != 0:
# print '%.-32s\t%.-10s' % (language, value)
print '{:.<32}{}'.format(language, value)
开发者ID:mikeholler,项目名称:CSC499-NLP,代码行数:11,代码来源:language_guessing.py
示例18: ex18
def ex18():
from nltk.corpus import gutenberg
macbeth = gutenberg.words("shakespeare-macbeth.txt")
stopwords = set(nltk.corpus.stopwords.words())
bigrams = nltk.bigrams(macbeth)
print bigrams
bigrams_wo_stopwords = filter(lambda (k, v) : k not in stopwords
and v not in stopwords
and k.isalpha()
and v.isalpha(), bigrams)
fd = nltk.FreqDist(map(lambda (k,v) : k+":"+v, bigrams_wo_stopwords))
print map(lambda k : (k.split(":")[0], k.split(":")[1]), fd.keys())[0:50]
开发者ID:447327642,项目名称:nltk-examples,代码行数:12,代码来源:ch02_ex.py
示例19: exercise_unusual_words
def exercise_unusual_words():
text = gutenberg.words("austen-sense.txt")
# 取出文本中的词汇, 去除数字, 转换为小写
text_vocab = set(w.lower() for w in text if w.isalpha())
# 取出词典中的词汇
english_vocab = set(w.lower() for w in words.words())
# 找出文本中的非常用词汇(错误词汇)
unusual_vocab = text_vocab.difference(english_vocab)
print sorted(unusual_vocab)
开发者ID:BurnellLiu,项目名称:LiuProject,代码行数:13,代码来源:chapter_02.py
示例20: searchTokenText
def searchTokenText():
from nltk.corpus import gutenberg, nps_chat
moby = nltk.Text(gutenberg.words('melville-moby_dick.txt'))
print moby.findall(r"<a> (<.*>) <man>")
chat = nltk.Text(nps_chat.words())
print chat.findall(r"<.*> <.*> <bro>")
print chat.findall(r"<l.*>{3,}")
from nltk.corpus import brown
hobbies_learned = nltk.Text(brown.words(categories=['hobbies', 'learned']))
hobbies_learned.findall(r"<\w*> <and> <other> <\w*s>")
开发者ID:hbdhj,项目名称:python,代码行数:13,代码来源:chapter3.py
注:本文中的nltk.corpus.gutenberg.words函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论