• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python gutenberg.words函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中nltk.corpus.gutenberg.words函数的典型用法代码示例。如果您正苦于以下问题:Python words函数的具体用法?Python words怎么用?Python words使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了words函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: exercise_gutenberg

def exercise_gutenberg():
    # 打印古腾堡项目的文件列表
    print gutenberg.fileids()

    # 挑选一个文本: 简-奥斯丁的《爱玛》
    emma = gutenberg.words("austen-emma.txt")

    # 查看书的长度
    print len(emma)

    # 导入文本
    emma_text = nltk.Text(emma)
    emma_text.concordance("surprize")

    for file_id in gutenberg.fileids():
        chars_list = gutenberg.raw(file_id)
        words_list = gutenberg.words(file_id)
        sents_list = gutenberg.sents(file_id)

        # 统计文件的总字符数
        num_chars = len(chars_list)
        # 统计文件的总单词数
        num_words = len(words_list)
        # 统计文件的总句子数
        num_sents = len(sents_list)
        # 统计文件的非重复单词数
        num_vocab = len(set([w.lower() for w in words_list]))
        # 打印词的平均字符数, 句子的平均单词数, 每个单词出现的平均次数, 文件名
        print num_chars / num_words, num_words / num_sents, num_words / num_vocab, file_id
开发者ID:BurnellLiu,项目名称:LiuProject,代码行数:29,代码来源:chapter_02.py


示例2: gutenberg

def gutenberg():
    from nltk.corpus import gutenberg
    for t in gutenberg.fileids():
        num_chars = len(gutenberg.raw(t))
        num_words = len(gutenberg.words(t))
        num_sents = len(gutenberg.sents(t))
        num_vocab = len(set([w.lower() for w in gutenberg.words(t)]))
        print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), t
开发者ID:kwdhd,项目名称:nlp,代码行数:8,代码来源:main.py


示例3: page57

def page57():
    """Statistics from the Gutenberg corpora"""
    from nltk.corpus import gutenberg

    for fileid in gutenberg.fileids():
        num_chars = len(gutenberg.raw(fileid))
        num_words = len(gutenberg.words(fileid))
        num_sents = len(gutenberg.sents(fileid))
        num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
        print int(num_chars / num_words), int(num_words / num_sents),
        print int(num_words / num_vocab), fileid
开发者ID:andreoliwa,项目名称:nlp-book,代码行数:11,代码来源:book_examples.py


示例4: fun02

def fun02():
    """fun02"""
    for fileid in gutenberg.fileids():
        num_chars = len(gutenberg.raw(fileid))
        num_words = len(gutenberg.words(fileid))
        num_sents = len(gutenberg.sents(fileid))
        num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
        # average word length average sentence length
        print int(num_chars/num_words), int(num_words/num_sents),
        # number of times each vocabulary item appers in the text
        print int(num_words/num_vocab), fileid
开发者ID:gree2,项目名称:hobby,代码行数:11,代码来源:ch02.py


示例5: for_print

def for_print():
    '''
    显示每个文本的三个统计量
    :return:
    '''
    for fileid in gutenberg.fileids():
        num_chars=len(gutenberg.raw(fileid))
        num_words=len(gutenberg.words(fileid))
        num_sents=len(gutenberg.sents(fileid))
        num_vocab=len(set([w.lower() for w in gutenberg.words(fileid)]))
        print int(num_chars/num_words),int(num_words/num_sents),int(num_words/num_vocab),fileid
开发者ID:Paul-Lin,项目名称:misc,代码行数:11,代码来源:toturial.py


示例6: ex2

def ex2():
  from nltk.corpus import gutenberg
  ap = gutenberg.words("austen-persuasion.txt")
  word_tokens = len(ap)
  word_types = len(set([w.lower() for w in ap]))
  print "#-word tokens=", word_tokens
  print "#-word types=", word_types
开发者ID:447327642,项目名称:nltk-examples,代码行数:7,代码来源:ch02_ex.py


示例7: ex17

def ex17():
  from nltk.corpus import gutenberg
  macbeth = gutenberg.words("shakespeare-macbeth.txt")
  stopwords = set(nltk.corpus.stopwords.words())
  fd = nltk.FreqDist([w for w in macbeth if w.lower() not in stopwords
      and len(w) > 3 and w.isalpha()])
  print fd.keys()[0:50]
开发者ID:447327642,项目名称:nltk-examples,代码行数:7,代码来源:ch02_ex.py


示例8: main

def main():
    # gutenberg
    gu_words = gutenberg.words()
    gu_words_exclude_stops = exclude_stopwords(gu_words)
    gu_fd1 = get_frequency_distribution(gu_words)
    gu_fd2 = get_frequency_distribution(gu_words_exclude_stops)

    pylab.plot(gu_fd1, color='red')
    pylab.plot(gu_fd2, color='orange')

    # inaugural
    in_words = inaugural.words()
    in_words_exclude_stops = exclude_stopwords(in_words)
    in_fd1 = get_frequency_distribution(in_words)
    in_fd2 = get_frequency_distribution(in_words_exclude_stops)

    pylab.plot(in_fd1, color='black')
    pylab.plot(in_fd2, color='gray')

    # reuters
    yen_words = reuters.words(categories='yen')
    yen_words_exclude_stops = exclude_stopwords(yen_words)
    yen_fd1 = get_frequency_distribution(yen_words)
    yen_fd2 = get_frequency_distribution(yen_words_exclude_stops)

    pylab.plot(yen_fd1, color='blue')
    pylab.plot(yen_fd2, color='green')

    pylab.xscale('log')
    pylab.yscale('log')
    pylab.show()
开发者ID:t2y,项目名称:learnnlp,代码行数:31,代码来源:practice23_a.py


示例9: generateSentence

def generateSentence():
    corpus = random.randint(0,3)
    if corpus == 0:
        text = brown.words()
    elif corpus == 1:
        text = gutenberg.words()
    elif corpus == 2:
        text = webtext.words()
    elif corpus == 3:
        text = movie_reviews.words()
    tweetString = ''
    lengthOfTweet = random.randint(0,20)
    len(text)
    firstRun = True
    blank = ' '
    startOfWord = ''
    startOfWordIndex = 0
    startingWord = random.randint(0, (len(text) - 40))
    punctuation = [".", ",", '"', ";", ":", "?", "!", ")", "(", "*", "[", "]", "‘", "“", "#"]

    for x in xrange(startingWord,(startingWord + len(text))):
        startOfWord = text[x]
        if startOfWord ==".":
                startOfWordIndex = x
                break

    for x in xrange(startOfWordIndex + 1, startOfWordIndex+lengthOfTweet):
        if text[x] in punctuation:
            tweetString = tweetString + text[x]

        elif text[x] not in punctuation:
            tweetString = tweetString + blank + text[x]
    return tweetString
开发者ID:mathieuhendey,项目名称:Twitter-bot,代码行数:33,代码来源:Twertbot.py


示例10: exercise2

def exercise2():
    print
    print "Exercise 2"
    words = gutenberg.words('austen-persuasion.txt')
    print "Number of word tokens in the text austen-persuasion.txt: %d" %len(words)
    print "Number of word-types in the text austen-persuasion.txt: %d" %len(set(words))
    print set(words)
    print
开发者ID:GirishSrinivas,项目名称:PythonPrograms,代码行数:8,代码来源:Girish_Srinivas_Ch2.py


示例11: fun01

def fun01():
    """fun01"""
    print gutenberg.fileids()
    # emma by jane austen
    emma = gutenberg.words('austen-emma.txt')
    # how many words it contains
    print len(emma)
    print Text(emma).concordance("surprize")
开发者ID:gree2,项目名称:hobby,代码行数:8,代码来源:ch02.py


示例12: find_word_probability

def find_word_probability(CORPUS):
    ''' Find word occurrence probabilty from the given corpus'''
    cfd = ConditionalFreqDist()
    prev_word = None
    for word in gutenberg.words(CORPUS):
        cfd[prev_word][word] += 1
        prev_word = word
    return cfd
开发者ID:sreejithc321,项目名称:natural_language_processing,代码行数:8,代码来源:sentence_maker.py


示例13: main

def main():
    loader = WordLoader()
    loader.load_valid_words_from_aspell("en_GB")
    loader.load_valid_words_from_aspell("en_US")
    all_words = brown.words() + gutenberg.words()
    sorted_words_filename = 'sorted_words.txt'
    loader.write_sorted_words(all_words, sorted_words_filename)
    sorted_words = loader.sorted_words
    print_anagrams(sorted_words, all_words)
开发者ID:donkirkby,项目名称:vograbulary,代码行数:9,代码来源:pinagrams.py


示例14: gutenberg

def gutenberg():

    emma = nltk.corpus.gutenberg.words('austen-emma.txt')
    print len(emma)

    print gutenberg.fileids()
    emma = gutenberg.words('austen-emma.txt')

    macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')
    macbeth_sentences[1037]
    longest_len = max([len(s) for s in macbeth_sentences])
    [s for s in macbeth_sentences if len(s) == longest_len]

    for fileid in gutenberg.fileids():
        num_chars = len(gutenberg.raw(fileid))
        num_words = len(gutenberg.words(fileid))
        num_sents = len(gutenberg.sents(fileid))
        num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
        print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), fileid
开发者ID:AkiraKane,项目名称:Python,代码行数:19,代码来源:c02_text_corpora.py


示例15: structure

def structure():

    raw = gutenberg.raw("burgess-busterbrown.txt")
    raw[1:20]

    words = gutenberg.words("burgess-busterbrown.txt")
    words[1:20]

    sents = gutenberg.sents("burgess-busterbrown.txt")
    sents[1:20]
开发者ID:AkiraKane,项目名称:Python,代码行数:10,代码来源:c02_text_corpora.py


示例16: searchText

def searchText():

    moby = nltk.Text(gutenberg.words('melville-moby_dick.txt'))
    moby.findall(r"<a> (<.*>) <man>")
    chat = nltk.Text(nps_chat.words())
    chat.findall(r"<.*> <.*> <bro>") 
    chat.findall(r"<l.*>{3,}") 

    hobbies_learned = nltk.Text(brown.words(categories=['hobbies', 'learned']))
    hobbies_learned.findall(r"<\w*> <and> <other> <\w*s>")
开发者ID:AkiraKane,项目名称:Python,代码行数:10,代码来源:c03_re.py


示例17: main

def main():
    sample_rankings = FreqDist(gutenberg.words('austen-persuasion.txt'))
    training_set_rankings = udhr_rankings(debug=True)

    predictions = predict_language(sample_rankings, training_set_rankings, debug=True)

    print
    for language, value in predictions:
        if value != 0:
            # print '%.-32s\t%.-10s' % (language, value)
            print '{:.<32}{}'.format(language, value)
开发者ID:mikeholler,项目名称:CSC499-NLP,代码行数:11,代码来源:language_guessing.py


示例18: ex18

def ex18():
  from nltk.corpus import gutenberg
  macbeth = gutenberg.words("shakespeare-macbeth.txt")
  stopwords = set(nltk.corpus.stopwords.words())
  bigrams = nltk.bigrams(macbeth)
  print bigrams
  bigrams_wo_stopwords = filter(lambda (k, v) : k not in stopwords
    and v not in stopwords
    and k.isalpha()
    and v.isalpha(), bigrams)
  fd = nltk.FreqDist(map(lambda (k,v) : k+":"+v, bigrams_wo_stopwords))
  print map(lambda k : (k.split(":")[0], k.split(":")[1]), fd.keys())[0:50]
开发者ID:447327642,项目名称:nltk-examples,代码行数:12,代码来源:ch02_ex.py


示例19: exercise_unusual_words

def exercise_unusual_words():
    text = gutenberg.words("austen-sense.txt")

    # 取出文本中的词汇, 去除数字, 转换为小写
    text_vocab = set(w.lower() for w in text if w.isalpha())

    # 取出词典中的词汇
    english_vocab = set(w.lower() for w in words.words())

    # 找出文本中的非常用词汇(错误词汇)
    unusual_vocab = text_vocab.difference(english_vocab)

    print sorted(unusual_vocab)
开发者ID:BurnellLiu,项目名称:LiuProject,代码行数:13,代码来源:chapter_02.py


示例20: searchTokenText

def searchTokenText():
    from nltk.corpus import gutenberg, nps_chat
    moby = nltk.Text(gutenberg.words('melville-moby_dick.txt'))
    print moby.findall(r"<a> (<.*>) <man>")

    chat = nltk.Text(nps_chat.words())
    print chat.findall(r"<.*> <.*> <bro>")

    print chat.findall(r"<l.*>{3,}")

    from nltk.corpus import brown
    hobbies_learned = nltk.Text(brown.words(categories=['hobbies', 'learned']))
    hobbies_learned.findall(r"<\w*> <and> <other> <\w*s>")
开发者ID:hbdhj,项目名称:python,代码行数:13,代码来源:chapter3.py



注:本文中的nltk.corpus.gutenberg.words函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python movie_reviews.categories函数代码示例发布时间:2022-05-27
下一篇:
Python gutenberg.sents函数代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap