• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python brown.categories函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中nltk.corpus.brown.categories函数的典型用法代码示例。如果您正苦于以下问题:Python categories函数的具体用法?Python categories怎么用?Python categories使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了categories函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: print_brown

def print_brown():
    from nltk.corpus import brown
    print brown.categories()
    print brown.words(categories='news')
    print brown.words(fileids=['cg22'])
    print brown.sents(categories=['news','reviews'])
    news_text=brown.words(categories='news')
    fdist=nltk.FreqDist([w.lower() for w in news_text])
    modals=['can','could','may','might','must','will']
    for m in modals:
        print m+':',fdist[m]
开发者ID:Paul-Lin,项目名称:misc,代码行数:11,代码来源:toturial.py


示例2: get_training_test_sentences

    def get_training_test_sentences(self):
        brown_cats = ",".join(brown.categories())
        self.news_text = brown.words(categories= brown.categories())
        self.news_tagged_sentences = brown.tagged_sents(categories= brown.categories())

        size = int(len(self.news_tagged_sentences) * .9)
        brown_train = self.news_tagged_sentences[:size]
        brown_test = self.news_tagged_sentences[size:]

        self.train_sents = brown_train
        self.test_sents  = brown_test
开发者ID:TheFourMonkeysProject,项目名称:Alfred,代码行数:11,代码来源:trainers.py


示例3: build_all_brown

def build_all_brown(subset_size=None):
    documents = []
    categories = []

    all_categories = set()

    try:
        fileids = brown.fileids()

        for fileid in fileids:
            if subset_size:
                if len(all_categories) > subset_size:
                    break
            category = brown.categories(fileid)[0]
            words = [x.lower() for x in brown.words(fileid)]

            documents.append(words)
            categories.append(category)

            all_categories.add(category)

        if subset_size != len(brown.categories()):
            # exclude the final item, since it's the sole member of the next group
            documents = documents[:-1]
            categories = categories[:-1]

        documents = [" ".join(d) for d in documents]

    except LookupError:
        """ we don't have the Brown corpus via nltk on this machine """
        try:
            with open("brown_docs_cats.pickle") as f:
                documents, categories = pickle.load(f)
        except IOError:
            raise Exception("can't load Brown Corpus via NLTK or file")

    # documents = [' '.join(d) for d in documents]

    """
    # let's NOT get tempted to hide away the encoding
    # we'll probably need to access, e.g., the vectorizer, to do reverse
    # transformations once we want to interpret/evaluate the model

    doc_vectorizer = CountVectorizer()
    doc_vec = doc_vectorizer.fit_transform(documents)
    """

    return documents, categories
开发者ID:kinguistics,项目名称:naivebayes,代码行数:48,代码来源:brown_testing.py


示例4: import_brown_pos

def import_brown_pos(ds, simplify_tags=False, silent=False, log=sys.stdout):
    """
    Import the brown corpus into `ds`. E.g.
    
    >>> from nathan.core import Dataspace
    >>> ds = Dataspace()
    >>> %time brown.import_brown(ds, silent=True)
    CPU times: user 12min 28s, sys: 536 ms, total: 12min 29s
    Wall time: 12min 29s
    """
    if not silent:
        total = len(brown.sents())
        counter = 0
    for category in brown.categories():
        cat_handle = ds.insert("#%s" % category)
        for sent in brown.tagged_sents(categories=category):
            if simplify_tags:
                norm = (simplify_tag(t) for t in sent)
            norm = [nltk.tuple2str(t) for t in norm]
            sen_handle = ds.insert(norm)
            ds.link(cat_handle, sen_handle)
            if not silent:
                counter += 1
                if (counter % 100 == 0):
                    print("importing %s of %s sentences..." % (counter, total), 
                        file=log)
开发者ID:tdiggelm,项目名称:nltk-playground,代码行数:26,代码来源:train.py


示例5: ex11

def ex11():
  from nltk.corpus import brown
  modals = set(["can", "could", "may", "might", "shall", "should", "will", "would", "must", "ought"])
  cfd = nltk.ConditionalFreqDist(
    (genre, modal)
    for genre in brown.categories()
    for modal in [w.lower() for w in brown.words(categories=genre) if w.lower() in modals])
  cfd.plot()
开发者ID:447327642,项目名称:nltk-examples,代码行数:8,代码来源:ch02_ex.py


示例6: brown_diversity

def brown_diversity():
	"""calculate and display lexical diversity score (token/token_type) for each brown corpus category"""
	cfd = nltk.ConditionalFreqDist((category, word)
		for category in brown.categories()
		for word in brown.words(categories=category))
	print "{0:15s} {1:10s}".format("CATEGORY", "DIVERSITY")
	for category in cfd.conditions():
		print "{0:15s} {1:10f}".format(category, (cfd[category].N() * 1.0 / cfd[category].B()))
开发者ID:jyzhang,项目名称:py-nlp,代码行数:8,代码来源:ch2.py


示例7: fun08

def fun08():
    """fun08"""
    cfd = nltk.ConditionalFreqDist((genre, word) \
        for genre in brown.categories() \
        for word in brown.words(categories=genre))
    genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
    modals = ['can', 'could', 'may', 'might', 'must', 'will']
    cfd.tabulate(conditions=genres, samples=modals)
开发者ID:gree2,项目名称:hobby,代码行数:8,代码来源:ch02.py


示例8: makeWordSet

def makeWordSet(args=None):
    '''Use the Brown corpus to see how many words used'''
    word_set = set()
    for cat in brown.categories():
        word_set = word_set.union(set(brown.words(categories=cat)))
    for cat in reuters.categories():
        word_set = word_set.union(set(reuters.words(categories=cat)))
    return word_set
开发者ID:divanshugarg,项目名称:Kaggle-Projects-Stuff,代码行数:8,代码来源:wordCheck.py


示例9: exercise_brown2

def exercise_brown2():
    """带条件的频率分布函数"""
    cfd = nltk.ConditionalFreqDist(
        (genre, word) for genre in brown.categories() for word in brown.words(categories=genre)
    )

    genres = ["news", "religion", "hobbies", "science_fiction", "romance", "humor"]
    modals = ["can", "could", "may", "might", "must", "will"]
    cfd.tabulate(conditions=genres, samples=modals)
开发者ID:BurnellLiu,项目名称:LiuProject,代码行数:9,代码来源:chapter_02.py


示例10: ex16

def ex16():
  from nltk.corpus import brown
  lex_div = {}
  for category in brown.categories():
    words = brown.words(categories=category)
    ld = len(words) / len(set(words))
    print category, ld
    lex_div[category] = ld
  print sorted(lex_div.iteritems(), key=operator.itemgetter(1))
开发者ID:447327642,项目名称:nltk-examples,代码行数:9,代码来源:ch02_ex.py


示例11: exercise_brown

def exercise_brown():
    # 打印布朗语料库中的分类
    print brown.categories()
    # 打印分类为新闻的文本词汇
    print brown.words(categories="news")
    # 打印文本'cg22'
    print brown.words(fileids=["cg22"])
    # 打印句子
    print brown.sents(categories=["news", "reviews"])

    """比较不同文体中的情态动词的用法"""
    # 获取文本
    news_text = brown.words(categories="news")
    # 单词定义频率
    fdist = nltk.FreqDist([w.lower() for w in news_text])
    # 定义情态动词表
    modals = ["can", "could", "may", "might", "must", "will"]
    for m in modals:
        print m + ":", fdist[m]
开发者ID:BurnellLiu,项目名称:LiuProject,代码行数:19,代码来源:chapter_02.py


示例12: print_modals

def print_modals():
    from nltk.corpus import brown
    cfd=nltk.ConditionalFreqDist(
        (genre,word)
        for genre in brown.categories()
        for word in brown.words(categories=genre)
    )
    genres=['news','religion','hobbies','science_fiction','romance','humor']
    modals=['can','could','may','might','must','will']
    cfd.tabulate(conditions=genres,samples=modals)
开发者ID:Paul-Lin,项目名称:misc,代码行数:10,代码来源:toturial.py


示例13: test_sentences

def test_sentences(categories=[]):
	"""returns a test sentence set: [[(word, tag), ..], [(word, tag), ..], ..]"""
	if len(categories) == 0:
		categories = brown.categories() # use all of the brown categories
	sents = []
	for category in categories:
		total = len(brown.tagged_sents(categories=category))
		start = int(TEST_PROPORTION * total) # use the last k sentences for test
		sents += brown.tagged_sents(categories=category, simplify_tags=True)[-start:-1]
	return sents
开发者ID:jyzhang,项目名称:py-nlp,代码行数:10,代码来源:pos.py


示例14: training_sentences

def training_sentences(use=1.0, categories=[]):
	"""returns a training sentence set: [[(word, tag), ..], [(word, tag), ..], ..]"""
	if len(categories) == 0:
		categories = brown.categories() # use all of the brown categories
	sents = []
	for category in categories:
		total = len(brown.tagged_sents(categories=category))
		max = int((1-TEST_PROPORTION) * use * total) - 1 # use the first n sentences for training
		sents += brown.tagged_sents(categories=category, simplify_tags=True)[0:max]
	return sents
开发者ID:jyzhang,项目名称:py-nlp,代码行数:10,代码来源:pos.py


示例15: init_corpus

def init_corpus():
    print 'init corpus.. ', 
    global categories, category_sentences
    categories = brown.categories()
    half_cat = int(len(categories) * 0.5)
    categories = categories[:half_cat]
    for category in categories:
        sents = brown.tagged_sents(categories = category)
        category_sentences[category] = sents
    print 'done'
开发者ID:haje01,项目名称:enser,代码行数:10,代码来源:application.py


示例16: ch03_29_reading_difficulty

def ch03_29_reading_difficulty():
  sent_tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
  from nltk.corpus import brown
  for category in brown.categories():
    raw = brown.raw(categories=category)
    words = len(brown.words(categories=category))
    sentences = len(sent_tokenizer.tokenize(raw))
    letters_per_word = (len(raw) - words) / words # raw chars - words space chars
    words_per_sentence = words / sentences
    reading_level = (4.71 * letters_per_word) + (0.5 * words_per_sentence) + 21.43
    print category, reading_level
开发者ID:447327642,项目名称:nltk-examples,代码行数:11,代码来源:ch03_ex.py


示例17: main

def main():
	#Annahme: Wort "heart" statt "fun"
	words = [u'money', u'duty', u'love', u'heart']
	categories = [u'science_fiction', u'romance', u'government', u'humor', u'religion']
	#Dein Code
	cfd = nltk.ConditionalFreqDist((genre, word)
			for genre in brown.categories()
			for word in brown.words(categories=genre))

	cfd.tabulate(conditions=categories, samples=words)

	print_min_max_for_all(cfd, words, categories)
开发者ID:billbos,项目名称:CL,代码行数:12,代码来源:Aufgabe6.py


示例18: brown

def brown():

    brown.categories()
    brown.words(categories='news')
    brown.words(fileids=['cg22'])
    brown.sents(categories=['news', 'editorial', 'reviews'])

    news_text = brown.words(categories='news')
    fdist = nltk.FreqDist([w.lower() for w in news_text])
    modals = ['can', 'could', 'may', 'might', 'must', 'will']
    for m in modals:
        print m + ':', fdist[m],


    cfd = nltk.ConditionalFreqDist(
            (genre, word)
            for genre in brown.categories()
            for word in brown.words(categories=genre))
    genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
    modals = ['can', 'could', 'may', 'might', 'must', 'will']
    cfd.tabulate(conditions=genres, samples=modals)
开发者ID:AkiraKane,项目名称:Python,代码行数:21,代码来源:c02_text_corpora.py


示例19: ex10

def ex10():
  from nltk.corpus import brown
  from nltk.corpus import stopwords
  stopwords = stopwords.words("english")
  for genre in brown.categories():
    print genre
    words = map(lambda x : x.lower(), brown.words(categories=genre))
    fd = nltk.FreqDist([w for w in words if w.isalpha() and not(w in stopwords)])
    vocab_size = len(set(words))
    sum = 0
    for word in fd.keys():
      freq = fd[word]
      print "... %s (%f)" % (word, (freq * 100 / vocab_size))
      sum = sum + freq
      if (sum > (vocab_size / 3)):
        break
开发者ID:447327642,项目名称:nltk-examples,代码行数:16,代码来源:ch02_ex.py


示例20: exercise11

def exercise11():
    print
    print "Exercise 11"
    cfd = nltk.ConditionalFreqDist((genre, word) for genre in brown.categories() for word in brown.words(categories=genre))
    genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
    modal_verb = ['shall', 'should', 'may', 'had', 'have']
    print "Tabulation data for closed class of words in english"
    print "For modal verbs:"
    cfd.tabulate(conditions = genres, samples = modal_verb)
    print
    print "For Prepositions:"
    prepositions = ['for', 'from', 'onto', 'to', 'with']
    cfd.tabulate(conditions = genres, samples = prepositions)
    print
    print "For Pronoun:"
    pronoun = ['me', 'she', 'her', 'I', 'we']
    cfd.tabulate(conditions = genres, samples = pronoun)
    print
开发者ID:GirishSrinivas,项目名称:PythonPrograms,代码行数:18,代码来源:Girish_Srinivas_Ch2.py



注:本文中的nltk.corpus.brown.categories函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python brown.sents函数代码示例发布时间:2022-05-27
下一篇:
Python corpus.PlaintextCorpusReader类代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap