• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python brown.tagged_sents函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中nltk.corpus.brown.tagged_sents函数的典型用法代码示例。如果您正苦于以下问题:Python tagged_sents函数的具体用法?Python tagged_sents怎么用?Python tagged_sents使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了tagged_sents函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: main

def main():
    # run Simple unigram tagger
    brown_news_tagged = brown.tagged_sents(categories='news')
    brown_train = brown_news_tagged[100:]
    brown_test = brown_news_tagged[:100]

    nn_tagger = nltk.DefaultTagger('NN')
    ut2 = nltk.UnigramTagger(brown_train, backoff=nn_tagger)
    simpleUnigramTagger = SimpleUnigramTagger(brown_train, backoff=nn_tagger)
    print 'Simple Unigram tagger accuracy: %4.1f%%' % ( 100.0 * simpleUnigramTagger.evaluate(brown_test))
    print 'Unigram tagger accuracy: %4.1f%%' % ( 100.0 * ut2.evaluate(brown_test))

    # run affix tagger with entropy
    brown_news_tagged = brown.tagged_sents(categories='news')
    brown_train = brown_news_tagged[:int(0.8*len(brown_news_tagged))]
    rest = brown_news_tagged[int(0.8*len(brown_news_tagged)):]
    brown_development = rest[:int(0.5*len(rest))]
    brown_test = rest[int(0.5*len(rest)):]
    
    affix_tagger = nltk.AffixTagger(brown_train, backoff= nltk.DefaultTagger('NN') , cutoff=2)
    nltk.AffixTagger._train = _train
    nltk.AffixTagger.H = _H
    optcutoff = optimize_parameter()
    print "the optimal cutoff param is: %d " % optcutoff 
    affix_tagger2 = nltk.AffixTagger(brown_train, backoff= nltk.DefaultTagger('NN') , cutoff=optcutoff)

    print 'Unigram tagger accuracy: %4.1f%%' % ( 100.0 * affix_tagger.evaluate(brown_test))
    print 'Unigram tagger accuracy with entropy: %4.1f%%' % ( 100.0 * affix_tagger2.evaluate(brown_test))
开发者ID:atiassa,项目名称:recommend-2011,代码行数:28,代码来源:q2.py


示例2: __init__

   def __init__(self):
      '''initialize and train brill and naive bayes classifiers'''
     
      #TODO: Fix bug where it loads tagger from calling module dir
      if exists(file):
         input = open(file, 'rb')
         self.classifier = load(input)
         input.close()
         print 'Successfully loaded saved classifier'
         return

      self.bayes = NaiveBayesTagger()
      boundary = int(len(brown.tagged_sents())*0.8)
      train = brown.tagged_sents(simplify_tags=True)[:boundary]

      brill_trainer = FastBrillTaggerTrainer(initial_tagger = self.bayes,
                                             templates = templates,
                                             trace = 3,
                                             deterministic = True)
         
      self.classifier = brill_trainer.train(train, max_rules=10)
         
      print 'Saving Taggers to file: "pos_tagger.pickle"'
      output = open(file, 'wb')
      dump(self.classifier, output, 1)
      output.close()
开发者ID:okoye,项目名称:sentimentanalysis,代码行数:26,代码来源:speechtagger.py


示例3: demo

def demo(train_size=100, test_size=100, java_home=None, mallet_home=None):
    from nltk.corpus import brown
    import textwrap

    # Define a very simple feature detector
    def fd(sentence, index):
        word = sentence[index]
        return dict(word=word, suffix=word[-2:], len=len(word))

    # Let nltk know where java & mallet are.
    nltk.internals.config_java(java_home)
    nltk.classify.mallet.config_mallet(mallet_home)

    # Get the training & test corpus.  We simplify the tagset a little:
    # just the first 2 chars.
    def strip(corpus): return [[(w, t[:2]) for (w,t) in sent]
                               for sent in corpus]
    brown_train = strip(brown.tagged_sents(categories='news')[:train_size])
    brown_test = strip(brown.tagged_sents(categories='editorial')[:test_size])

    crf = MalletCRF.train(fd, brown_train, #'/tmp/crf-model',
                          transduction_type='VITERBI')
    sample_output = crf.tag([w for (w,t) in brown_test[5]])
    acc = nltk.tag.accuracy(crf, brown_test)
    print('\nAccuracy: %.1f%%' % (acc*100))
    print('Sample output:')
    print(textwrap.fill(' '.join('%s/%s' % w for w in sample_output),
                        initial_indent='  ', subsequent_indent='  ')+'\n')

    # Clean up
    print('Clean-up: deleting', crf.filename)
    os.remove(crf.filename)

    return crf
开发者ID:BohanHsu,项目名称:developer,代码行数:34,代码来源:crf.py


示例4: demo

def demo(train_size=100, test_size=100, java_home="/usr/local/jdk1.5.0/", mallet_home="/usr/local/mallet-0.4"):
    from nltk.corpus import brown
    import textwrap

    # Define a very simple feature detector
    def fd(sentence, index):
        word = sentence[index]
        return dict(word=word, suffix=word[-2:], len=len(word))

    # Let nltk know where java & mallet are.
    nltk.internals.config_java(java_home)
    nltk.classify.mallet.config_mallet(mallet_home)

    # Get the training & test corpus.  We simplify the tagset a little:
    # just the first 2 chars.
    def strip(corpus):
        return [[(w, t[:2]) for (w, t) in sent] for sent in corpus]

    brown_train = strip(brown.tagged_sents(categories="news")[:train_size])
    brown_test = strip(brown.tagged_sents(categories="editorial")[:test_size])

    crf = MalletCRF.train(fd, brown_train, transduction_type="VITERBI")  #'/tmp/crf-model',
    sample_output = crf.tag([w for (w, t) in brown_test[5]])
    acc = nltk.tag.accuracy(crf, brown_test)
    print "\nAccuracy: %.1f%%" % (acc * 100)
    print "Sample output:"
    print textwrap.fill(
        " ".join("%s/%s" % w for w in sample_output), initial_indent="  ", subsequent_indent="  "
    ) + "\n"

    # Clean up
    print "Clean-up: deleting", crf.filename
    os.remove(crf.filename)

    return crf
开发者ID:sneilan,项目名称:EverythingIveDoneOverTheYears,代码行数:35,代码来源:crf.py


示例5: training_sentences

def training_sentences(use=1.0, categories=[]):
	"""returns a training sentence set: [[(word, tag), ..], [(word, tag), ..], ..]"""
	if len(categories) == 0:
		categories = brown.categories() # use all of the brown categories
	sents = []
	for category in categories:
		total = len(brown.tagged_sents(categories=category))
		max = int((1-TEST_PROPORTION) * use * total) - 1 # use the first n sentences for training
		sents += brown.tagged_sents(categories=category, simplify_tags=True)[0:max]
	return sents
开发者ID:jyzhang,项目名称:py-nlp,代码行数:10,代码来源:pos.py


示例6: test_sentences

def test_sentences(categories=[]):
	"""returns a test sentence set: [[(word, tag), ..], [(word, tag), ..], ..]"""
	if len(categories) == 0:
		categories = brown.categories() # use all of the brown categories
	sents = []
	for category in categories:
		total = len(brown.tagged_sents(categories=category))
		start = int(TEST_PROPORTION * total) # use the last k sentences for test
		sents += brown.tagged_sents(categories=category, simplify_tags=True)[-start:-1]
	return sents
开发者ID:jyzhang,项目名称:py-nlp,代码行数:10,代码来源:pos.py


示例7: exercise2

def exercise2():
    print
    print "Exercise 2:"
    brown_news_tagged_sents = bn.tagged_sents(categories = 'news')
    brown_lore_tagged_sents = bn.tagged_sents(categories = 'lore')
    trigram_tagger = nltk.TrigramTagger(brown_news_tagged_sents)
    brown_news_eval = trigram_tagger.evaluate(brown_news_tagged_sents)
    brown_lore_eval = trigram_tagger.evaluate(brown_lore_tagged_sents)
    print "Evaluation of the trigram tagger on 'News': %f " % brown_news_eval
    print "Evaluation of the trigram tagger on 'Lore': %f " % brown_lore_eval
    print
开发者ID:GirishSrinivas,项目名称:PythonPrograms,代码行数:11,代码来源:Girish_Srinivas_ch5b.py


示例8: precisionRecall

def precisionRecall():

    def tag_list(tagged_sents):
        return [tag for sent in tagged_sents for (word, tag) in sent]

    def apply_tagger(tagger, corpus):
        return [tagger.tag(nltk.tag.untag(sent)) for sent in corpus]

    gold = tag_list(brown.tagged_sents(categories='editorial'))
    test = tag_list(apply_tagger(t2, brown.tagged_sents(categories='editorial')))
    cm = nltk.ConfusionMatrix(gold, test)
    print cm.pp(sort_by_count=True, show_percents=True, truncate=9) 
开发者ID:AkiraKane,项目名称:Python,代码行数:12,代码来源:c06_evaluation.py


示例9: evaluate

   def evaluate(self):
      '''run tests on conll2000 and treebank data'''

      test = treebank.tagged_sents()[:100]
      treebank_result = (100*self.classifier.evaluate(test))

      test = conll2000.tagged_sents()[:100]
      conll2000_result = (100*self.classifier.evaluate(test))

      test = brown.tagged_sents()[int(len(brown.tagged_sents())*0.8):]
      brown_result = (100*self.classifier.evaluate(test))

      return (treebank_result, conll2000_result, brown_result)
开发者ID:okoye,项目名称:sentimentanalysis,代码行数:13,代码来源:speechtagger.py


示例10: testSet

def testSet():

    tagged_sents = list(brown.tagged_sents(categories='news'))
    random.shuffle(tagged_sents)
    size = int(len(tagged_sents) * 0.1)
    train_set, test_set = tagged_sents[size:], tagged_sents[:size]

    file_ids = brown.fileids(categories='news')
    size = int(len(file_ids) * 0.1)
    train_set = brown.tagged_sents(file_ids[size:])
    test_set = brown.tagged_sents(file_ids[:size])

    train_set = brown.tagged_sents(categories='news')
    test_set = brown.tagged_sents(categories='fiction')
开发者ID:AkiraKane,项目名称:Python,代码行数:14,代码来源:c06_evaluation.py


示例11: get_tagged_tokens

 def get_tagged_tokens(self, corpus=TAGGED, testing=False):
     """This tokenizes, segments, and tags all the files in a directory."""
     if testing:
         # train against a smaller version of the corpus so that it
         # doesn't take years during testing.
         tagger = build_trainer(brown.tagged_sents(categories='news'))
     else:
         tagger = build_trainer(brown.tagged_sents())
     tokens_and_spans = self.tokenize_corpus(corpus)
     tagged_spanned_tokens = tag_token_spans(
         tokens_and_spans,
         tagger,
     )
     return tagged_spanned_tokens
开发者ID:bmw9t,项目名称:woolf,代码行数:14,代码来源:fset_manager.py


示例12: exercise1

def exercise1():
    print
    print "Exercise 1:"
    brown_news_tagged_sents = bn.tagged_sents(categories = 'news')
    brown_lore_tagged_sents = bn.tagged_sents(categories = 'lore')
    unigram_tagger = nltk.UnigramTagger(brown_news_tagged_sents)
    brown_news_eval = unigram_tagger.evaluate(brown_news_tagged_sents)
    brown_lore_eval = unigram_tagger.evaluate(brown_lore_tagged_sents)
    print "Evaluation of the unigram tagger on 'News': %f " % brown_news_eval
    print "Evaluation of the unigram tagger on 'Lore': %f " % brown_lore_eval
    brown_lore = bn.sents(categories = 'lore')
    b_lore = unigram_tagger.tag(brown_lore[200])
    print "Tagged words for 200th sentence of 'Brown' corpus of category 'Lore' is: "
    print b_lore
    print
开发者ID:GirishSrinivas,项目名称:PythonPrograms,代码行数:15,代码来源:Girish_Srinivas_ch5a.py


示例13: create_tagger

def create_tagger():
    """Train a tagger from the Brown Corpus. This should not be called very
    often; only in the event that the tagger pickle wasn't found."""
    print "Building tagger..."
    train_sents = brown.tagged_sents()

    # These regexes were lifted from the NLTK book tagger chapter.
    t0 = nltk.RegexpTagger(
        [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
         (r'(The|the|A|a|An|an)$', 'AT'), # articles
         (r'.*able$', 'JJ'),              # adjectives
         (r'.*ness$', 'NN'),              # nouns formed from adjectives
         (r'.*ly$', 'RB'),                # adverbs
         (r'.*s$', 'NNS'),                # plural nouns
         (r'.*ing$', 'VBG'),              # gerunds
         (r'.*ed$', 'VBD'),               # past tense verbs
         (r'.*', 'NN')                    # nouns (default)
        ])
    print "got t0"

    t1 = nltk.UnigramTagger(train_sents, backoff=t0)
    print "got t1"

    t2 = nltk.BigramTagger(train_sents, backoff=t1)
    print "got t2"

    t3 = nltk.TrigramTagger(train_sents, backoff=t2)
    print "Built tagger!"
    return t3
开发者ID:Jacob33123,项目名称:narorumo,代码行数:29,代码来源:postagger.py


示例14: read_datas

 def read_datas(self):
     brown_tagged_sentence  = brown.tagged_sents()
     brown_sent = brown.sents()
     size = int(len(brown_tagged_sentence) * 0.9)
     train_set =  brown_tagged_sentence[:size]
     test_set = brown_tagged_sentence[size:]
     return (train_set,test_set)
开发者ID:Nicolas99-9,项目名称:TERApprentissage,代码行数:7,代码来源:tagger.py


示例15: ch05_11_train_test_affix_tagger

def ch05_11_train_test_affix_tagger():
  from nltk.corpus import brown
  fd = nltk.FreqDist(brown.words(categories="news"))
  cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories="news"))
  most_freq_pos = dict((word, cfd[word].max()) for word in fd.keys())
  affix_tagger = nltk.AffixTagger(model=most_freq_pos)
  print affix_tagger.evaluate(brown.tagged_sents(categories="editorial"))
开发者ID:447327642,项目名称:nltk-examples,代码行数:7,代码来源:ch05_ex.py


示例16: exploreTaggedCorpora

def exploreTaggedCorpora():

    brown_learned_text = brown.words(categories="learned")
    sorted(set(b for (a, b) in nltk.ibigrams(brown_learned_text) if a == "often"))

    brown_lrnd_tagged = brown.tagged_words(categories="learned", simplify_tags=True)
    tags = [b[1] for (a, b) in nltk.ibigrams(brown_lrnd_tagged) if a[0] == "often"]
    fd = nltk.FreqDist(tags)
    fd.tabulate()

    def process(sentence):
        for (w1, t1), (w2, t2), (w3, t3) in nltk.trigrams(sentence):
            if t1.startswith("V") and t2 == "TO" and t3.startswith("V"):
                print w1, w2, w3

    for tagged_sent in brown.tagged_sents():
        process(tagged_sent)

    brown_news_tagged = brown.tagged_words(categories="news", simplify_tags=True)
    data = nltk.ConditionalFreqDist((word.lower(), tag) for (word, tag) in brown_news_tagged)

    for word in data.conditions():
        if len(data[word]) > 3:
            tags = data[word].keys()
            print word, " ".join(tags)
开发者ID:AkiraKane,项目名称:Python,代码行数:25,代码来源:c05_tagger.py


示例17: __init__

 def __init__(self):
     try:
         tagger = cPickle.load(open("nerdb_tagger.pkl"))
     except IOError:
         print "failed to load nerdb_tagger, recreating..."
         train_sents = conll2000.tagged_sents() + brown.tagged_sents()
         tagger = nltk.DefaultTagger("NN")
         tagger = nltk.UnigramTagger(train_sents, backoff=tagger)
         tagger = nltk.BigramTagger(train_sents, backoff=tagger)
         tagger = nltk.TrigramTagger(train_sents, backoff=tagger)
         cPickle.dump(tagger, open("nerdb_tagger.pkl", "w"))
         print "done"
     try:
         chunker = cPickle.load(open("nerdb_chunker.pkl"))
     except IOError:
         print "failed to load nerdb_chunker, recreating..."
         train_sents = conll2000.chunked_sents()
         chunker = ConsecutiveNPChunker(tagger, train_sents)
         cPickle.dump(chunker, open("nerdb_chunker.pkl", "w"))
         print "done"
     self.chunker = chunker
     self.people = [line.strip().split(" ", 1) for line in open("actors_index.txt").readlines()]
     self.people += [line.strip().split(" ", 1) for line in open("actresses_index.txt").readlines()]
     self.movies = [line.strip().split(" ", 1) for line in open("title_index.txt").readlines()]
     self.entity_types = {"PERSON": self.people, "MOVIE": self.movies}
     self.numbers = eval(open("numbers.txt").read())
开发者ID:gabsl,项目名称:IMDBot,代码行数:26,代码来源:NERDb.py


示例18: auto_tag

def auto_tag(company):
    """
    tag a given text using brown corpus and unigram tagger
    :param company: company whose reviews are tagged
    :return: a list of tagged words
    """
    brown_tagged_sents = brown.tagged_sents(categories = 'news', tagset='universal')
    brown_sents = brown.sents(categories = 'news')

    # open the review of a company, and print error message if company review doesn't exist
    # first deal with unique cases such as General Motors => GM
    if company == 'General Motors':
        company = 'GM'
    elif company == 'Ford Motor Company':
        company = 'Ford'
    try:
        text = open('/Users/vickyzhang/Documents/Python/chart/comp/review/'+ company.capitalize() + '_review.txt').read()
    except FileNotFoundError:
        print('The system doesn\'t have a review for the company you entered. Please enter another company.')

    # normalize (tokenize and lowercase-ize) each word in the string
    text_token = nltk.word_tokenize(text)
    text_normal = [w.lower() for w in text_token]

    # build unigram tagger based on brown corpus, and use it to tag the normalized text
    unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
    text_tagged = unigram_tagger.tag(text_normal)
    return text_tagged
开发者ID:vicher37,项目名称:jobchart,代码行数:28,代码来源:review_summary.py


示例19: get_pos_tagger

    def get_pos_tagger(self):
        from nltk.corpus import brown

        regexp_tagger = RegexpTagger(
            [
                (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
                (r'(The|the|A|a|An|an)$', 'AT'),  # articles
                (r'.*able$', 'JJ'),  # adjectives
                (r'.*ness$', 'NN'),  # nouns formed from adjectives
                (r'.*ly$', 'RB'),  # adverbs
                (r'.*s$', 'NNS'),  # plural nouns
                (r'.*ing$', 'VBG'),  # gerunds
                (r'.*ed$', 'VBD'),  # past tense verbs
                (r'.*', 'NN'),  # nouns (default)
            ]
        )
        brown_train = brown.tagged_sents(categories='news')
        unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger)
        bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger)
        trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger)

        # Override particular words
        main_tagger = RegexpTagger(
            [(r'(A|a|An|an)$', 'ex_quant'), (r'(Every|every|All|all)$', 'univ_quant')],
            backoff=trigram_tagger,
        )

        return main_tagger
开发者ID:prz3m,项目名称:kind2anki,代码行数:28,代码来源:glue.py


示例20: getTaggerAndTestSetInSimplifiedMode

def getTaggerAndTestSetInSimplifiedMode(taggerName):
    brown_news_taggedS = brown.tagged_sents(categories='news', simplify_tags=True)
    brown_trainS = brown_news_taggedS[100:]
    brown_testS = brown_news_taggedS[:100]
    
    nn_taggerS = nltk.DefaultTagger('NN')
    regexp_taggerS = nltk.RegexpTagger([(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),   # cardinal numbers
                                       (r'(The|the|A|a|An|an)$', 'AT'),   # articles
                                       (r'.*able$', 'JJ'),                # adjectives
                                       (r'.*ness$', 'NN'),                # nouns formed from adjectives
                                       (r'.*ly$', 'RB'),                  # adverbs
                                       (r'.*s$', 'NNS'),                  # plural nouns
                                       (r'.*ing$', 'VBG'),                # gerunds
                                       (r'.*ed$', 'VBD'),                 # past tense verbs
                                       (r'.*', 'NN')                      # nouns (default)
                                       ],backoff=nn_taggerS)
    at2S = nltk.AffixTagger(brown_trainS, backoff=regexp_taggerS)
    ut3S = nltk.UnigramTagger(brown_trainS, backoff=at2S)
    ct2S = nltk.NgramTagger(2, brown_trainS, backoff=ut3S)
    if taggerName == "DefaultTagger":
        return nn_taggerS,brown_testS
    else:
        if taggerName == "RegExpTagger":
            return regexp_taggerS, brown_testS
        else:
            if taggerName == "AffixTagger":
                return at2S,brown_testS
            else:
                if taggerName == "UnigramTagger":
                    return ut3S,brown_testS
                else:
                    if taggerName == "BigramTagger":
                        return ct2S,brown_testS
开发者ID:atiassa,项目名称:recommend-2011,代码行数:33,代码来源:q3_2.py



注:本文中的nltk.corpus.brown.tagged_sents函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python brown.tagged_words函数代码示例发布时间:2022-05-27
下一篇:
Python brown.sents函数代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap