本文整理汇总了Python中nltk.corpus.brown.tagged_sents函数的典型用法代码示例。如果您正苦于以下问题:Python tagged_sents函数的具体用法?Python tagged_sents怎么用?Python tagged_sents使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了tagged_sents函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: main
def main():
# run Simple unigram tagger
brown_news_tagged = brown.tagged_sents(categories='news')
brown_train = brown_news_tagged[100:]
brown_test = brown_news_tagged[:100]
nn_tagger = nltk.DefaultTagger('NN')
ut2 = nltk.UnigramTagger(brown_train, backoff=nn_tagger)
simpleUnigramTagger = SimpleUnigramTagger(brown_train, backoff=nn_tagger)
print 'Simple Unigram tagger accuracy: %4.1f%%' % ( 100.0 * simpleUnigramTagger.evaluate(brown_test))
print 'Unigram tagger accuracy: %4.1f%%' % ( 100.0 * ut2.evaluate(brown_test))
# run affix tagger with entropy
brown_news_tagged = brown.tagged_sents(categories='news')
brown_train = brown_news_tagged[:int(0.8*len(brown_news_tagged))]
rest = brown_news_tagged[int(0.8*len(brown_news_tagged)):]
brown_development = rest[:int(0.5*len(rest))]
brown_test = rest[int(0.5*len(rest)):]
affix_tagger = nltk.AffixTagger(brown_train, backoff= nltk.DefaultTagger('NN') , cutoff=2)
nltk.AffixTagger._train = _train
nltk.AffixTagger.H = _H
optcutoff = optimize_parameter()
print "the optimal cutoff param is: %d " % optcutoff
affix_tagger2 = nltk.AffixTagger(brown_train, backoff= nltk.DefaultTagger('NN') , cutoff=optcutoff)
print 'Unigram tagger accuracy: %4.1f%%' % ( 100.0 * affix_tagger.evaluate(brown_test))
print 'Unigram tagger accuracy with entropy: %4.1f%%' % ( 100.0 * affix_tagger2.evaluate(brown_test))
开发者ID:atiassa,项目名称:recommend-2011,代码行数:28,代码来源:q2.py
示例2: __init__
def __init__(self):
'''initialize and train brill and naive bayes classifiers'''
#TODO: Fix bug where it loads tagger from calling module dir
if exists(file):
input = open(file, 'rb')
self.classifier = load(input)
input.close()
print 'Successfully loaded saved classifier'
return
self.bayes = NaiveBayesTagger()
boundary = int(len(brown.tagged_sents())*0.8)
train = brown.tagged_sents(simplify_tags=True)[:boundary]
brill_trainer = FastBrillTaggerTrainer(initial_tagger = self.bayes,
templates = templates,
trace = 3,
deterministic = True)
self.classifier = brill_trainer.train(train, max_rules=10)
print 'Saving Taggers to file: "pos_tagger.pickle"'
output = open(file, 'wb')
dump(self.classifier, output, 1)
output.close()
开发者ID:okoye,项目名称:sentimentanalysis,代码行数:26,代码来源:speechtagger.py
示例3: demo
def demo(train_size=100, test_size=100, java_home=None, mallet_home=None):
from nltk.corpus import brown
import textwrap
# Define a very simple feature detector
def fd(sentence, index):
word = sentence[index]
return dict(word=word, suffix=word[-2:], len=len(word))
# Let nltk know where java & mallet are.
nltk.internals.config_java(java_home)
nltk.classify.mallet.config_mallet(mallet_home)
# Get the training & test corpus. We simplify the tagset a little:
# just the first 2 chars.
def strip(corpus): return [[(w, t[:2]) for (w,t) in sent]
for sent in corpus]
brown_train = strip(brown.tagged_sents(categories='news')[:train_size])
brown_test = strip(brown.tagged_sents(categories='editorial')[:test_size])
crf = MalletCRF.train(fd, brown_train, #'/tmp/crf-model',
transduction_type='VITERBI')
sample_output = crf.tag([w for (w,t) in brown_test[5]])
acc = nltk.tag.accuracy(crf, brown_test)
print('\nAccuracy: %.1f%%' % (acc*100))
print('Sample output:')
print(textwrap.fill(' '.join('%s/%s' % w for w in sample_output),
initial_indent=' ', subsequent_indent=' ')+'\n')
# Clean up
print('Clean-up: deleting', crf.filename)
os.remove(crf.filename)
return crf
开发者ID:BohanHsu,项目名称:developer,代码行数:34,代码来源:crf.py
示例4: demo
def demo(train_size=100, test_size=100, java_home="/usr/local/jdk1.5.0/", mallet_home="/usr/local/mallet-0.4"):
from nltk.corpus import brown
import textwrap
# Define a very simple feature detector
def fd(sentence, index):
word = sentence[index]
return dict(word=word, suffix=word[-2:], len=len(word))
# Let nltk know where java & mallet are.
nltk.internals.config_java(java_home)
nltk.classify.mallet.config_mallet(mallet_home)
# Get the training & test corpus. We simplify the tagset a little:
# just the first 2 chars.
def strip(corpus):
return [[(w, t[:2]) for (w, t) in sent] for sent in corpus]
brown_train = strip(brown.tagged_sents(categories="news")[:train_size])
brown_test = strip(brown.tagged_sents(categories="editorial")[:test_size])
crf = MalletCRF.train(fd, brown_train, transduction_type="VITERBI") #'/tmp/crf-model',
sample_output = crf.tag([w for (w, t) in brown_test[5]])
acc = nltk.tag.accuracy(crf, brown_test)
print "\nAccuracy: %.1f%%" % (acc * 100)
print "Sample output:"
print textwrap.fill(
" ".join("%s/%s" % w for w in sample_output), initial_indent=" ", subsequent_indent=" "
) + "\n"
# Clean up
print "Clean-up: deleting", crf.filename
os.remove(crf.filename)
return crf
开发者ID:sneilan,项目名称:EverythingIveDoneOverTheYears,代码行数:35,代码来源:crf.py
示例5: training_sentences
def training_sentences(use=1.0, categories=[]):
"""returns a training sentence set: [[(word, tag), ..], [(word, tag), ..], ..]"""
if len(categories) == 0:
categories = brown.categories() # use all of the brown categories
sents = []
for category in categories:
total = len(brown.tagged_sents(categories=category))
max = int((1-TEST_PROPORTION) * use * total) - 1 # use the first n sentences for training
sents += brown.tagged_sents(categories=category, simplify_tags=True)[0:max]
return sents
开发者ID:jyzhang,项目名称:py-nlp,代码行数:10,代码来源:pos.py
示例6: test_sentences
def test_sentences(categories=[]):
"""returns a test sentence set: [[(word, tag), ..], [(word, tag), ..], ..]"""
if len(categories) == 0:
categories = brown.categories() # use all of the brown categories
sents = []
for category in categories:
total = len(brown.tagged_sents(categories=category))
start = int(TEST_PROPORTION * total) # use the last k sentences for test
sents += brown.tagged_sents(categories=category, simplify_tags=True)[-start:-1]
return sents
开发者ID:jyzhang,项目名称:py-nlp,代码行数:10,代码来源:pos.py
示例7: exercise2
def exercise2():
print
print "Exercise 2:"
brown_news_tagged_sents = bn.tagged_sents(categories = 'news')
brown_lore_tagged_sents = bn.tagged_sents(categories = 'lore')
trigram_tagger = nltk.TrigramTagger(brown_news_tagged_sents)
brown_news_eval = trigram_tagger.evaluate(brown_news_tagged_sents)
brown_lore_eval = trigram_tagger.evaluate(brown_lore_tagged_sents)
print "Evaluation of the trigram tagger on 'News': %f " % brown_news_eval
print "Evaluation of the trigram tagger on 'Lore': %f " % brown_lore_eval
print
开发者ID:GirishSrinivas,项目名称:PythonPrograms,代码行数:11,代码来源:Girish_Srinivas_ch5b.py
示例8: precisionRecall
def precisionRecall():
def tag_list(tagged_sents):
return [tag for sent in tagged_sents for (word, tag) in sent]
def apply_tagger(tagger, corpus):
return [tagger.tag(nltk.tag.untag(sent)) for sent in corpus]
gold = tag_list(brown.tagged_sents(categories='editorial'))
test = tag_list(apply_tagger(t2, brown.tagged_sents(categories='editorial')))
cm = nltk.ConfusionMatrix(gold, test)
print cm.pp(sort_by_count=True, show_percents=True, truncate=9)
开发者ID:AkiraKane,项目名称:Python,代码行数:12,代码来源:c06_evaluation.py
示例9: evaluate
def evaluate(self):
'''run tests on conll2000 and treebank data'''
test = treebank.tagged_sents()[:100]
treebank_result = (100*self.classifier.evaluate(test))
test = conll2000.tagged_sents()[:100]
conll2000_result = (100*self.classifier.evaluate(test))
test = brown.tagged_sents()[int(len(brown.tagged_sents())*0.8):]
brown_result = (100*self.classifier.evaluate(test))
return (treebank_result, conll2000_result, brown_result)
开发者ID:okoye,项目名称:sentimentanalysis,代码行数:13,代码来源:speechtagger.py
示例10: testSet
def testSet():
tagged_sents = list(brown.tagged_sents(categories='news'))
random.shuffle(tagged_sents)
size = int(len(tagged_sents) * 0.1)
train_set, test_set = tagged_sents[size:], tagged_sents[:size]
file_ids = brown.fileids(categories='news')
size = int(len(file_ids) * 0.1)
train_set = brown.tagged_sents(file_ids[size:])
test_set = brown.tagged_sents(file_ids[:size])
train_set = brown.tagged_sents(categories='news')
test_set = brown.tagged_sents(categories='fiction')
开发者ID:AkiraKane,项目名称:Python,代码行数:14,代码来源:c06_evaluation.py
示例11: get_tagged_tokens
def get_tagged_tokens(self, corpus=TAGGED, testing=False):
"""This tokenizes, segments, and tags all the files in a directory."""
if testing:
# train against a smaller version of the corpus so that it
# doesn't take years during testing.
tagger = build_trainer(brown.tagged_sents(categories='news'))
else:
tagger = build_trainer(brown.tagged_sents())
tokens_and_spans = self.tokenize_corpus(corpus)
tagged_spanned_tokens = tag_token_spans(
tokens_and_spans,
tagger,
)
return tagged_spanned_tokens
开发者ID:bmw9t,项目名称:woolf,代码行数:14,代码来源:fset_manager.py
示例12: exercise1
def exercise1():
print
print "Exercise 1:"
brown_news_tagged_sents = bn.tagged_sents(categories = 'news')
brown_lore_tagged_sents = bn.tagged_sents(categories = 'lore')
unigram_tagger = nltk.UnigramTagger(brown_news_tagged_sents)
brown_news_eval = unigram_tagger.evaluate(brown_news_tagged_sents)
brown_lore_eval = unigram_tagger.evaluate(brown_lore_tagged_sents)
print "Evaluation of the unigram tagger on 'News': %f " % brown_news_eval
print "Evaluation of the unigram tagger on 'Lore': %f " % brown_lore_eval
brown_lore = bn.sents(categories = 'lore')
b_lore = unigram_tagger.tag(brown_lore[200])
print "Tagged words for 200th sentence of 'Brown' corpus of category 'Lore' is: "
print b_lore
print
开发者ID:GirishSrinivas,项目名称:PythonPrograms,代码行数:15,代码来源:Girish_Srinivas_ch5a.py
示例13: create_tagger
def create_tagger():
"""Train a tagger from the Brown Corpus. This should not be called very
often; only in the event that the tagger pickle wasn't found."""
print "Building tagger..."
train_sents = brown.tagged_sents()
# These regexes were lifted from the NLTK book tagger chapter.
t0 = nltk.RegexpTagger(
[(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
(r'(The|the|A|a|An|an)$', 'AT'), # articles
(r'.*able$', 'JJ'), # adjectives
(r'.*ness$', 'NN'), # nouns formed from adjectives
(r'.*ly$', 'RB'), # adverbs
(r'.*s$', 'NNS'), # plural nouns
(r'.*ing$', 'VBG'), # gerunds
(r'.*ed$', 'VBD'), # past tense verbs
(r'.*', 'NN') # nouns (default)
])
print "got t0"
t1 = nltk.UnigramTagger(train_sents, backoff=t0)
print "got t1"
t2 = nltk.BigramTagger(train_sents, backoff=t1)
print "got t2"
t3 = nltk.TrigramTagger(train_sents, backoff=t2)
print "Built tagger!"
return t3
开发者ID:Jacob33123,项目名称:narorumo,代码行数:29,代码来源:postagger.py
示例14: read_datas
def read_datas(self):
brown_tagged_sentence = brown.tagged_sents()
brown_sent = brown.sents()
size = int(len(brown_tagged_sentence) * 0.9)
train_set = brown_tagged_sentence[:size]
test_set = brown_tagged_sentence[size:]
return (train_set,test_set)
开发者ID:Nicolas99-9,项目名称:TERApprentissage,代码行数:7,代码来源:tagger.py
示例15: ch05_11_train_test_affix_tagger
def ch05_11_train_test_affix_tagger():
from nltk.corpus import brown
fd = nltk.FreqDist(brown.words(categories="news"))
cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories="news"))
most_freq_pos = dict((word, cfd[word].max()) for word in fd.keys())
affix_tagger = nltk.AffixTagger(model=most_freq_pos)
print affix_tagger.evaluate(brown.tagged_sents(categories="editorial"))
开发者ID:447327642,项目名称:nltk-examples,代码行数:7,代码来源:ch05_ex.py
示例16: exploreTaggedCorpora
def exploreTaggedCorpora():
brown_learned_text = brown.words(categories="learned")
sorted(set(b for (a, b) in nltk.ibigrams(brown_learned_text) if a == "often"))
brown_lrnd_tagged = brown.tagged_words(categories="learned", simplify_tags=True)
tags = [b[1] for (a, b) in nltk.ibigrams(brown_lrnd_tagged) if a[0] == "often"]
fd = nltk.FreqDist(tags)
fd.tabulate()
def process(sentence):
for (w1, t1), (w2, t2), (w3, t3) in nltk.trigrams(sentence):
if t1.startswith("V") and t2 == "TO" and t3.startswith("V"):
print w1, w2, w3
for tagged_sent in brown.tagged_sents():
process(tagged_sent)
brown_news_tagged = brown.tagged_words(categories="news", simplify_tags=True)
data = nltk.ConditionalFreqDist((word.lower(), tag) for (word, tag) in brown_news_tagged)
for word in data.conditions():
if len(data[word]) > 3:
tags = data[word].keys()
print word, " ".join(tags)
开发者ID:AkiraKane,项目名称:Python,代码行数:25,代码来源:c05_tagger.py
示例17: __init__
def __init__(self):
try:
tagger = cPickle.load(open("nerdb_tagger.pkl"))
except IOError:
print "failed to load nerdb_tagger, recreating..."
train_sents = conll2000.tagged_sents() + brown.tagged_sents()
tagger = nltk.DefaultTagger("NN")
tagger = nltk.UnigramTagger(train_sents, backoff=tagger)
tagger = nltk.BigramTagger(train_sents, backoff=tagger)
tagger = nltk.TrigramTagger(train_sents, backoff=tagger)
cPickle.dump(tagger, open("nerdb_tagger.pkl", "w"))
print "done"
try:
chunker = cPickle.load(open("nerdb_chunker.pkl"))
except IOError:
print "failed to load nerdb_chunker, recreating..."
train_sents = conll2000.chunked_sents()
chunker = ConsecutiveNPChunker(tagger, train_sents)
cPickle.dump(chunker, open("nerdb_chunker.pkl", "w"))
print "done"
self.chunker = chunker
self.people = [line.strip().split(" ", 1) for line in open("actors_index.txt").readlines()]
self.people += [line.strip().split(" ", 1) for line in open("actresses_index.txt").readlines()]
self.movies = [line.strip().split(" ", 1) for line in open("title_index.txt").readlines()]
self.entity_types = {"PERSON": self.people, "MOVIE": self.movies}
self.numbers = eval(open("numbers.txt").read())
开发者ID:gabsl,项目名称:IMDBot,代码行数:26,代码来源:NERDb.py
示例18: auto_tag
def auto_tag(company):
"""
tag a given text using brown corpus and unigram tagger
:param company: company whose reviews are tagged
:return: a list of tagged words
"""
brown_tagged_sents = brown.tagged_sents(categories = 'news', tagset='universal')
brown_sents = brown.sents(categories = 'news')
# open the review of a company, and print error message if company review doesn't exist
# first deal with unique cases such as General Motors => GM
if company == 'General Motors':
company = 'GM'
elif company == 'Ford Motor Company':
company = 'Ford'
try:
text = open('/Users/vickyzhang/Documents/Python/chart/comp/review/'+ company.capitalize() + '_review.txt').read()
except FileNotFoundError:
print('The system doesn\'t have a review for the company you entered. Please enter another company.')
# normalize (tokenize and lowercase-ize) each word in the string
text_token = nltk.word_tokenize(text)
text_normal = [w.lower() for w in text_token]
# build unigram tagger based on brown corpus, and use it to tag the normalized text
unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
text_tagged = unigram_tagger.tag(text_normal)
return text_tagged
开发者ID:vicher37,项目名称:jobchart,代码行数:28,代码来源:review_summary.py
示例19: get_pos_tagger
def get_pos_tagger(self):
from nltk.corpus import brown
regexp_tagger = RegexpTagger(
[
(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
(r'(The|the|A|a|An|an)$', 'AT'), # articles
(r'.*able$', 'JJ'), # adjectives
(r'.*ness$', 'NN'), # nouns formed from adjectives
(r'.*ly$', 'RB'), # adverbs
(r'.*s$', 'NNS'), # plural nouns
(r'.*ing$', 'VBG'), # gerunds
(r'.*ed$', 'VBD'), # past tense verbs
(r'.*', 'NN'), # nouns (default)
]
)
brown_train = brown.tagged_sents(categories='news')
unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger)
bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger)
trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger)
# Override particular words
main_tagger = RegexpTagger(
[(r'(A|a|An|an)$', 'ex_quant'), (r'(Every|every|All|all)$', 'univ_quant')],
backoff=trigram_tagger,
)
return main_tagger
开发者ID:prz3m,项目名称:kind2anki,代码行数:28,代码来源:glue.py
示例20: getTaggerAndTestSetInSimplifiedMode
def getTaggerAndTestSetInSimplifiedMode(taggerName):
brown_news_taggedS = brown.tagged_sents(categories='news', simplify_tags=True)
brown_trainS = brown_news_taggedS[100:]
brown_testS = brown_news_taggedS[:100]
nn_taggerS = nltk.DefaultTagger('NN')
regexp_taggerS = nltk.RegexpTagger([(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
(r'(The|the|A|a|An|an)$', 'AT'), # articles
(r'.*able$', 'JJ'), # adjectives
(r'.*ness$', 'NN'), # nouns formed from adjectives
(r'.*ly$', 'RB'), # adverbs
(r'.*s$', 'NNS'), # plural nouns
(r'.*ing$', 'VBG'), # gerunds
(r'.*ed$', 'VBD'), # past tense verbs
(r'.*', 'NN') # nouns (default)
],backoff=nn_taggerS)
at2S = nltk.AffixTagger(brown_trainS, backoff=regexp_taggerS)
ut3S = nltk.UnigramTagger(brown_trainS, backoff=at2S)
ct2S = nltk.NgramTagger(2, brown_trainS, backoff=ut3S)
if taggerName == "DefaultTagger":
return nn_taggerS,brown_testS
else:
if taggerName == "RegExpTagger":
return regexp_taggerS, brown_testS
else:
if taggerName == "AffixTagger":
return at2S,brown_testS
else:
if taggerName == "UnigramTagger":
return ut3S,brown_testS
else:
if taggerName == "BigramTagger":
return ct2S,brown_testS
开发者ID:atiassa,项目名称:recommend-2011,代码行数:33,代码来源:q3_2.py
注:本文中的nltk.corpus.brown.tagged_sents函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论