本文整理汇总了Python中nltk.corpus.brown.tagged_words函数的典型用法代码示例。如果您正苦于以下问题:Python tagged_words函数的具体用法?Python tagged_words怎么用?Python tagged_words使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了tagged_words函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: exercise3
def exercise3():
print
print "Exercise 3"
print "Part 1"
count = 0
total_brown_tagged_words = bn.tagged_words()
cfd1 = nltk.ConditionalFreqDist(total_brown_tagged_words)
set1 = set([a for (a, b) in total_brown_tagged_words])
for s in set1:
if(len(cfd1[s].keys()) == 5):
count = count + 1
print "Number of words which have exactly 5 different tags: %d" % count
print
print "Part 2"
print "Words which have the most distinct tags are: "
tags = [b for (a, b) in bn.tagged_words()]
fd = nltk.FreqDist(tags)
ft = fd.keys()
cfd2 = nltk.ConditionalFreqDist((tag, word) for (word, tag) in bn.tagged_words())
for a in ft:
if fd[a] == 1:
print "For POS: " +a
print cfd2[a].keys()
print
print
开发者ID:GirishSrinivas,项目名称:PythonPrograms,代码行数:29,代码来源:Girish_Srinivas_ch5a.py
示例2: __init__
def __init__(self):
"""Initialize your data structures in the constructor."""
tag_corpus = []
# from nltk.corpus import treebank
# corpus = treebank.tagged_words()
# for (word,tag) in treebank.tagged_words():
# tag_corpus.append(tag)
from nltk.corpus import brown
corpus = brown.tagged_words()
for (word,tag) in brown.tagged_words():
tag_corpus.append(tag)
self.wordCounts = collections.defaultdict(int)
self.tagCounts = collections.defaultdict(int)
self.wordTagCounts = collections.defaultdict(int)
self.wordTagList = {}
self.totalTag = 0
self.train(corpus)
#estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
#estimator = lambda fdist, bins: WittenBellProbDist(fdist, 0.2)
estimator = _estimator
self.tagLM = NgramModel(2, tag_corpus, estimator)
开发者ID:tonyqtian,项目名称:sentence_checker,代码行数:25,代码来源:WordTagModel.py
示例3: verb_stem
def verb_stem(s):
"""extracts the stem from the 3sg form of a verb, or returns empty string"""
# goes through rules outlined in handout
if re.match ("has", s):
toReturn = 'have'
elif re.match (".*(ays|eys|iys|oys|uys)", s):
toReturn = s[:-1]
elif re.match (".*(ies)", s):
if (len(s) == 4):
toReturn = s[:-1]
else:
s1 = s[:-3]
s2 = s1 + "y"
toReturn = s2
elif re.match(".*(oes|xes|ches|shes|sses|zzes)", s):
toReturn = s[:-2]
elif re.match (".*(!sses|!zzes|ses|zes)", s):
toReturn = s[:-1]
elif re.match(".*(!ies|!oes|!ses|!xes|!ches|!shes|es)", s):
toReturn = s[:-1]
elif re.match(".*(!ss|!xs|!ys|!zs|!chs|!shs|s)", s):
toReturn = s[:-1]
else:
toReturn = ''
# will check if original plural or creted singular verb is in the Brown corpus.
if ((s, 'VBZ') not in brown.tagged_words()):
if ((toReturn, 'VB') not in brown.tagged_words()):
return ''
else:
return toReturn
else:
return toReturn
开发者ID:mtkent,项目名称:PFaNL-Assignment-2---Fall-2015,代码行数:34,代码来源:statements.py
示例4: exercise2
def exercise2(category):
print
print "For Category: " + category
print "Part 1"
print "Words with the tag 'JJ':"
words = bn.tagged_words(categories = category)
wordlist = bn.words(categories = category)
words_JJ = set(sorted([(word, tag) for (word, tag) in words if tag == 'JJ']))
print len(words_JJ)
print
print "Part 2"
print "Words with tags 'VBZ' -> 3rd Person Singular Verbs or ('NNPS' or 'NNS') -> plural nouns:"
words_VBP_NNPS_NNS = [(word, tag) for (word, tag) in words if tag == 'VBZ' or tag == 'NNPS' or tag == 'NNS']
print words_VBP_NNPS_NNS[:10]
print
sent = ""
print "Part 3"
print "The 3 most frequent 3-word prepositional phrases are:"
words = bn.tagged_words(categories = category)
for (w1, t1), (w2, t2), (w3, t3) in nltk.trigrams(words):
if(t1.startswith('IN') and t2.startswith('AT') and t3.startswith('NN')):
sent = sent + w1.lower() + " " + w2.lower() + " " + w3.lower() + "."
sent_part = sent.split(".")
fd = nltk.FreqDist(sent_part)
v = fd.most_common(3)
print v
print
print "Part 4"
print "Ratio of Masculine to Feminine is:"
male_pattern = r'\bhe\b|\bhis\b|\bhim\b|\bhimself\b'
female_pattern = r'\bshe\b|\bher\b|\bhers\b|\bherself\b'
male_pronouns = len([w for w in wordlist if re.search(male_pattern, w.lower())])
female_pronouns = len([w for w in wordlist if re.search(female_pattern, w.lower())])
print "Male : Female is -> %d : %d" %(male_pronouns, female_pronouns)
print
开发者ID:GirishSrinivas,项目名称:PythonPrograms,代码行数:35,代码来源:Girish_Srinivas_ch5a.py
示例5: exploreTaggedCorpora
def exploreTaggedCorpora():
brown_learned_text = brown.words(categories="learned")
sorted(set(b for (a, b) in nltk.ibigrams(brown_learned_text) if a == "often"))
brown_lrnd_tagged = brown.tagged_words(categories="learned", simplify_tags=True)
tags = [b[1] for (a, b) in nltk.ibigrams(brown_lrnd_tagged) if a[0] == "often"]
fd = nltk.FreqDist(tags)
fd.tabulate()
def process(sentence):
for (w1, t1), (w2, t2), (w3, t3) in nltk.trigrams(sentence):
if t1.startswith("V") and t2 == "TO" and t3.startswith("V"):
print w1, w2, w3
for tagged_sent in brown.tagged_sents():
process(tagged_sent)
brown_news_tagged = brown.tagged_words(categories="news", simplify_tags=True)
data = nltk.ConditionalFreqDist((word.lower(), tag) for (word, tag) in brown_news_tagged)
for word in data.conditions():
if len(data[word]) > 3:
tags = data[word].keys()
print word, " ".join(tags)
开发者ID:AkiraKane,项目名称:Python,代码行数:25,代码来源:c05_tagger.py
示例6: tagged_token_representation
def tagged_token_representation():
print nltk.tag.str2tuple("fly/NN")
from nltk.corpus import brown
print brown.tagged_words()
# distribution of tags
brown_news_tagged = brown.tagged_words(categories="news", simplify_tags=True)
tag_fd = nltk.FreqDist(tag for (word, tag) in brown_news_tagged)
print tag_fd
tag_fd.plot(cumulative=True)
# distribution of POS+N pairs
word_tag_pairs = nltk.bigrams(brown_news_tagged)
print nltk.FreqDist(a[1] for (a, b) in word_tag_pairs if b[1] == "N")
开发者ID:navjord,项目名称:TDT4501,代码行数:12,代码来源:svm-pipeline-step3.py
示例7: automaticTagging
def automaticTagging():
from nltk.corpus import brown
print "=============== The Default Tagger ==============="
brown_tagged_sents = brown.tagged_sents(categories='news')
print brown_tagged_sents[0:3]
brown_sents = brown.sents(categories='news')
print brown_sents[0:3]
tags = [tag for (word, tag) in brown.tagged_words(categories='news')]
print nltk.FreqDist(tags).max()
raw = 'I do not like green eggs and ham, I do not like them Sam I am!'
tokens = nltk.word_tokenize(raw)
default_tagger = nltk.DefaultTagger('NN')
print default_tagger.tag(tokens)
print default_tagger.evaluate(brown_tagged_sents)
print "=============== The Regular Expression Tagger ==============="
patterns = [(r'.*ing$', 'VBG'), (r'.*ed$', 'VBD'), (r'.*es$', 'VBZ'), (r'.*ould$', 'MD'), (r'.*\'s$', 'NN$'), (r'.*s$', 'NNS'), (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*', 'NN') ]
regexp_tagger = nltk.RegexpTagger(patterns)
print regexp_tagger.tag(brown_sents[3])
print regexp_tagger.evaluate(brown_tagged_sents)
print "=============== The Lookup Tagger ==============="
fd = nltk.FreqDist(brown.words(categories='news'))
cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
most_freq_words = fd.keys()[:100]
print most_freq_words
likely_tags = dict((word, cfd[word].max()) for word in most_freq_words)
baseline_tagger = nltk.UnigramTagger(model=likely_tags)
print baseline_tagger
print baseline_tagger.evaluate(brown_tagged_sents)
def performance(cfd, wordlist):
lt = dict((word, cfd[word].max()) for word in wordlist)
baseline_tagger = nltk.UnigramTagger(model=lt, backoff=nltk.DefaultTagger('NN'))
return baseline_tagger.evaluate(brown.tagged_sents(categories='news'))
def display():
import pylab
words_by_freq = list(nltk.FreqDist(brown.words(categories='news')))
cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
sizes = 2 ** pylab.arange(15)
perfs = [performance(cfd, words_by_freq[:size]) for size in sizes]
pylab.plot(sizes, perfs, '-bo')
pylab.title('Lookup Tagger Performance with Varying Model Size')
pylab.xlabel('Model Size')
pylab.ylabel('Performance')
pylab.show()
display()
开发者ID:hbdhj,项目名称:python,代码行数:52,代码来源:chapter5.py
示例8: ch05_11_train_test_affix_tagger
def ch05_11_train_test_affix_tagger():
from nltk.corpus import brown
fd = nltk.FreqDist(brown.words(categories="news"))
cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories="news"))
most_freq_pos = dict((word, cfd[word].max()) for word in fd.keys())
affix_tagger = nltk.AffixTagger(model=most_freq_pos)
print affix_tagger.evaluate(brown.tagged_sents(categories="editorial"))
开发者ID:447327642,项目名称:nltk-examples,代码行数:7,代码来源:ch05_ex.py
示例9: ch05_34_num_words_with_1to10_distinct_tags
def ch05_34_num_words_with_1to10_distinct_tags():
from nltk.corpus import brown
tagged_words = brown.tagged_words(categories="news")
# number of distinct tags and number of words in corpus for this
dd = nltk.defaultdict(set)
for w,t in tagged_words:
dd[w].add(t)
for i in range(1,10):
print i, len(filter(lambda x: len(dd[x]) == i, dd.keys()))
# for the word with greatest number of tags, print out concordance
# one for each tag
maxtags = 6
word = None
tags = None
for w in dd.keys():
if len(dd[w]) >= maxtags:
word = w
tags = dd[w]
break
poss = []
pos = 0
for w, t in tagged_words:
if w == word and t in tags:
poss.append((t, pos))
tags.remove(t)
pos += 1
for t, pos in poss:
print t, " ".join(w for w,t in tagged_words[pos-10:pos+10])
开发者ID:447327642,项目名称:nltk-examples,代码行数:28,代码来源:ch05_ex.py
示例10: ch05_21_qualifiers_before_adore_love_like_prefer
def ch05_21_qualifiers_before_adore_love_like_prefer():
from nltk.corpus import brown
tagged_words = brown.tagged_words(categories="news")
tagged_word_bigrams = nltk.bigrams(tagged_words)
allp = set(["adore", "love", "like", "prefer"])
print set([w for (w1,t1), (w2,t2) in tagged_word_bigrams
if t1 == "QL" and w2.lower() in allp])
开发者ID:447327642,项目名称:nltk-examples,代码行数:7,代码来源:ch05_ex.py
示例11: ch05_20_brown_corpus_words_phrases_by_tag
def ch05_20_brown_corpus_words_phrases_by_tag():
from nltk.corpus import brown
tagged_words = brown.tagged_words(categories="news")
# produce alpha sorted list of distinct words tagged MD
print sorted(set([w.lower()
for (w,t) in filter(lambda (w,t): t == "MD", tagged_words)]))
# identify words that can be plural (NRS, NPS*, NNS*) or
# third person singular verbs (BEDZ*, BEZ*, DOZ*, *BEZ)
# AND the ones ending with "s"
print set([w for (w, t) in tagged_words
if w.lower().endswith("s") and
(t == "NRS" or t.startswith("NPS")
or t.startswith("NPS") or t.startswith("NNS")
or t.startswith("BEDZ") or t.startswith("BEZ")
or t.startswith("DOZ") or t.endswith("BEZ"))])
# identify 3 word prepositional phrases IN+DET+NN
tagged_word_trigrams = nltk.trigrams(tagged_words)
print tagged_word_trigrams[:10]
print set([" ".join([w1, w2, w3])
for (w1,t1), (w2,t2), (w3,t3) in tagged_word_trigrams
if t1 == "IN" and t2 == "DET" and t3 == "NN"])
# ratio of masculine to feminine pronouns
num_masc_pn = len([w for (w,t) in tagged_words if w.lower() == "he"])
num_fem_pn = len([w for (w,t) in tagged_words if w.lower() == "she"])
print "masc/fem = ", (num_masc_pn / num_fem_pn)
开发者ID:447327642,项目名称:nltk-examples,代码行数:25,代码来源:ch05_ex.py
示例12: category_by_pos
def category_by_pos():
from nltk.corpus import brown
from nltk import FreqDist
from nltk import DecisionTreeClassifier
from nltk import NaiveBayesClassifier
from nltk import classify
suffix_fdist = FreqDist()
for word in brown.words():
word = word.lower()
suffix_fdist.inc(word[-1:])
suffix_fdist.inc(word[-2:])
suffix_fdist.inc(word[-3:])
common_suffixes = suffix_fdist.keys()[:100]
# print common_suffixes
def pos_features(word):
features = {}
for suffix in common_suffixes:
features['endswith(%s)' % suffix] = word.lower().endswith(suffix)
return features
tagged_words = brown.tagged_words(categories='news')
featuresets = [(pos_features(n), g) for (n, g) in tagged_words]
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
# classifier = DecisionTreeClassifier.train(train_set)
# print 'Decision Tree %f' % classify.accuracy(classifier, test_set)
classifier = NaiveBayesClassifier.train(train_set)
print 'NaiveBay %f' % classify.accuracy(classifier, test_set)
开发者ID:brenden17,项目名称:infinity,代码行数:32,代码来源:category_nltk.py
示例13: lookupTagger
def lookupTagger():
fd = nltk.FreqDist(brown.words(categories='news'))
cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
most_freq_words = fd.keys()[:100]
likely_tags = dict((word, cfd[word].max()) for word in most_freq_words)
baseline_tagger = nltk.UnigramTagger(model=likely_tags)
baseline_tagger.evaluate(brown_tagged_sents)
sent = brown.sents(categories='news')[3]
baseline_tagger.tag(sent)
baseline_tagger = nltk.UnigramTagger(model=likely_tags,
backoff=nltk.DefaultTagger('NN'))
def performance(cfd, wordlist):
lt = dict((word, cfd[word].max()) for word in wordlist)
baseline_tagger = nltk.UnigramTagger(model=lt, backoff=nltk.DefaultTagger('NN'))
return baseline_tagger.evaluate(brown.tagged_sents(categories='news'))
def display():
import pylab
words_by_freq = list(nltk.FreqDist(brown.words(categories='news')))
cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
sizes = 2 ** pylab.arange(15)
perfs = [performance(cfd, words_by_freq[:size]) for size in sizes]
pylab.plot(sizes, perfs, '-bo')
pylab.title('Lookup Tagger Performance with Varying Model Size')
pylab.xlabel('Model Size')
pylab.ylabel('Performance')
pylab.show()
开发者ID:AkiraKane,项目名称:Python,代码行数:31,代码来源:c05_auto_tagging.py
示例14: demo
def demo():
root = Tk()
root.bind('<Control-q>', lambda e: root.destroy())
table = Table(root, 'Word Synset Hypernym Hyponym'.split(),
column_weights=[0, 1, 1, 1],
reprfunc=(lambda i,j,s: ' %s' % s))
table.pack(expand=True, fill='both')
from nltk.corpus import wordnet
from nltk.corpus import brown
for word, pos in sorted(set(brown.tagged_words()[:500])):
if pos[0] != 'N': continue
word = word.lower()
for synset in wordnet.synsets(word):
hyper = (synset.hypernyms()+[''])[0]
hypo = (synset.hyponyms()+[''])[0]
table.append([word,
getattr(synset, 'definition', '*none*'),
getattr(hyper, 'definition', '*none*'),
getattr(hypo, 'definition', '*none*')])
table.columnconfig('Word', background='#afa')
table.columnconfig('Synset', background='#efe')
table.columnconfig('Hypernym', background='#fee')
table.columnconfig('Hyponym', background='#ffe')
for row in range(len(table)):
for column in ('Hypernym', 'Hyponym'):
if table[row, column] == '*none*':
table.itemconfig(row, column, foreground='#666',
selectforeground='#666')
root.mainloop()
开发者ID:ciju,项目名称:yql_hash,代码行数:32,代码来源:table.py
示例15: question2
def question2(category):
#print
#print "For Category: " + category
#print "Words with the tag 'JJ':"
#print
words = bn.tagged_words(categories = category)
wordlist = bn.words(categories = category)
words_JJ = set(sorted([(word, tag) for (word, tag) in words if tag == 'JJ']))
print len(words_JJ)
print
print
print "Words with tags 'VBZ' -> 3rd Person Singular Verbs or ('NNPS' or 'NNS') -> plural nouns:"
print
words_VBP_NNPS_NNS = [(word, tag) for (word, tag) in words if tag == 'VBZ' or tag == 'NNPS' or tag == 'NNS']
print words_VBP_NNPS_NNS[:10]
print
print
print "Ratio"
print
male_pattern = r'\bhe\b|\bhis\b|\bhim\b|\bhimself\b'
female_pattern = r'\bshe\b|\bher\b|\bhers\b|\bherself\b'
male_pronouns = len([w for w in wordlist if re.search(male_pattern, w.lower())])
female_pronouns = len([w for w in wordlist if re.search(female_pattern, w.lower())])
print "Male : Female is -> %d : %d" %(male_pronouns, female_pronouns)
print
print
sent = ""
print "3 word prepositional phrases are:"
开发者ID:GirishSrinivas,项目名称:PythonPrograms,代码行数:28,代码来源:ch5aq2.py
示例16: partOfSpeechTagging
def partOfSpeechTagging():
from nltk.corpus import brown
suffix_fdist = nltk.FreqDist()
for word in brown.words():
word = word.lower()
suffix_fdist.inc(word[-1:])
suffix_fdist.inc(word[-2:])
suffix_fdist.inc(word[-3:])
common_suffixes = suffix_fdist.keys()[:100]
print common_suffixes
def pos_features(word):
features = {}
for suffix in common_suffixes:
features['endswith(%s)' % suffix] = word.lower().endswith(suffix)
return features
tagged_words = brown.tagged_words(categories='news')
featuresets = [(pos_features(n), g) for (n,g) in tagged_words]
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.DecisionTreeClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)
classifier.classify(pos_features('cats'))
print classifier.pseudocode(depth=4)
开发者ID:AkiraKane,项目名称:Python,代码行数:33,代码来源:c06_supervised_classification.py
示例17: main
def main():
tagged_words = brown.tagged_words()
words_corpus = brown.words()
word2vec = Word2Vec()
word2vec.train(words_corpus)
word_vecs = [word2vec.word2vec(word) for word in words_corpus]
n_clusters = 10 # random number for now
kmeans = KMeans(n_clusters)
kmeans.compute(word_vecs)
# word-cluster HMM
p_word = {}
p_cluster = {}
p_cluster_given_word = None # softmax
p_word_given_cluster = None # joint probability formula
p_transition_cluster = None # count
p_initial_cluster = None # count
# cluster-tag HMM
p_cluster_given_tag = None # softmax
p_transition_tag = None # count from tagged data
p_initial_tag = None # count from tagged data
hmm_word_cluster = HMM(p_initial_cluster, p_transition_cluster, p_word_given_cluster)
hmm_cluster_tag = HMM(p_initial_tag, p_transition_tag, p_cluster_given_tag)
words = []
clusters = hmm_word_cluster.viterbi(words)
tags = hmm_cluster_tag.viterbi(clusters)
开发者ID:Sowmith-iiit,项目名称:nlp-ssp,代码行数:34,代码来源:main.py
示例18: verb_stem
def verb_stem(s):
"""extracts the stem from the 3sg form of a verb, or returns empty string"""
ok = 0
if (re.match("\w*([^aeiousxyzh]|[^cs]h)s$", s)):
stem = s[:-1]
elif (re.match("(\w*)[aeiou]ys$", s)):
stem = s[:-1]
elif (re.match("\w+[^aeiou]ies$", s)):
stem = s[:-3]+'y'
elif (re.match("[^aeiou]ies$", s)):
stem = s[:-1]
elif (re.match("\w*([ox]|ch|sh|ss|zz)es$", s)):
stem = s[:-2]
elif (re.match("\w*(([^s]se)|([^z]ze))s$", s)):
stem = s[:-1]
elif (re.match("has", s)):
stem = "have"
elif (re.match("\w*([^iosxzh]|[^cs]h)es$", s)):
stem = s[:-1]
else:
stem = ""
if (stem != "" and ok != 1):
for (word, tag) in brown.tagged_words():
if word == stem and tag in ('VB', 'VBZ'):
return stem
ok = 1
break
if (ok == 0):
return ""
开发者ID:zachbpd,项目名称:A-Natural-Language-Query-System-in-Python-NLTK,代码行数:32,代码来源:statements.py
示例19: exercise3c
def exercise3c(category):
print
print "For category: " +category
brown_tag_words = bn.tagged_words(categories = category)
tag_fd = nltk.FreqDist(t for (w,t) in brown_tag_words)
print tag_fd.keys()[:10]
print
开发者ID:GirishSrinivas,项目名称:PythonPrograms,代码行数:7,代码来源:ch5bq3c.py
示例20: partb
def partb():
print
print
tags = [b for (a, b) in bn.tagged_words()]
fd = nltk.FreqDist(tags)
ft = fd.keys()
cfd2 = nltk.ConditionalFreqDist((tag, word) for (word, tag) in bn.tagged_words())
for a in ft:
if fd[a] == 1:
print "For POS: " +a
print cfd2[a].keys()
print
print
print
开发者ID:GirishSrinivas,项目名称:PythonPrograms,代码行数:16,代码来源:ch5aq3.py
注:本文中的nltk.corpus.brown.tagged_words函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论