本文整理汇总了Python中nltk.tag.pos_tag函数的典型用法代码示例。如果您正苦于以下问题:Python pos_tag函数的具体用法?Python pos_tag怎么用?Python pos_tag使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了pos_tag函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: make_pos
def make_pos(target_tag, edit_rev):
tags, srcs, dsts = edit_rev_triple
# target_tag: 文中に存在する
# 品詞を付与する前に、文中から削除・追加タグが存在する部分を取り除く
if target_tag == del_tag:
sentence = dsts
elif target_tag == add_tag:
sentence = srcs
if target_tag in tags:
tag_indexes = [i for i, x in enumerate(tags) if x == target_tag]
trimed = sentence
for tag_index in tag_indexes:
trimed = trimed[:tag_index] + trimed[tag_index+1:]
posed = pos_tag(trimed)
pos = [w[1] for w in posed]
for tag_index in tag_indexes:
pos.insert(tag_index, u'')
#debug
None_indexes = [i for i, x in enumerate(pos) if x == u'']
if tag_indexes != None_indexes:
print >>sys.stderr, tag_indexes
print >>sys.stderr, None_indexes
print >>sys.stderr, tags
print >>sys.stderr, pos
else:
posed = pos_tag(u' '.join(sentence).split())
pos = [w[1] for w in posed]
return pos
开发者ID:keisks,项目名称:epair,代码行数:33,代码来源:englishword_edit_distance.py
示例2: number_of_exact_word_match
def number_of_exact_word_match(a, b, word_tokenizer, lemmatizer, stop_words):
pos_a = map(get_tagged_words, pos_tag(word_tokenizer.tokenize(a)))
pos_b = map(get_tagged_words, pos_tag(word_tokenizer.tokenize(b)))
lemmae_a = [lemmatizer.lemmatize(token.lower().strip(punctuation), pos) for token, pos in pos_a \
if token.lower().strip(punctuation) not in stop_words]
lemmae_b = [lemmatizer.lemmatize(token.lower().strip(punctuation), pos) for token, pos in pos_b \
if token.lower().strip(punctuation) not in stop_words]
matched_words = set(lemmae_a).intersection(lemmae_b)
return [len(matched_words), matched_words, b]
开发者ID:Majestic12,项目名称:supervised_news_summarization,代码行数:9,代码来源:word_analysis.py
示例3: number_of_noun_match
def number_of_noun_match(a, b, word_tokenizer, lemmatizer, stop_words):
pos_a = map(get_tagged_words, pos_tag(word_tokenizer.tokenize(a)))
pos_b = map(get_tagged_words, pos_tag(word_tokenizer.tokenize(b)))
lemmae_a = [lemmatizer.lemmatize(token.lower().strip(punctuation), pos) for token, pos in pos_a \
if pos == NOUN and token.lower().strip(punctuation) not in stop_words]
lemmae_b = [lemmatizer.lemmatize(token.lower().strip(punctuation), pos) for token, pos in pos_b \
if pos == NOUN and token.lower().strip(punctuation) not in stop_words]
# Calculate Jaccard similarity
#ratio = len(set(lemmae_a).intersection(lemmae_b)) / float(len(set(lemmae_a).union(lemmae_b)))
#return (ratio > 0.66)
matched_words = set(lemmae_a).intersection(lemmae_b)
return [len(matched_words), matched_words, b]
开发者ID:Majestic12,项目名称:supervised_news_summarization,代码行数:12,代码来源:word_analysis.py
示例4: keep_nouns
def keep_nouns(tf):
n_tf = {}
for k in tf:
if pos_tag([k])[0][1].find('N') == 0:
n_tf[k] = tf[k]
return n_tf
开发者ID:catsdogone,项目名称:video-indexing,代码行数:7,代码来源:utils.py
示例5: title_permutations
def title_permutations(title_expanded):
title_tagged = pos_tag(title_expanded.split())
st = PorterStemmer()
title_pos = [st.stem(word) for word, pos in title_tagged if pos != 'IN']
title_perms = list(map("*".join, permutations(title_pos)))
return title_perms
开发者ID:DivyaKarthikeyan,项目名称:NLPCareerTrajectory,代码行数:7,代码来源:JobTitleNormalization.py
示例6: extract
def extract(query):
sentence = query
tagged_sent = pos_tag(sentence.split())
propernouns = [word for word,pos in tagged_sent if pos == 'NN']
return propernouns
#extract("I want to buy a car and a dog and plane")
开发者ID:ljsou,项目名称:QueryAnalyzer,代码行数:7,代码来源:chunker_noun.py
示例7: test_run
def test_run():
results = {}
nouns = []
product_list = {}
for p in Post.query.all():
tagged_sent = pos_tag(p.story.split())
propernouns = [word for word,pos in tagged_sent if pos == 'NNP']
for n in propernouns:
if n == "I’m" or n == "It’s" or n == "Can’t":
continue
results[n.replace('.', '')] = True
for r in results.keys():
nouns.append(r)
for i in range(10):
noun = random.choice(nouns)
# print('Using "%s"', (noun,))
for k in test_keywords:
try:
products = amazon.search(Keywords=noun, SearchIndex=k)
for product in products:
product_list[product.title] = True
except:
continue
for p in product_list.keys():
print(" Found title: %s" % (p,))
开发者ID:davidmaignan,项目名称:giftsmarts,代码行数:28,代码来源:test_nltk.py
示例8: analiseSentimento
def analiseSentimento(resposta):
texto = resposta['corpo']
frases = sentencesTokenizer.tokenize(texto)
palavras = []
for frase in frases:
palavras.extend(wordsTokenizer.tokenize(frase))
posTags = pos_tag(palavras)
positivo = 0
negativo = 0
for palavra, tag in posTags:
synsets = None
if tag.startswith('J'):
synsets = sentiwordnet.senti_synsets(palavra, wordnet.ADJ)
elif tag.startswith('V'):
synsets = sentiwordnet.senti_synsets(palavra, wordnet.VERB)
elif tag.startswith('N'):
synsets = sentiwordnet.senti_synsets(palavra, wordnet.NOUN)
elif tag.startswith('R'):
synsets = sentiwordnet.senti_synsets(palavra, wordnet.ADV)
else:
synsets = sentiwordnet.senti_synsets(palavra, '')
if synsets != None:
synsets = list(synsets)
if len(synsets) > 0:
synset = synsets[0]
positivo = positivo + synset.pos_score()
negativo = negativo + synset.neg_score()
if positivo > negativo:
return (resposta, 'positivo')
elif negativo > positivo:
return (resposta, 'negativo')
else:
return (resposta, 'neutro')
开发者ID:vbozelli,项目名称:Sentiment-Analysis,代码行数:33,代码来源:analise_sentimento_sentiwordnet_com_stopwords.py
示例9: _process_simpleHash
def _process_simpleHash(self, simpleHash):
# Extract entities from keys resulting from SimpleExtractor process_*
entityHash = {}
for data in simpleHash:
occs = simpleHash[data]['occurences']
proxLoc = simpleHash[data]['proxLoc']
# Tokenize sentences
for sent in tokenize_sentences(data):
# Tokenize words
tokens = tokenize_words(sent)
# Tag words with Parts of Speech
tagged = pos_tag(tokens)
# Identify named entities
entities = ne_chunk(tagged)
for ent in entities:
if isinstance(ent, NLTKParseTree):
# Is it a wanted type?
if ent.node in self.types:
# Should we keep the PoS tag?
if self.keepPos:
txts = ['/'.join(token) for token in ent.leaves()]
else:
txts = [token[0] for token in ent.leaves()]
txt = ' '.join(txts)
new = {txt: {'text': txt,
'occurences': occs,
'proxLoc': proxLoc[:]}}
entityHash = self._mergeHash(entityHash, new)
return entityHash
开发者ID:digging-into-data-berkeley,项目名称:cheshire3,代码行数:29,代码来源:extractor.py
示例10: process_raw_text
def process_raw_text(text):
"""
First some code to standardize the formatting, then basic nlp.
"""
# Remove breaks and tabs
for char in ["\t", "\n"]:
text = text.replace(char, " ")
text = text.replace('."', '".')
text = text.replace(".'", "'.")
# Split special characters from words
for char in ["'", '"', ",", ".", "?", "!", ";", ":"]:
text = text.replace(char, " " + char + " ")
# Magic to remove all multi-spaces
text = ' '.join(text.split())
# get the words, sentences, POS tags, and chunks.
chunks = [ tuple([ c.type for c in t.chunks ]) for t in parsetree(text) ]
sentences = sent_tokenize(text)
sentences = [ word_tokenize(s) for s in sentences ]
sentences_tags = [ tuple([ (w, simplify_tag(t)) for w, t in pos_tag(s) ]) for s in sentences ]
sentences = [ tuple([ w for w, _ in s]) for s in sentences_tags ]
tags = [ tuple([ t for _, t in s]) for s in sentences_tags ]
words = flatten(sentences)
return tuple(words), tuple(sentences), tuple(tags), tuple(chunks)
开发者ID:RemideZ,项目名称:Stylometry,代码行数:26,代码来源:create_Datasets.py
示例11: GetContractPage
def GetContractPage(x):
url = 'http://www.defense.gov/contracts/contract.aspx?contractid=%d' % x
html = urllib.urlopen(url).read()
if re.search("The Official Home of the Department of Defense", html):
return
soup = BeautifulSoup(html)
p_tags = soup.findAll("p")
p_tags_text_list = [tag.text for tag in p_tags]
tokenized_list = []
for text in p_tags_text_list:
tokenized_list = tokenize.word_tokenize(text)
tokenized_list.append(nltk_tag.pos_tag(tokenized_list))
tagged_list = tokenized_list[-1]
data = {
"url": url}
for token in tagged_list[1:]:
if token[1]=="NNP":
data['entity'] = token[0]
break
for token in tagged_list[1:]:
if token[1]=="CD":
data['Amount'] = token[0]
break
print data
开发者ID:carriercomm,项目名称:scraperwiki-scraper-vault,代码行数:31,代码来源:dod_with_nltk.py
示例12: processoFeatures
def processoFeatures(resposta):
frases = tokenizerFrases.tokenize(resposta["corpo"])
palavras = []
palavrasTexto = {}
for frase in frases:
palavrasTemp = tokenizerPalavras.tokenize(frase)
for palavra in palavrasTemp:
palavrasTexto[palavra] = True
posTags = pos_tag(palavras)
positivo = 0
negativo = 0
for palavra, tag in posTags:
synsets = None
if tag.startswith("J"):
synsets = sentiwordnet.senti_synsets(palavra, wordnet.ADJ)
elif tag.startswith("V"):
synsets = sentiwordnet.senti_synsets(palavra, wordnet.VERB)
elif tag.startswith("N"):
synsets = sentiwordnet.senti_synsets(palavra, wordnet.NOUN)
elif tag.startswith("R"):
synsets = sentiwordnet.senti_synsets(palavra, wordnet.ADV)
else:
synsets = sentiwordnet.senti_synsets(palavra, "")
if synsets != None:
synsets = list(synsets)
if len(synsets) > 0:
synset = synsets[0]
positivo = positivo + synset.pos_score()
negativo = negativo + synset.neg_score()
if positivo > negativo:
return (palavrasTexto, "positivo")
elif negativo > positivo:
return (palavrasTexto, "negativo")
else:
return (palavrasTexto, "neutro")
开发者ID:vbozelli,项目名称:Sentiment-Analysis,代码行数:35,代码来源:criar_classificador_com_stopwords.py
示例13: count_words_unigram_pos
def count_words_unigram_pos(input_filename, output_path=''):
txt = get_file_text(input_filename)
word_regex = '[a-zA-Z]+'
word_frequency = {}
total_words = 0.
matches = re.findall(word_regex, txt, re.M + re.S + re.U)
for m in matches:
word_frequency[m] = word_frequency.get(m, 0.) + 1.
total_words+=1.
sorted_words = sorted(word_frequency.iteritems(), key=operator.itemgetter(1))
word_analysis = []
for word in sorted_words:
pos = pos_tag([word[0]])
word_analysis.append([word[0], word[1], pos[0][1]])
o_file = make_output_file(input_filename, output_path=output_path, prefix='', suffix='-words_unigram_pos')
o_file.write('word\tcount\tpos\n')
for w in word_analysis:
o_file.write('%s\t%d\t%s\n' % (w[0], w[1], w[2]))
o_file.close()
开发者ID:bchoatejr,项目名称:religion,代码行数:26,代码来源:nlp_word_tools.py
示例14: extract_pos
def extract_pos(tokens, simple=True):
"""
Simple parts of speech of speech are:
VERB - verbs (all tenses and modes)
NOUN - nouns (common and proper)
PRON - pronouns
ADJ - adjectives
ADV - adverbs
ADP - adpositions (prepositions and postpositions)
CONJ - conjunctions
DET - determiners
NUM - cardinal numbers
PRT - particles or other function words
X - other: foreign words, typos, abbreviations
. - punctuation
:param tokens:
:return:
"""
tokens_pos = pos_tag(tokens)
pos = [p for t, p in tokens_pos]
if simple:
# translate larger set of part of speech tags into small, simpler set
pos_dict = nltk.tagset_mapping('en-ptb', 'universal')
pos = [pos_dict[p] for p in pos]
return pos
开发者ID:robert-giaquinto,项目名称:sentence_boundary_detection,代码行数:25,代码来源:extract_features.py
示例15: lda_train
def lda_train(raw):
stop = set(stopwords.words('english'))
p_stemmer = PorterStemmer()
text_array = []
for i in range(len(raw)):
text = raw[i].lower()
text = text.replace('\r\n', ' ')
text = re.sub("[^a-z0-9]", " ", text)
# Tokenization segments a document into its atomic elements.
words = text.split()
# Stop words
# Certain parts of English speech, like (for, or) or the word the are meaningless to a topic model.
# These terms are called stop words and need to be removed from our token list.
words = [j for j in words if j not in stop]
tokenized = nltk.word_tokenize(text)
tagged_sent = pos_tag(words)
words = [word for word,pos in tagged_sent if pos == 'NN']
# Stemming words is another common NLP technique to reduce topically similar words to their root.
# stemming reduces those terms to stem. This is important for topic modeling, which would otherwise view those terms as separate entities and reduce their importance in the model.
#words = [p_stemmer.stem(s) for s in words]
text_array.append(words)
dictionary = corpora.Dictionary(text_array)
dictionary.save('dictionary.dic')
corpus = [dictionary.doc2bow(text) for text in text_array]
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
ldamodel = models.ldamodel.LdaModel(corpus, num_topics=15, id2word=dictionary, passes=20)
filename = 'finalized_model_15.sav'
joblib.dump(ldamodel, filename)
print(ldamodel.print_topics(num_topics=15, num_words=6))
return ldamodel,dictionary
开发者ID:ZhenqiWangC,项目名称:models,代码行数:31,代码来源:lda_train.py
示例16: extract_tags
def extract_tags(comment_file):
result = {}
fd = open(comment_file, "r")
for s in fd:
m = s.replace(",",".").replace("and", ".").replace("or",".").replace(":",".").split(".")
for f in m:
d = wordpunct_tokenize(f)
for index, t in enumerate(d):
pos_str = ""
if t in cellphone_attribute:
before = index-10
if before < 0:
before = 0
end = index + 10
if end > len(d)-1:
end = len(d)-1
pos_result = pos_tag(d[before:end])
for pos_index, pos_sent in enumerate(pos_result):
seg_for_word = ""
adjust_word = ""
if pos_sent[1].find("JJ") != -1:
seg_for_word = ' '.join(d[index:pos_index + before + 1])
adjust_word = pos_sent[0]
if pos_index+ before < index:
seg_for_word = ' '.join(d[pos_index+before:index+1])
add_into_dict(result, t, adjust_word, seg_for_word)
return result
开发者ID:ccljing,项目名称:tag_extractionnew,代码行数:28,代码来源:tag_extraction.py
示例17: extract_onlynouns
def extract_onlynouns(tokens):
out = list()
for token in tokens:
pos = pos_tag(nltk.word_tokenize(token.lower()))[0][1]
if (pos == "NN") or (pos == "NNP"):
out.append(token)
return out
开发者ID:davestanley,项目名称:compnet-email-classifier,代码行数:7,代码来源:supporting_funcs.py
示例18: extract_entities
def extract_entities(words):
entities = []
for chunk in ne_chunk(pos_tag(words)):
if hasattr(chunk, 'node'):
performer = ' '.join(c[0] for c in chunk.leaves())
entities.append(performer.lower())
return entities
开发者ID:DirkBrand,项目名称:Comment-Classification,代码行数:7,代码来源:mainExtractor.py
示例19: filter_by_pos
def filter_by_pos(sentence, pos):
"""Returns the sentence with only words whose part of speech is in
ACCEPTED_POS.
"""
words_with_pos = pos_tag(word_tokenize(sentence))
words_with_pos = filter(lambda word: word[1] in pos, words_with_pos)
return ' '.join(map(lambda word_with_pos: word_with_pos[0], words_with_pos))
开发者ID:aehuynh,项目名称:textrankpy,代码行数:7,代码来源:text_process.py
示例20: parse
def parse(body):
contents = []
if isinstance(body, basestring):
contents.append(body)
else:
contents = body
sentences = []
for content in contents:
sentences.extend([sentence for sentence in sent_tokenize(content) if not str_helper.hasHTMLTag(sentence)])
stop = stopword.get_stopwords()
tokens = {}
for sentence in sentences:
for word in word_tokenize(sentence.lower()):
if word not in stop and not str_helper.hasNumbers(word) and not str_helper.hasPunctuation(word):
word = stem.stemming(word)
tokens.setdefault(word, 0)
tokens[word] += 1
wp = pos_tag(tokens.keys())
words = [row[0] for row in wp]
tags = [row[1] for row in wp]
return words, tags
开发者ID:ZwEin27,项目名称:digoie-filter,代码行数:27,代码来源:doc_body.py
注:本文中的nltk.tag.pos_tag函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论