本文整理汇总了Python中nltk.tag.stanford.POSTagger类的典型用法代码示例。如果您正苦于以下问题:Python POSTagger类的具体用法?Python POSTagger怎么用?Python POSTagger使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了POSTagger类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: nltk_stanfordpos
def nltk_stanfordpos(inpath, outfolder):
"""POS-Tagging French text with Stanford POS-Tagger via NLTK."""
print("\nLaunched nltk_stanfordpos.")
import os
import glob
from nltk.tag.stanford import POSTagger
for file in glob.glob(inpath):
st = POSTagger('/home/christof/Programs/stanfordpos/models/french.tagger', '/home/christof/Programs/stanfordpos/stanford-postagger.jar', encoding="utf8")
with open(file, "r", encoding="utf-8") as infile:
untagged = infile.read()
tagged = st.tag(untagged.split())
taggedstring = ""
for item in tagged:
item = "\t".join(item)
taggedstring = taggedstring + str(item) + "\n"
#print(taggedstring)
basename = os.path.basename(file)
cleanfilename = basename
if not os.path.exists(outfolder):
os.makedirs(outfolder)
with open(os.path.join(outfolder, cleanfilename),"w") as output:
output.write(taggedstring)
print("Done.")
开发者ID:daschloer,项目名称:tmw,代码行数:27,代码来源:tmw.py
示例2: main
def main():
st = POSTagger(
"/home/shaun/stanford-postagger-full-2013-11-12/models/german-dewac.tagger",
"/home/shaun/stanford-postagger-full-2013-11-12/stanford-postagger.jar",
)
# st = POSTagger("/home/shaun/stanford-postagger-full-2013-11-12/models/german-fast.tagger", \
# "/home/shaun/stanford-postagger-full-2013-11-12/stanford-postagger.jar")
# print st.tag("Die Kinder in Bayern haben lange Ferien".split())
# return
with open(sys.argv[1], "r") as f:
content = f.read()
sentences = re.split("\n|\.|\?", content)
for s in sentences:
if len(s) == 0:
continue
# print s
pieces = st.tag(s.split())
strippedPieces = stripPieces(pieces)
print " ".join(strippedPieces)
开发者ID:spattersongt,项目名称:lingq,代码行数:27,代码来源:case_trainer.py
示例3: cleanTokens
def cleanTokens(tokens):
st = POSTagger('/models/german-fast.tagger')
tags = st.tag(tokens);
def cleanTags(x):
y = x[1]
return True if re.match("NE|NN",y) and len(x[0]) > 3 else False
clean_tags= filter(cleanTags,tags)
#import pdb;pdb.set_trace();
def buildSentens(arr):
list = []
sen =""
for i in arr:
list.append(i[0])
return list
#print len(clean_tags)
#print clean_tags
clean = buildSentens(clean_tags)
return clean
开发者ID:jbrissier,项目名称:gccheck,代码行数:29,代码来源:extract_text.py
示例4: stanford_corenlp_filter
def stanford_corenlp_filter(sent):
from nltk.tag.stanford import POSTagger
posTagger = POSTagger('/Users/gt/Downloads/'
'stanford-postagger-2013-06-20/models/'
'wsj-0-18-bidirectional-nodistsim.tagger',
'/Users/gt/Downloads/stanford-postagger-2013-06-20'
'/stanford-postagger-3.2.0.jar',encoding=encoding)
b1, b2 = sent.split(blockSeparator)
b2 = b2.rstrip()
b1 = b1.lower()
tokens = word_tokenize(b1)
pos_tags = posTagger.tag(tokens)
filtered_sent = ' '
for pos_t in pos_tags:
if pos_t[1] in filterList:
# filtered_sent += stemmer.stem(pos_t[0]) + ' '
filtered_sent += '1' + stemmer.stem(pos_t[0]) + ' '
#note: 1 concat stemmer(word) == stemmer(1 concat word)
b2 = b2.lower()
tokens = word_tokenize(b2)
pos_tags = posTagger.tag(tokens)
filtered_sent = ' '
for pos_t in pos_tags:
if pos_t[1] in filterList:
# filtered_sent += stemmer.stem(pos_t[0]) + ' '
filtered_sent += '2' + stemmer.stem(pos_t[0]) + ' '
return filtered_sent
开发者ID:gthandavam,项目名称:Recipes,代码行数:32,代码来源:builder.py
示例5: vectorizer
def vectorizer(tokens, w2v_db):
db_path = w2v_db
# POS TAGGING
tagger = POSTagger('tagger/english-left3words-distsim.tagger', 'tagger/stanford-postagger.jar')
tagged_tokens = tagger.tag(tokens)
unsorted_kw = OrderedDict()
for (w,t) in tagged_tokens:
if t in ['NNP', 'NNPS', 'FW']:
label = 1.5
elif t in ['NN', 'NNS']:
label = 1
else:
continue
w = w.lower()
try:
unsorted_kw[w] += label
except KeyError:
unsorted_kw[w] = label
# Get the vectors of words. Maintain order as in document.
token_vecs = OrderedDict()
conn = SQLCon(db_path)
words = (word.lower() for word in unsorted_kw)
for word in words:
try:
if token_vecs[word]: continue
except KeyError:
v = conn.read(word)
if not v is None:
token_vecs[word] = list(v)
print("kw_len: {0} vec_len: {1}".format(len(unsorted_kw), len(token_vecs))) #Output for debugging; total vs unique words.
conn.close()
return unsorted_kw, token_vecs
开发者ID:suraj813,项目名称:SOMClassifier,代码行数:33,代码来源:pp_v4.py
示例6: postext_st
def postext_st(filename):
# Opening of File
path_to_raw = '/home/cyneo/Work/Scans/Text Version/'
if type(filename) != str:
raise IOError('Filename must be a string')
# Preparing to Tokenize
with open(osp.abspath(path_to_raw + filename + '.txt'),
'r', encoding='utf8') as raw:
# Initialize the punkt module
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
sents = []
for line in raw:
sents.extend(sent_detector.tokenize(line.strip()))
tokenedsents = []
# Tokenizing
from nltk.tokenize.stanford import StanfordTokenizer
for line in sents:
tokenedsents.append(StanfordTokenizer().tokenize(line))
# Parts of Speech Tagging
posSents = []
from nltk.tag.stanford import POSTagger
st = POSTagger('/mnt/sda2/stanford-packages/stanford-postagger-2014-10-26/models/english-bidirectional-distsim.tagger',
encoding='utf8')
for line in tokenedsents:
# Returns a list of a list of tuples
posSents.append(st.tag(line))
return posSents
开发者ID:cyneo,项目名称:feminism,代码行数:34,代码来源:adjective+extract.py
示例7: createModel
def createModel():
global classifierit
global classifierloose
global classifieryou
global classifierto
global classifiertheir
trainingitSet = []
traininglooseSet = []
trainingyouSet = []
trainingtoSet = []
trainingtheirSet= []
st = POSTagger('/home/siddhartha/Downloads/stanford-postagger-full-2014-01-04/models/english-bidirectional-distsim.tagger', '/home/siddhartha/Downloads/stanford-postagger-full-2014-01-04/stanford-postagger.jar')
for line in brown.sents():
print line
tagSent = st.tag(line)
print tagSent
arrayOfitFeature = pos_itfeatures(tagSent)
arrayOfyouFeature = pos_youfeatures(tagSent)
arrayOftheirFeature = pos_theirfeatures(tagSent)
arrayOflooseFeature = pos_loosefeatures(tagSent)
arrayOftoFeature = pos_tofeatures(tagSent)
if arrayOfitFeature:
trainingitSet.extend(arrayOfitFeature)
if arrayOftheirFeature:
trainingtheirSet.extend(arrayOftheirFeature)
if arrayOflooseFeature:
traininglooseSet.extend(arrayOflooseFeature)
if arrayOftoFeature:
trainingtoSet.extend(arrayOftoFeature)
if arrayOfyouFeature:
trainingyouSet.extend(arrayOfyouFeature)
algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[1]
#encodingit = maxent.TypedMaxentFeatureEncoding.train(trainingitSet, count_cutoff=3, alwayson_features=True)
classifierit = maxent.MaxentClassifier.train(trainingitSet, algorithm)
f = open('classifierit.pickle', 'wb')
pickle.dump(classifierit, f)
f.close()
#encodingloose = maxent.TypedMaxentFeatureEncoding.train(traininglooseSet, count_cutoff=3, alwayson_features=True)
classifierloose = maxent.MaxentClassifier.train(traininglooseSet, algorithm)
f = open('classifierloose.pickle', 'wb')
pickle.dump(classifierloose, f)
f.close()
#encodingyou = maxent.TypedMaxentFeatureEncoding.train(trainingyouSet, count_cutoff=3, alwayson_features=True)
classifieryou = maxent.MaxentClassifier.train(trainingyouSet, algorithm)
f = open('classifieryou.pickle', 'wb')
pickle.dump(classifieryou, f)
f.close()
#encodingto = maxent.TypedMaxentFeatureEncoding.train(trainingtoSet, count_cutoff=3, alwayson_features=True)
classifierto = maxent.MaxentClassifier.train(trainingtoSet, algorithm)
f = open('classifierto.pickle', 'wb')
pickle.dump(classifierto, f)
f.close()
#encodingtheir = maxent.TypedMaxentFeatureEncoding.train(trainingtheirSet, count_cutoff=3, alwayson_features=True)
classifiertheir = maxent.MaxentClassifier.train(trainingtheirSet, algorithm)
f = open('classifiertheir.pickle', 'wb')
pickle.dump(classifiertheir, f)
f.close()
开发者ID:siddharthasandhu,项目名称:NLPProjects,代码行数:59,代码来源:stanLearn.py
示例8: stanford_tag
def stanford_tag(sentence):
''' use stanford tagger to tag a single tokenized sentence
'''
import src.experiment.path as path
tagger = POSTagger(path.stanford_tagger_model_path(),
path.stanford_tagger_path(),
java_options='-Xmx16g -XX:MaxPermSize=256m')
return tagger.tag(sentence)
开发者ID:fashandge,项目名称:deja,代码行数:8,代码来源:utilities.py
示例9: tag
def tag(segments):
#st = POSTagger('/home/dc65/Documents/tools/stanford-postagger-2014-01-04/models/english-left3words-distsim.tagger', '/home/dc65/Documents/tools/stanford-postagger-2014-01-04/stanford-postagger-3.3.1.jar')
st = POSTagger(os.path.join(stanford_path, 'models/english-left3words-distsim.tagger'),
os.path.join(stanford_path, 'stanford-postagger-3.3.1.jar'))
tagged = []
for segment in segments:
x = ' '.join(nltk.tag.tuple2str(w) for w in st.tag(word_tokenize(segment)))
tagged.append(x.decode('utf-8'))
return tagged
开发者ID:bwallace,项目名称:irony-redux,代码行数:9,代码来源:extract_and_tag.py
示例10: spanish_pos
def spanish_pos(text):
""" Parts of speech tagger for Spanish """
text = text.encode('utf8')
st = POSTagger('/Users/Lena/src/context/stanford-postagger/models/spanish-distsim.tagger',
'/Users/Lena/src/context/stanford-postagger/stanford-postagger.jar', 'utf8')
pos_tagged = st.tag(text.split())
return pos_tagged
开发者ID:lenazun,项目名称:context,代码行数:11,代码来源:spanish_processing.py
示例11: german_pos
def german_pos(text):
""" Parts of speech tagger for German """
text = text.encode('utf8')
st = POSTagger('/Users/Lena/src/context/stanford-postagger/models/german-fast.tagger',
'/Users/Lena/src/context/stanford-postagger/stanford-postagger.jar', 'utf8')
pos_tagged = st.tag(text.split())
return pos_tagged
开发者ID:lenazun,项目名称:context,代码行数:11,代码来源:german_processing.py
示例12: stanford_batch_tag
def stanford_batch_tag(sentences):
'''use stanford tagger to batch tag a list of tokenized
sentences
'''
import src.experiment.path as path
# need to replace the model path and tagger path of standford parser
# in your computer (I use two functions here, you can hard code the paths if
# you like)
tagger = POSTagger(path.stanford_tagger_model_path(),
path.stanford_tagger_path())
return tagger.batch_tag(sentences)
开发者ID:fashandge,项目名称:deja,代码行数:11,代码来源:utilities.py
示例13: pos_tag
def pos_tag(texts):
from nltk.tag.stanford import POSTagger
jar = config.mainpath+"analyze/SPOS/stanford-postagger.jar"
if language == "german":
model = config.mainpath+"analyze/SPOS/models/german-fast.tagger"
if language == "english":
model = config.mainpath+"analyze/SPOS/models/english-bidirectional-distsim.tagger"
tagger = POSTagger(model, path_to_jar = jar, encoding="UTF-8")
return tagger.tag_sents(texts)
开发者ID:chreman,项目名称:output_BA,代码行数:12,代码来源:parallel_preprocessing.py
示例14: main
def main():
print "Inicio..."
with open("tweets_a_procesar_v2.csv", 'rb') as csvfile:
lines = csv.reader(csvfile, delimiter=DELIMITER, quotechar="'")
# En esta variable estan todos los tweets
tweets = []
for line in lines:
tweet = Tweet(line)
#print tweet.spanish_text.split()
tweets.append(tweet)
#archivo de salida
output = open("output_tagged_v2.csv", 'wb')
filewriter = csv.writer(output, delimiter=DELIMITER, quotechar="'")
#importando el tagger en español de Stanford NLP
from nltk.tag.stanford import POSTagger
st = POSTagger('/Applications/XAMPP/htdocs/Proyectos/Stanford/stanford-postagger-full-2014-08-27/models/spanish-distsim.tagger','/Applications/XAMPP/htdocs/Proyectos/Stanford/stanford-postagger-full-2014-08-27/stanford-postagger-3.4.1.jar',encoding='utf-8')
#st = POSTagger('/Applications/XAMPP/htdocs/Proyectos/Stanford/stanford-postagger-full-2014-08-27/models/spanish.tagger','/Applications/XAMPP/htdocs/Proyectos/Stanford/stanford-postagger-full-2014-08-27/stanford-postagger-3.4.1.jar',encoding='utf-8')
#st = POSTagger('C:\Data\stanford-postagger-full-2014-08-27\models\spanish.tagger', 'C:\Data\stanford-postagger-full-2014-08-27\stanford-postagger-3.4.1.jar', encoding='utf-8')
n=0
for tweet in tweets:
n+=1
print tweet.spanish_text
#Ejemplo: st.tag('What is the airspeed of an unladen swallow ?'.split())
tweet_tagged = st.tag((tweet.spanish_text).split())
#Ejem_output: [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')]
#print tweet_tagged
important_words = []
n_adj = 0
for tag in tweet_tagged:
inicial = tag[1][:1]
if('a' in inicial):
important_words.append(tag[0])
if('r' in inicial):
important_words.append(tag[0])
if('n' in inicial):
important_words.append(tag[0])
if('v' in inicial):
important_words.append(tag[0])
#tweet.cant_adj = n_adj
tweet.tweet_tagged = tweet_tagged
tweet.important_words = important_words
filewriter.writerow(tweet.to_CSV())
if n % 100 == 0: print n
print "Done"
output.close()
开发者ID:wilchess26,项目名称:WebMining,代码行数:51,代码来源:unParseStream.py
示例15: pos_tag_stanford
def pos_tag_stanford(toked_sentence):
"""
INPUT: list of strings
OUTPUT: list of tuples
Given a tokenized sentence, return
a list of tuples of form (token, POS)
where POS is the part of speech of token
"""
from nltk.tag.stanford import POSTagger
st = POSTagger('/Users/jeff/Zipfian/opinion-mining/references/resources/stanford-pos/stanford-postagger-2014-06-16/models/english-bidirectional-distsim.tagger',
'/Users/jeff/Zipfian/opinion-mining/references/resources/stanford-pos/stanford-postagger-2014-06-16/stanford-postagger.jar')
return st.tag(toked_sentence)
开发者ID:Jewelryland,项目名称:Opinion-Mining-Project,代码行数:15,代码来源:extract_aspects.py
示例16: processor
def processor(name, url, tokens, db_path,json_dir, USE_TITLE_WORDS = False):
# POS TAGGING
tagger = POSTagger('tagger/english-left3words-distsim.tagger', 'tagger/stanford-postagger.jar')
tagged_tokens = tagger.tag(tokens)
unsorted_kw = OrderedDict()
for (w,t) in tagged_tokens:
if t in ['NNP', 'NNPS', 'FW']:
label = 1.5
elif t in ['NN', 'NNS']:
label = 1
else:
continue
w = w.lower()
try:
unsorted_kw[w] += label
except KeyError:
unsorted_kw[w] = label
# Get the vectors list
token_vecs = OrderedDict()
conn = SQLCon(db_path)
words = (word.lower() for word in unsorted_kw)
for word in words:
try:
if token_vecs[word]: continue
except KeyError:
v = conn.read(word)
if not v is None:
token_vecs[word] = list(v)
print("kw_len: {0} vec_len: {1}".format(len(unsorted_kw), len(token_vecs)))
conn.close()
#Compute cluster centers:
nk = round(len(token_vecs)/4)
data = numpy.array(list(token_vecs.values()))
cent, _ = kmeans2(data,nk,iter=20,minit='points')
centroids = cent.tolist()
# Create the JSON object for this webpage.
if not os.path.exists(json_dir):
os.makedirs(json_dir)
json_path = os.path.join(json_dir,name+'.json')
file_dest = open(json_path, 'w')
json.dump({'url': url, 'vectors' : token_vecs, 'keyword_frequency': unsorted_kw, 'centroids' : centroids}, file_dest)
file_dest.close()
开发者ID:suraj813,项目名称:SOMClassifier,代码行数:47,代码来源:pp_v3.py
示例17: stan_pos
def stan_pos(input_sent):
"""
This function calls stanford POS tagger.In this function Stanford POS tagger directory must be in the same directory.And this function chooses model "wsj left 3 words" as normal POS tagging model. If you want to use other POS tagging models, please change first argument of st = POSTagger() below.
"""
eval_sent = []
st = POSTagger("./stanford-postagger-2012-11-11/models/wsj-0-18-left3words.tagger","./stanford-postagger-2012-11-11/stanford-postagger.jar")
pos_result = st.tag(input_sent.split())
for one_tuple in pos_result:
pos_format = one_tuple[0] + "_" + one_tuple[1]
eval_sent.append(pos_format)
eval_sent = reg_form(eval_sent)
return eval_sent
开发者ID:Kensuke-Mitsuzawa,项目名称:practice_code,代码行数:17,代码来源:make_feat_predict.py
示例18: add_POS
def add_POS(self,row_file,target):
'''
row_str = '';
f = open(row_file,'rb');
for row in f:
row_str+=row;
soup = BeautifulSoup(row_str);
self.soup = soup;
sentences = soup.find_all('sentence');
all_token = list();
for block in sentences:
text = block.text.strip();
text_token = self.tf.stanford_tokenize(text);
all_token.append(text_token);
'''
all_token = self.get_token(target);
stanford_tagger = \
POSTagger('../stanford-postagger-full-2015-01-30/models/english-bidirectional-distsim.tagger','../stanford-postagger-full-2015-01-30/stanford-postagger.jar');
tag_list = list();
for row in all_token:
temp_list = list();
for word in row:
if len(word)>1 and re.match(r'^[A-Z]+',word):
temp_list.append(word.lower());
else:
temp_list.append(word);
tag_list.append(temp_list);1
#end for
tagged_result = stanford_tagger.tag_sents(tag_list);
'''
for row in tagged_result:
index_list = list();
for num,item in enumerate(row):
if not re.match(r'.*[\w\d]+',item[0]):
index_list.append(num);
for i in index_list:
row[i]=(row[i][0],row[i][0]);
#end for
'''
w = open('pos_%s'%target,'wb');
for num1,row in enumerate(tagged_result):
for num2,item in enumerate(row):
w.write(all_token[num1][num2]+' '+item[1]+'\n');
w.write('\n');
#print tagged_result;
return;
开发者ID:victormm88,项目名称:SemEval,代码行数:46,代码来源:Feature_Tool.py
示例19: main
def main():
dict2 = readDict("dict2.txt")
sentences2 = readSentences("sentences2.txt")
translated2 = translate(sentences2, dict2)
print "======================================BASE TRANSLATION=========================================="
for sentence in translated2:
print sentence
print "================================================================================================"
st = POSTagger('stanford-postagger/models/english-left3words-distsim.tagger',
'stanford-postagger/stanford-postagger.jar')
POS = []
for sentence in translated2:
tagged = st.tag(sentence.split())
if (len(tagged)>0):
POS.append(tagged)
POS = stupidFixes(POS)
print "==================================STUPID FIXES TRANSLATION======================================"
for sentence in POS:
# print sentence # '[%s]' % ', '.join(map(str, sentence))
print ' '.join(map(getWord, sentence))
POS = rulesOneThree(POS)
print "=====================================RULE1+3 TRANSLATION========================================"
for sentence in POS:
print ' '.join(map(getWord, sentence))
POS = rulesFourFiveSeven(POS)
print "=====================================RULE4+5+7 TRANSLATION========================================"
for sentence in POS:
print ' '.join(map(getWord, sentence))
POS = ruleTwoNine(POS)
POS = ruleTwoNine(POS) # apply twice
print "=====================================RULE2+9 TRANSLATION========================================"
for sentence in POS:
print ' '.join(map(getWord, sentence))
POS = ruleSixEight(POS)
print "=====================================RULE6+8 TRANSLATION========================================"
for sentence in POS:
print ' '.join(map(getWord, sentence))
开发者ID:j-squared,项目名称:cs124-pa7,代码行数:45,代码来源:MT.py
示例20: get_transactions
def get_transactions(self, product_reviews):
'''
Generates a set of transactions ready for frequent itemset mining
from the crawled product reviews
'''
pos_tagger = POSTagger(PATHS['POS_MODEL'], PATHS['POS_TAGGER'])
pos_output = []
transactions_output = []
print 'Generating transactions...'
product_count = 0
sentence_count = 0
for product in product_reviews:
sentences = sent_tokenize(product)
for sentence in sentences:
try:
sent_pos = pos_tagger.tag(word_tokenize(sentence))
except UnicodeEncodeError:
continue
trans = []
pos_tags = []
for word, pos in sent_pos:
pos_tags.append(':'.join([word, pos]))
if ((pos == 'NN' or pos == 'NNS' or pos == 'NP') and
re.match('^[A-Za-z0-9-]+$', word)):
trans.append(word.lower())
if trans:
pos_output.append([sentence] + pos_tags)
transactions_output.append([sentence] + trans)
sentence_count += 1
product_count += 1
print '---%s Reviews and %s Transactions Parsed---' % (
product_count,
sentence_count
)
write_csv(PATHS['POS'], pos_output)
write_csv(PATHS['TRANSACTIONS'], transactions_output)
print 'Finished generating transactions...'
开发者ID:arpangarg,项目名称:productreviews,代码行数:42,代码来源:features.py
注:本文中的nltk.tag.stanford.POSTagger类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论