本文整理汇总了Python中nltk.tag.StanfordNERTagger类的典型用法代码示例。如果您正苦于以下问题:Python StanfordNERTagger类的具体用法?Python StanfordNERTagger怎么用?Python StanfordNERTagger使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了StanfordNERTagger类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: pretag
def pretag(self):
text=self.text
st = StanfordNERTagger("/Users/victorstorchan/Downloads/stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz",\
"/Users/victorstorchan/Downloads/stanford-ner-2014-06-16/stanford-ner.jar")
paragraphs = []
paragraphs_string=''
for x in text:
paragraphs.append(str(x))
paragraphs_string=' '.join(paragraphs)
tagging=st.tag(paragraphs_string.split())
symlist=[ 'company','corporation','multinational', 'Corporation','open-source','social', 'network','software','system']
badlist=['integrated','first','check','computer','linear', 'solution','services','limited','tech','solutions','technology','open','model','on','applied','network', 'pricing','customers','social','big','subscribe','social','sign','monitor','software','machine','learning','compute','management','up']
badlist_stem=[]
self.badlist=badlist
self.symlist=symlist
for i in range(len(badlist)):
badlist_stem.append(stemmer.stem(badlist[i]))
self.badlist_stem=badlist_stem
pretag1= [tag for (tag,label) in tagging if label in set(("ORGANIZATION","PERSON")) or (count_upper(tag)>=2 and len(tag)<11 ) ]
pretag2=[tag for (tag,label) in tagging if tag.lower() in dict_1m or tag in dict_apps]
pretag3=[tag for (tag,label) in tagging if tag.lower() in dict_tech]
pretag= pretag1+pretag2+pretag3
domain2synsets = defaultdict(list)
synset2domains = defaultdict(list)
self.pretag=pretag
开发者ID:victorstorchan,项目名称:NER,代码行数:25,代码来源:extract_named_entities.py
示例2: test_model_in_mem
def test_model_in_mem(stanford_ner_path, model_name, sent_obj, type):
stanford_tagger = StanfordNERTagger(
model_name,
stanford_ner_path,
encoding='utf-8')
text = sent_obj.sentence
tokenized_text = list()
spans = list()
#Recover spans here
for match in re.finditer("\S+", text):
start = match.start()
end = match.end()
word = match.group(0)
tokenized_text.append(word.rstrip(",.;:"))
spans.append((start,end))
tokenized_text = strip_sec_headers_tokenized_text(tokenized_text)
classified_text = stanford_tagger.tag(tokenized_text)
# Expand tuple to have span as well
len_diff = len(spans) - len(classified_text) #Headers were stripped, so if this occured in the previous step, we have t account for the offset
final_class_and_span = list()
for idx,tup in enumerate(classified_text):
combined = (classified_text[idx][0],classified_text[idx][1],spans[idx+len_diff][0],spans[idx+len_diff][1])
final_class_and_span.append(combined)
#print(classified_text)
sent_obj.tok_sent_with_crf_predicted_attribs[type] = final_class_and_span
return sent_obj
开发者ID:abbottLane,项目名称:substance_abuse_extractor,代码行数:29,代码来源:EntityExtractor.py
示例3: extract_named_entities
def extract_named_entities(threadName,output_collection,fetchedTweets):
st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz')
try:
counter = 0
mongo_list = []
for fetchedTweet in fetchedTweets:
counter += 1
named_entities = []
sentence = fetchedTweet['cleaned_text']
neList = st.tag(sentence.split())
for ne in neList:
if ne[1] in ['PERSON', 'ORGANIZATION', 'LOCATION']:
named_entities.append((ne[0], ne[1]))
fetchedTweet['named_entities'] = named_entities
mongo_list.append(fetchedTweet)
if counter % 100 == 0:
logging.info("{}: Tweets processed: {} tweets".format(threadName, counter))
write_mongo(threadName,output_collection,mongo_list)
mongo_list = []
if len(mongo_list) > 0:
write_mongo(threadName,output_collection,mongo_list)
mongo_list = []
except Exception, e:
print(e)
sys.exit()
开发者ID:anammari,项目名称:optimum_repo,代码行数:26,代码来源:NerWMTweetsMongoIntraDaysMTv3.py
示例4: get_location
def get_location(loc):
"""
currently working only on my computer
english Model
english.muc.7class.distsim.crf.ser.gz
german Models
german.dewac_175m_600.crf.ser.gz
german.hgc_175m_600.crf.ser.gz
"""
# Named Entity Recognizer: recognizes named entities and assigns types like location, person, organization to the entity
st = StanfordNERTagger('stanford-ner-2015-12-09/classifiers/english.muc.7class.distsim.crf.ser.gz',
'stanford-ner-2015-12-09/stanford-ner-3.6.0.jar')
loc_ner = st.tag(loc)
"""
might be faster starting from back to front
'LOCATION' for English
'I-LOC' for German
"""
# code that glues named entities like 'New York' back together
loc_tuples = [item[0] for item in loc_ner if 'LOCATION' in item]
try:
location = loc_tuples[0]
if len(loc_tuples) > 1:
for i in range(1,len(loc_tuples)):
location += ' ' + loc_tuples[i]
except IndexError:
# if no location is specified
return None
return location
开发者ID:phucdev,项目名称:weatherbot,代码行数:29,代码来源:extractor.py
示例5: ner
def ner():
os.environ['STANFORD_NER_PATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanfordNer'
os.environ['STANFORD_POSTAGGER_PATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanford-postagger-full-2014-08-27'
os.environ['CLASSPATH'] = '/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanfordNer/stanford-ner.jar'
os.environ['STANFORD_POSTAGGER'] = os.environ['CLASSPATH']
eng_tagger = StanfordNERTagger('/Users/CHOON/Desktop/choon94.github.io/week5/nlp2/stanfordNer/classifiers/english.all.3class.distsim.crf.ser.gz')
for x in content:
print(eng_tagger.tag(x.split()))
开发者ID:choon94,项目名称:choon94.github.io,代码行数:9,代码来源:newsTest.py
示例6: getEntityCount
def getEntityCount(tweet):
# Use the Stanford NER Tagger
st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz')
# tokenize the tweet
tokenized_text = word_tokenize(tweet)
classified_text = st.tag(tokenized_text)
countPerson =0
for text in classified_text:
if "PERSON" in text[1]:
countPerson+=1
return countPerson
开发者ID:RohithEngu,项目名称:Opinion-Summarizer,代码行数:11,代码来源:Attributes.py
示例7: NERTagging
def NERTagging(text):
log_file = open("Dump/log/Main_output.txt", "a")
st = StanfordNERTagger('resources/ner/classifiers/english.all.3class.distsim.crf.ser.gz',
'resources/ner/stanford-ner.jar',
encoding='utf-8')
tokenized_text = word_tokenize(text)
classified_text = st.tag(tokenized_text)
log_file.write('NER \n %s \n' % classified_text)
print(classified_text)
log_file.close()
return
开发者ID:MoizRauf,项目名称:OQuant_Wiki_Clustering,代码行数:11,代码来源:NLPHelper.py
示例8: nltk_ner
def nltk_ner(remainders):
st = StanfordNERTagger('../stanford-ner/english.all.3class.distsim.crf.ser.gz', '../stanford-ner/stanford-ner.jar')
for item in remainders:
name = ""
tagged = st.tag(item.split())
for entity in tagged:
if entity[1] == u'PERSON':
name += (entity[0].title() + ' ')
if name:
return True, name, item
else:
return False, name, item
开发者ID:mwcurry,项目名称:tracker,代码行数:12,代码来源:parser.py
示例9: trial1
def trial1():
"""
Just to make sure we're not screwing everything up.
:return:
"""
st = StanfordNERTagger('/Users/mayankkejriwal/ubuntu-vm-stuff/home/mayankkejriwal/tmp/stanford-ner-2015-12-09/annotated-cities-model.ser.gz',
'/Users/mayankkejriwal/ubuntu-vm-stuff/home/mayankkejriwal/tmp/stanford-ner-2015-12-09/stanford-ner.jar',
encoding='utf-8')
text = 'While in France, Mrs. Christine Lagarde discussed short-term stimulus efforts in a recent interview with the Wall Street Journal.'
tokenized_text = word_tokenize(text)
classified_text = st.tag(tokenized_text)
print(classified_text)
开发者ID:mayankkejriwal,项目名称:pycharm-projects-ubuntu,代码行数:15,代码来源:StanfordNER.py
示例10: get_namedentities
def get_namedentities(text):
"""
Returns named entities in text using StanfordNERTagger
"""
st = StanfordNERTagger('utils/english.conll.4class.caseless.distsim.crf.ser.gz','utils/stanford-ner.jar')
ner_tagged = st.tag(text.lower().split())
named_entities = []
if len(ner_tagged) > 0:
for n in ner_tagged:
if n[1]!='O':
named_entities.append(remove_punctuation(n[0]))
named_entities = [n for n in named_entities if n]
return named_entities
开发者ID:veryluckyxyz,项目名称:keywordfinder,代码行数:15,代码来源:features.py
示例11: classify_text
def classify_text(text):
"""Using the 3-class Stanford Named Entity Recognition model, classify each
word in the input text as a PERSON, LOCATION, ORGANIZATION, or O (for
other)."""
directory = "C:/Users/liabbott/Documents/Projects/CBP OIT/stanford_ner/"
mod = "classifiers/english.all.3class.distsim.crf.ser.gz"
tag = "stanford-ner.jar"
path_to_model = os.path.normpath(directory + mod)
path_to_tagger = os.path.normpath(directory + tag)
st = StanfordNERTagger(path_to_model, path_to_tagger, encoding='utf-8')
tokenized_text = word_tokenize(text)
classified_text = st.tag(tokenized_text)
return classified_text
开发者ID:liameabbott,项目名称:named_entity_recognition,代码行数:16,代码来源:namedEntityRecognition.py
示例12: __init__
def __init__(self, use_stanford=False, NER_model=None, NER_tagger=None, POS_model=None, POS_tagger=None):
"""The initializer of the class
:param NER_model: NER model path
:param NER_tagger: NER tagger path
:param POS_model: POS model path
:param POS_tagger: POS tagger path
:param use_stanford: boolean, if using stanford NER and POS tagging
"""
self.NER_model = NER_model
self.NER_tagger = NER_tagger
self.POS_model = POS_model
self.POS_tagger = POS_tagger
self.use_stanford = use_stanford
if use_stanford:
if NER_model is None or NER_tagger is None or POS_model is None or POS_tagger is None:
sys.exit("tagging initialization: Stanford models and taggers" " have to be provided!")
else:
self.post = StanfordPOSTagger(self.POS_model, self.POS_tagger).tag
self.nert = StanfordNERTagger(self.NER_model, self.NER_tagger).tag
else:
self.post = nltk.pos_tag
self.nert = nltk.ne_chunk
开发者ID:CDIPSDataScience2016,项目名称:AmazonTrend,代码行数:25,代码来源:NLP_tagging.py
示例13: stanford_entities
def stanford_entities(model, jar, fileids=None, corpus=kddcorpus, section = None):
"""
Extract entities using the Stanford NER tagger.
Must pass in the path to the tagging model and jar as downloaded from the
Stanford Core NLP website.
"""
results = defaultdict(lambda: defaultdict(list))
fileids = fileids or corpus.fileids()
tagger = StanfordNERTagger(model, jar)
section = section
for fileid in fileids:
if section is not None:
text = nltk.word_tokenize(list(sectpull([fileid],section=section))[0][1])
else:
text = corpus.words(fileid)
chunk = []
for token, tag in tagger.tag(text):
if tag == 'O':
if chunk:
# Flush the current chunk
etext = " ".join([c[0] for c in chunk])
etag = chunk[0][1]
chunk = []
# if etag == 'PERSON':
# key = 'persons'
# elif etag == 'ORGANIZATION':
# key = 'organizations'
# elif etag == 'LOCATION':
# key = 'locations'
# else:
# key = 'other'
if etag == 'LOCATION':
key = 'locations'
else:
key = 'other'
results[fileid][key].append(etext)
else:
# Build chunk from tags
chunk.append((token, tag))
return results
开发者ID:goldin2008,项目名称:Research_in_NLP,代码行数:47,代码来源:extract_NER.py
示例14: main
def main():
parser = StanfordParser(path_to_jar=script_wrapper.stanford_parser_jar, path_to_models_jar=script_wrapper.stanford_model_jar)
st = StanfordNERTagger(model_filename='../lib/stanford-ner-2015-12-09/classifiers/english.all.3class.distsim.crf.ser.gz', path_to_jar="../lib/stanford-ner-2015-12-09/stanford-ner-3.6.0.jar")
raw_sent = "Dempsey was drafted by Major League Soccer club New England Revolution."
sent = word_tokenize(raw_sent)
ne_tuple = st.cur_tag(sent) # ##need write interface for tokenized sent (http://nlp.stanford.edu/software/crf-faq.shtml#tokenized)
print ne_tuple
print parser.raw_parse(raw_sent).next()
return
# find name entity
f = 0
ne_list = []
for (ne, label) in ne_tuple:
if label == 'PERSON':
f = 1
if f and label != 'PERSON':
break
if f:
ne_list.append(ne)
# print ne_list
init_file(main_tree)
####### my issue here: 1. don't know how to get NP. 2. is there a quicker way to find PERON ?
# try head to ask who/what
pattern = "S < NP=np"
head = check_output(['bash', ###add bash !!!!
tregex_path,
'-s',
pattern,
init_tree_file])
print head
def get_main_verbs(tree):
pattern = '/(VB.?)/=main >+ (VP) (S > ROOT)'
main_verbs = check_output(['bash', ###add bash !!!!
tregex_path,
'-s',
pattern,
init_tree_file])
print main_verbs
main_verbs = main_verbs.split('\n')[:-1]
main_verbs = [Tree.fromstring(main_verb) for main_verb in main_verbs]
return main_verbs
开发者ID:DerrickZhu1,项目名称:11611teamproject-YenYuan-,代码行数:45,代码来源:2.py
示例15: html_ner
def html_ner(content):
st = StanfordNERTagger(
'./lib/classifiers/english.all.3class.distsim.crf.ser.gz',
'./lib/stanford-ner-3.5.2.jar')
soup = BeautifulSoup(content, "html.parser")
for script in soup(["script", "style", "sup"]):
script.extract()
tokenised_sents = list(soup.stripped_strings)
tokenised_words = [wordpunct_tokenize(sent) for sent in tokenised_sents]
tagged_sents = [st.tag(sent) for sent in tokenised_words]
result = list()
for sent in tagged_sents:
for tag, chunk in groupby(sent, lambda x: x[1]):
if tag != 'O':
result.append((tag, ' '.join(w for w, t in chunk).encode('utf-8').strip()))
return result
开发者ID:Sinderella,项目名称:OSINT,代码行数:18,代码来源:ners.py
示例16: __init__
def __init__(self, language="en"):
from nltk.tag import StanfordNERTagger
self.__stanfordJar = "%s/dist/stanford-ner.jar" % self.__currentDirectory
self.__classifier = "%s/dist/classifiers/english.all.3class.distsim.crf.ser.gz" % (self.__currentDirectory,)
self.__tagger = StanfordNERTagger( self.__classifier,
self.__stanfordJar,
encoding="utf-8")
self.__namedEntitiesFinder = NERFinder(language=language)
开发者ID:domenicosolazzo,项目名称:jroc,代码行数:9,代码来源:StanfordTagger.py
示例17: __init__
def __init__(self, model_num):
if model_num == 3:
pathname = config.STANFORD_3CLASS
elif model_num == 4:
pathname = config.STANFORD_4CLASS
elif model_num == 7:
pathname = config.STANFORD_7CLASS
else:
raise Exception('No model for:', model_num)
self.tagger = StanfordNERTagger(pathname, config.STANFORD_NER_JAR)
开发者ID:aadah,项目名称:nlp_proj,代码行数:11,代码来源:stanford.py
示例18: sanitize_result
def sanitize_result(self, text):
st = StanfordNERTagger('C:\Python27\stanford_ner\classifiers\english.all.3class.distsim.crf.ser.gz',
'C:\Python27\stanford_ner\stanford-ner.jar',
encoding='utf-8')
tokenized_text = word_tokenize(self.capitalize_first_letter(text))
classified_text = st.tag(tokenized_text)
named_entities = self.get_continuous_chunks(classified_text)
named_entities_str = [" ".join([token for token, tag in ne]) for ne in named_entities]
named_entities_str_tag = [(" ".join([token for token, tag in ne]), ne[0][1]) for ne in named_entities]
for tag, chunk in groupby(named_entities_str_tag, lambda x:x[1]):
if tag == "PERSON":
#print "%-12s"%tag, " ".join(w for w, t in chunk)
name = " ".join(w for w, t in chunk)
return name
开发者ID:tseg,项目名称:online_img_search,代码行数:20,代码来源:stanford.py
示例19: init_ner_mapper
def init_ner_mapper(self):
# load the StanfordNER Tagger
# model_ger = "/opt/Projects/nlp/stanford-ner-2015-04-20/classifiers" \
# "/german/german.hgc_175m_600.crf.ser.gz"
# stanford_jar = "/opt/Projects/nlp/stanford-ner-2015-04-20/stanford" \
# "-ner.jar"
model_ger = "/home/janrn/ner/german.hgc_175m_600.crf.ser.gz" # earkdev
stanford_jar = "/home/janrn/ner/stanford-ner.jar" # earkdev
self.tagger = StanfordNERTagger(model_ger, stanford_jar,
encoding="utf-8",
java_options='-mx4096m',
)
开发者ID:eark-project,项目名称:dm-nlp,代码行数:13,代码来源:mr_ner.py
示例20: main
def main():
# training standford NER tagger
st = StanfordNERTagger(
"/home/viswanath/Downloads/stanford-ner-2014-08-27/classifiers/english.conll.4class.distsim.crf.ser.gz",
"/home/viswanath/Downloads/stanford-ner-2014-08-27/stanford-ner.jar",
encoding="utf-8",
)
fname = "/home/viswanath/data/resume/test_data/01.txt"
fp = open(fname, "r")
text = fp.read()
# print text
lstemp = cleanse_data(text)
list_ner_out = st.tag(lstemp.split())
# list_ner_out = st.tag(text.split())
# print list_ner_out
# list_out = st.tag('Rami Eid is studying at Stony Brook University in NY'.split())
fp = open("ner_temp.txt", "w")
# fp.write(list_ner_out)
for item in list_ner_out:
fp.write("{0}\n".format(item))
fp.close()
ne_tagged_sent = list_ner_out
ne_tree = stanfordNE2tree(ne_tagged_sent)
print ne_tree
ne_in_sent = []
for subtree in ne_tree:
if type(subtree) == Tree: # If subtree is a noun chunk, i.e. NE != "O"
ne_label = subtree.label()
ne_string = " ".join([token for token, pos in subtree.leaves()])
ne_in_sent.append((ne_string, ne_label))
print ne_in_sent
开发者ID:sagar3LOQ,项目名称:text_extractor,代码行数:39,代码来源:standfor_NER_tagger.py
注:本文中的nltk.tag.StanfordNERTagger类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论