本文整理汇总了Python中nltk.data.load函数的典型用法代码示例。如果您正苦于以下问题:Python load函数的具体用法?Python load怎么用?Python load使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了load函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: parse_tweets_set
def parse_tweets_set(filename, label, word_tokenizer=None, sent_tokenizer=None,
skip_header=True):
"""
Parse csv file containing tweets and output data a list of (text, label) tuples.
:param filename: the input csv filename.
:param label: the label to be appended to each tweet contained in the csv file.
:param word_tokenizer: the tokenizer instance that will be used to tokenize
each sentence into tokens (e.g. WordPunctTokenizer() or BlanklineTokenizer()).
If no word_tokenizer is specified, tweets will not be tokenized.
:param sent_tokenizer: the tokenizer that will be used to split each tweet into
sentences.
:param skip_header: if True, skip the first line of the csv file (which usually
contains headers).
:return: a list of (text, label) tuples.
"""
tweets = []
if not sent_tokenizer:
sent_tokenizer = load('tokenizers/punkt/english.pickle')
# If we use Python3.x we can proceed using the 'rt' flag
if sys.version_info[0] == 3:
with codecs.open(filename, 'rt') as csvfile:
reader = csv.reader(csvfile)
if skip_header == True:
next(reader, None) # skip the header
i = 0
for tweet_id, text in reader:
# text = text[1]
i += 1
sys.stdout.write('Loaded {0} tweets\r'.format(i))
# Apply sentence and word tokenizer to text
if word_tokenizer:
tweet = [w for sent in sent_tokenizer.tokenize(text)
for w in word_tokenizer.tokenize(sent)]
else:
tweet = text
tweets.append((tweet, label))
# If we use Python2.x we need to handle encoding problems
elif sys.version_info[0] < 3:
with codecs.open(filename) as csvfile:
reader = csv.reader(csvfile)
if skip_header == True:
next(reader, None) # skip the header
i = 0
for row in reader:
unicode_row = [x.decode('utf8') for x in row]
text = unicode_row[1]
i += 1
sys.stdout.write('Loaded {0} tweets\r'.format(i))
# Apply sentence and word tokenizer to text
if word_tokenizer:
tweet = [w.encode('utf8') for sent in sent_tokenizer.tokenize(text)
for w in word_tokenizer.tokenize(sent)]
else:
tweet = text
tweets.append((tweet, label))
print("Loaded {0} tweets".format(i))
return tweets
开发者ID:DrDub,项目名称:nltk,代码行数:60,代码来源:util.py
示例2: tag
def tag(text):
"""Tags the input text.
Arguments:
text (str): The text to tag.
Returns:
([[(str, str)]]): List of sentences containing lists of word/tag pairs.
"""
#Separate the input text into sentences
sentences = nltk.sent_tokenize(str(text))
#Separate each sentence into words
nested = []
for sentence in sentences:
nested.append(nltk.word_tokenize(sentence))
# Prepare default tagger
_POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
tagger = load(_POS_TAGGER) # Same tagger as using nltk.pos_tag
# Prepare regex tagger for custom tags
regexp_tagger = nltk.tag.RegexpTagger([(r'\(', '('),
(r'\)', ')'),
(r'\[', '['),
(r'\]', ']'),
(r'_+', 'None')],
backoff=tagger)
#Add a part of speech tag to each word
nested_tagged = []
for sentence in nested:
nested_tagged.append([TaggedToken(*x) for x in regexp_tagger.tag(sentence)])
return nested_tagged
开发者ID:skyschermer,项目名称:uweclang,代码行数:35,代码来源:tag.py
示例3: __init__
def __init__(self):
# Initializing TreeBank tokenizer from NLTK
from nltk.tokenize import TreebankWordTokenizer
self._tb_tokenizer = TreebankWordTokenizer().tokenize
# Initializing Punkt Sentence Tokenizer from NLTK
from nltk import data
self._sent_detector = data.load('tokenizers/punkt/english.pickle')
开发者ID:asuiu,项目名称:pyNLP,代码行数:7,代码来源:tokenizers.py
示例4: batch_pos_tag
def batch_pos_tag(sentences):
"""
Use NLTK's currently recommended part of speech tagger to tag the
given list of sentences, each consisting of a list of tokens.
"""
tagger = load(_POS_TAGGER)
return tagger.batch_tag(sentences)
开发者ID:0day1day,项目名称:golismero,代码行数:7,代码来源:__init__.py
示例5: read_rule
def read_rule (self, filename):
rules = load('nltk:stemmers/rslp/' + filename, format='raw').decode("utf8")
lines = rules.split("\n")
lines = [line for line in lines if line != ""] # remove blank lines
lines = [line for line in lines if line[0] != "#"] # remove comments
# NOTE: a simple but ugly hack to make this parser happy with double '\t's
lines = [line.replace("\t\t", "\t") for line in lines]
# parse rules
rules = []
for line in lines:
rule = []
tokens = line.split("\t")
# text to be searched for at the end of the string
rule.append( tokens[0][1:-1] ) # remove quotes
# minimum stem size to perform the replacement
rule.append( int(tokens[1]) )
# text to be replaced into
rule.append( tokens[2][1:-1] ) # remove quotes
# exceptions to this rule
rule.append( [token[1:-1] for token in tokens[3].split(",")] )
# append to the results
rules.append(rule)
return rules
开发者ID:DrDub,项目名称:nltk,代码行数:32,代码来源:rslp.py
示例6: run
def run(train, test, language, answer):
results = {}
if language == 'English':
_POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
tagger = load(_POS_TAGGER)
elif language == 'Spanish':
tagger = ut(cess_esp.tagged_sents())
elif language == 'Catalan':
tagger = ut(cess_cat.tagged_sents())
for lexelt in train:
train_features, y_train = extract_features(train[lexelt],language,tagger)
test_features, _ = extract_features(test[lexelt],language,tagger)
X_train, X_test = vectorize(train_features,test_features)
X_train_new, X_test_new = feature_selection(X_train, X_test,y_train)
results[lexelt] = classify(X_train_new, X_test_new,y_train)
"""
B1.c
for lexelt in train:
features = getBestWords(train[lexelt], 30)
train_features = countFeature(features, train[lexelt])
_, y_train = extract_features(train[lexelt], language)
test_features = countFeature(features, test[lexelt])
X_train, X_test = vectorize(train_features, test_features)
results[lexelt] = classify(X_train, X_test, y_train)
B1.c
"""
A.print_results(results, answer)
开发者ID:Xochitlxie,项目名称:EECS595-NLP,代码行数:31,代码来源:B.py
示例7: generate_instances
def generate_instances(self, sentences, child_conn):
# Each process has its own NLTK PoS-tagger
tagger = load('taggers/maxent_treebank_pos_tagger/english.pickle')
instances = list()
while True:
try:
s = sentences.get_nowait()
if sentences.qsize() % 500 == 0:
print(multiprocessing.current_process(), \
"Instances to process", sentences.qsize())
sentence = Sentence(s,
self.config.e1_type,
self.config.e2_type,
self.config.max_tokens_away,
self.config.min_tokens_away,
self.config.context_window_size,
tagger,
self.config)
for rel in sentence.relationships:
t = Tuple(rel.e1, rel.e2, rel.sentence, rel.before,
rel.between, rel.after, self.config)
instances.append(t)
except queue.Empty:
print(multiprocessing.current_process(), "Queue is Empty")
pid = multiprocessing.current_process().pid
child_conn.send((pid, instances))
break
开发者ID:davidsbatista,项目名称:BREDS,代码行数:30,代码来源:breds-parallel.py
示例8: get_tagger
def get_tagger(lang):
if lang == "English":
global eng_tagger
if eng_tagger:
return eng_tagger
else:
_POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
eng_tagger = load(_POS_TAGGER)
return eng_tagger
elif lang == "Spanish":
global spa_tagger
if spa_tagger:
return spa_tagger
else:
print 111
training = cess_esp.tagged_sents()
default_tagger = nltk.DefaultTagger('NN')
unigram_tagger = nltk.UnigramTagger(training,backoff=default_tagger)
bigram_tagger = nltk.BigramTagger(training, backoff=unigram_tagger)
spa_tagger = nltk.TrigramTagger(training, backoff=bigram_tagger)
print 555
return spa_tagger
else:
global cat_tagger
if cat_tagger:
return cat_tagger
else:
training = cess_cat.tagged_sents()
default_tagger = nltk.DefaultTagger('NN')
unigram_tagger = nltk.UnigramTagger(training,backoff=default_tagger)
bigram_tagger = nltk.BigramTagger(training, backoff=unigram_tagger)
cat_tagger = nltk.TrigramTagger(training, backoff=bigram_tagger)
return cat_tagger
开发者ID:qychen,项目名称:NLP_Projects_COMS4705_Columbia,代码行数:33,代码来源:B_best.py
示例9: test_austen
def test_austen():
from nltk.data import load
from nltk.corpus import gutenberg as g
stok = load('tokenizers/punkt/english.pickle')
train = [[w for w in tokenize(preprocess(sent))] for sent in stok.tokenize(g.raw('austen-emma.txt'))]
test1 = [[w for w in tokenize(preprocess(sent))] for sent in stok.tokenize(g.raw('austen-sense.txt'))]
test2 = [[w for w in tokenize(preprocess(sent))] for sent in stok.tokenize(g.raw('austen-persuasion.txt'))]
model1 = AdditiveSmoothing(n=2)
model1.generate_model(train)
print 'cross entropy additive smoothing:'
print 'emma to sense&sensibility: %f0.8' %cross_entropy(model1, test1)
print 'emma to persuasion: %f0.8' %cross_entropy(model1, test2)
model2 = KnesserNey(n=2)
model2.generate_model(train)
print 'cross entropy knesser-ney smoothing:'
print 'emma to sense&sensibility: %f0.8' %cross_entropy(model2, test1)
print 'emma to persuasion: %f0.8' %cross_entropy(model2, test2)
model3 = SimpleGoodTuring(n=2)
model3.generate_model(train)
print 'cross entropy simple good-turing smoothing:'
print 'emma to sense&sensibility: %f0.8' %cross_entropy(model3, test1)
print 'emma to persuasion: %f0.8' %cross_entropy(model3, test2)
model4 = KatzSmoothing(n=2)
model4.generate_model(train)
print 'cross entropy katz smoothing:'
print 'emma to sense&sensibility: %f0.8' %cross_entropy(model4, test1)
print 'emma to persuasion: %f0.8' %cross_entropy(model4, test2)
开发者ID:JoeDumoulin,项目名称:nlp_working,代码行数:29,代码来源:calc_score.py
示例10: meaning_words
def meaning_words(self, text):
# meaning tags nouns and adjective only
meaning_tags = ['NN', 'NNP', 'NNPS', 'JJ']
default_tagger = data.load(tag._POS_TAGGER)
''' sometimes the nltk tagger is misclassifying some part-of-speech
such as The that should be a determiner. The duty tagger also helps
to eliminate common words that are not so important
'''
duty = dict()
[duty.update({w:'x'}) for w in self.common_words]
enchaned_tagger = tag.UnigramTagger(model=duty, backoff=default_tagger)
meaning_words = ' '.join([w for w, c in enchaned_tagger.tag(
word_tokenize(text)) if c in
meaning_tags and (len(w) > 2)])
'''if no meaning words are found, using this approach then
return the whole text
'''
if not meaning_words:
return None
else:
return meaning_words
开发者ID:wagnersouz4,项目名称:miningDTH,代码行数:28,代码来源:webpage_content.py
示例11: digest
def digest(self):
if self.sentences is not None:
return
# Digest the problem into sentences
tokenizer = data.load("tokenizers/punkt/english.pickle")
self.sentences = tokenizer.tokenize(self.text.strip())
# Digest each sentence into words and part-of-speech tags
if self.sentence_tags is None:
sentence_tags = []
all_tags = []
all_words = []
for s in self.sentences:
all_words.append(s)
tags = pos_tag(word_tokenize(s))
sentence_tags.append(tags)
for t in tags:
l = len(t[0])
if not self.longest_word or self.longest_word < l:
self.longest_word = l
all_tags.append(t[1])
self.sentence_tags = sentence_tags
self.all_tags = uniq(all_tags)
self.all_words = uniq(all_words)
开发者ID:eric011,项目名称:Zoidberg,代码行数:25,代码来源:problem.py
示例12: _split_sentence
def _split_sentence(self, s):
'''
sentence splitter
'''
#use French sentence tokenizer from nltk
pst = data.load("tokenizers/punkt/french.pickle")
return pst.tokenize(s)
开发者ID:qiuwei,项目名称:datagger,代码行数:7,代码来源:CorpusBuilder.py
示例13: __init__
def __init__(self):
"""
:param train_percent_size: 0-1
:return:
"""
_POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
self._tagger = load(_POS_TAGGER)
开发者ID:williamFalcon,项目名称:NLP_HW3,代码行数:8,代码来源:Universal_tagger.py
示例14: sent_tokenize
def sent_tokenize(text):
"""
Return a sentence-tokenized copy of *text*,
using NLTK's recommended sentence tokenizer
(currently :class:`.PunktSentenceTokenizer`).
"""
tokenizer = load("tokenizers/punkt/english.pickle")
return tokenizer.tokenize(text)
开发者ID:smartseung,项目名称:nltk,代码行数:8,代码来源:__init__.py
示例15: load
def load(self, loc):
'''
:param loc: Load a pickled model at location.
:type loc: str
'''
self.model.weights, self.tagdict, self.classes = load(loc)
self.model.classes = self.classes
开发者ID:GINK03,项目名称:KindleReferencedIndexScore,代码行数:8,代码来源:perceptron.py
示例16: __init__
def __init__(self, encoding):
"""Constructor.
"""
super(FrenchBonsaiTokenizer, self).__init__()
self._sentence_tokenizer = data.load('tokenizers/punkt/french.pickle')
self._encoding = encoding
开发者ID:Archer-W,项目名称:KeyBench,代码行数:8,代码来源:french_bonsai_tokenizer.py
示例17: solve_problem
def solve_problem(problem):
tokenizer = load("tokenizers/punkt/english.pickle")
sentences = tokenizer.tokenize(problem.strip())
print "Problem input: {0}".format(problem)
for s in get_statements(sentences):
print "Statement: {0}".format(str(s))
print "Solution: {0}".format(s.solve())
开发者ID:Knewton,项目名称:Zoidberg,代码行数:9,代码来源:zoidberg.py
示例18: load_parser
def load_parser(grammar_url, trace=0,
parser=None, chart_class=None,
beam_size=0, **load_args):
"""
Load a grammar from a file, and build a parser based on that grammar.
The parser depends on the grammar format, and might also depend
on properties of the grammar itself.
The following grammar formats are currently supported:
- ``'cfg'`` (CFGs: ``CFG``)
- ``'pcfg'`` (probabilistic CFGs: ``PCFG``)
- ``'fcfg'`` (feature-based CFGs: ``FeatureGrammar``)
:type grammar_url: str
:param grammar_url: A URL specifying where the grammar is located.
The default protocol is ``"nltk:"``, which searches for the file
in the the NLTK data package.
:type trace: int
:param trace: The level of tracing that should be used when
parsing a text. ``0`` will generate no tracing output;
and higher numbers will produce more verbose tracing output.
:param parser: The class used for parsing; should be ``ChartParser``
or a subclass.
If None, the class depends on the grammar format.
:param chart_class: The class used for storing the chart;
should be ``Chart`` or a subclass.
Only used for CFGs and feature CFGs.
If None, the chart class depends on the grammar format.
:type beam_size: int
:param beam_size: The maximum length for the parser's edge queue.
Only used for probabilistic CFGs.
:param load_args: Keyword parameters used when loading the grammar.
See ``data.load`` for more information.
"""
grammar = load(grammar_url, **load_args)
if not isinstance(grammar, CFG):
raise ValueError("The grammar must be a CFG, "
"or a subclass thereof.")
if isinstance(grammar, PCFG):
if parser is None:
parser = InsideChartParser
return parser(grammar, trace=trace, beam_size=beam_size)
elif isinstance(grammar, FeatureGrammar):
if parser is None:
parser = FeatureChartParser
if chart_class is None:
chart_class = FeatureChart
return parser(grammar, trace=trace, chart_class=chart_class)
else: # Plain CFG.
if parser is None:
parser = ChartParser
if chart_class is None:
chart_class = Chart
return parser(grammar, trace=trace, chart_class=chart_class)
开发者ID:DrDub,项目名称:nltk,代码行数:56,代码来源:util.py
示例19: split_sentences
def split_sentences(corpus='rbc.txt', newfile='rbc_se.txt'):
t = load('tokenizers/punkt/russian.pickle')
text = open('.\\crawler\\' + corpus, 'r', encoding='utf-8')
new = open(newfile, 'w', encoding='utf-8')
for line in text:
s = t.tokenize(line.strip('\n'))
for sent in s:
new.write(sent + '\n')
text.close()
new.close()
开发者ID:mannefedov,项目名称:Relext,代码行数:10,代码来源:sentences.py
示例20: update_attributes
def update_attributes(self, settingfile_input):
searchURL = self.http + "/search"
feature_service = "Feature Service"
query_dict = {'f': 'json',
'token': self.token,
'q': "tags:\"" + self.utag + "\" AND owner:\"" + self.username + "\" AND type:\"" + feature_service + "\""}
jsonResponse = sendAGOLReq(searchURL, query_dict)
if jsonResponse['total'] == 0:
#feature_id = jsonResponse['results'][0]['id']
DirMGMT().lgr.error("\n.Couldn't find the service.\n")
sys.exit()
else:
#jsonResponse = sendAGOLReq(searchURL, query_dict)
feature_id = jsonResponse['results'][0]['id']
# Update
updateURL = agol.http + '/content/users/{}/items/{}/update'.format(agol.username, feature_id)
sentence_break = data.load('tokenizers/punkt/english.pickle')
temp_desc = ReadSF(settingfile_input).description
utagloc = temp_desc.find('uTag')
cut = temp_desc[utagloc:utagloc+42]
temp_desc = temp_desc.replace(cut, '')
# TODO remove tags from
temp_tags = ReadSF(settingfile_input).tags
# utag = temp_tags.split()[-1]
# lutag = temp_tags.rfind(utag)-2
# temp_tags = temp_tags[0:lutag]
url = updateURL + "?f=json&token=" + agol.token + \
"&type=Feature Service" \
"&title=" + agol.serviceName.replace('_', ' ') + \
"&tags=" + temp_tags + \
"&snippet=" + sentence_break.tokenize(ReadSF(settingfile_input).description.strip())[0] + \
"&description=" + temp_desc
# "&description=" + ReadSF(settingfile_input).description.replace("\n\nuTag: "+ReadSF(settingfile_input).tags[-1], '')
response = requests.post(url)
itemPartJSON = json.loads(response.text)
if "success" in itemPartJSON:
# itemPartID = itemPartJSON['id']
itemPartTitle = itemPartJSON['id']
DirMGMT().lgr.info("updated Feature Layer: {}".format(itemPartTitle))
return True
else:
DirMGMT().lgr.error("\n.sd file not uploaded. Check the errors and try again.\n")
DirMGMT().lgr.error(itemPartJSON)
sys.exit()
开发者ID:USHUDEGIS,项目名称:Storefront_Publish,代码行数:54,代码来源:update_tools.py
注:本文中的nltk.data.load函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论