本文整理汇总了Python中nltk.data.find函数的典型用法代码示例。如果您正苦于以下问题:Python find函数的具体用法?Python find怎么用?Python find使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了find函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: test_corpus_bleu
def test_corpus_bleu(self):
ref_file = find('models/wmt15_eval/ref.ru')
hyp_file = find('models/wmt15_eval/google.ru')
mteval_output_file = find('models/wmt15_eval/mteval-13a.output')
# Reads the BLEU scores from the `mteval-13a.output` file.
# The order of the list corresponds to the order of the ngrams.
with open(mteval_output_file, 'r') as mteval_fin:
# The numbers are located in the last 2nd line of the file.
# The first and 2nd item in the list are the score and system names.
mteval_bleu_scores = map(float, mteval_fin.readlines()[-2].split()[1:-1])
with io.open(ref_file, 'r', encoding='utf8') as ref_fin:
with io.open(hyp_file, 'r', encoding='utf8') as hyp_fin:
# Whitespace tokenize the file.
# Note: split() automatically strip().
hypothesis = list(map(lambda x: x.split(), hyp_fin))
# Note that the corpus_bleu input is list of list of references.
references = list(map(lambda x: [x.split()],ref_fin))
# Without smoothing.
for i, mteval_bleu in zip(range(1,10), mteval_bleu_scores):
nltk_bleu = corpus_bleu(references, hypothesis, weights=(1.0/i,)*i)
# Check that the BLEU scores difference is less than 0.005 .
# Note: This is an approximate comparison; as much as
# +/- 0.01 BLEU might be "statistically significant",
# the actual translation quality might not be.
assert abs(mteval_bleu - nltk_bleu) < 0.005
# With the same smoothing method used in mteval-v13a.pl
chencherry = SmoothingFunction()
for i, mteval_bleu in zip(range(1,10), mteval_bleu_scores):
nltk_bleu = corpus_bleu(references, hypothesis,
weights=(1.0/i,)*i,
smoothing_function=chencherry.method3)
assert abs(mteval_bleu - nltk_bleu) < 0.005
开发者ID:DrDub,项目名称:nltk,代码行数:35,代码来源:test_bleu.py
示例2: demo
def demo():
from itertools import islice
# zip_path = find('corpora/toolbox.zip')
# lexicon = ToolboxData(ZipFilePathPointer(zip_path, 'toolbox/rotokas.dic')).parse()
file_path = find('corpora/toolbox/rotokas.dic')
lexicon = ToolboxData(file_path).parse()
print('first field in fourth record:')
print(lexicon[3][0].tag)
print(lexicon[3][0].text)
print('\nfields in sequential order:')
for field in islice(lexicon.find('record'), 10):
print(field.tag, field.text)
print('\nlx fields:')
for field in islice(lexicon.findall('record/lx'), 10):
print(field.text)
settings = ToolboxSettings()
file_path = find('corpora/toolbox/MDF/MDF_AltH.typ')
settings.open(file_path)
# settings.open(ZipFilePathPointer(zip_path, entry='toolbox/MDF/MDF_AltH.typ'))
tree = settings.parse(unwrap=False, encoding='cp1252')
print(tree.find('expset/expMDF/rtfPageSetup/paperSize').text)
settings_tree = ElementTree(tree)
print(to_settings_string(settings_tree).encode('utf8'))
开发者ID:esabelhaus,项目名称:secret-octo-dubstep,代码行数:27,代码来源:toolbox.py
示例3: build_model
def build_model(fmt="binary"):
print("Loading training data...")
train_paths = [
find("corpora/ace_data/ace.dev"),
find("corpora/ace_data/ace.heldout"),
find("corpora/ace_data/bbn.dev"),
find("corpora/ace_data/muc.dev"),
]
train_trees = load_ace_data(train_paths, fmt)
train_data = [postag_tree(t) for t in train_trees]
print("Training...")
cp = NEChunkParser(train_data)
del train_data
print("Loading eval data...")
eval_paths = [find("corpora/ace_data/ace.eval")]
eval_trees = load_ace_data(eval_paths, fmt)
eval_data = [postag_tree(t) for t in eval_trees]
print("Evaluating...")
chunkscore = ChunkScore()
for i, correct in enumerate(eval_data):
guess = cp.parse(correct.leaves())
chunkscore.score(correct, guess)
if i < 3:
cmp_chunks(correct, guess)
print(chunkscore)
outfilename = "/tmp/ne_chunker_%s.pickle" % fmt
print("Saving chunker to %s..." % outfilename)
with open(outfilename, "wb") as out:
pickle.dump(cp, out, -1)
return cp
开发者ID:huderlem,项目名称:nltk,代码行数:35,代码来源:named_entity.py
示例4: nltk_download_corpus
def nltk_download_corpus(resource_path):
"""
Download the specified NLTK corpus file
unless it has already been downloaded.
Returns True if the corpus needed to be downloaded.
"""
from nltk.data import find
from nltk import download
from os.path import split
# Download the wordnet data only if it is not already downloaded
_, corpus_name = split(resource_path)
## From http://www.nltk.org/api/nltk.html ##
# When using find() to locate a directory contained in a zipfile,
# the resource name must end with the forward slash character.
# Otherwise, find() will not locate the directory.
####
# Helps when resource_path=='sentiment/vader_lexicon''
if not resource_path.endswith('/'):
resource_path = resource_path + '/'
downloaded = False
try:
find(resource_path)
except LookupError:
download(corpus_name)
downloaded = True
return downloaded
开发者ID:jianjun66,项目名称:ChatterBot,代码行数:32,代码来源:utils.py
示例5: build_model
def build_model(fmt='binary'):
print('Loading training data...')
train_paths = [find('corpora/ace_data/ace.dev'),
find('corpora/ace_data/ace.heldout'),
find('corpora/ace_data/bbn.dev'),
find('corpora/ace_data/muc.dev')]
train_trees = load_ace_data(train_paths, fmt)
train_data = [postag_tree(t) for t in train_trees]
print('Training...')
cp = NEChunkParser(train_data)
del train_data
print('Loading eval data...')
eval_paths = [find('corpora/ace_data/ace.eval')]
eval_trees = load_ace_data(eval_paths, fmt)
eval_data = [postag_tree(t) for t in eval_trees]
print('Evaluating...')
chunkscore = ChunkScore()
for i, correct in enumerate(eval_data):
guess = cp.parse(correct.leaves())
chunkscore.score(correct, guess)
if i < 3: cmp_chunks(correct, guess)
print(chunkscore)
outfilename = '/tmp/ne_chunker_%s.pickle' % fmt
print('Saving chunker to %s...' % outfilename)
with open(outfilename, 'wb') as outfile:
pickle.dump(cp, outfile, -1)
return cp
开发者ID:chatbotimporved,项目名称:chatbot,代码行数:32,代码来源:informationextraction.py
示例6: __init__
def __init__(self):
from nltk.data import find
from nltk import download
try:
find('wordnet.zip')
except LookupError:
download('wordnet')
开发者ID:fmoliveira,项目名称:ChatterBot,代码行数:8,代码来源:word_net.py
示例7: namedEntityRecognizer
def namedEntityRecognizer():
echo2("Performing NER on incoming stream")
content = request.stream.read()
#print content
if Verbose:
echo2("Incoming content is "+content)
PICKLE = "averaged_perceptron_tagger.pickle"
AP_MODEL_LOC = 'file:'+str(find('taggers/averaged_perceptron_tagger/'+PICKLE))
tagger = PerceptronTagger(load=False)
tagger.load(AP_MODEL_LOC)
pos_tag = tagger.tag
start = time.time()
#date_time = timex.tag(content)
tokenized = nltk.word_tokenize(content)
tagged = pos_tag(tokenized)
namedEnt = nltk.ne_chunk(tagged, binary=True)
names = extract_entity_names(namedEnt, 'NE')
#names.extend(date_time)
result = {"result" : "success", "names" : names}
if Units:
grammar = '''unit: {<CD><NNS>?<NN.*>?},
unit: {<CD><JJ>?<NN.*>}
'''
parser = nltk.RegexpParser(grammar)
units = extract_entity_names(parser.parse(tagged),'unit')
result['units'] = units
jsonDoc = json.dumps(result, sort_keys=True, indent=4, separators=(',', ': '))
end = time.time()
print "NER took "+str(end - start)+" seconds"
return jsonDoc
开发者ID:anirbanmishra,项目名称:Content_Evaluation,代码行数:31,代码来源:NLTKRestServer.py
示例8: _vocabulary
def _vocabulary(self):
return (
data.find('stemmers/porter_test/porter_vocabulary.txt')
.open(encoding='utf-8')
.read()
.splitlines()
)
开发者ID:DrDub,项目名称:nltk,代码行数:7,代码来源:test_stem.py
示例9: _vocabulary
def _vocabulary(self):
with closing(
data.find('stemmers/porter_test/porter_vocabulary.txt').open(
encoding='utf-8'
)
) as fp:
return fp.read().splitlines()
开发者ID:rmalouf,项目名称:nltk,代码行数:7,代码来源:test_stem.py
示例10: demo
def demo():
from nltk.data import find
corpus_root = find('corpora/childes/data-xml/Eng-USA/')
childes = CHILDESCorpusReader(corpus_root, u'.*.xml')
# describe all corpus
for file in childes.fileids()[:5]:
corpus = ''
corpus_id = ''
for (key,value) in childes.corpus(file)[0].items():
if key == "Corpus": corpus = value
if key == "Id": corpus_id = value
print 'Reading', corpus,corpus_id,' .....'
print "words:", childes.words(file)[:7],"..."
print "words with replaced words:", childes.words(file, replace=True)[:7]," ..."
print "words with pos tags:", childes.words(file, pos=True)[:7]," ..."
print "words (only MOT):", childes.words(file, speaker='MOT')[:7], "..."
print "words (only CHI):", childes.words(file, speaker='CHI')[:7], "..."
print "stemmed words:", childes.words(file, stem=True)[:7]," ..."
print "words with relations and pos-tag:", childes.words(file, relation=True)[:5]," ..."
print "sentence:", childes.sents(file)[:2]," ..."
for (participant, values) in childes.participants(file)[0].items():
for (key, value) in values.items():
print "\tparticipant", participant, key, ":", value
print "num of sent:", len(childes.sents(file))
print "num of morphemes:", len(childes.words(file, stem=True))
print "age:", childes.age(file)
print "age in month:", childes.age(file, month=True)
print "MLU:", childes.MLU(file)
print '\r'
开发者ID:johndpope,项目名称:jazzparser,代码行数:30,代码来源:childes.py
示例11: test_vocabulary_nltk_mode
def test_vocabulary_nltk_mode(self):
self._test_against_expected_output(
PorterStemmer.NLTK_EXTENSIONS,
data.find('stemmers/porter_test/porter_nltk_output.txt')
.open(encoding='utf-8')
.read()
.splitlines()
)
开发者ID:DrDub,项目名称:nltk,代码行数:8,代码来源:test_stem.py
示例12: _get_tagger
def _get_tagger(lang=None):
if lang == 'rus':
tagger = PerceptronTagger(False)
ap_russian_model_loc = 'file:' + str(find(RUS_PICKLE))
tagger.load(ap_russian_model_loc)
else:
tagger = PerceptronTagger()
return tagger
开发者ID:Weiming-Hu,项目名称:text-based-six-degree,代码行数:8,代码来源:__init__.py
示例13: __init__
def __init__(self):
from nltk.data import find
from nltk import download
import os
# Download the wordnet data only if it is not already downloaded
wordnet_path = None
if os.name == 'nt':
wordnet_path = os.path.join(os.getenv('APPDATA'), 'nltk_data',
'corpora', 'wordnet.zip')
else:
wordnet_path = os.path.join(os.path.expanduser('~'), 'nltk_data',
'corpora', 'wordnet.zip')
try:
if not os.path.isfile(wordnet_path):
find('wordnet.zip')
except LookupError:
download('wordnet')
开发者ID:AugustoQueiroz,项目名称:ChatterBot,代码行数:18,代码来源:wordnet.py
示例14: demo
def demo(corpus_root=None):
"""
The CHILDES corpus should be manually downloaded and saved
to ``[NLTK_Data_Dir]/corpora/childes/``
"""
if not corpus_root:
from nltk.data import find
corpus_root = find('corpora/childes/data-xml/Eng-USA/')
try:
childes = CHILDESCorpusReader(corpus_root, '.*.xml')
# describe all corpus
for file in childes.fileids()[:5]:
corpus = ''
corpus_id = ''
for (key, value) in childes.corpus(file)[0].items():
if key == "Corpus":
corpus = value
if key == "Id":
corpus_id = value
print('Reading', corpus, corpus_id, ' .....')
print("words:", childes.words(file)[:7], "...")
print(
"words with replaced words:",
childes.words(file, replace=True)[:7],
" ...",
)
print("words with pos tags:", childes.tagged_words(file)[:7], " ...")
print("words (only MOT):", childes.words(file, speaker='MOT')[:7], "...")
print("words (only CHI):", childes.words(file, speaker='CHI')[:7], "...")
print("stemmed words:", childes.words(file, stem=True)[:7], " ...")
print(
"words with relations and pos-tag:",
childes.words(file, relation=True)[:5],
" ...",
)
print("sentence:", childes.sents(file)[:2], " ...")
for (participant, values) in childes.participants(file)[0].items():
for (key, value) in values.items():
print("\tparticipant", participant, key, ":", value)
print("num of sent:", len(childes.sents(file)))
print("num of morphemes:", len(childes.words(file, stem=True)))
print("age:", childes.age(file))
print("age in month:", childes.age(file, month=True))
print("MLU:", childes.MLU(file))
print()
except LookupError as e:
print(
"""The CHILDES corpus, or the parts you need, should be manually
downloaded from https://childes.talkbank.org/data-xml/ and saved at
[NLTK_Data_Dir]/corpora/childes/
Alternately, you can call the demo with the path to a portion of the CHILDES corpus, e.g.:
demo('/path/to/childes/data-xml/Eng-USA/")
"""
)
开发者ID:rmalouf,项目名称:nltk,代码行数:57,代码来源:childes.py
示例15: __init__
def __init__(self):
from nltk.data import find
from nltk import download
import os
# Download the punkt data only if it is not already downloaded
punkt_path = None
if os.name == 'nt':
punkt_path = os.path.join(os.getenv('APPDATA'), 'nltk_data',
'tokenizers', 'punkt.zip')
else:
punkt_path = os.path.join(os.path.expanduser('~'), 'nltk_data',
'tokenizers', 'punkt.zip')
try:
if not os.path.isfile(punkt_path):
find('punkt.zip')
except LookupError:
download('punkt')
开发者ID:AugustoQueiroz,项目名称:ChatterBot,代码行数:18,代码来源:tokenizer.py
示例16: __init__
def __init__(self, load=True):
'''
:param load: Load the pickled model upon instantiation.
'''
self.model = AveragedPerceptron()
self.tagdict = {}
self.classes = set()
if load:
AP_MODEL_LOC = 'file:'+str(find('taggers/averaged_perceptron_tagger/'+PICKLE))
self.load(AP_MODEL_LOC)
开发者ID:GINK03,项目名称:KindleReferencedIndexScore,代码行数:10,代码来源:perceptron.py
示例17: _get_tagger
def _get_tagger(lang=None):
if lang == "rus":
tagger = PerceptronTagger(False)
ap_russian_model_loc = "file:" + str(find(RUS_PICKLE))
tagger.load(ap_russian_model_loc)
elif lang == "eng":
tagger = PerceptronTagger()
else:
tagger = PerceptronTagger()
return tagger
开发者ID:DrDub,项目名称:nltk,代码行数:10,代码来源:__init__.py
示例18: test_vocabulary_original_mode
def test_vocabulary_original_mode(self):
# The list of stems for this test was generated by taking the
# Martin-blessed stemmer from
# http://tartarus.org/martin/PorterStemmer/c.txt
# and removing all the --DEPARTURE-- sections from it and
# running it against Martin's test vocabulary.
with closing(data.find('stemmers/porter_test/porter_original_output.txt').open(encoding='utf-8')) as fp:
self._test_against_expected_output(
PorterStemmer.ORIGINAL_ALGORITHM,
fp.read().splitlines()
)
self._test_against_expected_output(
PorterStemmer.ORIGINAL_ALGORITHM,
data.find('stemmers/porter_test/porter_original_output.txt')
.open(encoding='utf-8')
.read()
.splitlines()
)
开发者ID:alpaco42,项目名称:ML_Spring_2018,代码行数:20,代码来源:test_stem.py
示例19: nltk_download_corpus
def nltk_download_corpus(resource_path):
"""
Download the specified NLTK corpus file
unless it has already been downloaded.
Returns True if the corpus needed to be downloaded.
"""
from nltk.data import find
from nltk import download
from os.path import split, sep
from zipfile import BadZipfile
# Download the NLTK data only if it is not already downloaded
_, corpus_name = split(resource_path)
# From http://www.nltk.org/api/nltk.html
# When using find() to locate a directory contained in a zipfile,
# the resource name must end with the forward slash character.
# Otherwise, find() will not locate the directory.
#
# Helps when resource_path=='sentiment/vader_lexicon''
if not resource_path.endswith(sep):
resource_path = resource_path + sep
downloaded = False
try:
find(resource_path)
except LookupError:
download(corpus_name)
downloaded = True
except BadZipfile:
raise BadZipfile(
'The NLTK corpus file being opened is not a zipfile, '
'or it has been corrupted and needs to be manually deleted.'
)
return downloaded
开发者ID:dawnpower,项目名称:ChatterBot,代码行数:38,代码来源:utils.py
示例20: _str2records
def _str2records(filename, rel):
"""
Read a file into memory and convert each relation clause into a list.
"""
recs = []
path = find("corpora/chat80/%s" % filename)
for line in path.open():
if line.startswith(rel):
line = re.sub(rel+r'\(', '', line)
line = re.sub(r'\)\.$', '', line)
line = line[:-1]
record = line.split(',')
recs.append(record)
return recs
开发者ID:approximatelylinear,项目名称:nltk,代码行数:14,代码来源:chat80.py
注:本文中的nltk.data.find函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论