本文整理汇总了Python中nltk.corpus.gutenberg.fileids函数的典型用法代码示例。如果您正苦于以下问题:Python fileids函数的具体用法?Python fileids怎么用?Python fileids使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了fileids函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: exercise_gutenberg
def exercise_gutenberg():
# 打印古腾堡项目的文件列表
print gutenberg.fileids()
# 挑选一个文本: 简-奥斯丁的《爱玛》
emma = gutenberg.words("austen-emma.txt")
# 查看书的长度
print len(emma)
# 导入文本
emma_text = nltk.Text(emma)
emma_text.concordance("surprize")
for file_id in gutenberg.fileids():
chars_list = gutenberg.raw(file_id)
words_list = gutenberg.words(file_id)
sents_list = gutenberg.sents(file_id)
# 统计文件的总字符数
num_chars = len(chars_list)
# 统计文件的总单词数
num_words = len(words_list)
# 统计文件的总句子数
num_sents = len(sents_list)
# 统计文件的非重复单词数
num_vocab = len(set([w.lower() for w in words_list]))
# 打印词的平均字符数, 句子的平均单词数, 每个单词出现的平均次数, 文件名
print num_chars / num_words, num_words / num_sents, num_words / num_vocab, file_id
开发者ID:BurnellLiu,项目名称:LiuProject,代码行数:29,代码来源:chapter_02.py
示例2: fun01
def fun01():
"""fun01"""
print gutenberg.fileids()
# emma by jane austen
emma = gutenberg.words('austen-emma.txt')
# how many words it contains
print len(emma)
print Text(emma).concordance("surprize")
开发者ID:gree2,项目名称:hobby,代码行数:8,代码来源:ch02.py
示例3: handle
def handle(self, *args, **options):
for fileid in gutenberg.fileids():
out_dir = CORPUS_DIR + os.sep + fileid.replace(".txt", "")
if not os.path.isdir(out_dir):
os.makedirs(out_dir)
f = open(out_dir + os.sep + "sentences.txt", 'w')
f.write(gutenberg.raw(fileid))
f.close()
开发者ID:hashx101,项目名称:wordseerbackend_python,代码行数:8,代码来源:create_collection.py
示例4: gutenberg
def gutenberg():
from nltk.corpus import gutenberg
for t in gutenberg.fileids():
num_chars = len(gutenberg.raw(t))
num_words = len(gutenberg.words(t))
num_sents = len(gutenberg.sents(t))
num_vocab = len(set([w.lower() for w in gutenberg.words(t)]))
print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), t
开发者ID:kwdhd,项目名称:nlp,代码行数:8,代码来源:main.py
示例5: gutenberg
def gutenberg():
emma = nltk.corpus.gutenberg.words('austen-emma.txt')
print len(emma)
print gutenberg.fileids()
emma = gutenberg.words('austen-emma.txt')
macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')
macbeth_sentences[1037]
longest_len = max([len(s) for s in macbeth_sentences])
[s for s in macbeth_sentences if len(s) == longest_len]
for fileid in gutenberg.fileids():
num_chars = len(gutenberg.raw(fileid))
num_words = len(gutenberg.words(fileid))
num_sents = len(gutenberg.sents(fileid))
num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), fileid
开发者ID:AkiraKane,项目名称:Python,代码行数:19,代码来源:c02_text_corpora.py
示例6: similarity_gutenberg
def similarity_gutenberg():
for x in range(2,6):
a = []
b = 0
c = 0
d = 1
for fid in gutenberg.fileids():
a.append([])
for ffid in gutenberg.fileids():
a[b].append(Jaccard(n_window(gutenberg.raw(fid),x),n_window(gutenberg.raw(ffid),x)))
b += 1
for i in range(len(a)):
for j in range(len(a)):
c += a[i][j]/(len(a)*len(a))
d = min(d,a[i][j])
print("Media: "+ str(c))
print("Minimo: "+ str(d))
开发者ID:gabrielsqsf,项目名称:nltkfun,代码行数:19,代码来源:mineracao.py
示例7: page57
def page57():
"""Statistics from the Gutenberg corpora"""
from nltk.corpus import gutenberg
for fileid in gutenberg.fileids():
num_chars = len(gutenberg.raw(fileid))
num_words = len(gutenberg.words(fileid))
num_sents = len(gutenberg.sents(fileid))
num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
print int(num_chars / num_words), int(num_words / num_sents),
print int(num_words / num_vocab), fileid
开发者ID:andreoliwa,项目名称:nlp-book,代码行数:11,代码来源:book_examples.py
示例8: for_print
def for_print():
'''
显示每个文本的三个统计量
:return:
'''
for fileid in gutenberg.fileids():
num_chars=len(gutenberg.raw(fileid))
num_words=len(gutenberg.words(fileid))
num_sents=len(gutenberg.sents(fileid))
num_vocab=len(set([w.lower() for w in gutenberg.words(fileid)]))
print int(num_chars/num_words),int(num_words/num_sents),int(num_words/num_vocab),fileid
开发者ID:Paul-Lin,项目名称:misc,代码行数:11,代码来源:toturial.py
示例9: create_model_from_NLTK
def create_model_from_NLTK():
filepath = "nltkcorpus.txt"
if isfile(filepath):
return create_model(filepath= filepath, save=False)
else:
from nltk.corpus import reuters, brown, gutenberg
sents = reuters.sents() + brown.sents()
for gsents in [gutenberg.sents(fid) for fid in gutenberg.fileids()]:
sents += gsents
return create_model(sentences=sents, savename=filepath)
开发者ID:ieaalto,项目名称:CCProject,代码行数:11,代码来源:semantics.py
示例10: fun02
def fun02():
"""fun02"""
for fileid in gutenberg.fileids():
num_chars = len(gutenberg.raw(fileid))
num_words = len(gutenberg.words(fileid))
num_sents = len(gutenberg.sents(fileid))
num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
# average word length average sentence length
print int(num_chars/num_words), int(num_words/num_sents),
# number of times each vocabulary item appers in the text
print int(num_words/num_vocab), fileid
开发者ID:gree2,项目名称:hobby,代码行数:11,代码来源:ch02.py
示例11: solve_p2_greedy
def solve_p2_greedy(file):
lines = [l.lower().split("|")[1:-1] for l in open(file)]
slices = slice(lines)
n = 3
corpus = NgramLetterCorpus(n)
for fileid in gutenberg.fileids()[:3]:
corpus.update(gutenberg.raw(fileid))
slices = unshred3(slices, corpus)
print "FINAL: "
for l in linearize(slices):
print "".join(l)
开发者ID:indraastra,项目名称:puzzles,代码行数:13,代码来源:solve.py
示例12: train
def train(self):
self.vocabulary=set()
this_bigrams=[]
self.unigrams = FreqDist([])
for fileid in gutenberg.fileids():
for sentence in gutenberg.sents(fileid):
words=["<s>",] + [x.lower() for x in sentence if wordRE.search(x)] + ["</s>",]
this_bigrams += bigrams(words)
self.vocabulary.update(words)
self.unigrams.update(words)
self.bigrams=ConditionalFreqDist(this_bigrams)
self.V = len(self.vocabulary)
开发者ID:slee17,项目名称:NLP,代码行数:14,代码来源:LanguageModel.py
示例13: benchmark_sbd
def benchmark_sbd():
ps = []
rs = []
f1s = []
c = 0
for fileid in gutenberg.fileids():
c += 1
copy_sents_gold = gutenberg.sents(fileid)
sents_gold = [s for s in copy_sents_gold]
for sent_i in range(len(sents_gold)):
new_sent = [w for w in sents_gold[sent_i] if w.isalpha()]
sents_gold[sent_i] = new_sent
text = gutenberg.raw(fileid)
sents_obtained = split_text(text)
copy_sents_obtained = sents_obtained.copy()
for sent_i in range(len(sents_obtained)):
new_sent = [w.group()
for w in re.finditer(r'\w+', sents_obtained[sent_i])
if w.group().isalpha()]
sents_obtained[sent_i] = new_sent
c_common = 0
for sent in sents_obtained:
if sent in sents_gold:
c_common += 1
p, r, f1 = get_prf(c_common, len(sents_obtained), len(sents_gold))
print('\n\n', fileid)
print('Precision: {:0.2f}, Recall: {:0.2f}, F1: {:0.2f}'.format(p, r, f1))
ps.append(p)
rs.append(r)
f1s.append(f1)
print('\n\nPrecision stats: {:0.3f} +- {:0.4f}'.format(np.mean(ps),
np.std(ps)))
print('Recall stats: {:0.3f} +- {:0.4f}'.format(np.mean(rs),
np.std(rs)))
print('F1 stats: {:0.3f} +- {:0.4f}'.format(np.mean(f1s),
np.std(f1s)))
print(len(f1s))
good_ps = [p for p in ps if p >= 0.8]
good_rs = [r for r in rs if r >= 0.8]
good_f1s = [f1 for f1 in f1s if f1 >= 0.8]
print('\n Good precision stats: {:0.3f} +- {:0.4f}'.format(np.mean(good_ps),
np.std(good_ps)))
print('Good Recall stats: {:0.3f} +- {:0.4f}'.format(np.mean(good_rs),
np.std(good_rs)))
print('Good F1 stats: {:0.3f} +- {:0.4f}'.format(np.mean(good_f1s),
np.std(good_f1s)))
print(len(good_f1s))
开发者ID:artreven,项目名称:assessment_tools,代码行数:49,代码来源:readability.py
示例14: __init__
def __init__(self):
self.num_passages = 10
self.passagesize = 1000
self.maxpeople = 10
self.maxnouns = 5
self.total_passages = 10*len(gutenberg.fileids())
self.skeletons = []
self.index_dicts = []
#Load all of the things into memory
#j = 0
for fileid in gutenberg.fileids():
for k in range(self.num_passages):
filename = fileid+'_'+str(k) +'_skeleton.txt'
f = open(filename, 'r')
self.skeletons.append(f.read().split(" "))
f.close()
filename = fileid+'_'+str(k) +'_indices.txt'
f = open(filename, 'r')
self.index_dicts.append({})
for line in f.readlines():
splitted = line.split()
self.index_dicts[-1][splitted[0]] = splitted[1:]
f.close()
开发者ID:jacquelinekay,项目名称:facelibs,代码行数:24,代码来源:server.py
示例15: find_phrases
def find_phrases(regexp):
fids = gutenberg.fileids()
rs = []
for fid in fids:
txt = nltk.Text(gutenberg.words(fid))
ts = nltk.text.TokenSearcher(txt)
r = ts.findall(regexp)
for x in r:
if x[0].lower() in wrong_vbs:
x[0] = 'looking at'
if x[-1].lower() in wrong_vbs:
x[-1] = 'me'
rs.extend(r)
return rs
开发者ID:mobeets,项目名称:imperatives,代码行数:15,代码来源:imperatives_gutenberg.py
示例16: load_data
def load_data():
global N, words
freqs = [ FreqDist(corpus.words(fileid)) for fileid in corpus.fileids() ]
words = list(set(word
for dist in freqs
for word in dist.keys()
if word not in ENGLISH_STOP_WORDS and
word not in punctuation))
data = []
N = len(words)
for dist in freqs:
x = volumize(dist)
data.append((x, x.w))
return data
开发者ID:Aaronduino,项目名称:ConvNetPy,代码行数:17,代码来源:similarity.py
示例17: mean_len
def mean_len():
a = []
d = 1
for fid in gutenberg.fileids():
b = 0
c = 0
st = gutenberg.raw(fid)
stl = re.split("\n|\.|\!|\?", st)
stw = re.split("\n|\.|\!|\?| |,| - ", st)
for el in stl:
b += len(el)*(1.0)/len(stl)
for el in stw:
c += len(el)*(1.0)/len(stw)
print(fid)
print("Media Frases: "+ str(b))
print("Media Palavras: "+ str(c))
开发者ID:gabrielsqsf,项目名称:nltkfun,代码行数:17,代码来源:mineracao.py
示例18: load_data
def load_data():
global N, words
freqs = [ FreqDist(corpus.words(fileid)) for fileid in corpus.fileids() ]
words = list(set(word
for dist in freqs
for word in dist.keys()
if word not in ENGLISH_STOP_WORDS and
word not in punctuation))
data = []
N = len(words)
for dist in freqs:
V = Vol(1, 1, N, 0.0)
for i, word in enumerate(words):
V.w[i] = dist.freq(word)
data.append((V, V.w))
return data
开发者ID:Aaronduino,项目名称:ConvNetPy,代码行数:19,代码来源:topics.py
示例19: nltk_test_2
def nltk_test_2():
# Count each token in each text of the Gutenberg collection
fd = FreqDist()
for text in gutenberg.fileids():
for word in gutenberg.words(text):
fd[word.lower()] += 1
# Initialize two empty lists which will hold our ranks and frequencies
ranks = []
freqs = []
# Generate a (rank, frequency) point for each counted token and append to the respective lists
for rank, word in enumerate(fd):
ranks.append(rank + 1)
freqs.append(fd[word])
freqs.sort(reverse=True)
# Plot rank vs frequency on a log
|
请发表评论