本文整理汇总了Python中nltk.corpus.gutenberg.raw函数的典型用法代码示例。如果您正苦于以下问题:Python raw函数的具体用法?Python raw怎么用?Python raw使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了raw函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: test_austen
def test_austen():
from nltk.data import load
from nltk.corpus import gutenberg as g
stok = load('tokenizers/punkt/english.pickle')
train = [[w for w in tokenize(preprocess(sent))] for sent in stok.tokenize(g.raw('austen-emma.txt'))]
test1 = [[w for w in tokenize(preprocess(sent))] for sent in stok.tokenize(g.raw('austen-sense.txt'))]
test2 = [[w for w in tokenize(preprocess(sent))] for sent in stok.tokenize(g.raw('austen-persuasion.txt'))]
model1 = AdditiveSmoothing(n=2)
model1.generate_model(train)
print 'cross entropy additive smoothing:'
print 'emma to sense&sensibility: %f0.8' %cross_entropy(model1, test1)
print 'emma to persuasion: %f0.8' %cross_entropy(model1, test2)
model2 = KnesserNey(n=2)
model2.generate_model(train)
print 'cross entropy knesser-ney smoothing:'
print 'emma to sense&sensibility: %f0.8' %cross_entropy(model2, test1)
print 'emma to persuasion: %f0.8' %cross_entropy(model2, test2)
model3 = SimpleGoodTuring(n=2)
model3.generate_model(train)
print 'cross entropy simple good-turing smoothing:'
print 'emma to sense&sensibility: %f0.8' %cross_entropy(model3, test1)
print 'emma to persuasion: %f0.8' %cross_entropy(model3, test2)
model4 = KatzSmoothing(n=2)
model4.generate_model(train)
print 'cross entropy katz smoothing:'
print 'emma to sense&sensibility: %f0.8' %cross_entropy(model4, test1)
print 'emma to persuasion: %f0.8' %cross_entropy(model4, test2)
开发者ID:JoeDumoulin,项目名称:nlp_working,代码行数:29,代码来源:calc_score.py
示例2: test
def test():
from nltk.corpus import gutenberg
emma = gutenberg.raw('austen-emma.txt')
print len(emma)
ex = createexercise(emma, pos='v', last_index=False, fast=True)
print len(ex)
开发者ID:SuzanaK,项目名称:wordgap,代码行数:7,代码来源:wordex.py
示例3: load_moby_dick_analysis
def load_moby_dick_analysis():
tokens = get_moby_dick_tokens()
text = gutenberg.raw('melville-moby_dick.txt')
try:
moby_dick_doc = Document(
url='gutenberg',
name='moby dick',
text=text,
month='Jan',
year='1851'
)
odm_session.flush()
except DuplicateKeyError:
moby_dick_doc = Document.query.get(name='moby dick')
for sum_threshold in sum_thresholds:
log.info("Trying analysis for threshold = %s" % sum_threshold)
analysis = get_optimal_window_size(tokens, window_sizes, 20, sum_threshold=sum_threshold)[1]
anal_dict = analysis.encode()
window_size = anal_dict['window_size']
log.debug("Best result = %s" % window_size)
InformationValueResult(
window_size = window_size,
threshold = sum_threshold,
document = moby_dick_doc,
iv_words = anal_dict['top_words'],
max_iv = anal_dict['max_iv'],
sum_iv = anal_dict['sum_iv']
)
odm_session.flush()
开发者ID:finiteautomata,项目名称:leninanalysis,代码行数:32,代码来源:moby_dick.py
示例4: exercise_gutenberg
def exercise_gutenberg():
# 打印古腾堡项目的文件列表
print gutenberg.fileids()
# 挑选一个文本: 简-奥斯丁的《爱玛》
emma = gutenberg.words("austen-emma.txt")
# 查看书的长度
print len(emma)
# 导入文本
emma_text = nltk.Text(emma)
emma_text.concordance("surprize")
for file_id in gutenberg.fileids():
chars_list = gutenberg.raw(file_id)
words_list = gutenberg.words(file_id)
sents_list = gutenberg.sents(file_id)
# 统计文件的总字符数
num_chars = len(chars_list)
# 统计文件的总单词数
num_words = len(words_list)
# 统计文件的总句子数
num_sents = len(sents_list)
# 统计文件的非重复单词数
num_vocab = len(set([w.lower() for w in words_list]))
# 打印词的平均字符数, 句子的平均单词数, 每个单词出现的平均次数, 文件名
print num_chars / num_words, num_words / num_sents, num_words / num_vocab, file_id
开发者ID:BurnellLiu,项目名称:LiuProject,代码行数:29,代码来源:chapter_02.py
示例5: gutenberg
def gutenberg():
from nltk.corpus import gutenberg
for t in gutenberg.fileids():
num_chars = len(gutenberg.raw(t))
num_words = len(gutenberg.words(t))
num_sents = len(gutenberg.sents(t))
num_vocab = len(set([w.lower() for w in gutenberg.words(t)]))
print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), t
开发者ID:kwdhd,项目名称:nlp,代码行数:8,代码来源:main.py
示例6: handle
def handle(self, *args, **options):
for fileid in gutenberg.fileids():
out_dir = CORPUS_DIR + os.sep + fileid.replace(".txt", "")
if not os.path.isdir(out_dir):
os.makedirs(out_dir)
f = open(out_dir + os.sep + "sentences.txt", 'w')
f.write(gutenberg.raw(fileid))
f.close()
开发者ID:hashx101,项目名称:wordseerbackend_python,代码行数:8,代码来源:create_collection.py
示例7: similarity_gutenberg
def similarity_gutenberg():
for x in range(2,6):
a = []
b = 0
c = 0
d = 1
for fid in gutenberg.fileids():
a.append([])
for ffid in gutenberg.fileids():
a[b].append(Jaccard(n_window(gutenberg.raw(fid),x),n_window(gutenberg.raw(ffid),x)))
b += 1
for i in range(len(a)):
for j in range(len(a)):
c += a[i][j]/(len(a)*len(a))
d = min(d,a[i][j])
print("Media: "+ str(c))
print("Minimo: "+ str(d))
开发者ID:gabrielsqsf,项目名称:nltkfun,代码行数:19,代码来源:mineracao.py
示例8: structure
def structure():
raw = gutenberg.raw("burgess-busterbrown.txt")
raw[1:20]
words = gutenberg.words("burgess-busterbrown.txt")
words[1:20]
sents = gutenberg.sents("burgess-busterbrown.txt")
sents[1:20]
开发者ID:AkiraKane,项目名称:Python,代码行数:10,代码来源:c02_text_corpora.py
示例9: for_print
def for_print():
'''
显示每个文本的三个统计量
:return:
'''
for fileid in gutenberg.fileids():
num_chars=len(gutenberg.raw(fileid))
num_words=len(gutenberg.words(fileid))
num_sents=len(gutenberg.sents(fileid))
num_vocab=len(set([w.lower() for w in gutenberg.words(fileid)]))
print int(num_chars/num_words),int(num_words/num_sents),int(num_words/num_vocab),fileid
开发者ID:Paul-Lin,项目名称:misc,代码行数:11,代码来源:toturial.py
示例10: fun02
def fun02():
"""fun02"""
for fileid in gutenberg.fileids():
num_chars = len(gutenberg.raw(fileid))
num_words = len(gutenberg.words(fileid))
num_sents = len(gutenberg.sents(fileid))
num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
# average word length average sentence length
print int(num_chars/num_words), int(num_words/num_sents),
# number of times each vocabulary item appers in the text
print int(num_words/num_vocab), fileid
开发者ID:gree2,项目名称:hobby,代码行数:11,代码来源:ch02.py
示例11: page57
def page57():
"""Statistics from the Gutenberg corpora"""
from nltk.corpus import gutenberg
for fileid in gutenberg.fileids():
num_chars = len(gutenberg.raw(fileid))
num_words = len(gutenberg.words(fileid))
num_sents = len(gutenberg.sents(fileid))
num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
print int(num_chars / num_words), int(num_words / num_sents),
print int(num_words / num_vocab), fileid
开发者ID:andreoliwa,项目名称:nlp-book,代码行数:11,代码来源:book_examples.py
示例12: solve_p2_greedy
def solve_p2_greedy(file):
lines = [l.lower().split("|")[1:-1] for l in open(file)]
slices = slice(lines)
n = 3
corpus = NgramLetterCorpus(n)
for fileid in gutenberg.fileids()[:3]:
corpus.update(gutenberg.raw(fileid))
slices = unshred3(slices, corpus)
print "FINAL: "
for l in linearize(slices):
print "".join(l)
开发者ID:indraastra,项目名称:puzzles,代码行数:13,代码来源:solve.py
示例13: test_moby_dick_window
def test_moby_dick_window(self):
#just make sure we
window_sizes = xrange(100, 6000, 100)
text = gutenberg.raw('melville-moby_dick.txt')
tokens = tokenize(text, only_alphanum=True, clean_punctuation=True)
total_number_of_tokens = len(tokens)
for window_size in window_sizes:
count = 0
number_of_windows = int(math.ceil( total_number_of_tokens / window_size))
for current_window in range(0, number_of_windows+1):
word_window = Window(tokens, window_size, current_window)
for word in word_window:
count += 1
self.assertEquals(count, total_number_of_tokens)
开发者ID:finiteautomata,项目名称:leninanalysis,代码行数:14,代码来源:test_window.py
示例14: benchmark_sbd
def benchmark_sbd():
ps = []
rs = []
f1s = []
c = 0
for fileid in gutenberg.fileids():
c += 1
copy_sents_gold = gutenberg.sents(fileid)
sents_gold = [s for s in copy_sents_gold]
for sent_i in range(len(sents_gold)):
new_sent = [w for w in sents_gold[sent_i] if w.isalpha()]
sents_gold[sent_i] = new_sent
text = gutenberg.raw(fileid)
sents_obtained = split_text(text)
copy_sents_obtained = sents_obtained.copy()
for sent_i in range(len(sents_obtained)):
new_sent = [w.group()
for w in re.finditer(r'\w+', sents_obtained[sent_i])
if w.group().isalpha()]
sents_obtained[sent_i] = new_sent
c_common = 0
for sent in sents_obtained:
if sent in sents_gold:
c_common += 1
p, r, f1 = get_prf(c_common, len(sents_obtained), len(sents_gold))
print('\n\n', fileid)
print('Precision: {:0.2f}, Recall: {:0.2f}, F1: {:0.2f}'.format(p, r, f1))
ps.append(p)
rs.append(r)
f1s.append(f1)
print('\n\nPrecision stats: {:0.3f} +- {:0.4f}'.format(np.mean(ps),
np.std(ps)))
print('Recall stats: {:0.3f} +- {:0.4f}'.format(np.mean(rs),
np.std(rs)))
print('F1 stats: {:0.3f} +- {:0.4f}'.format(np.mean(f1s),
np.std(f1s)))
print(len(f1s))
good_ps = [p for p in ps if p >= 0.8]
good_rs = [r for r in rs if r >= 0.8]
good_f1s = [f1 for f1 in f1s if f1 >= 0.8]
print('\n Good precision stats: {:0.3f} +- {:0.4f}'.format(np.mean(good_ps),
np.std(good_ps)))
print('Good Recall stats: {:0.3f} +- {:0.4f}'.format(np.mean(good_rs),
np.std(good_rs)))
print('Good F1 stats: {:0.3f} +- {:0.4f}'.format(np.mean(good_f1s),
np.std(good_f1s)))
print(len(good_f1s))
开发者ID:artreven,项目名称:assessment_tools,代码行数:49,代码来源:readability.py
示例15: access
def access():
monty[0]
monty[3]
monty[5]
monty[-1]
sent = 'colorless green ideas sleep furiously'
for char in sent:
print char,
from nltk.corpus import gutenberg
raw = gutenberg.raw('melville-moby_dick.txt')
fdist = nltk.FreqDist(ch.lower() for ch in raw if ch.isalpha())
fdist.keys()
开发者ID:AkiraKane,项目名称:Python,代码行数:15,代码来源:c03_strings.py
示例16: load_hamlet
def load_hamlet():
"""
Loads the contents of the play Hamlet into a string.
Returns
-------
str
The one big, raw, unprocessed string.
Example
-------
>>> document = load_hamlet()
>>> document[:80]
'[The Tragedie of Hamlet by William Shakespeare 1599]\n\n\nActus Primus. Scoena Prim'
"""
return gutenberg.raw("shakespeare-hamlet.txt")
开发者ID:efrenaguilar95,项目名称:Yelp_Analyzer,代码行数:16,代码来源:test.py
示例17: mean_len
def mean_len():
a = []
d = 1
for fid in gutenberg.fileids():
b = 0
c = 0
st = gutenberg.raw(fid)
stl = re.split("\n|\.|\!|\?", st)
stw = re.split("\n|\.|\!|\?| |,| - ", st)
for el in stl:
b += len(el)*(1.0)/len(stl)
for el in stw:
c += len(el)*(1.0)/len(stw)
print(fid)
print("Media Frases: "+ str(b))
print("Media Palavras: "+ str(c))
开发者ID:gabrielsqsf,项目名称:nltkfun,代码行数:17,代码来源:mineracao.py
示例18: get_moby_dick_document
def get_moby_dick_document():
moby_dick = gutenberg.raw('melville-moby_dick.txt')
document = Document(
url = 'melville-moby_dick.txt',
name = 'Moby dick',
text = moby_dick,
month = 'Oct',
year = 1851
)
# document uses tokenizer func for create tokens, since we need to enforce
# only_alphanum and clean_punct we need a wrapper
def tokenizer_wrapper(raw_text):
return map(str.lower, map(str, tokenize(raw_text, only_alphanum=True, clean_punctuation=True)))
document.tokenizer = tokenizer_wrapper
odm_session.flush()
return document
开发者ID:finiteautomata,项目名称:leninanalysis,代码行数:18,代码来源:moby_dick_tests.py
示例19: gutenberg
def gutenberg():
emma = nltk.corpus.gutenberg.words('austen-emma.txt')
print len(emma)
print gutenberg.fileids()
emma = gutenberg.words('austen-emma.txt')
macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')
macbeth_sentences[1037]
longest_len = max([len(s) for s in macbeth_sentences])
[s for s in macbeth_sentences if len(s) == longest_len]
for fileid in gutenberg.fileids():
num_chars = len(gutenberg.raw(fileid))
num_words = len(gutenberg.words(fileid))
num_sents = len(gutenberg.sents(fileid))
num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), fileid
开发者ID:AkiraKane,项目名称:Python,代码行数:19,代码来源:c02_text_corpora.py
示例20: sentenceTokenization
def sentenceTokenization():
### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
mySentenceTokenizer = nltk.sent_tokenize
### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
sample_text = 'We will discuss briefly about the basic syntax, structure and design philosophies. There is a defined hierarchical syntax for Python code which you should remember when writing code! Python is a really powerful programming language!'
sentences_sample = mySentenceTokenizer(text = sample_text)
print( '\nTotal number of sentences in sample_text: ' + str(len(sentences_sample)) )
print( '\nSample sentences:' )
print( sentences_sample )
### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
alice = gutenberg.raw(fileids = 'carroll-alice.txt')
print( "\n### len(alice), total number of characters: " + str(len(alice)) )
print( "\n### First 1000 characters of carroll-alice.txt:\n" )
print( alice[0:1000] )
sentences_alice = mySentenceTokenizer(text = alice)
print( '\nTotal number of sentences in Alice: ' + str(len(sentences_alice)) )
print( '\nFirst 5 sentences in Alice:' )
for temp_sentence in sentences_alice[0:5]:
print( "\n### ~~~~~~~~~~ ###\n" + temp_sentence )
print( "\n### ~~~~~~~~~~ ###" )
### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
text_german = europarl_raw.german.raw(fileids = "ep-00-01-17.de")
print( "\n### len(German text), total number of characters: " + str(len(text_german)) )
print( "\n### First 1000 characters of ep-00-01-17.de (German text):\n" )
print( text_german[0:1000] )
sentences_german = mySentenceTokenizer(text = text_german, language = "german")
print( '\nTotal number of sentences in German text: ' + str(len(sentences_german)) )
print( '\nFirst 5 sentences in German text:' )
for temp_sentence in sentences_german[0:5]:
print( "\n### ~~~~~~~~~~ ###\n" + temp_sentence )
print( "\n### ~~~~~~~~~~ ###" )
### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
return( None )
开发者ID:paradisepilot,项目名称:statistics,代码行数:42,代码来源:TextTokenization.py
注:本文中的nltk.corpus.gutenberg.raw函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论