本文整理汇总了Python中nltk.corpus.brown.sents函数的典型用法代码示例。如果您正苦于以下问题:Python sents函数的具体用法?Python sents怎么用?Python sents使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了sents函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: load_sentences_brown
def load_sentences_brown(nb_sentences=None):
"""
:param nb_sentences: Use if all brown sentences are too many
:return: index2word (list of string)
"""
from nltk.corpus import brown
import gensim
print 'building vocab ...'
if nb_sentences is None:
sents = brown.sents()
else:
sents = brown.sents()[:nb_sentences]
# I use gensim model only for building vocab
model = gensim.models.Word2Vec()
model.build_vocab(sents)
vocab = model.vocab
# ids: list of (list of word-id)
ids = [[vocab[w].index for w in sent
if w in vocab and vocab[w].sample_int > model.random.rand() * 2**32]
for sent in sents]
return ids, model.index2word
开发者ID:perrier1034,项目名称:skipgram-word2vec-keras,代码行数:26,代码来源:utils.py
示例2: clean
def clean():
'''
1. Removes any individual special character.
2. Lowers all the words.
:return: list of clean sentences
'''
sents = list(brown.sents())
sents_copy = list(brown.sents())
n = len(sents)
print 'Removing special chars...'
for i in range(0, n):
for word in sents[i]:
if not bool(re.search('[A-Za-z0-9]', word)):
sents_copy[i].remove(word)
print 'Removed special chars.'
sents = None
print 'Lowercasing all the words...'
for i in range(0, n):
m = len(sents_copy[i])
for j in range(0, m):
sents_copy[i][j] = sents_copy[i][j].lower()
print 'Lowered all the words.'
return sents_copy
开发者ID:CRUZEAAKASH,项目名称:ArticleWriter,代码行数:26,代码来源:BrownDataCleaner.py
示例3: print_brown
def print_brown():
from nltk.corpus import brown
print brown.categories()
print brown.words(categories='news')
print brown.words(fileids=['cg22'])
print brown.sents(categories=['news','reviews'])
news_text=brown.words(categories='news')
fdist=nltk.FreqDist([w.lower() for w in news_text])
modals=['can','could','may','might','must','will']
for m in modals:
print m+':',fdist[m]
开发者ID:Paul-Lin,项目名称:misc,代码行数:11,代码来源:toturial.py
示例4: load_movie_corpus_each_sentence
def load_movie_corpus_each_sentence(range):
m = re.match(r'(\d+):(\d+)$', range)
if m:
start = int(m.group(1))
end = int(m.group(2))
from nltk.corpus import movie_reviews as corpus
return [corpus.sents(fileid) for fileid in corpus.fileids()[start:end]]
开发者ID:zjusuyong,项目名称:multi_grain_lda,代码行数:7,代码来源:vocabulary_for_mglda.py
示例5: find_ngrams
def find_ngrams(self, n):
""" Input: the 'n' of 'n-grams'
Find all the n-grams in the brown corpus. Store in frequency dictionary.
Optionally it can be decided to use more corpora in order to have more data.
Note: these are of course n-grams based on going through the sentence from left to right
If we want to give the correction back based on the dependency tree, we need to
parse the brown corpus (or any other data set) with the dependency parser, so that
we can use this data.
"""
total_ngram_count = 0
ngram_freq_dict = {}
sents = brown.sents()
for sent in sents:
sent = ['-START-']*(n-1)+sent
ngrams_brown = ngrams(sent, n)
for i in ngrams_brown:
total_ngram_count += 1
old = ngram_freq_dict.get(i,0)
old += 1
ngram_freq_dict[i] = old
#print i,old
return ngram_freq_dict, total_ngram_count
开发者ID:Tomaat,项目名称:grammarCorrector,代码行数:29,代码来源:correction.py
示例6: data_api
def data_api(spilt_rate):
raw_sent = brown.sents()
partial_data = raw_sent[:int(0.1*len(raw_sent))]
data_x, data_y = prepare_0(partial_data, word2intdict)
print 'len data_x', len(data_x), len(data_y)
train_inds = npr.choice(range(len(data_x)), size = int((1 - spilt_rate) * len(data_x)), replace = False)
X_train = []
Y_train = []
X_test = []
Y_test = []
print 'len train_inds', len(train_inds), len(data_x)
for i in range(len(data_x)):
if i in train_inds:
#print 'trn', i
X_train.append(data_x[i])
Y_train.append(data_y[i])
else :
#print 'tst', i
X_test.append(data_x[i])
Y_test.append(data_y[i])
print 'len X_train', len(X_train), len(X_test)
return (X_train, Y_train), (X_test, Y_test)
开发者ID:taineleau,项目名称:Neural-Learner-for-English-Language-Test,代码行数:25,代码来源:get_data.py
示例7: lookupTagger
def lookupTagger():
fd = nltk.FreqDist(brown.words(categories='news'))
cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
most_freq_words = fd.keys()[:100]
likely_tags = dict((word, cfd[word].max()) for word in most_freq_words)
baseline_tagger = nltk.UnigramTagger(model=likely_tags)
baseline_tagger.evaluate(brown_tagged_sents)
sent = brown.sents(categories='news')[3]
baseline_tagger.tag(sent)
baseline_tagger = nltk.UnigramTagger(model=likely_tags,
backoff=nltk.DefaultTagger('NN'))
def performance(cfd, wordlist):
lt = dict((word, cfd[word].max()) for word in wordlist)
baseline_tagger = nltk.UnigramTagger(model=lt, backoff=nltk.DefaultTagger('NN'))
return baseline_tagger.evaluate(brown.tagged_sents(categories='news'))
def display():
import pylab
words_by_freq = list(nltk.FreqDist(brown.words(categories='news')))
cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
sizes = 2 ** pylab.arange(15)
perfs = [performance(cfd, words_by_freq[:size]) for size in sizes]
pylab.plot(sizes, perfs, '-bo')
pylab.title('Lookup Tagger Performance with Varying Model Size')
pylab.xlabel('Model Size')
pylab.ylabel('Performance')
pylab.show()
开发者ID:AkiraKane,项目名称:Python,代码行数:31,代码来源:c05_auto_tagging.py
示例8: read_datas
def read_datas(self):
brown_tagged_sentence = brown.tagged_sents()
brown_sent = brown.sents()
size = int(len(brown_tagged_sentence) * 0.9)
train_set = brown_tagged_sentence[:size]
test_set = brown_tagged_sentence[size:]
return (train_set,test_set)
开发者ID:Nicolas99-9,项目名称:TERApprentissage,代码行数:7,代码来源:tagger.py
示例9: build_index
def build_index(out_filename, in_filename = None):
'''Builds data files for word lookup. Can take an optional input file
to add to the data pool which is processed (not working).
Data is then dumped to a pickle file.'''
sents_data = []
try:
in_file = open(in_filename).read()
sents_data += sent_tokenize(in_file)
in_file.close()
except:
print("Warning: Failed to load external file for building.")
sents_data += brown.sents() + treebank.sents()
# get sentences, chop of rtheir ambiguous heads, and look at their words!
mysents = [sent[1:] for sent in sents_data]
# flatten sublists of words to list of words
mywords = [word for word in mysents for word in word]
cfd = ConditionalFreqDist((word.lower(), word) for word in mywords)
# look up most frequent form of lowercase word by doing cfd['word'].max()
# but need to check for existance of word in cfd first
# made pickle file too large and slow
# wordlist = set(words.words())
# wordlist.update(brown.words())
# wordlist.update(treebank.words())
# common_words_lower = set([w for w in wordlist if w.islower()])
# common_words_titlecase = set([w.lower() for w in wordlist if (w.istitle() and w not in common_words_lower)])
out_file = open(out_filename, 'wb')
pickle.dump(cfd, out_file, 2)
# pickle.dump(common_words_lower, out_file, 2)
# pickle.dump(common_words_titlecase, out_file, 2)
out_file.close()
开发者ID:lberezy,项目名称:LangComp,代码行数:35,代码来源:main.py
示例10: cal_idf
def cal_idf():
# brown.sents()
total_wordlists = []
doc_sents = []
for f in brown.fileids():
print f
doc_wordlist = []
doc_sentlist = brown.sents(fileids=[f])
d_sents = ''
for sent in doc_sentlist:
s = ''
# sent = stem_tokens(sent)
for w in sent:
w = w.lower()
s += w + ' '
d_sents += s + '\n'
doc_wordlist.extend(sent)
total_wordlists.append(doc_wordlist)
doc_sents.append(d_sents)
print 'start caling tfidf'
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = doc_sents
vectorizer = TfidfVectorizer(min_df=1)
X = vectorizer.fit_transform(corpus)
idf = vectorizer.idf_
# print dict(zip(vectorizer.get_feature_names(), idf))
pickle.dump(vectorizer, open('idf_vectorizer', 'w'))
dictionary = corpora.Dictionary(total_wordlists)
dic, corps = get_corpus_by_lists(total_wordlists)
tfidf = models.TfidfModel(corps, id2word=dic)
pickle.dump(tfidf, open('brown_tfidf', 'w'))
开发者ID:JayveeHe,项目名称:OpinionRankProject,代码行数:32,代码来源:corpus_utils.py
示例11: auto_tag
def auto_tag(company):
"""
tag a given text using brown corpus and unigram tagger
:param company: company whose reviews are tagged
:return: a list of tagged words
"""
brown_tagged_sents = brown.tagged_sents(categories = 'news', tagset='universal')
brown_sents = brown.sents(categories = 'news')
# open the review of a company, and print error message if company review doesn't exist
# first deal with unique cases such as General Motors => GM
if company == 'General Motors':
company = 'GM'
elif company == 'Ford Motor Company':
company = 'Ford'
try:
text = open('/Users/vickyzhang/Documents/Python/chart/comp/review/'+ company.capitalize() + '_review.txt').read()
except FileNotFoundError:
print('The system doesn\'t have a review for the company you entered. Please enter another company.')
# normalize (tokenize and lowercase-ize) each word in the string
text_token = nltk.word_tokenize(text)
text_normal = [w.lower() for w in text_token]
# build unigram tagger based on brown corpus, and use it to tag the normalized text
unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
text_tagged = unigram_tagger.tag(text_normal)
return text_tagged
开发者ID:vicher37,项目名称:jobchart,代码行数:28,代码来源:review_summary.py
示例12: update_category_by_pos
def update_category_by_pos():
from nltk.corpus import brown
from nltk import NaiveBayesClassifier
from nltk import classify
from nltk.tag import untag
from nltk import DecisionTreeClassifier
def pos_features(sentence, i):
features = {'suffix(1)':sentence[i][-1:],
'suffix(2)':sentence[i][-2:],
'suffix(3)':sentence[i][-3:]
}
features['prev-word'] = '<start>' if i==0 else sentence[i-1]
return features
print pos_features(brown.sents()[0], 8)
tagged_sents = brown.tagged_sents(categories='news')
featuresets = []
for tagged_sent in tagged_sents:
untagged_sent = untag(tagged_sent)
for i, (word, tag) in enumerate(tagged_sent):
featuresets.append((pos_features(untagged_sent, i), tag))
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
# classifier = NaiveBayesClassifier.train(train_set)
classifier = DecisionTreeClassifier.train(train_set)
print 'NaiveBay %f' % classify.accuracy(classifier, test_set)
开发者ID:brenden17,项目名称:infinity,代码行数:30,代码来源:category_nltk.py
示例13: import_brown_pos
def import_brown_pos(ds, simplify_tags=False, silent=False, log=sys.stdout):
"""
Import the brown corpus into `ds`. E.g.
>>> from nathan.core import Dataspace
>>> ds = Dataspace()
>>> %time brown.import_brown(ds, silent=True)
CPU times: user 12min 28s, sys: 536 ms, total: 12min 29s
Wall time: 12min 29s
"""
if not silent:
total = len(brown.sents())
counter = 0
for category in brown.categories():
cat_handle = ds.insert("#%s" % category)
for sent in brown.tagged_sents(categories=category):
if simplify_tags:
norm = (simplify_tag(t) for t in sent)
norm = [nltk.tuple2str(t) for t in norm]
sen_handle = ds.insert(norm)
ds.link(cat_handle, sen_handle)
if not silent:
counter += 1
if (counter % 100 == 0):
print("importing %s of %s sentences..." % (counter, total),
file=log)
开发者ID:tdiggelm,项目名称:nltk-playground,代码行数:26,代码来源:train.py
示例14: createModel
def createModel():
global classifierit
global classifierloose
global classifieryou
global classifierto
global classifiertheir
trainingitSet = []
traininglooseSet = []
trainingyouSet = []
trainingtoSet = []
trainingtheirSet= []
st = POSTagger('/home/siddhartha/Downloads/stanford-postagger-full-2014-01-04/models/english-bidirectional-distsim.tagger', '/home/siddhartha/Downloads/stanford-postagger-full-2014-01-04/stanford-postagger.jar')
for line in brown.sents():
print line
tagSent = st.tag(line)
print tagSent
arrayOfitFeature = pos_itfeatures(tagSent)
arrayOfyouFeature = pos_youfeatures(tagSent)
arrayOftheirFeature = pos_theirfeatures(tagSent)
arrayOflooseFeature = pos_loosefeatures(tagSent)
arrayOftoFeature = pos_tofeatures(tagSent)
if arrayOfitFeature:
trainingitSet.extend(arrayOfitFeature)
if arrayOftheirFeature:
trainingtheirSet.extend(arrayOftheirFeature)
if arrayOflooseFeature:
traininglooseSet.extend(arrayOflooseFeature)
if arrayOftoFeature:
trainingtoSet.extend(arrayOftoFeature)
if arrayOfyouFeature:
trainingyouSet.extend(arrayOfyouFeature)
algorithm = nltk.classify.MaxentClassifier.ALGORITHMS[1]
#encodingit = maxent.TypedMaxentFeatureEncoding.train(trainingitSet, count_cutoff=3, alwayson_features=True)
classifierit = maxent.MaxentClassifier.train(trainingitSet, algorithm)
f = open('classifierit.pickle', 'wb')
pickle.dump(classifierit, f)
f.close()
#encodingloose = maxent.TypedMaxentFeatureEncoding.train(traininglooseSet, count_cutoff=3, alwayson_features=True)
classifierloose = maxent.MaxentClassifier.train(traininglooseSet, algorithm)
f = open('classifierloose.pickle', 'wb')
pickle.dump(classifierloose, f)
f.close()
#encodingyou = maxent.TypedMaxentFeatureEncoding.train(trainingyouSet, count_cutoff=3, alwayson_features=True)
classifieryou = maxent.MaxentClassifier.train(trainingyouSet, algorithm)
f = open('classifieryou.pickle', 'wb')
pickle.dump(classifieryou, f)
f.close()
#encodingto = maxent.TypedMaxentFeatureEncoding.train(trainingtoSet, count_cutoff=3, alwayson_features=True)
classifierto = maxent.MaxentClassifier.train(trainingtoSet, algorithm)
f = open('classifierto.pickle', 'wb')
pickle.dump(classifierto, f)
f.close()
#encodingtheir = maxent.TypedMaxentFeatureEncoding.train(trainingtheirSet, count_cutoff=3, alwayson_features=True)
classifiertheir = maxent.MaxentClassifier.train(trainingtheirSet, algorithm)
f = open('classifiertheir.pickle', 'wb')
pickle.dump(classifiertheir, f)
f.close()
开发者ID:siddharthasandhu,项目名称:NLPProjects,代码行数:59,代码来源:stanLearn.py
示例15: get_valid_brown_corpus
def get_valid_brown_corpus():
global DIR
DIR = BROWN_DIR
genre = ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
sentences = brown.sents(categories=genre)
sents = remove_bad_sents(sentences)
sents = [[w.lower() for w in s] for s in sents]
return sents
开发者ID:eugenet12,项目名称:PoemGenerator,代码行数:8,代码来源:process_corpus.py
示例16: brown_tagged_sents
def brown_tagged_sents():
from nltk.corpus import brown
brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents = brown.sents(categories='news')
unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
size = int(len(brown_tagged_sents) * 0.9)
train_sents = brown_tagged_sents[:size]
return (train_sents, brown_tagged_sents[size:])
开发者ID:atokop,项目名称:compling,代码行数:8,代码来源:u3.py
示例17: ic
def ic(w) :
total = 0
for sentence in b.sents():
for word in sentence:
total = total + 1
brown_freqs[word.lower()] +=1
print w.lower() ,":",brown_freqs[w.lower()], 1.0 - (math.log(brown_freqs[w.lower()]) / math.log(total+1))
开发者ID:dxd132630,项目名称:NeoPythonic,代码行数:8,代码来源:NLPProject.py
示例18: uG
def uG():
global uniCounter #counts repeats of uniGrams
global uniGram #dictionary of biGrams
global uniGrams #counts biGrams
uniCounter = {}
uniGram = []
uniGrams = 0
news = brown.sents(categories='editorial')
for x in range (1, MAX, 1):
sent = news[x]
sent.append('</s>') #ending sentences with '</s>'
sent.insert(0, '<s>') #beginning sentences with '<s>'
for x in range (0,sent.count('.')+1,1):
try:
sent.remove('.') #removing .'s
except:
pass
for x in range (0,sent.count(',')+1,1):
try:
sent.remove(',') #removing ,'s
except:
pass
for x in range (0,sent.count("'")+1,1):
try:
sent.remove("'") #removing ''s
except:
pass
for x in range (0,sent.count('"')+1,1):
try:
sent.remove('"') #removing ''s
except:
pass
x = 0
for word in sent:
word = word.lower() #making all letters lowercase
sent[x] = word #so differences dont occur when
x = x+1 #they shouldn't
value = '1'
for x in range (0,len(sent),1):
try:
word = sent[x]
if(word not in uniGram):
uniGram.append(word)
uniGrams = uniGrams + 1
if (word in uniCounter):
value = uniCounter[word]
value = value + 1
uniCounter[word] = value
else:
uniCounter[word] = 1
except:
pass
开发者ID:cglennk,项目名称:nGrams,代码行数:58,代码来源:nGram.py
示例19: learn
def learn(self, listofsentences=[], n=2000):
self.learned = defaultdict(mydict)
if listofsentences == []:
listofsentences = brown.sents()
for i, sent in enumerate(listofsentences):
if i >= n: # Limit to the first nth sentences of the corpus
break
for word in sent:
self.learned[self.specialhash(word)][word.lower()] += 1
开发者ID:aminorex,项目名称:icsisumm,代码行数:9,代码来源:didyoumean.py
示例20: collect_data_from_ptb_brow_duc2004
def collect_data_from_ptb_brow_duc2004():
start_collect = time.time()
samples = []
# Penn Tree Bank
treebank_sents = treebank.sents()
for i in range(len(treebank_sents)):
senttmp = " ".join(treebank_sents[i])
words = nltk.word_tokenize(senttmp)
samples.append(words)
sys.stdout.write("Finish collecting training data from Penn Tree Bank")
sys.stdout.flush()
# Brown
brown_sents = brown.sents()
for i in range(len(brown_sents)):
senttmp = " ".join(brown_sents[i])
words = nltk.word_tokenize(senttmp)
samples.append(words)
sys.stdout.write("Finish collecting training data from Brown")
sys.stdout.flush()
# DUC data
folder_path = "/Users/HyNguyen/Documents/Research/Data/duc2004/DUC2004_Summarization_Documents/duc2004_testdata/tasks1and2/duc2004_tasks1and2_docs/docs"
clusters_name = os.listdir(folder_path)
for cluster_name in clusters_name:
if cluster_name[0] == ".":
# except file .DStore in my macbook
continue
files_name = os.listdir(folder_path + "/" + cluster_name)
for file_name in files_name:
if file_name[0] == ".":
# except file .DStore in my macbook
continue
file_path = folder_path + "/" + cluster_name +"/"+ file_name
try:
tree = ET.parse(file_path)
root = tree.getroot()
text_tag = root._children[3]
if text_tag.tag == "TEXT":
text = text_tag.text.replace("\n", "")
sentences = nltk.tokenize.sent_tokenize(text)
for sentence in sentences:
words = nltk.word_tokenize(sentence)
samples.append(words)
except:
print "exception parse XML: ", file_name
continue
sys.stdout.write("Finish collecting training data from DUC2004")
sys.stdout.flush()
sys.stdout.write("length of samples" + str(len(samples)))
sys.stdout.flush()
end_collect = time.time()
sys.stdout.write("Total time for collecting training data: " + str(end_collect - start_collect))
sys.stdout.flush()
return samples
开发者ID:giahy2507,项目名称:convae,代码行数:57,代码来源:preparedata4convaewmpi.py
注:本文中的nltk.corpus.brown.sents函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论