本文整理汇总了Python中nltk.ngrams函数的典型用法代码示例。如果您正苦于以下问题:Python ngrams函数的具体用法?Python ngrams怎么用?Python ngrams使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了ngrams函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: ngrams
def ngrams(self, ns=[2, 3, 5]):
_p = ["/".join(t) for t in zip(self.SUF, self.POS)]
for n in ns:
ngf = {"Ngram(N={})_{}".format(n, "_".join(t)): 1 for t in ngrams(self.SUF, n)}
ngfp = {"NgramP(N={})_{}".format(n, "_".join(t)): 1 for t in ngrams(_p, n)}
self.features.update(ngf)
self.features.update(ngfp)
开发者ID:tuxedocat,项目名称:precure,代码行数:7,代码来源:feature_extractor.py
示例2: update_freqs
def update_freqs(self, doc_text, id_str):
for bigram in list(ngrams(doc_text, 2)):
k = bigram[0] + u"_" + bigram[1]
self.bicount.update([k])
self.bigram_to_ids[k] = self.bigram_to_ids.get(k, []) + [id_str]
for trigram in list(ngrams(doc_text, 3)):
k = trigram[0] + u"_" + trigram[1] + u"_" + trigram[2]
self.tricount.update([k])
self.trigram_to_ids[k] = self.trigram_to_ids.get(k, []) + [id_str]
开发者ID:jtmurphy89,项目名称:twitter_challenge,代码行数:9,代码来源:part1.py
示例3: get_gram_ratio
def get_gram_ratio(w2v, text1, text2, n_grams_1=1, n_grams_2=1, n_jobs=1):
t1 = list(ngrams(text1.split(), n_grams_1))
t2 = list(ngrams(text2.split(), n_grams_2))
pairs = list(iter_product(t1, t2, repeat=1))
res = list(map(lambda x: similarity(w2v, x), pairs))
if len(res) == 0:
return 0
else:
return np.mean(res)
开发者ID:KhaoticMind,项目名称:kaggle-homedepot,代码行数:9,代码来源:helper_processing.py
示例4: ngrams_extract
def ngrams_extract(string):
if random.random() < SAMPLE_RATE:
print '[*]',string
l = list
grams = l(ngrams(string,2)) + l(ngrams(string,3)) + l(ngrams(string,4)) + l(ngrams(string,5))
SIZE = 1024
vec = zeros((SIZE,))
for t in grams:
vec[hash(t)%SIZE]+=1
return log(vec+1.0)
开发者ID:joshsaxe,项目名称:eXposeDeepNeuralNetwork,代码行数:10,代码来源:features.py
示例5: build_ngram
def build_ngram(source):
ngram_set = {}
for key, value in source.items():
ngram = []
for line in value:
if IS_PAD:
ngram.extend(nltk.ngrams(line.strip(), NGRAM_LEVEL, pad_left=True, pad_right=True, pad_symbol='SSS'))
else:
ngram.extend(nltk.ngrams(line.strip(), NGRAM_LEVEL))
ngram_set[key] = ngram
return ngram_set
开发者ID:Tiotao,项目名称:CS3245HW1,代码行数:11,代码来源:build_test_LM.py
示例6: read_data
def read_data(type):
datapath = '../data/' + type + '/'
data = {}
maxindex = 500
count = 0
unigrams = []
bigrams = []
dependecies = []
for c in string.ascii_uppercase:
data[c] = {}
for i in range(1, maxindex):
filename = datapath + c + str(i)
txtpath = filename + '.data'
metapath = filename + '.meta'
text = read_file(txtpath)
meta = read_file(metapath)
if text is not None:
count += 1
# print (count)
data[c][i] = {'text': text[0], 'meta': parse_meta(meta)}
tokens = nltk.word_tokenize(text[0])
data[c][i]['tokens'] = tokens
data[c][i]['length'] = len(tokens)
s = remove_punct(text[0])
tokens = nltk.word_tokenize(remove_punct(s.lower()))
data[c][i]['unigrams'] = list(nltk.ngrams(tokens, 1))
data[c][i]['bigrams'] = list(nltk.ngrams(tokens, 2))
# data[c][i]['dependencies'] = dependency_parse(text[0])
# deppath = filename + '.dep'
# with open (deppath, 'w') as f:
# json.dump(data[c][i]['dependencies'],f)
# with open (deppath, 'r') as f:
# data[c][i]['dependencies'] = json.load(f)
unigrams.extend(data[c][i]['unigrams'])
bigrams.extend(data[c][i]['bigrams'])
# dependecies.extend(data[c][i]['dependencies'])
data[c]['sequences'] = gen_sequences(data[c])
data['unigram_model'] = create_model(unigrams, maxfeat=5000, minfreq=3)
data['bigram_model'] = create_model(bigrams, maxfeat=5000, minfreq=3)
# data['dependencies'] = create_model(dependecies, maxfeat=5000, minfreq=3)
# pprint.pprint (data['unigram_model'])
# pprint.pprint (data['bigram_model'])
# pprint.pprint (data['dependencies'])
# print(type, count)
return data
开发者ID:patwaria,项目名称:stance_classification,代码行数:54,代码来源:stance_classification.py
示例7: lookup_phrases
def lookup_phrases(sentence, noun_types, ignore_case=False):
phrases = ngrams(sentence, 3) + ngrams(sentence, 2) + ngrams(sentence, 1)
matches = []
for phrase in phrases:
if contains_noun(phrase):
phrase_str = u' '.join(w.form for w in phrase)
if ignore_case:
phrase_str = phrase_str.lower()
types = noun_types.get(phrase_str)
if types:
matches.append((phrase, types))
return sorted(matches)
开发者ID:Noahs-ARK,项目名称:semafor,代码行数:12,代码来源:markup_sentence.py
示例8: extract_ngrams
def extract_ngrams (self, memes):
for meme_type in memes:
for meme in memes[meme_type]:
top_unigrams = meme[0]
bottom_unigrams = meme[1]
all_unigrams = top_unigrams + bottom_unigrams
top_bigrams = ngrams (meme[0], 2)
bottom_bigrams = ngrams (meme[1], 2)
all_bigrams = top_bigrams + bottom_bigrams
self.add_ngrams(key, top_unigrams, bottom_unigrams, all_unigrams, top_bigrams, bottom_bigrams, all_bigrams)
开发者ID:AlexeyMK,项目名称:DATASS,代码行数:12,代码来源:NgramsManager.py
示例9: get_gram_ratio
def get_gram_ratio(text1, text2, w2v, n_grams_1=1, n_grams_2=1, w=30, h=2000):
arr = np.ndarray((w, h), np.float32)
arr.fill(0)
t1 = list(ngrams(text1.split(), n_grams_1))
t2 = list(ngrams(text2.split(), n_grams_2))
for i in range(len(t1)):
for j in range(len(t2)):
try:
arr[i, j] = w2v.n_similarity(t1[i], t2[j])
except:
pass
return arr
开发者ID:KhaoticMind,项目名称:kaggle-homedepot,代码行数:12,代码来源:neural_test.py
示例10: generate_location_vector
def generate_location_vector(self, branch, index):
if branch.text is not None:
branch.text = branch.text.encode('ascii', 'ignore')
if not branch.getchildren():
sentences = branch.text.split('. ')
for sentence in range(0, len(sentences)):
#sentence_location = (("{0}[{1}]".format(index, sentence)), sentences[sentence])
words = sentences[sentence].split()
for doc_word in range(0, len(words)):
word_location = (("{0}[{1}][{2}]".format(index, sentence, doc_word)), words[doc_word])
# any change in line below should be replicated in corpus.py also
symbols = ".,[]();:<>+=&+%[email protected]#~?{}|"
whitespace = " "
replace = maketrans(symbols, whitespace)
doc_word = word_location[1].translate(replace)
doc_word = doc_word.lstrip()
doc_word = doc_word.rstrip()
if len(doc_word) > 1 and not len(doc_word) > 16:
self.doc_words.append(doc_word)
doc_bigrams = bigrams(words)
if not len(doc_bigrams) < 1:
doc_bigrams = self.n_gram_cleaner(doc_bigrams)
for bi_gram in doc_bigrams:
bi_gram = ' '.join(bi_gram)
self.bi_grams.append(bi_gram)
doc_trigrams = trigrams(words)
if not len(doc_trigrams) < 1:
doc_trigrams = self.n_gram_cleaner(doc_trigrams)
for tri_gram in doc_trigrams:
tri_gram = ' '.join(tri_gram)
self.tri_grams.append(tri_gram)
doc_fourgrams = ngrams(words, 4)
if not len(doc_fourgrams) < 1:
doc_fourgrams = self.n_gram_cleaner(doc_fourgrams)
for four_gram in doc_fourgrams:
four_gram = ' '.join(four_gram)
self.four_grams.append(four_gram)
doc_fivegrams = ngrams(words, 5)
if not len(doc_fivegrams) < 1:
doc_fivegrams = self.n_gram_cleaner(doc_fivegrams)
for five_gram in doc_fivegrams:
five_gram = ' '.join(five_gram)
self.five_grams.append(five_gram)
else:
for subtree in range(0, len(branch)):
LocationVector.generate_location_vector(self, branch[subtree], ("{0}[{1}]".format(index, subtree)))
开发者ID:arunenigma,项目名称:deva_algo,代码行数:53,代码来源:doc_analyzer.py
示例11: get_top_ngrams_tfidf
def get_top_ngrams_tfidf(text,collection,NGRAM=2,cutoff=100,docs=None):
bigs = nltk.ngrams(text,NGRAM)
print 'totally',len(bigs),'bigrams'
bigs = remove_website_stopwords(bigs)
freqdist = nltk.FreqDist(bigs)
topwords = freqdist.keys()[:cutoff]
# print len(topwords),'topwords:',topwords[:30],freqdist[topwords[0]],freqdist[topwords[1]]
from math import log
if True: #do_tfidf
df = {}
df_les = {}
df_time = {}
tfidf ={}
for doc_id, text in docs.items():
words = [w for w in nltk.ngrams(text,NGRAM)]
les_id,time_id = doc_id.split(':')
time_id = time_id.replace('.csv','')
time_id = time_id[0:8]
for w in words:
df.setdefault(w,set())
df[w].add(doc_id)
df_les.setdefault(w,set())
df_les[w].add(les_id)
df_time.setdefault(w,set())
df_time[w].add(time_id)
_cutoff=10000
_topwords = freqdist.keys()[:_cutoff]
df0,df1,df2={},{},{}
for w in _topwords:
# print w
try: df0[w] = len(df[w])
except: df0[w] = 0
try: df1[w] = len(df_les[w])
except: df1[w] = 0
try: df2[w] = len(df_time[w])
except: df2[w] = 0
tfidf[w] = freqdist[w]/(1+df0[w])
# print df0
#get sorted words in decreasing order of tfidf values
sortedwords = sorted(tfidf.items(), key=itemgetter(1), reverse=True)
sortedwords = sortedwords[:cutoff]
topwords = [w for w,s in sortedwords]
sortedwords0 = sorted(df0.items(), key=itemgetter(1), reverse=True)
sortedwords1 = sorted(df1.items(), key=itemgetter(1), reverse=True)
sortedwords2 = sorted(df2.items(), key=itemgetter(1), reverse=True)
print 'TF-IDF topwords:'
print len(topwords),'topwords:',sortedwords[:50],freqdist[topwords[0]],freqdist[topwords[1]]
print sortedwords0[:30]
print sortedwords1[:30]
print sortedwords2[:30]
return topwords,freqdist,df0,df1,df2
return topwords,freqdist
开发者ID:iamhighman,项目名称:GoogleNewsAnalysis,代码行数:52,代码来源:nltk_utils.py
示例12: __call__
def __call__(self, words):
grams = list(ngrams(words, 2)) + list(ngrams(words, 3))
positives = [
(i, len(gram), gram) for i, gram in enumerate(grams)
if self.colls[len(gram)][gram]
]
if not positives:
return words
positives.sort(key=lambda x: (x[1], len(words) - x[0]), reverse=True)
matches, covered = self.__non_overlapping(positives)
unigrams = [(i, w) for i, w in enumerate(words) if i not in covered]
catted = sorted(matches + unigrams)
return zip(*catted)[1]
开发者ID:JordiCarreraVentura,项目名称:wlp,代码行数:13,代码来源:Collocations.py
示例13: generateLocationVector
def generateLocationVector(self, branch, index):
if branch.text is not None:
branch.text = branch.text.encode('ascii', 'ignore')
if not branch.getchildren():
sentences = branch.text.split('. ')
for sentence in range(0, len(sentences)):
#sentence_location = (("{0}[{1}]".format(index, sentence)), sentences[sentence])
words = sentences[sentence].split()
for word in range(0, len(words)):
word_location = (("{0}[{1}][{2}]".format(index, sentence, word)), words[word])
symbols = ",[]();:<>+=&+%[email protected]#~?{}|"
whitespace = " "
replace = maketrans(symbols, whitespace)
spec_word = word_location[1].translate(replace)
spec_word = spec_word.lstrip()
spec_word = spec_word.rstrip()
if len(spec_word) > 1 and not len(spec_word) > 16:
self.spec_words.append(spec_word)
bi_grams = bigrams(words)
if not len(bi_grams) < 1:
for bi_gram in bi_grams:
bi_gram = ' '.join(bi_gram)
self.bi_grams.append(bi_gram)
tri_grams = trigrams(words)
if not len(tri_grams) < 1:
for tri_gram in tri_grams:
tri_gram = ' '.join(tri_gram)
self.tri_grams.append(tri_gram)
four_grams = ngrams(words, 4)
if not len(four_grams) < 1:
for four_gram in four_grams:
four_gram = ' '.join(four_gram)
self.four_grams.append(four_gram)
five_grams = ngrams(words, 5)
if not len(five_grams) < 1:
for five_gram in five_grams:
five_gram = ' '.join(five_gram)
self.five_grams.append(five_gram)
else:
for subtree in range(0, len(branch)):
Corpus.generateLocationVector(self, branch[subtree], ("{0}[{1}]".format(index, subtree)))
开发者ID:arunenigma,项目名称:Scenario-Mining,代码行数:50,代码来源:corpus.py
示例14: __init__
def __init__(self, text, random_seed=5, shingle_length=5, minhash_size=200):
split_text = text.split()
if len(split_text) < shingle_length:
raise ValueError(u'input text is too short for specified shingle length of {}'.format(shingle_length))
self.minhash = []
self.shingles = ngrams(split_text, shingle_length)
for hash_seed in generate_random_seeds(minhash_size, random_seed):
min_value = float('inf')
for shingle in ngrams(split_text, shingle_length):
value = mmh3.hash(' '.join(shingle), hash_seed)
min_value = min(min_value, value)
self.minhash.append(min_value)
开发者ID:steven-s,项目名称:text-shingles,代码行数:14,代码来源:shingles.py
示例15: train
def train(self, words, tagged=False):
if tagged is True:
tags = []
for i in range(len(words)):
tags.append(words[i][1])
self.ngrams = list(nltk.ngrams(tags, self.n))
else:
# text = nltk.word_tokenize(words)
tagged_words = nltk.pos_tag(words)
universal_tags = [nltk.map_tag('en-ptb', 'universal', tag) for word, tag in tagged_words]
self.ngrams = list(nltk.ngrams(universal_tags, self.n))
self.frequencies = nltk.FreqDist(self.ngrams)
self.probs_ng = nltk.MLEProbDist(self.frequencies)
print self.probs_ng
开发者ID:sofiabroome,项目名称:wordpredictor,代码行数:14,代码来源:GrammarModel.py
示例16: jacquard_fivegram
def jacquard_fivegram(query):
final=[]
n=4
for a in file('enwiktionary.a.list'):
a=a.rstrip()
fivegram=set(nltk.ngrams(a,5))
q_fivegram=set(nltk.ngrams(query,5))
intersect=q_fivegram.intersection(fivegram)
union=q_fivegram.union(fivegram)
sim=float(len(intersect))/len(union)
final.append([a,sim])
final_sorted= sorted(final,key=lambda sim:sim[1], reverse=True)
print final_sorted[:10]
开发者ID:jubimishra,项目名称:Data-Mining,代码行数:14,代码来源:jacquard_vs_levenshtein.py
示例17: count_alliteration
def count_alliteration(tokens):
allit_instances = []
#ignore stopwords
tokens = [token for token in tokens if not(is_punctuation(token) or is_stopword(token))]
bigrams = nltk.ngrams(tokens,2)
for one,two in bigrams:
if has_alliteration(one,two):
allit_instances.append((one,two))
trigrams = nltk.ngrams(tokens,3)
for one,two,three in trigrams:
#the not avoids double counting
if has_alliteration(one,three) and not has_alliteration(one,two):
allit_instances.append((one,two,three))
return len(allit_instances)
开发者ID:BAH-DSST,项目名称:QuantifyingRhetoric_ODSCEast2016,代码行数:15,代码来源:rhetoric.py
示例18: calc_precision
def calc_precision(n,translation, reference):
total = 0
correct = 0
for i in range(min(len(translation),len(reference))):
tra_ngrams = nltk.ngrams(translation[i].split(), n)
ref_ngrams = nltk.ngrams(reference[i].split(), n)
total += min(len(ref_ngrams),len(tra_ngrams))
for ng in tra_ngrams:
if(ng in ref_ngrams):
correct += 1
print("total:" + str(total)+ ", correct: "+ str(correct))
if(total == 0):
return(0)
precision = float(correct)/total
return(precision)
开发者ID:jvalansi,项目名称:Machine_Translation,代码行数:15,代码来源:bleu.py
示例19: get_date_from_utterance
def get_date_from_utterance(tokenized_utterance: List[Token],
year: int = 1993) -> List[datetime]:
"""
When the year is not explicitly mentioned in the utterance, the query assumes that
it is 1993 so we do the same here. If there is no mention of the month or day then
we do not return any dates from the utterance.
"""
dates = []
utterance = ' '.join([token.text for token in tokenized_utterance])
year_result = re.findall(r'199[0-4]', utterance)
if year_result:
year = int(year_result[0])
trigrams = ngrams([token.text for token in tokenized_utterance], 3)
for month, tens, digit in trigrams:
# This will match something like ``september twenty first``.
day = ' '.join([tens, digit])
if month in MONTH_NUMBERS and day in DAY_NUMBERS:
try:
dates.append(datetime(year, MONTH_NUMBERS[month], DAY_NUMBERS[day]))
except ValueError:
print('invalid month day')
bigrams = ngrams([token.text for token in tokenized_utterance], 2)
for month, day in bigrams:
if month in MONTH_NUMBERS and day in DAY_NUMBERS:
# This will match something like ``september first``.
try:
dates.append(datetime(year, MONTH_NUMBERS[month], DAY_NUMBERS[day]))
except ValueError:
print('invalid month day')
fivegrams = ngrams([token.text for token in tokenized_utterance], 5)
for tens, digit, _, year_match, month in fivegrams:
# This will match something like ``twenty first of 1993 july``.
day = ' '.join([tens, digit])
if month in MONTH_NUMBERS and day in DAY_NUMBERS and year_match.isdigit():
try:
dates.append(datetime(int(year_match), MONTH_NUMBERS[month], DAY_NUMBERS[day]))
except ValueError:
print('invalid month day')
if month in MONTH_NUMBERS and digit in DAY_NUMBERS and year_match.isdigit():
try:
dates.append(datetime(int(year_match), MONTH_NUMBERS[month], DAY_NUMBERS[digit]))
except ValueError:
print('invalid month day')
return dates
开发者ID:apmoore1,项目名称:allennlp,代码行数:48,代码来源:atis_tables.py
示例20: __fromcursor__
def __fromcursor__(self):
self.data = []
for document in c['Body'][self.source].find({
'term' : self.term,
'date' : {'$gt' : self.start_date, '$lt' : self.stop_date},
'str_type' : self.str_type.__name__,
'n' : self.n
}, {
'documents' : 1
}, no_cursor_timeout=True):
for _id in document['documents']:
comment = get_comment(_id, self.source)
gram_list = []
for ngram in ngrams(comment[self.str_type.__name__], self.n):
gram_list.append(Gram(ngram).term)
if self.position:
loc = gram_list.index(self.term) + position
self[gram_list[loc]] + 1
else:
gram_list.remove(self.term)
for gram in gram_list:
self[gram] += 1
try:
self * (sum(self) ** -1)
except ZeroDivisionError:
raise ValueError("No comments with term {} found".format(self.term))
self.__tocollection__()
开发者ID:deniederhut,项目名称:redicorpus,代码行数:27,代码来源:objects.py
注:本文中的nltk.ngrams函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论