本文整理汇总了Python中nltk.collocations.TrigramCollocationFinder类的典型用法代码示例。如果您正苦于以下问题:Python TrigramCollocationFinder类的具体用法?Python TrigramCollocationFinder怎么用?Python TrigramCollocationFinder使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了TrigramCollocationFinder类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: ngram_collocation
def ngram_collocation(words, sents, n, support=10, topK=200):
if n>=4:
finder = TrigramCollocationFinder.from_words(words)
ngram_measures = TrigramAssocMeasures()
finder.apply_freq_filter(support)
pmi_ngrams = finder.nbest(ngram_measures.pmi, topK)
ext_ngrams = NgramCollocationExtender(pmi_ngrams, sents, support/3, 0.3)
print_ngrams(ext_ngrams)
return ext_ngrams
#pmi_ngrams = NgramCollocationFinder(words, 2, lowFreq, topK)
#the current collocation measure is PMI
else:
if n==2:
finder = BigramCollocationFinder.from_words(words)
ngram_measures = BigramAssocMeasures()
if n==3:
finder = TrigramCollocationFinder.from_words(words)
ngram_measures = TrigramAssocMeasures()
finder.apply_freq_filter(support)
pmi_ngrams = finder.nbest(ngram_measures.pmi, topK)
print_ngrams(pmi_ngrams)
return pmi_ngrams
开发者ID:chqsark,项目名称:hightext,代码行数:25,代码来源:nlp_module.py
示例2: collocations
def collocations(stream, top_n=10000, min_bigram_freq=50, min_trigram_freq=20):
"""Extract text collocations (bigrams and trigrams), from a stream of words.
Parameters
----------
stream: iterable object
An iterable of words
top_n: int
Number of collocations to retrieve from the stream of words (order by decreasing frequency). Default is 10000
min_bigram_freq: int
Minimum frequency of a bigram in order to retrieve it. Default is 50.
min_trigram_freq: int
Minimum frequency of a trigram in order to retrieve it. Default is 20.
"""
tcf = TrigramCollocationFinder.from_words(stream)
tcf.apply_freq_filter(min_trigram_freq)
trigrams = [' '.join(w) for w in tcf.nbest(TrigramAssocMeasures.chi_sq, top_n)]
logging.info("%i trigrams found: %s..." % (len(trigrams), trigrams[:20]))
bcf = tcf.bigram_finder()
bcf.apply_freq_filter(min_bigram_freq)
bigrams = [' '.join(w) for w in bcf.nbest(BigramAssocMeasures.pmi, top_n)]
logging.info("%i bigrams found: %s..." % (len(bigrams), bigrams[:20]))
bigrams_patterns = re.compile('(%s)' % '|'.join(bigrams), re.UNICODE)
trigrams_patterns = re.compile('(%s)' % '|'.join(trigrams), re.UNICODE)
return bigrams_patterns, trigrams_patterns
开发者ID:lewismc,项目名称:topik,代码行数:33,代码来源:tokenizers.py
示例3: trigramFeats
def trigramFeats(thesewords, n=100):
si = iter(thesewords)
words = [c + " " + next(si, '') + " " + next(si, '') for c in si]
tcf = TrigramCollocationFinder.from_words(words)
tcf.apply_freq_filter(n)
trigram = tcf.nbest(TrigramAssocMeasures.likelihood_ratio, n)
return dict([(ngram, True) for ngram in itertools.chain(words, trigram)])
开发者ID:levidehaan,项目名称:securitynowbigdataproject,代码行数:7,代码来源:frameparser.py
示例4: get_frequencies
def get_frequencies(self, desc):
stopset = set(stopwords.words('english'))
filter_stops = lambda w: len(w) < 3 or w in stopset
words = word_tokenize(desc)
print '------gram--------'
words_to_count = [word for word in words if word not in stopset]
words_to_count = [word for word in words_to_count if not len(word) < 3]
c = Counter(words_to_count)
single = c.most_common(20)
print single
print '------bigram--------'
bcf = BigramCollocationFinder.from_words(words)
bcf.apply_word_filter(filter_stops)
bigrm = bcf.nbest(BigramAssocMeasures.likelihood_ratio, 15)
print bigrm
print '------trigram--------'
tcf = TrigramCollocationFinder.from_words(words)
tcf.apply_word_filter(filter_stops)
tcf.apply_freq_filter(3) #only keep those that appear more than 3 times
trigrm = tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 10)
print trigrm
matches = [single,bigrm,trigrm]
return matches
开发者ID:amac441,项目名称:Metten,代码行数:28,代码来源:grapher.py
示例5: trigram
def trigram(words, score_fn=TrigramAssocMeasures.likelihood_ratio, n=1500, freq=1):
"""
tmp_words=[]
for w in words:
tmp_words.append(w)
words=tmp_words
"""
if len(words) <= 0:
return {}
tmp_dict = {}
for w in words:
tmp_dict[w] = 1
if len(tmp_dict.keys()) < 3:
return {}
trigram_finder = TrigramCollocationFinder.from_words(words) # 把文本变成双词搭配的形式
trigram_finder.apply_freq_filter(freq)
trigrams = trigram_finder.nbest(score_fn, n) # 使用了卡方统计的方法,选择排名前1000的双词
# print type(words)
res = {}
for s in trigrams:
if res.has_key(s[0] + s[1] + s[2]) == True:
res[s[0] + s[1] + s[2]] += 1
else:
res[s[0] + s[1] + s[2]] = 1
return res
开发者ID:cysjtu,项目名称:SentimentAnalysis,代码行数:34,代码来源:nlp_machine2222.py
示例6: best_ngrams
def best_ngrams(words, top_n=10, min_freq=5):
"""
Extract `top_n` most salient collocations (bigrams and trigrams),
from a stream of words. Ignore collocations with frequency
lower than `min_freq`.
This fnc uses NLTK for the collocation detection itself -- not very scalable!
Return the detected ngrams as compiled regular expressions, for their faster
detection later on.
"""
tcf = TrigramCollocationFinder.from_words(words)
tcf.apply_freq_filter(min_freq)
trigrams = [' '.join(w) for w in tcf.nbest(TrigramAssocMeasures.chi_sq, top_n)]
logging.info("%i trigrams found: %s..." % (len(trigrams), trigrams[:20]))
bcf = tcf.bigram_finder()
bcf.apply_freq_filter(min_freq)
bigrams = [' '.join(w) for w in bcf.nbest(BigramAssocMeasures.pmi, top_n)]
logging.info("%i bigrams found: %s..." % (len(bigrams), bigrams[:20]))
pat_gram2 = re.compile('(%s)' % '|'.join(bigrams), re.UNICODE)
pat_gram3 = re.compile('(%s)' % '|'.join(trigrams), re.UNICODE)
print pat_gram2
return pat_gram2, pat_gram3
开发者ID:ninamiriamjnana,项目名称:topic,代码行数:28,代码来源:get_data.py
示例7: create_tri_collocations
def create_tri_collocations(features_words,document_preprocess):
finder = TrigramCollocationFinder.from_words(movie_reviews.words())
finder.apply_freq_filter(3)
tricoll = finder.nbest(trigram_measures.pmi,1000)
for f in document_preprocess:
tricoll = [(f(a),f(b),f(c)) for (a,b,c) in tricoll if (f(a) and f(b) and f(c))]
return tricoll
开发者ID:katomaso,项目名称:LiU-TextMining,代码行数:7,代码来源:Lab5-2.py
示例8: __init__
def __init__(self, words, sentences, language):
self.num_words = len(words)
self.unique_words = len(set(words))
self.num_sentences = len(sentences)
self.average_sentence_length = round(self.num_words / self.num_sentences)
self.lexical_diversity = round(self.num_words / self.unique_words)
fdist = FreqDist(words)
stop_words = stopwords.words(language)
not_stopwords = [w for w in words if w not in stop_words]
fdist2 = FreqDist(not_stopwords)
self.fifty_first_words = fdist.most_common(50)
self.hundreds_nsw = fdist2.most_common(300)
bigram_measures = BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(words)
finder.apply_freq_filter(10)
self.fifty_collocations = finder.nbest(bigram_measures.pmi, 50)
trigram_measures = TrigramAssocMeasures()
finder3 = TrigramCollocationFinder.from_words(words)
finder3.apply_freq_filter(10)
self.fifty_collocations3 = finder3.nbest(trigram_measures.pmi, 50)
self.stcs_width_words = [' '.join(sent) for sent in sentences
if "malheureusement" in sent.lower()]
开发者ID:Raveline,项目名称:journal-imaginaire,代码行数:26,代码来源:analyst.py
示例9: extract_trigrams
def extract_trigrams(self, sent):
sent = self._preprocess_sent(sent)
trigram_measures = TrigramAssocMeasures()
TriFinder = TrigramCollocationFinder.from_words(sent)
trigrams = TriFinder.nbest(trigram_measures.pmi, 10000)
trigrams = set([' '.join(i) for i in trigrams])
trigrams = trigrams & self._trigrams_set
return { i: True for i in trigrams }
开发者ID:aginiewicz,项目名称:EmoClassifier,代码行数:8,代码来源:feature_extraction.py
示例10: set_trigramas
def set_trigramas(self,freq=2,best=20):
tcf = TrigramCollocationFinder.from_words(self.palavras)
stopset = set(stopwords.words('portuguese'))
filter_stops = lambda w: len(w) < 3 or w in stopset
tcf.apply_word_filter(filter_stops)
tcf.apply_freq_filter(freq)
a = tcf.nbest(TrigramAssocMeasures.pmi, best)
self.trigramas = a
开发者ID:eric011,项目名称:SemantikaCrawler,代码行数:8,代码来源:lingprocessador.py
示例11: calc_trigrams
def calc_trigrams(text, min_freq=50):
"""Returns frequency of trigrams from a text input."""
words = [w.lower() for w in text]
tcf = TrigramCollocationFinder.from_words(words)
tcf.apply_freq_filter(min_freq)
trigrams = tcf.ngram_fd.items()
trigram_list.append(trigrams)
return trigram_list
开发者ID:djkn0x,项目名称:GA_homework,代码行数:8,代码来源:trigrams.py
示例12: trigram_word_feats
def trigram_word_feats(words, score_fn=TrigramAssocMeasures.chi_sq, n=50):
trigram_finder = TrigramCollocationFinder.from_words(words)
try:
trigrams = trigram_finder.nbest(score_fn, n)
except:
print "lost trigrams", words
return dict([(ngram, True) for ngram in itertools.chain(words)])
return dict([(ngram, True) for ngram in itertools.chain(words, trigrams)])
开发者ID:zhouxiaofan,项目名称:Projects,代码行数:9,代码来源:sentimentMapper.py
示例13: getTrigrams
def getTrigrams(self):
words = [w.lower() for w in nltk.word_tokenize(self.text)]
tcf = TrigramCollocationFinder.from_words(words)
stopset = set(stopwords.words('english'))
filter_stops = lambda w: len(w) < 3 or w in stopset
tcf.apply_word_filter(filter_stops)
tcf.apply_freq_filter(1)
return tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 6)
开发者ID:DevilDante88,项目名称:MyCogs,代码行数:9,代码来源:postagger.py
示例14: getTrigram
def getTrigram(haystack):
tokenizer = WordPunctTokenizer()
words = tokenizer.tokenize(haystack)
tcf = TrigramCollocationFinder.from_words(words)
stopset = set(stopwords.words('english'))
filter_stops = lambda w: len(w) < 3 or w in stopset
tcf.apply_word_filter(filter_stops)
return tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 4)
开发者ID:blorenz,项目名称:cms,代码行数:9,代码来源:seo.py
示例15: best_n_trigrams
def best_n_trigrams(self, n, method="pmi"):
trigram_measures = TrigramAssocMeasures()
tokens = self.get_word_lst()
finder = TrigramCollocationFinder.from_words(tokens)
if method == "pmi":
return finder.nbest(trigram_measures.pmi, n)
if method == "raw_freq":
return finder.nbest(trigram_measures.raw_freq, n)
开发者ID:WilliamHammond,项目名称:fbcanalyzer,代码行数:9,代码来源:ChatStream.py
示例16: setup_mwes
def setup_mwes(self, trigram_nbest=100, bigram_nbest=2000):
"""Create multi-word expressions by learning a corpus located in a corpus directory.
Testing setting up mwes with custom path and setting it up twice (correct when no exception):
>>> corpus_dir = os.path.join(base_path, 'test', 'corpus')
>>> clusterer = DumbClusterer(corpus_dir=corpus_dir, mwes=['custom mwe'])
>>> mwes = clusterer.setup_mwes(trigram_nbest=1000, bigram_nbest=15000)
>>> 'custom mwe' not in mwes
True
>>> 'custom mwe' in clusterer.mwes
True
Args:
trigram_nbest(int): Number of highest ranked trigrams to acquire.
bigram_nbest(int): Number of highest ranked trigrams to acquire.
Returns:
list: List of multi-word expressions.
"""
if self.corpus is None:
raise Exception("Corpus not found. Run method `setup_corpus` with given corpus directory first.")
bigram_measures = BigramAssocMeasures()
trigram_measures = TrigramAssocMeasures()
# Following are not used since ne chunk takes too much time.
# Text processing before bigrams and trigrams calculated
# words = []
# for sent in self.corpus.sents():
# for chunk in nltk.ne_chunk(nltk.pos_tag(sent)):
# if not isinstance(chunk, nltk.Tree):
# w = chunk[0]
# # - Removal of words containing numbers or punctuations
# if not any((ch.isdigit() or ch in string.punctuation) for ch in w):
# # - Lowercasing all words
# words.append(w.lower())
# print(w.lower().encode("utf-8")),
# Text processing before bigrams and trigrams calculated
words = []
for w in self.corpus.words():
# - Removal of words containing numbers or punctuations
if not any((ch.isdigit() or ch in string.punctuation) for ch in w):
# - Lowercasing all words
words.append(w.lower())
bigram_finder = BigramCollocationFinder.from_words(words)
trigram_finder = TrigramCollocationFinder.from_words(words)
mwes = trigram_finder.nbest(trigram_measures.pmi, trigram_nbest) + bigram_finder.nbest(bigram_measures.pmi, bigram_nbest)
# Basically combining two list by turning them into sets to make sure union returned
# i.e. `set1 | set2` where set1 could be list of string or list, and if the latter, they
# need to be converted into sets.
set1 = {(tuple(mwe) if isinstance(mwe,list) else mwe) for mwe in self.mwes}
set2 = set(mwes)
self.mwes = list(set1 | set2)
return mwes
开发者ID:jaycode,项目名称:-archive-Arthur,代码行数:56,代码来源:dumb_clusterer.py
示例17: trigrams
def trigrams(words, max_trigrams=100):
print "Extracting trigrams"
trigram_finder = TrigramCollocationFinder.from_words(words)
for trigram, score in trigram_finder.score_ngrams(trigram_measures.raw_freq)[:max_trigrams]:
l_trigram = [lmtzr.lemmatize(p) for p in trigram]
if l_trigram in tg:
print "Common trigram", trigram
continue
#print trigram, score
yield trigram
开发者ID:cratejoy,项目名称:nladwords,代码行数:12,代码来源:nl_util.py
示例18: find_collocations
def find_collocations(text_series):
bigram_measures = BigramAssocMeasures()
trigram_measures = TrigramAssocMeasures()
tokens = [ token for token_list in text_series for token in token_list ]
bigrams = BigramCollocationFinder.from_words(tokens)
trigrams = TrigramCollocationFinder.from_words(tokens)
scored_bigrams = bigrams.score_ngrams(bigram_measures.likelihood_ratio)
scored_trigrams = trigrams.score_ngrams(trigram_measures.likelihood_ratio)
with open('bigrams.pkl', 'wb') as fid:
cPickle.dump(scored_bigrams, fid)
with open('trigrams.pkl', 'wb') as fid:
cPickle.dump(scored_trigrams, fid)
开发者ID:annamarie-g,项目名称:capstone_project,代码行数:12,代码来源:text_preprocessing.py
示例19: _collect_bigrams_and_trigrams
def _collect_bigrams_and_trigrams(raw_corpus, top_n=10000, min_length=1, min_freqs=None, stopwords=None):
"""collects bigrams and trigrams from collection of documents. Input to collocation tokenizer.
bigrams are pairs of words that recur in the collection; trigrams are triplets.
Parameters
----------
raw_corpus : iterable of tuple of (doc_id(str/int), doc_text(str))
body of documents to examine
top_n : int
limit results to this many entries
min_length : int
Minimum length of any single word
min_freqs : iterable of int
threshold of when to consider a pair of words as a recognized n-gram,
starting with bigrams.
stopwords : None or iterable of str
Collection of words to ignore as tokens
Examples
--------
>>> patterns = _collect_bigrams_and_trigrams(sample_corpus, min_freqs=[2, 2])
>>> patterns[0].pattern
u'(frank swank|swank tank|sassy unicorns)'
>>> patterns[1].pattern
u'(frank swank tank)'
"""
from nltk.collocations import TrigramCollocationFinder
from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures
# generator of documents, turn each element to its list of words
doc_texts = (_simple_document(doc_text, min_length=min_length, stopwords=stopwords)
for doc_id, doc_text in raw_corpus)
# generator, concatenate (chain) all words into a single sequence, lazily
words = itertools.chain.from_iterable(doc_texts)
tcf = TrigramCollocationFinder.from_words(iter(words))
bcf = tcf.bigram_finder()
bcf.apply_freq_filter(min_freqs[0])
bigrams = [' '.join(w) for w in bcf.nbest(BigramAssocMeasures.pmi, top_n)]
tcf.apply_freq_filter(min_freqs[1])
trigrams = [' '.join(w) for w in tcf.nbest(TrigramAssocMeasures.chi_sq, top_n)]
bigrams_patterns = re.compile('(%s)' % '|'.join(bigrams), re.UNICODE)
trigrams_patterns = re.compile('(%s)' % '|'.join(trigrams), re.UNICODE)
return bigrams_patterns, trigrams_patterns
开发者ID:gpfreitas,项目名称:topik,代码行数:49,代码来源:ngrams.py
示例20: best_ngrams
def best_ngrams(words, top_n=1000, min_freq=100):
tcf = TrigramCollocationFinder.from_words(words)
tcf.apply_freq_filter(min_freq)
trigrams = [' '.join(w) for w in tcf.nbest(TrigramAssocMeasures.chi_sq, top_n)]
logging.info('%i trigrams found: %s...' % (len(trigrams), trigrams[:10]))
bcf = tcf.bigram_finder()
bcf.apply_freq_filter(min_freq)
bigrams = [' '.join(w) for w in bcf.nbest(BigramAssocMeasures.pmi, top_n)]
logging.info('%i bigrams found: %s...' % (len(bigrams), bigrams[:10]))
pat_gram2 = re.compile('(%s)' % '|'.join(bigrams), re.UNICODE)
pat_gram3 = re.compile('(%s)' % '|'.join(trigrams), re.UNICODE)
return pat_gram2, pat_gram3
开发者ID:codekansas,项目名称:icm-2016-topic-modeling,代码行数:15,代码来源:topic_model_news.py
注:本文中的nltk.collocations.TrigramCollocationFinder类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论