• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python collocations.TrigramCollocationFinder类代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中nltk.collocations.TrigramCollocationFinder的典型用法代码示例。如果您正苦于以下问题:Python TrigramCollocationFinder类的具体用法?Python TrigramCollocationFinder怎么用?Python TrigramCollocationFinder使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。



在下文中一共展示了TrigramCollocationFinder类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: ngram_collocation

def ngram_collocation(words, sents, n, support=10, topK=200):

    if n>=4: 
        finder = TrigramCollocationFinder.from_words(words)
        ngram_measures = TrigramAssocMeasures()
        finder.apply_freq_filter(support)
        pmi_ngrams = finder.nbest(ngram_measures.pmi, topK)
        ext_ngrams = NgramCollocationExtender(pmi_ngrams, sents, support/3, 0.3)
        print_ngrams(ext_ngrams)
        return ext_ngrams
        #pmi_ngrams = NgramCollocationFinder(words, 2, lowFreq, topK)
        #the current collocation measure is PMI
    else:
        if n==2:
            finder = BigramCollocationFinder.from_words(words)
            ngram_measures = BigramAssocMeasures()
        if n==3:
            finder = TrigramCollocationFinder.from_words(words)
            ngram_measures = TrigramAssocMeasures()

        finder.apply_freq_filter(support)
        pmi_ngrams = finder.nbest(ngram_measures.pmi, topK)

    print_ngrams(pmi_ngrams)
    return pmi_ngrams
开发者ID:chqsark,项目名称:hightext,代码行数:25,代码来源:nlp_module.py


示例2: collocations

def collocations(stream, top_n=10000, min_bigram_freq=50, min_trigram_freq=20):
    """Extract text collocations (bigrams and trigrams), from a stream of words.

    Parameters
    ----------
    stream: iterable object
        An iterable of words

    top_n: int
        Number of collocations to retrieve from the stream of words (order by decreasing frequency). Default is 10000

    min_bigram_freq: int
        Minimum frequency of a bigram in order to retrieve it. Default is 50.

    min_trigram_freq: int
        Minimum frequency of a trigram in order to retrieve it. Default is 20.

    """
    tcf = TrigramCollocationFinder.from_words(stream)

    tcf.apply_freq_filter(min_trigram_freq)
    trigrams = [' '.join(w) for w in tcf.nbest(TrigramAssocMeasures.chi_sq, top_n)]
    logging.info("%i trigrams found: %s..." % (len(trigrams), trigrams[:20]))

    bcf = tcf.bigram_finder()
    bcf.apply_freq_filter(min_bigram_freq)
    bigrams = [' '.join(w) for w in bcf.nbest(BigramAssocMeasures.pmi, top_n)]
    logging.info("%i bigrams found: %s..." % (len(bigrams), bigrams[:20]))

    bigrams_patterns = re.compile('(%s)' % '|'.join(bigrams), re.UNICODE)
    trigrams_patterns = re.compile('(%s)' % '|'.join(trigrams), re.UNICODE)

    return bigrams_patterns, trigrams_patterns
开发者ID:lewismc,项目名称:topik,代码行数:33,代码来源:tokenizers.py


示例3: trigramFeats

def trigramFeats(thesewords, n=100):
    si = iter(thesewords)
    words = [c + " " + next(si, '') + " " + next(si, '') for c in si]
    tcf = TrigramCollocationFinder.from_words(words)
    tcf.apply_freq_filter(n)
    trigram = tcf.nbest(TrigramAssocMeasures.likelihood_ratio, n)
    return dict([(ngram, True) for ngram in itertools.chain(words, trigram)])
开发者ID:levidehaan,项目名称:securitynowbigdataproject,代码行数:7,代码来源:frameparser.py


示例4: get_frequencies

    def get_frequencies(self, desc):

        stopset = set(stopwords.words('english'))
        filter_stops = lambda w: len(w) < 3 or w in stopset
        words = word_tokenize(desc)

        print '------gram--------'
        words_to_count = [word for word in words if word not in stopset]
        words_to_count = [word for word in words_to_count if not len(word) < 3]
        c = Counter(words_to_count)
        single = c.most_common(20)
        print single

        print '------bigram--------'
        bcf = BigramCollocationFinder.from_words(words)
        bcf.apply_word_filter(filter_stops)
        bigrm = bcf.nbest(BigramAssocMeasures.likelihood_ratio, 15)
        print bigrm

        print '------trigram--------'
        tcf = TrigramCollocationFinder.from_words(words)
        tcf.apply_word_filter(filter_stops)
        tcf.apply_freq_filter(3)  #only keep those that appear more than 3 times
        trigrm = tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 10)
        print trigrm

        matches = [single,bigrm,trigrm]
        return matches
开发者ID:amac441,项目名称:Metten,代码行数:28,代码来源:grapher.py


示例5: trigram

def trigram(words, score_fn=TrigramAssocMeasures.likelihood_ratio, n=1500, freq=1):
    """
    tmp_words=[]
    for w in words:
        tmp_words.append(w)
    words=tmp_words
    """
    if len(words) <= 0:
        return {}

    tmp_dict = {}

    for w in words:
        tmp_dict[w] = 1

    if len(tmp_dict.keys()) < 3:
        return {}

    trigram_finder = TrigramCollocationFinder.from_words(words)  # 把文本变成双词搭配的形式
    trigram_finder.apply_freq_filter(freq)
    trigrams = trigram_finder.nbest(score_fn, n)  # 使用了卡方统计的方法,选择排名前1000的双词

    # print type(words)

    res = {}

    for s in trigrams:

        if res.has_key(s[0] + s[1] + s[2]) == True:
            res[s[0] + s[1] + s[2]] += 1
        else:
            res[s[0] + s[1] + s[2]] = 1

    return res
开发者ID:cysjtu,项目名称:SentimentAnalysis,代码行数:34,代码来源:nlp_machine2222.py


示例6: best_ngrams

def best_ngrams(words, top_n=10, min_freq=5):
    """
    Extract `top_n` most salient collocations (bigrams and trigrams),
    from a stream of words. Ignore collocations with frequency
    lower than `min_freq`.

    This fnc uses NLTK for the collocation detection itself -- not very scalable!

    Return the detected ngrams as compiled regular expressions, for their faster
    detection later on.

    """
    tcf = TrigramCollocationFinder.from_words(words)
    tcf.apply_freq_filter(min_freq)
    trigrams = [' '.join(w) for w in tcf.nbest(TrigramAssocMeasures.chi_sq, top_n)]
    logging.info("%i trigrams found: %s..." % (len(trigrams), trigrams[:20]))

    bcf = tcf.bigram_finder()
    bcf.apply_freq_filter(min_freq)
    bigrams = [' '.join(w) for w in bcf.nbest(BigramAssocMeasures.pmi, top_n)]
    logging.info("%i bigrams found: %s..." % (len(bigrams), bigrams[:20]))

    pat_gram2 = re.compile('(%s)' % '|'.join(bigrams), re.UNICODE)
    pat_gram3 = re.compile('(%s)' % '|'.join(trigrams), re.UNICODE)

    print pat_gram2
    
    return pat_gram2, pat_gram3
开发者ID:ninamiriamjnana,项目名称:topic,代码行数:28,代码来源:get_data.py


示例7: create_tri_collocations

def create_tri_collocations(features_words,document_preprocess):
    finder = TrigramCollocationFinder.from_words(movie_reviews.words())
    finder.apply_freq_filter(3)
    tricoll = finder.nbest(trigram_measures.pmi,1000)
    for f in document_preprocess:
        tricoll = [(f(a),f(b),f(c)) for (a,b,c) in tricoll if (f(a) and f(b) and f(c))]
    return tricoll
开发者ID:katomaso,项目名称:LiU-TextMining,代码行数:7,代码来源:Lab5-2.py


示例8: __init__

    def __init__(self, words, sentences, language):
        self.num_words = len(words)
        self.unique_words = len(set(words))
        self.num_sentences = len(sentences)
        self.average_sentence_length = round(self.num_words / self.num_sentences)
        self.lexical_diversity = round(self.num_words / self.unique_words)

        fdist = FreqDist(words)
        stop_words = stopwords.words(language)
        not_stopwords = [w for w in words if w not in stop_words]
        fdist2 = FreqDist(not_stopwords)
        self.fifty_first_words = fdist.most_common(50)
        self.hundreds_nsw = fdist2.most_common(300)

        bigram_measures = BigramAssocMeasures()
        finder = BigramCollocationFinder.from_words(words)
        finder.apply_freq_filter(10)
        self.fifty_collocations = finder.nbest(bigram_measures.pmi, 50)

        trigram_measures = TrigramAssocMeasures()
        finder3 = TrigramCollocationFinder.from_words(words)
        finder3.apply_freq_filter(10)
        self.fifty_collocations3 = finder3.nbest(trigram_measures.pmi, 50)

        self.stcs_width_words = [' '.join(sent) for sent in sentences
                                 if "malheureusement" in sent.lower()]
开发者ID:Raveline,项目名称:journal-imaginaire,代码行数:26,代码来源:analyst.py


示例9: extract_trigrams

 def extract_trigrams(self, sent):
    sent = self._preprocess_sent(sent)
    trigram_measures = TrigramAssocMeasures()
    TriFinder = TrigramCollocationFinder.from_words(sent)
    trigrams = TriFinder.nbest(trigram_measures.pmi, 10000)
    trigrams = set([' '.join(i) for i in trigrams])
    trigrams = trigrams & self._trigrams_set
    return { i: True for i in trigrams }
开发者ID:aginiewicz,项目名称:EmoClassifier,代码行数:8,代码来源:feature_extraction.py


示例10: set_trigramas

	def set_trigramas(self,freq=2,best=20):
		tcf = TrigramCollocationFinder.from_words(self.palavras)
		stopset = set(stopwords.words('portuguese'))
		filter_stops = lambda w: len(w) < 3 or w in stopset
		tcf.apply_word_filter(filter_stops)
		tcf.apply_freq_filter(freq)
		a = tcf.nbest(TrigramAssocMeasures.pmi, best)
		self.trigramas = a
开发者ID:eric011,项目名称:SemantikaCrawler,代码行数:8,代码来源:lingprocessador.py


示例11: calc_trigrams

def calc_trigrams(text, min_freq=50):
	"""Returns frequency of trigrams from a text input."""
	words = [w.lower() for w in text]
	tcf = TrigramCollocationFinder.from_words(words)
	tcf.apply_freq_filter(min_freq)
	trigrams = tcf.ngram_fd.items()
	trigram_list.append(trigrams)
	return trigram_list
开发者ID:djkn0x,项目名称:GA_homework,代码行数:8,代码来源:trigrams.py


示例12: trigram_word_feats

def trigram_word_feats(words, score_fn=TrigramAssocMeasures.chi_sq, n=50):
    trigram_finder = TrigramCollocationFinder.from_words(words)
    try:
        trigrams = trigram_finder.nbest(score_fn, n)
    except:
        print "lost trigrams", words
        return dict([(ngram, True) for ngram in itertools.chain(words)])

    return dict([(ngram, True) for ngram in itertools.chain(words, trigrams)])
开发者ID:zhouxiaofan,项目名称:Projects,代码行数:9,代码来源:sentimentMapper.py


示例13: getTrigrams

    def getTrigrams(self):

        words = [w.lower() for w in nltk.word_tokenize(self.text)]
        tcf = TrigramCollocationFinder.from_words(words)
        stopset = set(stopwords.words('english'))
        filter_stops = lambda w: len(w) < 3 or w in stopset
        tcf.apply_word_filter(filter_stops)
        tcf.apply_freq_filter(1)
        return tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 6)
开发者ID:DevilDante88,项目名称:MyCogs,代码行数:9,代码来源:postagger.py


示例14: getTrigram

def getTrigram(haystack):
    tokenizer = WordPunctTokenizer()
    words = tokenizer.tokenize(haystack)
    tcf = TrigramCollocationFinder.from_words(words)
    stopset = set(stopwords.words('english'))
    filter_stops = lambda w: len(w) < 3 or w in stopset
    tcf.apply_word_filter(filter_stops)

    return tcf.nbest(TrigramAssocMeasures.likelihood_ratio, 4)
开发者ID:blorenz,项目名称:cms,代码行数:9,代码来源:seo.py


示例15: best_n_trigrams

    def best_n_trigrams(self, n, method="pmi"):
        trigram_measures = TrigramAssocMeasures()
        tokens = self.get_word_lst()
        finder = TrigramCollocationFinder.from_words(tokens)

        if method == "pmi":
            return finder.nbest(trigram_measures.pmi, n)
        if method == "raw_freq":
            return finder.nbest(trigram_measures.raw_freq, n)
开发者ID:WilliamHammond,项目名称:fbcanalyzer,代码行数:9,代码来源:ChatStream.py


示例16: setup_mwes

    def setup_mwes(self, trigram_nbest=100, bigram_nbest=2000):
        """Create multi-word expressions by learning a corpus located in a corpus directory.

        Testing setting up mwes with custom path and setting it up twice (correct when no exception):
        >>> corpus_dir = os.path.join(base_path, 'test', 'corpus')
        >>> clusterer = DumbClusterer(corpus_dir=corpus_dir, mwes=['custom mwe'])
        >>> mwes = clusterer.setup_mwes(trigram_nbest=1000, bigram_nbest=15000)
        >>> 'custom mwe' not in mwes
        True

        >>> 'custom mwe' in clusterer.mwes
        True

        Args:
            trigram_nbest(int): Number of highest ranked trigrams to acquire.
            bigram_nbest(int): Number of highest ranked trigrams to acquire.
        Returns:
            list: List of multi-word expressions.
        """
        if self.corpus is None:
            raise Exception("Corpus not found. Run method `setup_corpus` with given corpus directory first.")

        bigram_measures = BigramAssocMeasures()
        trigram_measures = TrigramAssocMeasures()

        # Following are not used since ne chunk takes too much time.
        # Text processing before bigrams and trigrams calculated
        # words = []
        # for sent in self.corpus.sents():
        #     for chunk in nltk.ne_chunk(nltk.pos_tag(sent)):
        #         if not isinstance(chunk, nltk.Tree):
        #             w = chunk[0]
        #             # - Removal of words containing numbers or punctuations
        #             if not any((ch.isdigit() or ch in string.punctuation) for ch in w):
        #                 # - Lowercasing all words
        #                 words.append(w.lower())
        #                 print(w.lower().encode("utf-8")),

        # Text processing before bigrams and trigrams calculated
        words = []
        for w in self.corpus.words():
            # - Removal of words containing numbers or punctuations
            if not any((ch.isdigit() or ch in string.punctuation) for ch in w):
                # - Lowercasing all words
                words.append(w.lower())

        bigram_finder = BigramCollocationFinder.from_words(words)
        trigram_finder = TrigramCollocationFinder.from_words(words)
        mwes = trigram_finder.nbest(trigram_measures.pmi, trigram_nbest) + bigram_finder.nbest(bigram_measures.pmi, bigram_nbest)
        # Basically combining two list by turning them into sets to make sure union returned 
        # i.e. `set1 | set2` where set1 could be list of string or list, and if the latter, they
        # need to be converted into sets.
        set1 = {(tuple(mwe) if isinstance(mwe,list) else mwe) for mwe in self.mwes}
        set2 = set(mwes)
        self.mwes = list(set1 | set2)
        return mwes
开发者ID:jaycode,项目名称:-archive-Arthur,代码行数:56,代码来源:dumb_clusterer.py


示例17: trigrams

def trigrams(words, max_trigrams=100):
    print "Extracting trigrams"
    trigram_finder = TrigramCollocationFinder.from_words(words)

    for trigram, score in trigram_finder.score_ngrams(trigram_measures.raw_freq)[:max_trigrams]:
        l_trigram = [lmtzr.lemmatize(p) for p in trigram]
        if l_trigram in tg:
            print "Common trigram", trigram
            continue

        #print trigram, score
        yield trigram
开发者ID:cratejoy,项目名称:nladwords,代码行数:12,代码来源:nl_util.py


示例18: find_collocations

def find_collocations(text_series):
    bigram_measures = BigramAssocMeasures()
    trigram_measures = TrigramAssocMeasures()
    tokens = [ token for token_list in text_series for token in token_list ]
    bigrams = BigramCollocationFinder.from_words(tokens)
    trigrams = TrigramCollocationFinder.from_words(tokens)
    scored_bigrams = bigrams.score_ngrams(bigram_measures.likelihood_ratio)
    scored_trigrams = trigrams.score_ngrams(trigram_measures.likelihood_ratio)
    with open('bigrams.pkl', 'wb') as fid:
        cPickle.dump(scored_bigrams, fid)
    with open('trigrams.pkl', 'wb') as fid:
        cPickle.dump(scored_trigrams, fid)
开发者ID:annamarie-g,项目名称:capstone_project,代码行数:12,代码来源:text_preprocessing.py


示例19: _collect_bigrams_and_trigrams

def _collect_bigrams_and_trigrams(raw_corpus, top_n=10000, min_length=1, min_freqs=None, stopwords=None):
    """collects bigrams and trigrams from collection of documents.  Input to collocation tokenizer.

    bigrams are pairs of words that recur in the collection; trigrams are triplets.

    Parameters
    ----------
    raw_corpus : iterable of tuple of (doc_id(str/int), doc_text(str))
        body of documents to examine
    top_n : int
        limit results to this many entries
    min_length : int
        Minimum length of any single word
    min_freqs : iterable of int
        threshold of when to consider a pair of words as a recognized n-gram,
        starting with bigrams.
    stopwords : None or iterable of str
        Collection of words to ignore as tokens

    Examples
    --------
    >>> patterns = _collect_bigrams_and_trigrams(sample_corpus, min_freqs=[2, 2])
    >>> patterns[0].pattern
    u'(frank swank|swank tank|sassy unicorns)'
    >>> patterns[1].pattern
    u'(frank swank tank)'
    """

    from nltk.collocations import TrigramCollocationFinder
    from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures

    # generator of documents, turn each element to its list of words
    doc_texts = (_simple_document(doc_text, min_length=min_length, stopwords=stopwords)
                 for doc_id, doc_text in raw_corpus)
    # generator, concatenate (chain) all words into a single sequence, lazily
    words = itertools.chain.from_iterable(doc_texts)
    tcf = TrigramCollocationFinder.from_words(iter(words))

    bcf = tcf.bigram_finder()
    bcf.apply_freq_filter(min_freqs[0])
    bigrams = [' '.join(w) for w in bcf.nbest(BigramAssocMeasures.pmi, top_n)]

    tcf.apply_freq_filter(min_freqs[1])
    trigrams = [' '.join(w) for w in tcf.nbest(TrigramAssocMeasures.chi_sq, top_n)]

    bigrams_patterns = re.compile('(%s)' % '|'.join(bigrams), re.UNICODE)
    trigrams_patterns = re.compile('(%s)' % '|'.join(trigrams), re.UNICODE)

    return bigrams_patterns, trigrams_patterns
开发者ID:gpfreitas,项目名称:topik,代码行数:49,代码来源:ngrams.py


示例20: best_ngrams

    def best_ngrams(words, top_n=1000, min_freq=100):
        tcf = TrigramCollocationFinder.from_words(words)
        tcf.apply_freq_filter(min_freq)
        trigrams = [' '.join(w) for w in tcf.nbest(TrigramAssocMeasures.chi_sq, top_n)]
        logging.info('%i trigrams found: %s...' % (len(trigrams), trigrams[:10]))

        bcf = tcf.bigram_finder()
        bcf.apply_freq_filter(min_freq)
        bigrams = [' '.join(w) for w in bcf.nbest(BigramAssocMeasures.pmi, top_n)]
        logging.info('%i bigrams found: %s...' % (len(bigrams), bigrams[:10]))

        pat_gram2 = re.compile('(%s)' % '|'.join(bigrams), re.UNICODE)
        pat_gram3 = re.compile('(%s)' % '|'.join(trigrams), re.UNICODE)

        return pat_gram2, pat_gram3
开发者ID:codekansas,项目名称:icm-2016-topic-modeling,代码行数:15,代码来源:topic_model_news.py



注:本文中的nltk.collocations.TrigramCollocationFinder类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python corpus.PlaintextCorpusReader类代码示例发布时间:2022-05-27
下一篇:
Python collocations.BigramCollocationFinder类代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap