• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python metrics.BigramAssocMeasures类代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中nltk.metrics.BigramAssocMeasures的典型用法代码示例。如果您正苦于以下问题:Python BigramAssocMeasures类的具体用法?Python BigramAssocMeasures怎么用?Python BigramAssocMeasures使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。



在下文中一共展示了BigramAssocMeasures类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: create_word_bigram_scores

def create_word_bigram_scores():
    posdata = tp.seg_fil_senti_excel("~", 1, 1)
    negdata = tp.seg_fil_senti_excel("~", 1, 1)
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)

    pos = posWords + posBigrams
    neg = negWords + negBigrams

    word_fd = FreqDist()
    last_word = ConditionalFreqDist()
    for word in pos:
        word_fd.inc(word)
        last_word['pos'].inc(word)
    for word in neg:
        word_fd.inc(word)
        last_word['neg'].inc(word)

    pos_word_count = last_word['pos'].N()
    neg_word_count = last_word['neg'].N()
    totalnumber = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(last_word['pos'][word], (freq, pos_word_count), totalnumber)
        neg_score = BigramAssocMeasures.chi_sq(last_word['neg'][word], (freq, neg_word_count), totalnumber)
        word_scores[word] = pos_score + neg_score

    return word_scores
开发者ID:TianyiM,项目名称:Final-Project,代码行数:35,代码来源:score.py


示例2: high_words

def high_words(posids, negids, cutoff, score_fn=BigramAssocMeasures.chi_sq, min_score=5):

    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()
    pos = 0
    neg = 0
    for review in posids:
        pos += 1
        if (pos != cutoff):
            for word in review['text'].split(' '):
                word_fd.update(token_helpers.tokenize_simple(word))
                label_word_fd['pos'].update(token_helpers.tokenize_simple(word))
 
    for review in negids:
        neg += 1
        if (neg != cutoff):
            for word in review['text'].split(' '):
                word_fd.update(token_helpers.tokenize_simple(word))
                label_word_fd['neg'].update(token_helpers.tokenize_simple(word))
    
    pos_word_count = label_word_fd['pos'].N()
    neg_word_count = label_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.items():
        pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score
    best = sorted(word_scores.items(), key=itemgetter(1), reverse=True)[:10000]
    bestwords = set([w for w, s in best])
    return bestwords
    
    """
开发者ID:efrenaguilar95,项目名称:Yelp_Analyzer,代码行数:34,代码来源:classifiers.py


示例3: create_word_scores

	def create_word_scores(self):
		[posWords, negWords] = self.getAllWords()
		
		posWords = list(itertools.chain(*posWords))
		negWords = list(itertools.chain(*negWords))

		word_fd = FreqDist()
		cond_word_fd = ConditionalFreqDist()
		for word in posWords:
			word_fd.inc(word)
			cond_word_fd['pos'].inc(word)
		for word in negWords:
			word_fd.inc(word)
			cond_word_fd['neg'].inc(word)

		pos_word_count = cond_word_fd['pos'].N()
		neg_word_count = cond_word_fd['neg'].N()
		total_word_count = pos_word_count + neg_word_count

		log("Total number of words: %d" % total_word_count)

		word_scores = {}
		for word, freq in word_fd.iteritems():
			pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
			neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
			word_scores[word] = pos_score + neg_score

		return word_scores
开发者ID:zlmoment,项目名称:Tweet-Sentiment-Classification,代码行数:28,代码来源:getFeatureList_ChiSquare.py


示例4: create_word_bigram_scores

def create_word_bigram_scores(posWords, negWords):
    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 2000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 2000)

    pos = posWords + posBigrams #词和双词搭配
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[str(word)] += 1 
        cond_word_fd['pos'][str(word)] += 1
    for word in neg:
	    word_fd[str(word)] += 1
	    cond_word_fd['neg'][str(word)] += 1

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
开发者ID:delili,项目名称:NLP_Comments_Sentiment_Analysis,代码行数:29,代码来源:process.py


示例5: setup

def setup():
    global bestwords

    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()

    for word in movie_reviews.words(categories=['pos']):
        word_fd.inc(word.strip('\'"?,.').lower())
        label_word_fd['pos'].inc(word.lower())

    for word in movie_reviews.words(categories=['neg']):
        word_fd.inc(word.strip('\'"?,.').lower())
        label_word_fd['neg'].inc(word.lower())

    pos_word_count = label_word_fd['pos'].N()
    neg_word_count = label_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}

    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
            (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
            (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000]
    bestwords = set([w for w, s in best])
    return train(best_bigram_word_features)
开发者ID:seanfreiburg,项目名称:chicago_tweet_grabber,代码行数:30,代码来源:analyze_tweets.py


示例6: create_word_bigram_scores

def create_word_bigram_scores():
    posdata = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/pos_review.pkl','r'))
    negdata = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/neg_review.pkl','r'))
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)

    pos = posWords + posBigrams #词和双词搭配
    neg = negWords + negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd[word] += 1#word_fd.inc(word)
        cond_word_fd['pos'][word]+= 1 #cond_word_fd['pos'].inc(word)
    for word in neg:
        word_fd[word] += 1#word_fd.inc(word)
        cond_word_fd['neg'][word]+= 1#cond_word_fd['neg'].inc(word)

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
开发者ID:coolspiderghy,项目名称:sina_weibo_crawler,代码行数:35,代码来源:extractFeatures_org.py


示例7: create_word_scores

def create_word_scores(posWords,negWords,posTag,negTag):
    from nltk.probability import FreqDist, ConditionalFreqDist
    import itertools 
    posWords = list(itertools.chain(*posWords)) #把多维数组解链成一维数组
    negWords = list(itertools.chain(*negWords)) #同理

    word_fd = FreqDist() #可统计所有词的词频
    cond_word_fd = ConditionalFreqDist() #可统计积极文本中的词频和消极文本中的词频
    for word in posWords:
        #help(FreqDist)
        word_fd[word] += 1#word_fd.inc(word)
        cond_word_fd[posTag][word]+= 1#cond_word_fd['pos'].inc(word)
    for word in negWords:
        word_fd[word] += 1#word_fd.inc(word)
        cond_word_fd[negTag][word]+= 1#cond_word_fd['neg'].inc(word)

    pos_word_count = cond_word_fd[posTag].N() #积极词的数量
    neg_word_count = cond_word_fd[negTag].N() #消极词的数量
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd[posTag][word], (freq, pos_word_count), total_word_count) #计算积极词的卡方统计量,这里也可以计算互信息等其它统计量
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd[negTag][word], (freq, neg_word_count), total_word_count) #同理
        word_scores[word] = pos_score + neg_score #一个词的信息量等于积极卡方统计量加上消极卡方统计量

    return word_scores #包括了每个词和这个词的信息量
开发者ID:coolspiderghy,项目名称:sina_weibo_crawler,代码行数:27,代码来源:extractFeatures.py


示例8: create_word_scores

def create_word_scores():

    posWords = list(itertools.chain(*datap)) #把多维数组解链成一维数组
    negWords = list(itertools.chain(*datan)) #同理

    word_fd = nltk.FreqDist()
    cond_word_fd = ConditionalFreqDist() #可统计积极文本中的词频和消极文本中的词频
    for word in posWords:
        word_fd[word] += 1
        cond_word_fd['pos'][word] += 1
    for word in negWords:
        word_fd[word] += 1
        cond_word_fd['neg'][word] += 1

    pos_word_count = cond_word_fd['pos'].N() #积极词的数量
    neg_word_count = cond_word_fd['neg'].N() #消极词的数量
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) #计算积极词的卡方统计量,这里也可以计算互信息等其它统计量
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) #同理
        word_scores[word] = pos_score + neg_score #一个词的信息量等于积极卡方统计量加上消极卡方统计量

    return word_scores #包括了每个词和这个词的信息量
开发者ID:xiutx,项目名称:review_emotion,代码行数:25,代码来源:findword.py


示例9: create_bigram_scores

def create_bigram_scores():
    posdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/pos_review.xlsx", "1", "1")
    negdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/neg_review.xlsx", "1", "1")
    
    posWords = list(itertools.chain(*posdata))
    negWords = list(itertools.chain(*negdata))

    bigram_finder = BigramCollocationFinder.from_words(posWords)
    bigram_finder = BigramCollocationFinder.from_words(negWords)
    posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 8000)
    negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 8000)

    pos = posBigrams
    neg = negBigrams

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in pos:
        word_fd.inc(word)
        cond_word_fd['pos'].inc(word)
    for word in neg:
        word_fd.inc(word)
        cond_word_fd['neg'].inc(word)

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
开发者ID:EricChanBD,项目名称:Review-Helpfulness-Prediction,代码行数:35,代码来源:store+sentiment+classifier.py


示例10: getWordScores

def getWordScores():
    posWords = []
    negWords = []
    with open(RT_POLARITY_POS_FILE, 'r') as posSentences:
        for i in posSentences:
            posWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            posWords.append(posWord)
    with open(RT_POLARITY_NEG_FILE, 'r') as negSentences:
        for i in negSentences:
            negWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
            negWords.append(negWord)
    posWords = list(itertools.chain(*posWords))
    negWords = list(itertools.chain(*negWords))

    word_fd = FreqDist()
    cond_word_fd = ConditionalFreqDist()
    for word in posWords:
        word_fd[word.lower()] += 1
        cond_word_fd['pos'][word.lower()] += 1
    for word in negWords:
        word_fd[word.lower()] += 1
        cond_word_fd['neg'][word.lower()] += 1

    pos_word_count = cond_word_fd['pos'].N()
    neg_word_count = cond_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    return word_scores
开发者ID:Sapphirine,项目名称:MyTravelAgent,代码行数:34,代码来源:Sentiment.py


示例11: getBestWords

def getBestWords(posWords, negWords):
    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()

    for word in posWords:
        word_fd[word.lower()] += 1
        label_word_fd["pos"][word.lower()] += 1

    for word in negWords:
        word_fd[word.lower()] += 1
        label_word_fd["neg"][word.lower()] += 1

    pos_word_count = label_word_fd["pos"].N()
    neg_word_count = label_word_fd["neg"].N()
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}

    for word, freq in word_fd.items():
        pos_score = BigramAssocMeasures.chi_sq(label_word_fd["pos"][word], (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(label_word_fd["neg"][word], (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score

    # best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000]
    sorted_x = sorted(word_scores.items(), key=operator.itemgetter(1), reverse=True)
    bestwords = set([w for w, s in sorted_x])

    return bestwords
开发者ID:dakshvar22,项目名称:DishingOut,代码行数:28,代码来源:sentimentTrainer.py


示例12: get_bestwords

def get_bestwords(contents, labels, limit = 10000, n = None, cache = True):
    if cache:
        if n:
            cache_path = 'cache/%s_%s.pkl' % (limit, n)
            if os.path.exists(cache_path):
                bestwords = pickle.load(open(cache_path, 'r'))
                print 'Loaded from cache'
                print 'bestwords count = %d' % (len(bestwords))
                return bestwords
    
    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()
    
    pos_contents = contents[labels == 1]
    neg_contents = contents[labels != 0]
    
    pos_words = set()
    neg_words = set()
    
    for pos_content in pos_contents:
        pos_words = pos_words.union(word_tokenize(pos_content))
    
    for neg_content in neg_contents:
        neg_words = neg_words.union(word_tokenize(neg_content))
    
    for word in pos_words:
        word_fd.inc(word.lower())
        label_word_fd['pos'].inc(word.lower())
    
    for word in neg_words:
        word_fd.inc(word.lower())
        label_word_fd['neg'].inc(word.lower())
    
    pos_word_count = label_word_fd['pos'].N()
    neg_word_count = label_word_fd['neg'].N()
    total_word_count = pos_word_count + neg_word_count
    
    word_scores = {}
    
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
            (freq, pos_word_count), total_word_count)
        neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
            (freq, neg_word_count), total_word_count)
        word_scores[word] = pos_score + neg_score
    
    best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:limit]
    bestwords = set([w for w, s in best])
    
    print 'all words count = %d' % (len(word_scores))
    print 'bestwords count = %d' % (len(bestwords))
    
    if cache:
        if n:
            cache_path = 'cache/%s_%s.pkl' % (limit, n)
            f = open(cache_path, 'w')
            pickle.dump(bestwords, f)
            print 'Dumped to cache'
    
    return bestwords
开发者ID:colinsongf,项目名称:stumbleupon_evergreen_classification_challenge,代码行数:60,代码来源:submission.py


示例13: best_word_feats

 def best_word_feats(self, words):
     word_fd = FreqDist()
     label_word_fd = ConditionalFreqDist()
      
     for word in movie_reviews.words(categories=['pos']):
         word_fd.inc(word.lower())
         label_word_fd['pos'].inc(word.lower())
      
     for word in movie_reviews.words(categories=['neg']):
         word_fd.inc(word.lower())
         label_word_fd['neg'].inc(word.lower())
      
     # n_ii = label_word_fd[label][word]
     # n_ix = word_fd[word]
     # n_xi = label_word_fd[label].N()
     # n_xx = label_word_fd.N()
      
     pos_word_count = label_word_fd['pos'].N()
     neg_word_count = label_word_fd['neg'].N()
     total_word_count = pos_word_count + neg_word_count
      
     word_scores = {}
      
     for word, freq in word_fd.iteritems():
         pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
             (freq, pos_word_count), total_word_count)
         neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
             (freq, neg_word_count), total_word_count)
         word_scores[word] = pos_score + neg_score
      
     best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000]
     bestwords = set([w for w, s in best])
     return dict([(word, True) for word in words if word in bestwords])
开发者ID:dkaliyev,项目名称:TwitterAnalyser,代码行数:33,代码来源:NBClass.py


示例14: computeFreqDistribution

def computeFreqDistribution():

	if DEBUG:
		print word_fd

	pos_word_count = label_word_fd['positive'].N()
	neg_word_count = label_word_fd['negative'].N()
	neu_word_count = label_word_fd['neutral'].N()
	total_word_count = pos_word_count + neg_word_count + neu_word_count

	word_scores = {}

	for word, freq in word_fd.iteritems():
		pos_score = BigramAssocMeasures.chi_sq(label_word_fd['positive'][word], (freq, pos_word_count), total_word_count)
		neg_score = BigramAssocMeasures.chi_sq(label_word_fd['negative'][word], (freq, neg_word_count), total_word_count)
		neu_score = BigramAssocMeasures.chi_sq(label_word_fd['neutral'][word], (freq, neu_word_count), total_word_count)
		word_scores[word] = pos_score + neg_score + neu_score

	if DEBUG:
		print json.dumps(word_scores, indent = 4)

	threshold = 2

	temp = []

	for item in word_scores:
		if word_scores[item] > threshold:
			temp.append(item)

	if DEBUG:
     
		print temp
                
	return temp
开发者ID:chawlaaditya8,项目名称:Sentiment-Analysis,代码行数:34,代码来源:app.py


示例15: get_best_words

def get_best_words(words_list, num_best_words):
	from nltk.probability import FreqDist, ConditionalFreqDist
	from nltk.metrics import BigramAssocMeasures


	word_fd = FreqDist()
	label_word_fd = ConditionalFreqDist()

	for pair in words_list:
		line,sent = pair
		for word in nltk.word_tokenize(line):
			word_fd.inc(word.lower())
			label_word_fd[sent].inc(word.lower())

	pos_word_count = label_word_fd['pos'].N()
	neg_word_count = label_word_fd['neg'].N()
	total_word_count = pos_word_count + neg_word_count


	word_scores = {}
	for word, freq in word_fd.iteritems():
		pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],(freq, pos_word_count),total_word_count)
		neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],(freq, neg_word_count),total_word_count)
		word_scores[word] = pos_score + neg_score
 
	best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:num_best_words]
	bestwords = set([w for w, s in best])

	return bestwords
开发者ID:dsedra,项目名称:yproject,代码行数:29,代码来源:sent_master.py


示例16: store_word_scores

    def store_word_scores(self):
        """
        Stores 'word scores' into Redis.
        """
        
        try:
            word_freqdist = pickle.loads(self.r.get('word_fd'))
            label_word_freqdist = pickle.loads(self.r.get('label_fd'))
        except TypeError:
            print('Requires frequency distributions to be built.')

        word_scores = {}

        pos_word_count = label_word_freqdist['pos'].N()
        neg_word_count = label_word_freqdist['neg'].N()
        total_word_count = pos_word_count + neg_word_count

        for word, freq in word_freqdist.iteritems():
            pos_score = BigramAssocMeasures.chi_sq(label_word_freqdist['pos'][word], (freq, pos_word_count), total_word_count)

            neg_score = BigramAssocMeasures.chi_sq(label_word_freqdist['neg'][word], (freq, neg_word_count), total_word_count)

            word_scores[word] = pos_score + neg_score
        
        self.r.set('word_scores', word_scores)
开发者ID:daniel-cloudspace,项目名称:synt,代码行数:25,代码来源:redis_manager.py


示例17: create_word_scores

def create_word_scores():
    posWords = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/pos_review.pkl','r'))
    negWords = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/neg_review.pkl','r'))
    
    posWords = list(itertools.chain(*posWords)) #把多维数组解链成一维数组
    negWords = list(itertools.chain(*negWords)) #同理

    word_fd = FreqDist() #可统计所有词的词频
    cond_word_fd = ConditionalFreqDist() #可统计积极文本中的词频和消极文本中的词频
    for word in posWords:
        #help(FreqDist)
        word_fd[word] += 1#word_fd.inc(word)
        cond_word_fd['pos'][word]+= 1#cond_word_fd['pos'].inc(word)
    for word in negWords:
        word_fd[word] += 1#word_fd.inc(word)
        cond_word_fd['neg'][word]+= 1#cond_word_fd['neg'].inc(word)

    pos_word_count = cond_word_fd['pos'].N() #积极词的数量
    neg_word_count = cond_word_fd['neg'].N() #消极词的数量
    total_word_count = pos_word_count + neg_word_count

    word_scores = {}
    for word, freq in word_fd.iteritems():
        pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) #计算积极词的卡方统计量,这里也可以计算互信息等其它统计量
        neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) #同理
        word_scores[word] = pos_score + neg_score #一个词的信息量等于积极卡方统计量加上消极卡方统计量

    return word_scores #包括了每个词和这个词的信息量
开发者ID:coolspiderghy,项目名称:sina_weibo_crawler,代码行数:28,代码来源:extractFeatures_org.py


示例18: __setTermsCHISQUARE__

    def __setTermsCHISQUARE__(self,size):
        word_fd = FreqDist()
        label_word_fd = ConditionalFreqDist()
        
        for word in self.reader.words(categories=['pos']):
            word_fd.inc(word.lower())
            label_word_fd['pos'].inc(word.lower())

        for word in self.reader.words(categories=['neg']):
            word_fd.inc(word.lower())
            label_word_fd['neg'].inc(word.lower())
            
        pos_word_count = label_word_fd['pos'].N()
        neg_word_count = label_word_fd['neg'].N()
        total_word_count = pos_word_count + neg_word_count

        wordScores = {}
        
        for word, freq in word_fd.iteritems():
            pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
                                                   (freq, pos_word_count), total_word_count)
            neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
                                                   (freq, neg_word_count), total_word_count)
            wordScores[word] = pos_score + neg_score

        termScore = sorted(wordScores.items(),key=lambda(w,s):s,reverse=True)[:size]
        self.terms = [w for (w,s) in termScore];
开发者ID:bharadwaj221,项目名称:SentimentAnalysis,代码行数:27,代码来源:corpusReader.py


示例19: store_feature_scores

    def store_feature_scores(self):
        """
        Determine the scores of words based on chi-sq and stores word:score to Redis.
        """
        
        try:
            word_fd = self.pickle_load('word_fd')
            label_word_freqdist = self.pickle_load('label_fd')
        except TypeError:
            print('Requires frequency distributions to be built.')

        word_scores = {}

        pos_word_count = label_word_freqdist['positive'].N()
        neg_word_count = label_word_freqdist['negative'].N()
        total_word_count = pos_word_count + neg_word_count

        for label in label_word_freqdist.conditions():

            for word, freq in word_fd.iteritems():

                pos_score = BigramAssocMeasures.chi_sq(label_word_freqdist['positive'][word], (freq, pos_word_count), total_word_count)
                neg_score = BigramAssocMeasures.chi_sq(label_word_freqdist['negative'][word], (freq, neg_word_count), total_word_count)
            
                word_scores[word] = pos_score + neg_score 
      
        self.pickle_store('word_scores', word_scores)
开发者ID:chrisblythe812,项目名称:synt,代码行数:27,代码来源:db.py


示例20: _get_bigram_scores

    def _get_bigram_scores(self, posdata, negdata):
        pos_words = list(itertools.chain(*posdata))
        neg_words = list(itertools.chain(*negdata))

        pos_bigram_finder = BigramCollocationFinder.from_words(pos_words)
        neg_bigram_finder = BigramCollocationFinder.from_words(neg_words)
        pos_bigrams = pos_bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
        neg_bigrams = neg_bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)

        pos = pos_words + pos_bigrams
        neg = neg_words + neg_bigrams

        word_fd = FreqDist()
        cond_word_fd = ConditionalFreqDist()
        for word in pos:
            word_fd[word] += 1
            cond_word_fd['pos'][word] += 1
        for word in neg:
            word_fd[word] += 1
            cond_word_fd['neg'][word] += 1

        pos_word_count = cond_word_fd['pos'].N()
        neg_word_count = cond_word_fd['neg'].N()
        total_word_count = pos_word_count + neg_word_count

        word_scores = {}
        for word, freq in word_fd.iteritems():
            pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
            neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
            word_scores[word] = pos_score + neg_score

        return word_scores
开发者ID:Palazor,项目名称:sentiment,代码行数:32,代码来源:Extractor.py



注:本文中的nltk.metrics.BigramAssocMeasures类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python distance.edit_distance函数代码示例发布时间:2022-05-27
下一篇:
Python metrics.recall函数代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap