本文整理汇总了Python中nltk.metrics.BigramAssocMeasures类的典型用法代码示例。如果您正苦于以下问题:Python BigramAssocMeasures类的具体用法?Python BigramAssocMeasures怎么用?Python BigramAssocMeasures使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了BigramAssocMeasures类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: create_word_bigram_scores
def create_word_bigram_scores():
posdata = tp.seg_fil_senti_excel("~", 1, 1)
negdata = tp.seg_fil_senti_excel("~", 1, 1)
posWords = list(itertools.chain(*posdata))
negWords = list(itertools.chain(*negdata))
bigram_finder = BigramCollocationFinder.from_words(posWords)
bigram_finder = BigramCollocationFinder.from_words(negWords)
posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
pos = posWords + posBigrams
neg = negWords + negBigrams
word_fd = FreqDist()
last_word = ConditionalFreqDist()
for word in pos:
word_fd.inc(word)
last_word['pos'].inc(word)
for word in neg:
word_fd.inc(word)
last_word['neg'].inc(word)
pos_word_count = last_word['pos'].N()
neg_word_count = last_word['neg'].N()
totalnumber = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(last_word['pos'][word], (freq, pos_word_count), totalnumber)
neg_score = BigramAssocMeasures.chi_sq(last_word['neg'][word], (freq, neg_word_count), totalnumber)
word_scores[word] = pos_score + neg_score
return word_scores
开发者ID:TianyiM,项目名称:Final-Project,代码行数:35,代码来源:score.py
示例2: high_words
def high_words(posids, negids, cutoff, score_fn=BigramAssocMeasures.chi_sq, min_score=5):
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
pos = 0
neg = 0
for review in posids:
pos += 1
if (pos != cutoff):
for word in review['text'].split(' '):
word_fd.update(token_helpers.tokenize_simple(word))
label_word_fd['pos'].update(token_helpers.tokenize_simple(word))
for review in negids:
neg += 1
if (neg != cutoff):
for word in review['text'].split(' '):
word_fd.update(token_helpers.tokenize_simple(word))
label_word_fd['neg'].update(token_helpers.tokenize_simple(word))
pos_word_count = label_word_fd['pos'].N()
neg_word_count = label_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.items():
pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
best = sorted(word_scores.items(), key=itemgetter(1), reverse=True)[:10000]
bestwords = set([w for w, s in best])
return bestwords
"""
开发者ID:efrenaguilar95,项目名称:Yelp_Analyzer,代码行数:34,代码来源:classifiers.py
示例3: create_word_scores
def create_word_scores(self):
[posWords, negWords] = self.getAllWords()
posWords = list(itertools.chain(*posWords))
negWords = list(itertools.chain(*negWords))
word_fd = FreqDist()
cond_word_fd = ConditionalFreqDist()
for word in posWords:
word_fd.inc(word)
cond_word_fd['pos'].inc(word)
for word in negWords:
word_fd.inc(word)
cond_word_fd['neg'].inc(word)
pos_word_count = cond_word_fd['pos'].N()
neg_word_count = cond_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
log("Total number of words: %d" % total_word_count)
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
return word_scores
开发者ID:zlmoment,项目名称:Tweet-Sentiment-Classification,代码行数:28,代码来源:getFeatureList_ChiSquare.py
示例4: create_word_bigram_scores
def create_word_bigram_scores(posWords, negWords):
bigram_finder = BigramCollocationFinder.from_words(posWords)
bigram_finder = BigramCollocationFinder.from_words(negWords)
posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 2000)
negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 2000)
pos = posWords + posBigrams #词和双词搭配
neg = negWords + negBigrams
word_fd = FreqDist()
cond_word_fd = ConditionalFreqDist()
for word in pos:
word_fd[str(word)] += 1
cond_word_fd['pos'][str(word)] += 1
for word in neg:
word_fd[str(word)] += 1
cond_word_fd['neg'][str(word)] += 1
pos_word_count = cond_word_fd['pos'].N()
neg_word_count = cond_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
return word_scores
开发者ID:delili,项目名称:NLP_Comments_Sentiment_Analysis,代码行数:29,代码来源:process.py
示例5: setup
def setup():
global bestwords
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
for word in movie_reviews.words(categories=['pos']):
word_fd.inc(word.strip('\'"?,.').lower())
label_word_fd['pos'].inc(word.lower())
for word in movie_reviews.words(categories=['neg']):
word_fd.inc(word.strip('\'"?,.').lower())
label_word_fd['neg'].inc(word.lower())
pos_word_count = label_word_fd['pos'].N()
neg_word_count = label_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
(freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
(freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000]
bestwords = set([w for w, s in best])
return train(best_bigram_word_features)
开发者ID:seanfreiburg,项目名称:chicago_tweet_grabber,代码行数:30,代码来源:analyze_tweets.py
示例6: create_word_bigram_scores
def create_word_bigram_scores():
posdata = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/pos_review.pkl','r'))
negdata = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/neg_review.pkl','r'))
posWords = list(itertools.chain(*posdata))
negWords = list(itertools.chain(*negdata))
bigram_finder = BigramCollocationFinder.from_words(posWords)
bigram_finder = BigramCollocationFinder.from_words(negWords)
posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
pos = posWords + posBigrams #词和双词搭配
neg = negWords + negBigrams
word_fd = FreqDist()
cond_word_fd = ConditionalFreqDist()
for word in pos:
word_fd[word] += 1#word_fd.inc(word)
cond_word_fd['pos'][word]+= 1 #cond_word_fd['pos'].inc(word)
for word in neg:
word_fd[word] += 1#word_fd.inc(word)
cond_word_fd['neg'][word]+= 1#cond_word_fd['neg'].inc(word)
pos_word_count = cond_word_fd['pos'].N()
neg_word_count = cond_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
return word_scores
开发者ID:coolspiderghy,项目名称:sina_weibo_crawler,代码行数:35,代码来源:extractFeatures_org.py
示例7: create_word_scores
def create_word_scores(posWords,negWords,posTag,negTag):
from nltk.probability import FreqDist, ConditionalFreqDist
import itertools
posWords = list(itertools.chain(*posWords)) #把多维数组解链成一维数组
negWords = list(itertools.chain(*negWords)) #同理
word_fd = FreqDist() #可统计所有词的词频
cond_word_fd = ConditionalFreqDist() #可统计积极文本中的词频和消极文本中的词频
for word in posWords:
#help(FreqDist)
word_fd[word] += 1#word_fd.inc(word)
cond_word_fd[posTag][word]+= 1#cond_word_fd['pos'].inc(word)
for word in negWords:
word_fd[word] += 1#word_fd.inc(word)
cond_word_fd[negTag][word]+= 1#cond_word_fd['neg'].inc(word)
pos_word_count = cond_word_fd[posTag].N() #积极词的数量
neg_word_count = cond_word_fd[negTag].N() #消极词的数量
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(cond_word_fd[posTag][word], (freq, pos_word_count), total_word_count) #计算积极词的卡方统计量,这里也可以计算互信息等其它统计量
neg_score = BigramAssocMeasures.chi_sq(cond_word_fd[negTag][word], (freq, neg_word_count), total_word_count) #同理
word_scores[word] = pos_score + neg_score #一个词的信息量等于积极卡方统计量加上消极卡方统计量
return word_scores #包括了每个词和这个词的信息量
开发者ID:coolspiderghy,项目名称:sina_weibo_crawler,代码行数:27,代码来源:extractFeatures.py
示例8: create_word_scores
def create_word_scores():
posWords = list(itertools.chain(*datap)) #把多维数组解链成一维数组
negWords = list(itertools.chain(*datan)) #同理
word_fd = nltk.FreqDist()
cond_word_fd = ConditionalFreqDist() #可统计积极文本中的词频和消极文本中的词频
for word in posWords:
word_fd[word] += 1
cond_word_fd['pos'][word] += 1
for word in negWords:
word_fd[word] += 1
cond_word_fd['neg'][word] += 1
pos_word_count = cond_word_fd['pos'].N() #积极词的数量
neg_word_count = cond_word_fd['neg'].N() #消极词的数量
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) #计算积极词的卡方统计量,这里也可以计算互信息等其它统计量
neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) #同理
word_scores[word] = pos_score + neg_score #一个词的信息量等于积极卡方统计量加上消极卡方统计量
return word_scores #包括了每个词和这个词的信息量
开发者ID:xiutx,项目名称:review_emotion,代码行数:25,代码来源:findword.py
示例9: create_bigram_scores
def create_bigram_scores():
posdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/pos_review.xlsx", "1", "1")
negdata = tp.seg_fil_senti_excel("D:/code/sentiment_test/neg_review.xlsx", "1", "1")
posWords = list(itertools.chain(*posdata))
negWords = list(itertools.chain(*negdata))
bigram_finder = BigramCollocationFinder.from_words(posWords)
bigram_finder = BigramCollocationFinder.from_words(negWords)
posBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 8000)
negBigrams = bigram_finder.nbest(BigramAssocMeasures.chi_sq, 8000)
pos = posBigrams
neg = negBigrams
word_fd = FreqDist()
cond_word_fd = ConditionalFreqDist()
for word in pos:
word_fd.inc(word)
cond_word_fd['pos'].inc(word)
for word in neg:
word_fd.inc(word)
cond_word_fd['neg'].inc(word)
pos_word_count = cond_word_fd['pos'].N()
neg_word_count = cond_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
return word_scores
开发者ID:EricChanBD,项目名称:Review-Helpfulness-Prediction,代码行数:35,代码来源:store+sentiment+classifier.py
示例10: getWordScores
def getWordScores():
posWords = []
negWords = []
with open(RT_POLARITY_POS_FILE, 'r') as posSentences:
for i in posSentences:
posWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
posWords.append(posWord)
with open(RT_POLARITY_NEG_FILE, 'r') as negSentences:
for i in negSentences:
negWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
negWords.append(negWord)
posWords = list(itertools.chain(*posWords))
negWords = list(itertools.chain(*negWords))
word_fd = FreqDist()
cond_word_fd = ConditionalFreqDist()
for word in posWords:
word_fd[word.lower()] += 1
cond_word_fd['pos'][word.lower()] += 1
for word in negWords:
word_fd[word.lower()] += 1
cond_word_fd['neg'][word.lower()] += 1
pos_word_count = cond_word_fd['pos'].N()
neg_word_count = cond_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
return word_scores
开发者ID:Sapphirine,项目名称:MyTravelAgent,代码行数:34,代码来源:Sentiment.py
示例11: getBestWords
def getBestWords(posWords, negWords):
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
for word in posWords:
word_fd[word.lower()] += 1
label_word_fd["pos"][word.lower()] += 1
for word in negWords:
word_fd[word.lower()] += 1
label_word_fd["neg"][word.lower()] += 1
pos_word_count = label_word_fd["pos"].N()
neg_word_count = label_word_fd["neg"].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.items():
pos_score = BigramAssocMeasures.chi_sq(label_word_fd["pos"][word], (freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(label_word_fd["neg"][word], (freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
# best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000]
sorted_x = sorted(word_scores.items(), key=operator.itemgetter(1), reverse=True)
bestwords = set([w for w, s in sorted_x])
return bestwords
开发者ID:dakshvar22,项目名称:DishingOut,代码行数:28,代码来源:sentimentTrainer.py
示例12: get_bestwords
def get_bestwords(contents, labels, limit = 10000, n = None, cache = True):
if cache:
if n:
cache_path = 'cache/%s_%s.pkl' % (limit, n)
if os.path.exists(cache_path):
bestwords = pickle.load(open(cache_path, 'r'))
print 'Loaded from cache'
print 'bestwords count = %d' % (len(bestwords))
return bestwords
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
pos_contents = contents[labels == 1]
neg_contents = contents[labels != 0]
pos_words = set()
neg_words = set()
for pos_content in pos_contents:
pos_words = pos_words.union(word_tokenize(pos_content))
for neg_content in neg_contents:
neg_words = neg_words.union(word_tokenize(neg_content))
for word in pos_words:
word_fd.inc(word.lower())
label_word_fd['pos'].inc(word.lower())
for word in neg_words:
word_fd.inc(word.lower())
label_word_fd['neg'].inc(word.lower())
pos_word_count = label_word_fd['pos'].N()
neg_word_count = label_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
(freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
(freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:limit]
bestwords = set([w for w, s in best])
print 'all words count = %d' % (len(word_scores))
print 'bestwords count = %d' % (len(bestwords))
if cache:
if n:
cache_path = 'cache/%s_%s.pkl' % (limit, n)
f = open(cache_path, 'w')
pickle.dump(bestwords, f)
print 'Dumped to cache'
return bestwords
开发者ID:colinsongf,项目名称:stumbleupon_evergreen_classification_challenge,代码行数:60,代码来源:submission.py
示例13: best_word_feats
def best_word_feats(self, words):
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
for word in movie_reviews.words(categories=['pos']):
word_fd.inc(word.lower())
label_word_fd['pos'].inc(word.lower())
for word in movie_reviews.words(categories=['neg']):
word_fd.inc(word.lower())
label_word_fd['neg'].inc(word.lower())
# n_ii = label_word_fd[label][word]
# n_ix = word_fd[word]
# n_xi = label_word_fd[label].N()
# n_xx = label_word_fd.N()
pos_word_count = label_word_fd['pos'].N()
neg_word_count = label_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
(freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
(freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000]
bestwords = set([w for w, s in best])
return dict([(word, True) for word in words if word in bestwords])
开发者ID:dkaliyev,项目名称:TwitterAnalyser,代码行数:33,代码来源:NBClass.py
示例14: computeFreqDistribution
def computeFreqDistribution():
if DEBUG:
print word_fd
pos_word_count = label_word_fd['positive'].N()
neg_word_count = label_word_fd['negative'].N()
neu_word_count = label_word_fd['neutral'].N()
total_word_count = pos_word_count + neg_word_count + neu_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(label_word_fd['positive'][word], (freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(label_word_fd['negative'][word], (freq, neg_word_count), total_word_count)
neu_score = BigramAssocMeasures.chi_sq(label_word_fd['neutral'][word], (freq, neu_word_count), total_word_count)
word_scores[word] = pos_score + neg_score + neu_score
if DEBUG:
print json.dumps(word_scores, indent = 4)
threshold = 2
temp = []
for item in word_scores:
if word_scores[item] > threshold:
temp.append(item)
if DEBUG:
print temp
return temp
开发者ID:chawlaaditya8,项目名称:Sentiment-Analysis,代码行数:34,代码来源:app.py
示例15: get_best_words
def get_best_words(words_list, num_best_words):
from nltk.probability import FreqDist, ConditionalFreqDist
from nltk.metrics import BigramAssocMeasures
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
for pair in words_list:
line,sent = pair
for word in nltk.word_tokenize(line):
word_fd.inc(word.lower())
label_word_fd[sent].inc(word.lower())
pos_word_count = label_word_fd['pos'].N()
neg_word_count = label_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],(freq, pos_word_count),total_word_count)
neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],(freq, neg_word_count),total_word_count)
word_scores[word] = pos_score + neg_score
best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:num_best_words]
bestwords = set([w for w, s in best])
return bestwords
开发者ID:dsedra,项目名称:yproject,代码行数:29,代码来源:sent_master.py
示例16: store_word_scores
def store_word_scores(self):
"""
Stores 'word scores' into Redis.
"""
try:
word_freqdist = pickle.loads(self.r.get('word_fd'))
label_word_freqdist = pickle.loads(self.r.get('label_fd'))
except TypeError:
print('Requires frequency distributions to be built.')
word_scores = {}
pos_word_count = label_word_freqdist['pos'].N()
neg_word_count = label_word_freqdist['neg'].N()
total_word_count = pos_word_count + neg_word_count
for word, freq in word_freqdist.iteritems():
pos_score = BigramAssocMeasures.chi_sq(label_word_freqdist['pos'][word], (freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(label_word_freqdist['neg'][word], (freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
self.r.set('word_scores', word_scores)
开发者ID:daniel-cloudspace,项目名称:synt,代码行数:25,代码来源:redis_manager.py
示例17: create_word_scores
def create_word_scores():
posWords = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/pos_review.pkl','r'))
negWords = pickle.load(open('/Users/genghaiyang/ghy_works/projects/weibo_crawler/textmining/sentiML/pos_neg_review/neg_review.pkl','r'))
posWords = list(itertools.chain(*posWords)) #把多维数组解链成一维数组
negWords = list(itertools.chain(*negWords)) #同理
word_fd = FreqDist() #可统计所有词的词频
cond_word_fd = ConditionalFreqDist() #可统计积极文本中的词频和消极文本中的词频
for word in posWords:
#help(FreqDist)
word_fd[word] += 1#word_fd.inc(word)
cond_word_fd['pos'][word]+= 1#cond_word_fd['pos'].inc(word)
for word in negWords:
word_fd[word] += 1#word_fd.inc(word)
cond_word_fd['neg'][word]+= 1#cond_word_fd['neg'].inc(word)
pos_word_count = cond_word_fd['pos'].N() #积极词的数量
neg_word_count = cond_word_fd['neg'].N() #消极词的数量
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count) #计算积极词的卡方统计量,这里也可以计算互信息等其它统计量
neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count) #同理
word_scores[word] = pos_score + neg_score #一个词的信息量等于积极卡方统计量加上消极卡方统计量
return word_scores #包括了每个词和这个词的信息量
开发者ID:coolspiderghy,项目名称:sina_weibo_crawler,代码行数:28,代码来源:extractFeatures_org.py
示例18: __setTermsCHISQUARE__
def __setTermsCHISQUARE__(self,size):
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
for word in self.reader.words(categories=['pos']):
word_fd.inc(word.lower())
label_word_fd['pos'].inc(word.lower())
for word in self.reader.words(categories=['neg']):
word_fd.inc(word.lower())
label_word_fd['neg'].inc(word.lower())
pos_word_count = label_word_fd['pos'].N()
neg_word_count = label_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
wordScores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
(freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
(freq, neg_word_count), total_word_count)
wordScores[word] = pos_score + neg_score
termScore = sorted(wordScores.items(),key=lambda(w,s):s,reverse=True)[:size]
self.terms = [w for (w,s) in termScore];
开发者ID:bharadwaj221,项目名称:SentimentAnalysis,代码行数:27,代码来源:corpusReader.py
示例19: store_feature_scores
def store_feature_scores(self):
"""
Determine the scores of words based on chi-sq and stores word:score to Redis.
"""
try:
word_fd = self.pickle_load('word_fd')
label_word_freqdist = self.pickle_load('label_fd')
except TypeError:
print('Requires frequency distributions to be built.')
word_scores = {}
pos_word_count = label_word_freqdist['positive'].N()
neg_word_count = label_word_freqdist['negative'].N()
total_word_count = pos_word_count + neg_word_count
for label in label_word_freqdist.conditions():
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(label_word_freqdist['positive'][word], (freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(label_word_freqdist['negative'][word], (freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
self.pickle_store('word_scores', word_scores)
开发者ID:chrisblythe812,项目名称:synt,代码行数:27,代码来源:db.py
示例20: _get_bigram_scores
def _get_bigram_scores(self, posdata, negdata):
pos_words = list(itertools.chain(*posdata))
neg_words = list(itertools.chain(*negdata))
pos_bigram_finder = BigramCollocationFinder.from_words(pos_words)
neg_bigram_finder = BigramCollocationFinder.from_words(neg_words)
pos_bigrams = pos_bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
neg_bigrams = neg_bigram_finder.nbest(BigramAssocMeasures.chi_sq, 5000)
pos = pos_words + pos_bigrams
neg = neg_words + neg_bigrams
word_fd = FreqDist()
cond_word_fd = ConditionalFreqDist()
for word in pos:
word_fd[word] += 1
cond_word_fd['pos'][word] += 1
for word in neg:
word_fd[word] += 1
cond_word_fd['neg'][word] += 1
pos_word_count = cond_word_fd['pos'].N()
neg_word_count = cond_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
return word_scores
开发者ID:Palazor,项目名称:sentiment,代码行数:32,代码来源:Extractor.py
注:本文中的nltk.metrics.BigramAssocMeasures类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论