本文整理汇总了Python中nltk.corpus.movie_reviews.words函数的典型用法代码示例。如果您正苦于以下问题:Python words函数的具体用法?Python words怎么用?Python words使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了words函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: train_with_movie_db
def train_with_movie_db(self):
"""
Training possible with movie reviews
- this does not yield particularly good results
"""
self.use_movie_reviews = True
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
negfeats = [(self.feature_extraction_movie_reviews(movie_reviews.words(fileids=[f])),
"negative") for f in negids]
posfeats = [(self.feature_extraction_movie_reviews(movie_reviews.words(fileids=[f])),
"positive") for f in posids]
negcutoff = len(negfeats) * 3 / 4
poscutoff = len(posfeats) * 3 / 4
trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
DLOG("train on %d instances, test on %d instances" % (len(trainfeats), len(testfeats)))
self.classifier = NaiveBayesClassifier.train(trainfeats)
DLOG("accuracy: " + str(util.accuracy(self.classifier, testfeats)))
DLOG(self.classifier.show_most_informative_features())
开发者ID:maagaard,项目名称:dmup,代码行数:27,代码来源:sentimentanalyzer.py
示例2: documentClassification
def documentClassification():
from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = all_words.keys()[:2000]
def document_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in document_words)
return features
print document_features(movie_reviews.words('pos/cv957_8737.txt'))
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier, test_set)
classifier.show_most_informative_features(5)
开发者ID:AkiraKane,项目名称:Python,代码行数:28,代码来源:c06_supervised_classification.py
示例3: category_by_movie
def category_by_movie():
from nltk.corpus import movie_reviews as mr
from nltk import FreqDist
from nltk import NaiveBayesClassifier
from nltk import classify
from nltk.corpus import names
from nltk.classify import apply_features
import random
documents = [(list(mr.words(f)), c) for c in mr.categories() for f in
mr.fileids(c)]
random.shuffle(documents)
all_words = FreqDist(w.lower() for w in mr.words())
word_features = all_words.keys()[:2000]
def document_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in document_words)
return features
#print document_features(mr.words('pos/cv957_8737.txt'))
#print documents[0]
features = [(document_features(d), c) for (d, c) in documents]
train_set, test_set = features[100:], features[:100]
classifier = NaiveBayesClassifier.train(train_set)
print classify.accuracy(classifier, train_set)
开发者ID:brenden17,项目名称:infinity,代码行数:30,代码来源:category_nltk.py
示例4: load_data
def load_data():
global posfeats,negfeats
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
return
开发者ID:sjayakum,项目名称:sentiment-analysis,代码行数:7,代码来源:NaiveBayesArticle.py
示例5: evaluate_classifier
def evaluate_classifier(featx):
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
negfeats = [(featx(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
posfeats = [(featx(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
negcutoff = len(negfeats)*3/4
poscutoff = len(posfeats)*3/4
trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
classifier = NaiveBayesClassifier.train(trainfeats)
refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)
for i, (feats, label) in enumerate(testfeats):
refsets[label].add(i)
observed = classifier.classify(feats)
testsets[observed].add(i)
print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos'])
print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos'])
print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg'])
print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg'])
classifier.show_most_informative_features()
开发者ID:zhougr1993,项目名称:Bayes_kick_momo_spam,代码行数:28,代码来源:test_sentiment.py
示例6: main
def main():
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
negcutoff = int(len(negfeats) * 3 / 4)
poscutoff = int(len(posfeats) * 3 / 4)
trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
classifier = NaiveBayesClassifier.train(trainfeats)
with open("output.json") as fin:
sid = SentimentIntensityAnalyzer()
data = json.load(fin)
for key in data:
reviews = data[key]["reviews"]
for i in range(len(reviews)):
text = reviews[i]["review"]
sentiment_dict = {'positive_probability':0, 'label':'', 'negative_probability':0}
prob = classifier.prob_classify(word_feats(text.split(" ")))
classification = classifier.classify(word_feats(text.split(" ")))
sentiment_dict['positive_probability'] = prob.prob('pos')
sentiment_dict['negative_probability'] = prob.prob('neg')
sentiment_dict['label'] = classification
reviews[i]["sentiment"] = sentiment_dict
data[key]["reviews"] = reviews
with open('out_with_sentiment.json', 'w') as outfile:
json.dump(data, outfile)
开发者ID:bifft2,项目名称:cs410FinalProject,代码行数:28,代码来源:sentiment.py
示例7: best_word_feats
def best_word_feats(self, words):
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
for word in movie_reviews.words(categories=['pos']):
word_fd.inc(word.lower())
label_word_fd['pos'].inc(word.lower())
for word in movie_reviews.words(categories=['neg']):
word_fd.inc(word.lower())
label_word_fd['neg'].inc(word.lower())
# n_ii = label_word_fd[label][word]
# n_ix = word_fd[word]
# n_xi = label_word_fd[label].N()
# n_xx = label_word_fd.N()
pos_word_count = label_word_fd['pos'].N()
neg_word_count = label_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
(freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
(freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000]
bestwords = set([w for w, s in best])
return dict([(word, True) for word in words if word in bestwords])
开发者ID:dkaliyev,项目名称:TwitterAnalyser,代码行数:33,代码来源:NBClass.py
示例8: train
def train(test=False):
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
if(test):
negcutoff = len(negfeats)*3/4
poscutoff = len(posfeats)*3/4
trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))
classifier = NaiveBayesClassifier.train(trainfeats)
print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
classifier.show_most_informative_features()
else:
return NaiveBayesClassifier.train(negfeats+posfeats)
开发者ID:jnu,项目名称:texecutions,代码行数:26,代码来源:sentiment.py
示例9: maketrainset
def maketrainset(movie_reviews, tokenizer, stemmer):
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
negfeats = [(tokenizer(movie_reviews.words(fileids=[f]), stemmer), 'neg') for f in negids]
posfeats = [(tokenizer(movie_reviews.words(fileids=[f]), stemmer), 'pos') for f in posids]
trainfeats = negfeats + posfeats
return trainfeats
开发者ID:askerry,项目名称:FGE_MISC,代码行数:7,代码来源:stimanalysisfuncs.py
示例10: GetHighInformationWordsChi
def GetHighInformationWordsChi(num_bestwords):
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
for word in movie_reviews.words(categories=['pos']):
word_fd[word.lower()] +=1
label_word_fd['pos'][word.lower()] +=1
for word in movie_reviews.words(categories=['neg']):
word_fd[word.lower()] +=1
label_word_fd['neg'][word.lower()] +=1
pos_word_count = label_word_fd['pos'].N()
neg_word_count = label_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
(freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
(freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:num_bestwords]
bestwords = set([w for w, s in best])
return bestwords
开发者ID:ai2010,项目名称:machine_learning_for_the_web,代码行数:28,代码来源:views.py
示例11: setup
def setup():
global bestwords
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
for word in movie_reviews.words(categories=['pos']):
word_fd.inc(word.strip('\'"?,.').lower())
label_word_fd['pos'].inc(word.lower())
for word in movie_reviews.words(categories=['neg']):
word_fd.inc(word.strip('\'"?,.').lower())
label_word_fd['neg'].inc(word.lower())
pos_word_count = label_word_fd['pos'].N()
neg_word_count = label_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
(freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
(freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000]
bestwords = set([w for w, s in best])
return train(best_bigram_word_features)
开发者ID:seanfreiburg,项目名称:chicago_tweet_grabber,代码行数:30,代码来源:analyze_tweets.py
示例12: prepareSentimentClassifier
def prepareSentimentClassifier():
documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
all_words = []
for w in movie_reviews.words():
all_words.append(w.lower())
all_words = nltk.FreqDist(all_words)
global word_featuresSent
word_featuresSent = list(all_words.keys())[:3000]
featuresets = [(findFeaturesSentiment(rev), category) for (rev, category) in documents]
training_set = featuresets[:1900]
testing_set = featuresets[1900:]
sentimentClassifier = nltk.NaiveBayesClassifier.train(training_set)
print("Classifier accuracy percent:",(nltk.classify.accuracy(sentimentClassifier, testing_set))*100)
return sentimentClassifier
开发者ID:koskinap,项目名称:Popularity_StyleOfPlay_DS2015_Group3_Soton,代码行数:27,代码来源:realTimeMatchAnalyzer.py
示例13: __init__
def __init__(self):
## Best words feature extraction
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
for word in movie_reviews.words(categories=['pos']):
word_fd.inc(word.lower())
label_word_fd['pos'].inc(word.lower())
for word in movie_reviews.words(categories=['neg']):
word_fd.inc(word.lower())
label_word_fd['neg'].inc(word.lower())
pos_word_count = label_word_fd['pos'].N()
neg_word_count = label_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
(freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
(freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000]
self.bestwords = set([w for w, s in best])
self.train_classifier()
开发者ID:nginz,项目名称:blazor,代码行数:29,代码来源:sentiment_analyze.py
示例14: main
def main(argv):
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
#print negids
negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'negative') for f in negids]
posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'positive') for f in posids]
trainfeats = posfeats+negfeats
#print trainfeats
# break
classifier = NaiveBayesClassifier.train(trainfeats)
#classifier = pickle.load(open("classifier.p", "rb"))
topicList = ["media", "sports", "news", "fashion", "finance", "politics"]
for line in sys.stdin:
try:
tolk_posset = word_tokenize(line.rstrip())
d = word_feats(tolk_posset)
for topic in topicList:
subjectFull = subj(line, topic)
if not subjectFull == "No match":
#print d
print "LongValueSum:" + "" + str(line.split(":")[0])+","+subjectFull + "," + classifier.classify(d) + "\t" + "1"
except:
#print "Error"
continue
开发者ID:BhavdeepSethi,项目名称:cloudBigData,代码行数:28,代码来源:sentiment.py
示例15: __init__
def __init__(self):
self.documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
random.shuffle(self.documents)
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = all_words.keys()[:2000]
开发者ID:julius-jsr,项目名称:text_sum,代码行数:8,代码来源:docs.py
示例16: __init__
def __init__(self, train1=True, train2=True, train3=True, train4=True):
self.trainfeats = []
if train1:
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
neg_movies = [(self.word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
pos_movies = [(self.word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
self.trainfeats = neg_movies + pos_movies
if train2:
f = open("out.txt", "r")
negfeats = []
posfeats = []
for line in f:
status = line[0]
texto = line[2:]
if status == '0':
negfeats.append((self.word_feats(texto.split(" ")), 'neg'))
elif status == '1':
posfeats.append((self.word_feats(texto.split(" ")), 'pos'))
self.trainfeats += negfeats + posfeats
if train3:
f = open("E:\\Workspace\\WS_TG\\analisador1\\AFINN\\AFINN-111.txt", 'r')
for l in f:
data = l.strip().split('\t')
self.trainfeats.append( (self.word_feats(data[0]), 'neg' if int(data[1]) < 0 else 'pos'))
if train4:
f = open("E:\\Workspace\\WS_TG\\api\\trainning set.txt", 'r')
pos = []
neutral = []
neg = []
for line in f:
if line.startswith("pos"):
pos.append(line)
elif line.startswith("neutral"):
neutral.append(line)
elif line.startswith("neg"):
neg.append(line)
print len(pos), len(neutral), len(neg)
total = pos + neutral[:200] + neg
for line in total:
data = line.split(' .:. ')
self.trainfeats.append( (self.word_feats(data[1].split()), data[0]) )
self.classifier = NaiveBayesClassifier.train(self.trainfeats)
print self.classifier.show_most_informative_features(20)
开发者ID:phslfo,项目名称:TGSAT,代码行数:58,代码来源:analisador.py
示例17: __init__
def __init__(self, load = False, loadFile = ""):
if(load):
self.loadClassifier(loadFile)
else:
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
negfeats = [(self.word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in movie_reviews.fileids('neg')]
posfeats = [(self.word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in movie_reviews.fileids('pos')]
trainfeats = negfeats + posfeats
self.classifier = NaiveBayesClassifier.train(trainfeats)
开发者ID:rzsun,项目名称:Enquire,代码行数:10,代码来源:sentclassifier.py
示例18: build_classifier
def build_classifier(self):
documents = [(' '.join(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words() if len(w) > 2)
self.word_features = all_words.keys()[:2000]
featuresets = [(self.document_features(d), c) for (d,c) in documents]
classifier = nltk.NaiveBayesClassifier.train(featuresets)
return classifier
开发者ID:hmason,项目名称:Color-Commentary-Teletype,代码行数:10,代码来源:teletype.py
示例19: train_classifiers
def train_classifiers(self):
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
negfeats = [(word_feats(
movie_reviews.words(fileids=[f])), 'neg') for f in negids]
posfeats = [(word_feats(
movie_reviews.words(fileids=[f])), 'pos') for f in posids]
trainfeats = negfeats + posfeats
# train naive bayes
self.classifier = NaiveBayesClassifier.train(trainfeats)
开发者ID:jlburgos,项目名称:DemiseAnalyzer,代码行数:11,代码来源:DemiseAnalyzer.py
示例20: train
def train(feature):
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
negfeatures = [(feature(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
posfeatures = [(feature(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
trainfeatures = negfeatures + posfeatures
classifier = NaiveBayesClassifier.train(trainfeatures)
return classifier
开发者ID:seanfreiburg,项目名称:chicago_tweet_grabber,代码行数:11,代码来源:analyze_tweets.py
注:本文中的nltk.corpus.movie_reviews.words函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论