本文整理汇总了Python中nltk.corpus.movie_reviews.fileids函数的典型用法代码示例。如果您正苦于以下问题:Python fileids函数的具体用法?Python fileids怎么用?Python fileids使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了fileids函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: load_data
def load_data():
global posfeats,negfeats
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
return
开发者ID:sjayakum,项目名称:sentiment-analysis,代码行数:7,代码来源:NaiveBayesArticle.py
示例2: prep_reviews_data
def prep_reviews_data(self): # messy code to test classifier with movie reviews
if not self.movie_review_data:
print 'Preparing movie reviews...\n'
from nltk.corpus import movie_reviews
docs = [movie_reviews.raw(fileid)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
process = lambda x: 1 if x == 'pos' else -1
labels = [process(category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
docs, labels = double_shuffle(docs, labels)
training, testing = divide_list_by_ratio(docs)
self.train_labs, self.test_labs = divide_list_by_ratio(labels)
train_vecs = self.vectorizer.fit_transform(training)
test_vecs = self.vectorizer.transform(testing)
if isinstance(self.model, naive_bayes.GaussianNB):
train_vecs = train_vecs.toarray()
test_vecs = test_vecs.toarray()
self.train_vecs = train_vecs
self.test_vecs = test_vecs
self.movie_review_data = True
self.news_market_data = False
开发者ID:willpots,项目名称:stockrockanddropit,代码行数:29,代码来源:ham.py
示例3: category_by_movie
def category_by_movie():
from nltk.corpus import movie_reviews as mr
from nltk import FreqDist
from nltk import NaiveBayesClassifier
from nltk import classify
from nltk.corpus import names
from nltk.classify import apply_features
import random
documents = [(list(mr.words(f)), c) for c in mr.categories() for f in
mr.fileids(c)]
random.shuffle(documents)
all_words = FreqDist(w.lower() for w in mr.words())
word_features = all_words.keys()[:2000]
def document_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in document_words)
return features
#print document_features(mr.words('pos/cv957_8737.txt'))
#print documents[0]
features = [(document_features(d), c) for (d, c) in documents]
train_set, test_set = features[100:], features[:100]
classifier = NaiveBayesClassifier.train(train_set)
print classify.accuracy(classifier, train_set)
开发者ID:brenden17,项目名称:infinity,代码行数:30,代码来源:category_nltk.py
示例4: main
def main():
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
negcutoff = int(len(negfeats) * 3 / 4)
poscutoff = int(len(posfeats) * 3 / 4)
trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
classifier = NaiveBayesClassifier.train(trainfeats)
with open("output.json") as fin:
sid = SentimentIntensityAnalyzer()
data = json.load(fin)
for key in data:
reviews = data[key]["reviews"]
for i in range(len(reviews)):
text = reviews[i]["review"]
sentiment_dict = {'positive_probability':0, 'label':'', 'negative_probability':0}
prob = classifier.prob_classify(word_feats(text.split(" ")))
classification = classifier.classify(word_feats(text.split(" ")))
sentiment_dict['positive_probability'] = prob.prob('pos')
sentiment_dict['negative_probability'] = prob.prob('neg')
sentiment_dict['label'] = classification
reviews[i]["sentiment"] = sentiment_dict
data[key]["reviews"] = reviews
with open('out_with_sentiment.json', 'w') as outfile:
json.dump(data, outfile)
开发者ID:bifft2,项目名称:cs410FinalProject,代码行数:28,代码来源:sentiment.py
示例5: train_with_movie_db
def train_with_movie_db(self):
"""
Training possible with movie reviews
- this does not yield particularly good results
"""
self.use_movie_reviews = True
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
negfeats = [(self.feature_extraction_movie_reviews(movie_reviews.words(fileids=[f])),
"negative") for f in negids]
posfeats = [(self.feature_extraction_movie_reviews(movie_reviews.words(fileids=[f])),
"positive") for f in posids]
negcutoff = len(negfeats) * 3 / 4
poscutoff = len(posfeats) * 3 / 4
trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
DLOG("train on %d instances, test on %d instances" % (len(trainfeats), len(testfeats)))
self.classifier = NaiveBayesClassifier.train(trainfeats)
DLOG("accuracy: " + str(util.accuracy(self.classifier, testfeats)))
DLOG(self.classifier.show_most_informative_features())
开发者ID:maagaard,项目名称:dmup,代码行数:27,代码来源:sentimentanalyzer.py
示例6: evaluate_classifier
def evaluate_classifier(featx):
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
negfeats = [(featx(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
posfeats = [(featx(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
negcutoff = len(negfeats)*3/4
poscutoff = len(posfeats)*3/4
trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
classifier = NaiveBayesClassifier.train(trainfeats)
refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)
for i, (feats, label) in enumerate(testfeats):
refsets[label].add(i)
observed = classifier.classify(feats)
testsets[observed].add(i)
print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos'])
print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos'])
print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg'])
print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg'])
classifier.show_most_informative_features()
开发者ID:zhougr1993,项目名称:Bayes_kick_momo_spam,代码行数:28,代码来源:test_sentiment.py
示例7: train
def train(test=False):
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
if(test):
negcutoff = len(negfeats)*3/4
poscutoff = len(posfeats)*3/4
trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
print 'train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))
classifier = NaiveBayesClassifier.train(trainfeats)
print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
classifier.show_most_informative_features()
else:
return NaiveBayesClassifier.train(negfeats+posfeats)
开发者ID:jnu,项目名称:texecutions,代码行数:26,代码来源:sentiment.py
示例8: median_approach
def median_approach(llimit,ulimit,isphrase,pathname):
posmedlist=[]
negmedlist=[]
medians=[]
lpcount=0
totalcount=ulimit-llimit
cnt_var=0
print '\nNo of +ve reviews trained : '
for fid in movie_reviews.fileids(categories=['pos'])[llimit:ulimit]:
testmed=proximity_tagger.medianlist(movie_reviews.abspath(fid),isphrase,cnt_var,0,pathname)
posmedlist.append(testmed)
lpcount=lpcount+1
cnt_var+=1
print 'Training +ve review ',lpcount,'.'*10,(float(lpcount)*100/float(totalcount)),'%'
lpcount=0
cnt_var=0
print '\nNo of -ve reviews trained : '
for fid in movie_reviews.fileids(categories=['neg'])[llimit:ulimit]:
testmed=proximity_tagger.medianlist(movie_reviews.abspath(fid),isphrase,cnt_var,1,pathname)
negmedlist.append(testmed)
lpcount=lpcount+1
cnt_var+=1
print 'Training -ve review ',lpcount,'.'*10,(float(lpcount)*100/float(totalcount)),'%'
medians.append([numpy.median(x) for x in itertools.izip(*posmedlist)])
medians.append([numpy.median(x) for x in itertools.izip(*negmedlist)])
f = open('train_result\proximity_median_train_result_'+str(isphrase),'w')
json.dump(medians,f)
f.close()
开发者ID:nidhinbalakrishnan,项目名称:academic-project,代码行数:33,代码来源:review_train.py
示例9: maketrainset
def maketrainset(movie_reviews, tokenizer, stemmer):
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
negfeats = [(tokenizer(movie_reviews.words(fileids=[f]), stemmer), 'neg') for f in negids]
posfeats = [(tokenizer(movie_reviews.words(fileids=[f]), stemmer), 'pos') for f in posids]
trainfeats = negfeats + posfeats
return trainfeats
开发者ID:askerry,项目名称:FGE_MISC,代码行数:7,代码来源:stimanalysisfuncs.py
示例10: evaluate_features
def evaluate_features(self,feature_extractor, N):
self.negative = movie_reviews.fileids('neg') #list of all names of the documents under neg folder
self.positive = movie_reviews.fileids('pos') #list of all names of the documents under pos folder
self.maintrain, self.maintest = self.stratifiedSplit(self.negative, self.positive, N)
lst = []
trainvocabulary = []
for doc,lbl in self.maintrain:
x = (feature_extractor(movie_reviews.words(fileids=[doc])),lbl)
lst.append(x)
trainvocabulary = trainvocabulary + x[0].keys()
trainvocabulary = set(trainvocabulary)
if q2_1.W == 0:
q2_1.W = len(trainvocabulary)
print "no. of features in train:", self.W
nb = classifier.train(lst)
self.testClassify = self.classifyTest(self.maintest, nb, feature_extractor)
print "accuracy = ", accuracy(self.maintest, self.testClassify)
print "Negative:"
print " precision = ", self.calcPrec('neg', self.maintest, self.testClassify)
print " recall = ", self.calcRecall('neg', self.maintest, self.testClassify)
print " f measure = ", self.calcFMeasur('neg', self.maintest, self.testClassify)
print "Positive:"
print " precision = ", self.calcPrec('pos', self.maintest, self.testClassify)
print " recall = ", self.calcRecall('pos', self.maintest, self.testClassify)
print " f measure = ", self.calcFMeasur('pos', self.maintest, self.testClassify)
nb.show_most_informative_features()
return nb
开发者ID:atiassa,项目名称:recommend-2011,代码行数:27,代码来源:q2_1.py
示例11: main
def main(argv):
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
#print negids
negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'negative') for f in negids]
posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'positive') for f in posids]
trainfeats = posfeats+negfeats
#print trainfeats
# break
classifier = NaiveBayesClassifier.train(trainfeats)
#classifier = pickle.load(open("classifier.p", "rb"))
topicList = ["media", "sports", "news", "fashion", "finance", "politics"]
for line in sys.stdin:
try:
tolk_posset = word_tokenize(line.rstrip())
d = word_feats(tolk_posset)
for topic in topicList:
subjectFull = subj(line, topic)
if not subjectFull == "No match":
#print d
print "LongValueSum:" + "" + str(line.split(":")[0])+","+subjectFull + "," + classifier.classify(d) + "\t" + "1"
except:
#print "Error"
continue
开发者ID:BhavdeepSethi,项目名称:cloudBigData,代码行数:28,代码来源:sentiment.py
示例12: main
def main():
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
to_review1 = "A man with a magnanimous spirit helps a mute girl from Pakistan return home."
to_review2 = "Forced out of his own company by former Darren Cross, Dr. Hank Pym (Michael Douglas) recruits the talents of Scott Lang (Paul Rudd), a master thief just released from prison. Lang becomes Ant-Man, trained by Pym and armed with a suit that allows him to shrink in size, possess superhuman strength and control an army of ants. The miniature hero must use his new skills to prevent Cross, also known as Yellowjacket, from perfecting the same technology and using it as a weapon for evil."
to_review3 = '''Parents need to know that kids may clamor to see this fast-paced, action-packed comic book-based adventure. But it's definitely more age-appropriate for teens than younger children. Although much of the violence is clearly meant to be based in the realm of sci-fi and fantasy -- and/or is shown at a distance -- there's plenty of it, from massive explosions to children held at gunpoint to super-powered fistfights. Some of the violence is war themed, and some characters get hurt and/or die. While much is made of lead character Tony Stark's devil-may-care lifestyle of fun and frolic, viewers also see him turn away from the more irresponsible aspects of playboyhood. Language is minimal, and sexual content is more suggested than shown overall -- though there are a few eyebrow-raising moments.'''
reviews = []
reviews.append(to_review1)
reviews.append(to_review2)
reviews.append(to_review3)
for to_review in reviews:
to_review_words = to_review.split(" ")
print "Reviewing",to_review,"\n\n\n"
print ''' Normal classification ''',"\n\n"
negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
calculateScore(classification(negfeats, posfeats, 1, 1), to_review_words)
calculateScore(classification(negfeats, posfeats, 1, 0.95), to_review_words)
calculateScore(classification(negfeats, posfeats, 0.95, 1), to_review_words)
calculateScore(classification(negfeats, posfeats, 0.9, 1), to_review_words)
calculateScore(classification(negfeats, posfeats, 1, 0.9), to_review_words)
print ''' Without Punctuations ''',"\n\n"
negfeats_stopwords = [(word_feats_punctuations(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
posfeats_stopwords = [(word_feats_punctuations(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
calculateScore_punctuations(classification(negfeats, posfeats, 1, 1), to_review_words)
calculateScore_punctuations(classification(negfeats, posfeats, 1, 0.95), to_review_words)
calculateScore_punctuations(classification(negfeats, posfeats, 0.95, 1), to_review_words)
calculateScore_punctuations(classification(negfeats, posfeats, 0.9, 1), to_review_words)
calculateScore_punctuations(classification(negfeats, posfeats, 1, 0.9), to_review_words)
print ''' Without Stop Words ''',"\n\n"
negfeats_stopwords = [(word_feats_stopwords(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
posfeats_stopwords = [(word_feats_stopwords(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
wordstoreview = []
for each in to_review_words:
if each not in stopwords.words('english'):
wordstoreview.append(each)
calculateScore_stopwords(classification(negfeats, posfeats, 1, 1), wordstoreview)
calculateScore_stopwords(classification(negfeats, posfeats, 1, 0.95), to_review_words)
calculateScore_stopwords(classification(negfeats, posfeats, 0.95, 1), to_review_words)
calculateScore_stopwords(classification(negfeats, posfeats, 0.9, 1), to_review_words)
calculateScore_stopwords(classification(negfeats, posfeats, 1, 0.9), to_review_words)
print ''' With Lemmatizer ''',"\n\n"
negfeats_stopwords = [(word_feats_lemmatize(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
posfeats_stopwords = [(word_feats_lemmatize(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
calculateScore_lemmatizer(classification(negfeats, posfeats, 1, 1), to_review_words)
calculateScore_lemmatizer(classification(negfeats, posfeats, 1, 0.95), to_review_words)
calculateScore_lemmatizer(classification(negfeats, posfeats, 0.95, 1), to_review_words)
calculateScore_lemmatizer(classification(negfeats, posfeats, 0.9, 1), to_review_words)
calculateScore_lemmatizer(classification(negfeats, posfeats, 1, 0.9), to_review_words)
开发者ID:saransh2405,项目名称:sentiment-Analysis-using-Maximum-Entropy-Classification,代码行数:59,代码来源:maxent.py
示例13: __init__
def __init__(self, train1=True, train2=True, train3=True, train4=True):
self.trainfeats = []
if train1:
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
neg_movies = [(self.word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
pos_movies = [(self.word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
self.trainfeats = neg_movies + pos_movies
if train2:
f = open("out.txt", "r")
negfeats = []
posfeats = []
for line in f:
status = line[0]
texto = line[2:]
if status == '0':
negfeats.append((self.word_feats(texto.split(" ")), 'neg'))
elif status == '1':
posfeats.append((self.word_feats(texto.split(" ")), 'pos'))
self.trainfeats += negfeats + posfeats
if train3:
f = open("E:\\Workspace\\WS_TG\\analisador1\\AFINN\\AFINN-111.txt", 'r')
for l in f:
data = l.strip().split('\t')
self.trainfeats.append( (self.word_feats(data[0]), 'neg' if int(data[1]) < 0 else 'pos'))
if train4:
f = open("E:\\Workspace\\WS_TG\\api\\trainning set.txt", 'r')
pos = []
neutral = []
neg = []
for line in f:
if line.startswith("pos"):
pos.append(line)
elif line.startswith("neutral"):
neutral.append(line)
elif line.startswith("neg"):
neg.append(line)
print len(pos), len(neutral), len(neg)
total = pos + neutral[:200] + neg
for line in total:
data = line.split(' .:. ')
self.trainfeats.append( (self.word_feats(data[1].split()), data[0]) )
self.classifier = NaiveBayesClassifier.train(self.trainfeats)
print self.classifier.show_most_informative_features(20)
开发者ID:phslfo,项目名称:TGSAT,代码行数:58,代码来源:analisador.py
示例14: setup_demo
def setup_demo(lower):
print 'running movie reviews demo. data dir: ', nltk_movie_reviews_data_root
negative_reviews = map (lambda x: nltk_movie_reviews_data_root + x, movie_reviews.fileids('neg'))
positive_reviews = map (lambda x: nltk_movie_reviews_data_root + x, movie_reviews.fileids('pos'))
pos = create_corpus_from_file_list(negative_reviews, "negative", None, None, lower)
neg = create_corpus_from_file_list(positive_reviews, "positive", None, None, lower)
pos_bigrams = create_corpus_from_file_list(negative_reviews, "negative", None, None, lower, wordlist_to_bigrams_dict)
neg_bigrams = create_corpus_from_file_list(positive_reviews, "positive", None, None, lower, wordlist_to_bigrams_dict)
return (pos, neg, pos_bigrams, neg_bigrams)
开发者ID:gleicon,项目名称:sentiment_analysis,代码行数:9,代码来源:demo_movie_reviews.py
示例15: __init__
def __init__(self, load = False, loadFile = ""):
if(load):
self.loadClassifier(loadFile)
else:
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
negfeats = [(self.word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in movie_reviews.fileids('neg')]
posfeats = [(self.word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in movie_reviews.fileids('pos')]
trainfeats = negfeats + posfeats
self.classifier = NaiveBayesClassifier.train(trainfeats)
开发者ID:rzsun,项目名称:Enquire,代码行数:10,代码来源:sentclassifier.py
示例16: bins_svm_approach
def bins_svm_approach(llimit,ulimit,isphrase,pathname):
posbinlist=[]
negbinlist=[]
trainingdata=[]
trainingclass=[]
bin_train_set=[]
totalcount=ulimit-llimit
lpcount=0
cnt_var=0
print '\nNo of +ve reviews scanned for training : '
for fid in movie_reviews.fileids(categories=['pos'])[llimit:ulimit]:
testbin=proximity_tagger.bin_list(movie_reviews.abspath(fid),isphrase,cnt_var,0,pathname)
posbinlist.append(testbin)
lpcount+=1
cnt_var+=1
print 'Scanning +ve review ',lpcount,'.'*10,(float(lpcount)*100/float(totalcount)),'%'
lpcount=0
cnt_var=0
print '\nNo of -ve reviews scanned for training : '
for fid in movie_reviews.fileids(categories=['neg'])[llimit:ulimit]:
testbin=proximity_tagger.bin_list(movie_reviews.abspath(fid),isphrase,cnt_var,1,pathname)
negbinlist.append(testbin)
lpcount+=1
cnt_var+=1
print 'Scanning -ve review ',lpcount,'.'*10,(float(lpcount)*100/float(totalcount)),'%'
lpcount=0
totalcount=len(posbinlist)
print '\nNo of +ve reviews trained : '
trainingdata.extend(posbinlist)
for i in range(totalcount):
trainingclass.append(1)
lpcount+=1
print 'Training +ve review ',lpcount,'.'*10,(float(lpcount)*100/float(totalcount)),'%'
lpcount=0
totalcount=len(negbinlist)
print '\nNo of -ve reviews trained : '
trainingdata.extend(negbinlist)
for i in range(totalcount):
trainingclass.append(0)
lpcount+=1
print 'Training -ve review ',lpcount,'.'*10,(float(lpcount)*100/float(totalcount)),'%'
bin_train_set.append(trainingdata)
bin_train_set.append(trainingclass)
f = open('train_result\proximity_bin_train_result_'+str(isphrase),'w')
json.dump(bin_train_set,f)
f.close()
开发者ID:nidhinbalakrishnan,项目名称:academic-project,代码行数:55,代码来源:review_train.py
示例17: train_classifiers
def train_classifiers(self):
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
negfeats = [(word_feats(
movie_reviews.words(fileids=[f])), 'neg') for f in negids]
posfeats = [(word_feats(
movie_reviews.words(fileids=[f])), 'pos') for f in posids]
trainfeats = negfeats + posfeats
# train naive bayes
self.classifier = NaiveBayesClassifier.train(trainfeats)
开发者ID:jlburgos,项目名称:DemiseAnalyzer,代码行数:11,代码来源:DemiseAnalyzer.py
示例18: sort_files
def sort_files():
"""
Sorted the sample for cross reading the sample
:return:
files_list
"""
files_list = list()
neg_file_list = movie_reviews.fileids('neg')
pos_file_list = movie_reviews.fileids('pos')
files_list = list(chain.from_iterable(zip(neg_file_list, pos_file_list)))
return files_list
开发者ID:absorbguo,项目名称:Paddle,代码行数:11,代码来源:sentiment.py
示例19: train
def train(feature):
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')
negfeatures = [(feature(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
posfeatures = [(feature(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
trainfeatures = negfeatures + posfeatures
classifier = NaiveBayesClassifier.train(trainfeatures)
return classifier
开发者ID:seanfreiburg,项目名称:chicago_tweet_grabber,代码行数:11,代码来源:analyze_tweets.py
示例20: train
def train(self, feats):
print "Starting to train the data"
start = datetime.datetime.now()
print "setting the ids", datetime.datetime.now()
self.negids = movie_reviews.fileids('neg')
self.posids = movie_reviews.fileids('pos')
#random.shuffle(self.negids)
#random.shuffle(self.posids)
##self.reviews = ([(movie_reviews.words(fileids=[f]), 'neg') for f in self.negids] +
##[(movie_reviews.words(fileids=[f]), 'pos') for f in self.posids])
##random.shuffle(self.reviews)
##self.train_set = apply_features(feats, self.reviews[len(self.reviews)*1/4:])
##self.test_set = apply_features(feats, self.reviews[:len(self.reviews)*1/4])
print "setting the feats", datetime.datetime.now()
self.negfeats = [(feats(movie_reviews.words(fileids=[f])), 'neg') for f in self.negids]
self.posfeats = [(feats(movie_reviews.words(fileids=[f])), 'pos') for f in self.posids]
self.negcutoff = len(self.negfeats)*3/4
self.poscutoff = len(self.posfeats)*3/4
print "setting the train/test", datetime.datetime.now()
self.trainfeats = self.negfeats[:self.negcutoff] + self.posfeats[:self.poscutoff]
self.testfeats = self.negfeats[self.negcutoff:] + self.posfeats[self.poscutoff:]
print "training", datetime.datetime.now()
self.classifier = NaiveBayesClassifier.train(self.trainfeats)
##self.classifier = NaiveBayesClassifier.train(self.train_set)
self.refsets = defaultdict(set)
self.testsets = defaultdict(set)
print "accuracy stuff", datetime.datetime.now()
for i, (feats, label) in enumerate(self.testfeats):
##for i, (feats, label) in enumerate(self.test_set):
self.refsets[label].add(i)
observed = self.classifier.classify(feats)
self.testsets[observed].add(i)
end = datetime.datetime.now()
print "Training lasted for ", end-start
print 'accuracy:', nltk.classify.util.accuracy(self.classifier, self.testfeats)
##print 'accuracy:', nltk.classify.util.accuracy(self.classifier, self.test_set)
print 'pos precision:', nltk.metrics.precision(self.refsets['pos'], self.testsets['pos'])
print 'pos recall:', nltk.metrics.recall(self.refsets['pos'], self.testsets['pos'])
print 'neg precision:', nltk.metrics.precision(self.refsets['neg'], self.testsets['neg'])
print 'neg recall:', nltk.metrics.recall(self.refsets['neg'], self.testsets['neg'])
self.classifier.show_most_informative_features()
self.trained = True
开发者ID:crimsonknave,项目名称:moodilate,代码行数:52,代码来源:data_server.py
注:本文中的nltk.corpus.movie_reviews.fileids函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论