本文整理汇总了Python中sklearn.naive_bayes.MultinomialNB类的典型用法代码示例。如果您正苦于以下问题:Python MultinomialNB类的具体用法?Python MultinomialNB怎么用?Python MultinomialNB使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了MultinomialNB类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: train
def train(self):
'''
## -- How to predict -- ##
query = "blah blah"
q = list2vec(hashit(q))
clf2 = joblib.load('nb')
print(clf2.predict(q)) # <--- returns type id
'''
limit = self.comment_limit
sqls = ["SELECT body FROM comment JOIN entity ON comment.eid = entity.eid WHERE entity.tid=1 ORDER BY time DESC LIMIT " + str(limit),
"SELECT body FROM comment JOIN entity ON comment.eid = entity.eid WHERE entity.tid=2 ORDER BY time DESC LIMIT " + str(limit),
"SELECT body FROM comment JOIN entity ON comment.eid = entity.eid WHERE entity.tid=3 ORDER BY time DESC LIMIT " + str(limit)]
print "training model"
comments = self.sql2list(sqls)
x, y = self.featureMatrix(comments)
X = list2Vec(x)
Y = list2Vec(y)
q = "Let's talk about food."
q_vec = list2Vec(hashit(q))
## Precicting
print "Classifying"
clf = MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
clf.fit(X, Y)
joblib.dump(clf, self.path, compress=9)
开发者ID:WangWenjun559,项目名称:Weiss,代码行数:28,代码来源:typeTrain.py
示例2: MultinomialNBClassify_Proba
def MultinomialNBClassify_Proba(enrollment_id, trainData, trainLabel, testData):
nbClf = MultinomialNB() # default alpha=1.0, Laplace smoothing
# settinf alpha < 1 is called Lidstone smoothing
nbClf.fit(trainData, ravel(trainLabel))
testLabel = nbClf.predict_proba(testData)[:,1]
saveResult(enrollment_id, testLabel, 'Proba_sklearn_MultinomialNB_alpha=0.1_Result.csv')
return testLabel
开发者ID:ElvisKwok,项目名称:code,代码行数:7,代码来源:test.py
示例3: crossValidate
def crossValidate(X_dataset,y):
#cross validate model
num_folds = 5
kfold = cross_validation.StratifiedKFold(y, n_folds=num_folds, shuffle=True)
# kfold=KFold(X.shape[0],n_folds=10, shuffle=True)
avg_accuracy=0
avg_precision=0
avg_recall=0
print "----------- cross_validation k=5"
for train,test in kfold:
Xtrain,Xtest,ytrain,ytest=X_dataset[train],X_dataset[test],y[train],y[test]
# clf=LinearSVC()
clf=MultinomialNB(alpha=0.1)
# clf=LDA()
clf.fit(Xtrain.toarray(),ytrain)
ypred=clf.predict(Xtest.toarray())
accuracy=metrics.accuracy_score(ytest,ypred)
# print "accuracy = ", accuracy
avg_accuracy+=accuracy
precision = metrics.precision_score(ytest,ypred)
# print("precision: %0.3f" % precision)
avg_precision+=precision
recall = metrics.recall_score(ytest,ypred)
# print("recall: %0.3f" % recall)
avg_recall+=recall
print "Average accuracy : " , (avg_accuracy/num_folds)
print "Average precision : " , (avg_precision/num_folds)
print "Average recall : " , (avg_recall/num_folds)
开发者ID:ananya11,项目名称:CS5614_projects,代码行数:31,代码来源:CrossValidation.py
示例4: naive_bayes
def naive_bayes():
nb = MultinomialNB()
nb.fit(X_train, train_data.danger)
nb_pred = nb.predict(X_test)
nb_score = nb.score(X_test, y_test)
precision, recall, _, _ = precision_recall_fscore_support(y_test, nb_pred)
return precision, recall, str(nb_score)
开发者ID:ilyaaltshteyn,项目名称:danger_tweets,代码行数:7,代码来源:classify4.py
示例5: classify_reviews
def classify_reviews():
import featurizer
import gen_training_data
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
data = gen_training_data.gen_data();
stemmed_data = featurizer.stem(data);
tfidf= featurizer.tfidf(data);
clf = MultinomialNB().fit(tfidf['train_tfidf'], data['training_labels']);
predicted = clf.predict(tfidf['test_tfidf']);
num_wrong = 0;
tot = 0;
for expected, guessed in zip(data['testing_labels'], predicted):
if(expected-guessed != 0):
num_wrong += 1;
print("num_wrong: %d",num_wrong)
sgd_clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42);
_ = sgd_clf.fit(tfidf['train_tfidf'], data['training_labels']);
sgd_pred = sgd_clf.predict(tfidf['test_tfidf']);
print np.mean(sgd_pred == data['testing_labels']);
stem_tfidf = featurizer.tfidf(stemmed_data);
_ = sgd_clf.fit(stem_tfidf['train_tfidf'], data['training_labels']);
sgd_stem_prd = sgd_clf.predict(stem_tfidf['test_tfidf']);
print np.mean(sgd_stem_prd==data['testing_labels']);
开发者ID:JT17,项目名称:445Project,代码行数:29,代码来源:classifier.py
示例6: run_naivebayes_evaluation
def run_naivebayes_evaluation(self, inputdata, outputdata, k):
""" Fit Naive Bayes Classification on train set with cross validation.
Run Naive Bayes Classificaiton on test set. Return results
"""
###print "** Fitting Naive Bayes classifier.."
# Cross validation
cv = cross_validation.KFold(inputdata.shape[0], n_folds=k, indices=True)
cv_naivebayes = []
f1_scores = []
for traincv, testcv in cv:
clf_cv = MultinomialNB()
clf_cv.fit(inputdata[traincv], outputdata[traincv])
y_pred_cv = clf_cv.predict(inputdata[testcv])
f1 = metrics.f1_score(outputdata[testcv], y_pred_cv, pos_label=0)
f1_scores.append(f1)
#TODO: NEEDED? self.classifier = clf_cv
print "score average: %s" + str(np.mean(f1_scores))
average_score =np.mean(f1_scores)
tuples = (average_score, f1_scores)
return (tuples, 'N.A.', 'N.A.')
开发者ID:sagieske,项目名称:scriptie,代码行数:29,代码来源:start_nb.py
示例7: train
def train(good_sources, bad_sources,method,naive_bayes=None,keywords=list()):
#train the algorithm
good_samples = find_keywords(' '.join([entry[method] for entry in good_sources]))
bad_samples = find_keywords(' '.join([entry[method] for entry in bad_sources]))
#if we have an exists knowledge base to append this new information to, do so
if naive_bayes:
new_kws = set(good_samples+bad_samples)
print('Using old keywords as well')
print("# old keywords = {}\n # new keywords = {}".format(len(keywords),len(new_kws)))
new_kws = set(good_samples+bad_samples).difference(keywords)
print("# fresh keywords = {}\n".format(len(new_kws)))
#make some call to naive_bayes.partial_fssit in here
X = np.concatenate((naive_bayes.feature_count_, np.zeros((naive_bayes.feature_count_.shape[0],len(new_kws)))),1)
all_kw = keywords + list(new_kws)
else:
print('Only using keywords from this content set')
all_kw = list(set(good_samples+bad_samples))
X = np.zeros((2,len(all_kw)))
for j,kw in enumerate(all_kw):
X[0,j] += good_samples.count(kw)
X[1,j] += bad_samples.count(kw)
y = ['good','bad']
naive_bayes = MultinomialNB()
naive_bayes.fit(X,y)
return naive_bayes, all_kw
开发者ID:pfdamasceno,项目名称:shakespeare,代码行数:33,代码来源:shakespeare.py
示例8: main
def main():
print('Reading in data file...')
data = pd.read_csv(path + 'Sentiment Analysis Dataset.csv',
usecols=['Sentiment', 'SentimentText'], error_bad_lines=False)
print('Pre-processing tweet text...')
corpus = data['SentimentText']
vectorizer = TfidfVectorizer(decode_error='replace', strip_accents='unicode',
stop_words='english', tokenizer=tokenize)
X = vectorizer.fit_transform(corpus.values)
y = data['Sentiment'].values
print('Training sentiment classification model...')
classifier = MultinomialNB()
classifier.fit(X, y)
print('Training word2vec model...')
corpus = corpus.map(lambda x: tokenize(x))
word2vec = Word2Vec(corpus.tolist(), size=100, window=4, min_count=10, workers=4)
word2vec.init_sims(replace=True)
print('Fitting PCA transform...')
word_vectors = [word2vec[word] for word in word2vec.vocab]
pca = PCA(n_components=2)
pca.fit(word_vectors)
print('Saving artifacts to disk...')
joblib.dump(vectorizer, path + 'vectorizer.pkl')
joblib.dump(classifier, path + 'classifier.pkl')
joblib.dump(pca, path + 'pca.pkl')
word2vec.save(path + 'word2vec.pkl')
print('Process complete.')
开发者ID:jdwittenauer,项目名称:twitter-viz-demo,代码行数:33,代码来源:build_models.py
示例9: text_classifly_twang
def text_classifly_twang(dataset_dir_name, fs_method, fs_num):
print 'Loading dataset, 80% for training, 20% for testing...'
movie_reviews = load_files(dataset_dir_name)
doc_str_list_train, doc_str_list_test, doc_class_list_train, doc_class_list_test = train_test_split(movie_reviews.data, movie_reviews.target, test_size = 0.2, random_state = 0)
print 'Feature selection...'
print 'fs method:' + fs_method, 'fs num:' + str(fs_num)
vectorizer = CountVectorizer(binary = True)
word_tokenizer = vectorizer.build_tokenizer()
doc_terms_list_train = [word_tokenizer(doc_str) for doc_str in doc_str_list_train]
term_set_fs = feature_selection.feature_selection(doc_terms_list_train, doc_class_list_train, fs_method)[:fs_num]
print 'Building VSM model...'
term_dict = dict(zip(term_set_fs, range(len(term_set_fs))))
vectorizer.fixed_vocabulary = True
vectorizer.vocabulary_ = term_dict
doc_train_vec = vectorizer.fit_transform(doc_str_list_train)
doc_test_vec= vectorizer.transform(doc_str_list_test)
clf = MultinomialNB().fit(doc_train_vec, doc_class_list_train) #调用MultinomialNB分类
doc_test_predicted = clf.predict(doc_test_vec)
acc = np.mean(doc_test_predicted == doc_class_list_test)
print 'Accuracy: ', acc
return acc
开发者ID:ZHAOTING,项目名称:WebDataMining_Kaggle,代码行数:26,代码来源:feature_selection_test.py
示例10: naive_classify_unknown
def naive_classify_unknown(X_train, y_train, vectorizer):
client = pymongo.MongoClient("localhost", 27017)
db = client.tweets
clf = MultinomialNB()
clf.fit(X_train, y_train)
test_users = db.tweets.distinct('user.screen_name')
classify_users(clf, vectorizer, test_users, load_users(db, test_users))
开发者ID:vojnovski,项目名称:mktweets,代码行数:7,代码来源:train.py
示例11: __init__
class NaiveBayes:
def __init__(self):
self.clf = MultinomialNB()
self.pattern ='(?u)\\b[A-Za-z]{3,}'
self.tfidf = TfidfVectorizer(sublinear_tf=False, use_idf=True, smooth_idf=True, stop_words='english', token_pattern=self.pattern, ngram_range=(2,2))
def train(self,fileName):
print "Naive Bayes classifier is being trained"
table = pandas.read_table(fileName, sep="\t", names=["cat", "message"])
X_train = self.tfidf.fit_transform(table.message)
Y_train = []
for item in table.cat:
Y_train.append(int(item))
self.clf.fit(X_train, Y_train)
self.clf.fit(X_train, Y_train)
print "Naive Bayes classifier has been trained"
def classify(self,cFileName, rFileName):
table = pandas.read_table(cFileName, names=["message"])
X_test = self.tfidf.transform(table.message)
print "Data have been classified"
with open(rFileName,'w') as f:
for item in self.clf.predict(X_test).astype(str):
f.write(item+'\n')
def validate(self,fileName):
table = pandas.read_table(fileName, sep="\t", names=["cat", "message"])
X_validate = self.tfidf.transform(table.message)
Y_validated = self.clf.predict(X_validate).astype(str)
totalNum = len(table.cat)
errorCount = 0
for i in range(0,totalNum):
if int(table.cat[i])!=int(Y_validated[i]):
errorCount += 1
print "Data have been validated! Precision={}".format((totalNum-errorCount)/float(totalNum))
开发者ID:richelite,项目名称:classify,代码行数:35,代码来源:lib.py
示例12: bcluster
def bcluster(corpus_path, cluster_fn):
folds = KFold(article_count, n_folds=10, shuffle=True)
results = []
for i, (train_idx, test_idx) in enumerate(folds):
logging.info("Running fold %d" % i)
vect = BrownClusterVectorizer(cluster_fn)
x_train = vect.fit_transform(ArticleSequence(corpus_path, indices=train_idx))
bin = LabelEncoder()
y_train = bin.fit_transform(GroupSequence(corpus_path, indices=train_idx))
x_test = vect.transform(ArticleSequence(corpus_path, indices=test_idx))
y_test = bin.transform(GroupSequence(corpus_path, indices=test_idx))
model = MultinomialNB()
model.fit(x_train, y_train)
pred = model.predict(x_test)
score = accuracy_score(y_test, pred)
logging.info("Completed fold %d with score %.04f" % (i, score))
results.append(score)
return results
开发者ID:andrely,项目名称:sublexical-features,代码行数:25,代码来源:newsgroups.py
示例13: plain_word_counts
def plain_word_counts(corpus_path):
folds = KFold(article_count, n_folds=10, shuffle=True)
results = []
for i, (train_idx, test_idx) in enumerate(folds):
logging.info("Running fold %d" % i)
vect = CountVectorizer(max_features=1000, decode_error='ignore', strip_accents='unicode')
x_train = vect.fit_transform(ArticleSequence(corpus_path, indices=train_idx))
bin = LabelEncoder()
y_train = bin.fit_transform(GroupSequence(corpus_path, indices=train_idx))
x_test = vect.transform(ArticleSequence(corpus_path, indices=test_idx))
y_test = bin.transform(GroupSequence(corpus_path, indices=test_idx))
model = MultinomialNB()
model.fit(x_train, y_train)
pred = model.predict(x_test)
score = accuracy_score(y_test, pred)
logging.info("Completed fold %d with score %.04f" % (i, score))
results.append(score)
return results
开发者ID:andrely,项目名称:sublexical-features,代码行数:25,代码来源:newsgroups.py
示例14: find_best_vectorizor
def find_best_vectorizor(vectorizer, grid):
dg = DataGatherer()
y_test = dg.validate_target
y_train = dg.labeled_target
nb = MultinomialNB()
header_printed = False
best_params = None
best_score = -1
for param in IterGrid(grid):
if not header_printed:
print(str(",".join(param.keys())) + ",Score")
header_printed = True
vectorizer.set_params(**param)
X_train = vectorizer.fit_transform(dg.labeled_data)
X_test = vectorizer.transform(dg.validate_data)
nb.fit(X_train, y_train)
score = nb.score(X_test, y_test)
if score > best_score:
best_score = score
best_params = param
print(str(",".join(map(str, param.values()))) + "," + str(score))
print("")
print("Best params: " + str(best_params))
print("Best score: " + str(best_score))
开发者ID:Web5design,项目名称:big-data,代码行数:25,代码来源:naive_bayes_optimizer.py
示例15: __init__
class Sentiment:
def __init__(self):
self.stop_words = stopwords.words() + list(string.punctuation)
self.tfid = TfidfVectorizer()
self.clf = MultinomialNB()
# score: 0.7225
# self.clf = SVC()
# create pipelines
# clean the input
def fit(self, X, Y):
self.X = X
self.Y = Y
# give the subset of dataset to be trained
l = 0
h = 4000
words = [word_tokenize(x.decode("utf-8").lower()) for x in X[l:h]]
processed_words = [" ".join(w for w in s if w not in self.stop_words) for s in words]
X_train = self.tfid.fit_transform(processed_words)
Y_train = Y[l:h]
self.clf.fit(X_train, Y_train)
print "Classes: ", self.clf.classes_
print "Score: ", self.clf.score(X_train, Y_train)
def predict(self, X_inp):
word_list = " ".join(w for w in word_tokenize(X_inp.decode("utf-8").lower()) if w not in self.stop_words)
X_test = self.tfid.transform([word_list])
return self.clf.predict(X_test)
开发者ID:abijith-kp,项目名称:DataMining_NLP_AI,代码行数:29,代码来源:sentiment.py
示例16: MultinomialNBClassify
def MultinomialNBClassify(trainData, trainLabel, testData):
nbClf = MultinomialNB(alpha=0.1) # default alpha=1.0, Laplace smoothing
# settinf alpha < 1 is called Lidstone smoothing
nbClf.fit(trainData, ravel(trainLabel))
testLabel = nbClf.predict(testData)
saveResult(testLabel, 'sklearn_MultinomialNB_alpha=0.1_Result.csv')
return testLabel
开发者ID:ElvisKwok,项目名称:code,代码行数:7,代码来源:digit_recognizer.py
示例17: string_selection
def string_selection():
# get data
vectorizer = CountVectorizer(decode_error='ignore')
ch2 = SelectKBest(chi2, k=100)
# get data
train_data, permission_list = db_tool.get_new_train_data()
x_train, x_test, y_train, y_test = cross_validation.train_test_split(train_data['string-data'],
train_data['target'], test_size=0.2,
random_state=1)
# feature extraction
x_train = vectorizer.fit_transform(x_train)
feature_names = vectorizer.get_feature_names()
x_train = ch2.fit_transform(x_train, y_train)
feature_names = [feature_names[i] for i in ch2.get_support(indices=True)]
print(ch2.scores_)
print(ch2.get_support(indices=True))
print(feature_names)
x_test = vectorizer.transform(x_test)
x_test = ch2.transform(x_test)
# # build the model
model = MultinomialNB().fit(x_train, y_train)
#
# # valid the model
predicted = model.predict(x_test)
print (metrics.accuracy_score(y_test, predicted))
开发者ID:psuedoelastic,项目名称:android_malware_detection,代码行数:29,代码来源:mine_apk_category.py
示例18: train_chunk
def train_chunk(X, Y, Xe, Ye):
#clf = KNeighborsClassifier(n_neighbors=5).fit(X, Y)
#clf = GaussianNB().fit(X, Y)
clf = MultinomialNB().fit(X, Y)
Yd = clf.predict(Xe)
return stats(Ye, Yd)
开发者ID:BigBull90,项目名称:anon,代码行数:7,代码来源:wordLen.py
示例19: __init__
class TrainNaiveBayes:
def __init__(self, all_features, neu_labels):
"""
Trains a classifier using Naive Bayes
"""
self._num_features = len(all_features.values()[0])
self._X = numpy.zeros((1, self._num_features)) # Feature matrix
self._Y = numpy.array([0]) # Label vector
for user_id in neu_labels.keys():
self._X = numpy.append(self._X, [all_features[user_id]], axis=0)
self._Y = numpy.append(self._Y, [neu_labels[user_id]])
self._X = numpy.delete(self._X, 0, 0) # Delete the first row (contains all 0s)
self._Y = numpy.delete(self._Y, 0)
print "Using MultinomialNB"
self._model = MultinomialNB()
print cross_validation.cross_val_score(self._model, self._X, self._Y, cv=10, scoring='f1')
self._model.fit(self._X, self._Y)
def predict(self, features):
A = numpy.zeros((1, self._num_features))
for user_id in features.keys():
A = numpy.append(A, [features[user_id]], axis=0)
A = numpy.delete(A, 0, 0)
return self._model.predict(A)
开发者ID:artir,项目名称:cl2_project,代码行数:28,代码来源:train_naive_bayes.py
示例20: train
def train(self, data):
nb = MultinomialNB()
launches = map(lambda x: x['application'], data)
instances = map(lambda i: {'lu1': launches[i-1]}, xrange(1, len(launches)))
X = self.vectorizer.fit_transform(instances).toarray()
y = launches[1:]
self.lu1_predictor = nb.fit(X, y)
instances = map(lambda i: {'lu2': launches[i-2]}, xrange(2, len(launches)))
X = self.vectorizer.fit_transform(instances).toarray()
y = launches[2:]
self.lu2_predictor = nb.fit(X, y)
# tune mu
max_hr = 0
best_mu = 0
for mu in map(lambda x: x/10.0, xrange(11)):
self.mu = mu
predictions = map(lambda i: self.predict({'lu1': launches[i-1], 'lu2': launches[i-2]}), \
xrange(2, len(launches)))
hr, mrr = self.test(launches[2:], predictions)
if hr > max_hr:
max_hr = hr
best_mu = mu
self.mu = best_mu
开发者ID:nodestory,项目名称:ApeicServer,代码行数:26,代码来源:lu_predictor.py
注:本文中的sklearn.naive_bayes.MultinomialNB类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论