本文整理汇总了Python中nltk.NaiveBayesClassifier类的典型用法代码示例。如果您正苦于以下问题:Python NaiveBayesClassifier类的具体用法?Python NaiveBayesClassifier怎么用?Python NaiveBayesClassifier使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了NaiveBayesClassifier类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: category_by_movie
def category_by_movie():
from nltk.corpus import movie_reviews as mr
from nltk import FreqDist
from nltk import NaiveBayesClassifier
from nltk import classify
from nltk.corpus import names
from nltk.classify import apply_features
import random
documents = [(list(mr.words(f)), c) for c in mr.categories() for f in
mr.fileids(c)]
random.shuffle(documents)
all_words = FreqDist(w.lower() for w in mr.words())
word_features = all_words.keys()[:2000]
def document_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in document_words)
return features
#print document_features(mr.words('pos/cv957_8737.txt'))
#print documents[0]
features = [(document_features(d), c) for (d, c) in documents]
train_set, test_set = features[100:], features[:100]
classifier = NaiveBayesClassifier.train(train_set)
print classify.accuracy(classifier, train_set)
开发者ID:brenden17,项目名称:infinity,代码行数:30,代码来源:category_nltk.py
示例2: train_nltk
def train_nltk(data, labels):
'''
Returns a trained nltk.NaiveBayesClassifier
Inputs
---------
data -- np.array of tuples
'''
# For now, shuffle, since for now assuming that only the post language itself is all that's needed for offensive measure, though in the future, 2 anti-something users may actually not be offended by one another if they are both negative about something
kf = cv.KFold(n=len(data), n_folds=N_FOLDS, shuffle=True)
best_model = None
max_acc = float('-inf')
for k, (train_index, test_index) in enumerate(kf):
X_train, Y_train = data[train_index], labels[train_index]
X_test, Y_test = data[test_index], labels[test_index]
features_train = bulk_extract_features(X_train)
features_test = bulk_extract_features(X_test)
train_set = zip(features_train, Y_train)
test_set = zip(features_test, Y_test)
model = nbc.train(train_set)
acc = nltk.classify.accuracy(model, test_set)
print str(acc)
if acc > max_acc:
max_acc = acc
best_model = model
best_model.show_most_informative_features(30)
return best_model
开发者ID:arizonat,项目名称:data-science,代码行数:32,代码来源:insulter.py
示例3: nltk_model
def nltk_model():
"""Fits the (non-parametric) naive Bayes classifier from nltk on the names
dataset."""
# each elt of all_names will be a (name, gender) tuple
all_names = list()
with open(MALE_FILE, "r") as f:
for line in f:
all_names.append((line.rstrip(), "male")) # rstrip removes trailing whitespace
with open(FEMALE_FILE, "r") as g:
for line in g:
all_names.append((line.rstrip(), "female"))
# assert stmts can be useful for debugging etc
assert len(all_names) == 7944
# shuffle all_names in place
random.shuffle(all_names)
# features are ({'feature_type': feature_value}, gender) tuples
features = [(nltk_featurize(name), gender) for name, gender in all_names]
split_pt = int(TRAIN_PCT * len(features))
train_set, test_set = features[:split_pt], features[split_pt:]
nb = NaiveBayesClassifier.train(train_set)
print "accuracy = {0} %".format(int(100 * nltk.classify.accuracy(nb, test_set)))
nb.show_most_informative_features(10)
开发者ID:CBaader,项目名称:science,代码行数:29,代码来源:nbayes.py
示例4: test_raw_mail
def test_raw_mail(org_email):
features_test = {}
wordtokens_test = [word_limit.lemmatize(key.lower()) for key in
word_tokenize(org_email)]
for key in wordtokens_test:
if key not in stpwords:
features_test[key] = True
return features_test
#Extracting the features(Tonenized, stemmed and non-stopwords emails) from all the emails
feature_sets = [(raw_mail(n), g) for (n,g) in mail_shuffle]
#Splitting the test and training data sets from the whole email set features
size_feature = int(len(feature_sets) * 0.10)
train_set, test_set = feature_sets[size_feature:], feature_sets[:size_feature]
classifier = NaiveBayesClassifier.train(train_set)
#print (test_set[1:5])
#Printing the accuracy of the machine
print ('accuracy of the machine: ', (classify.accuracy(classifier,test_set))*100)
#Printing the top 50 features
classifier.show_most_informative_features(50)
#Printing the spam and ham labels
print ('labels:',classifier.labels())
#Classification of user entered email
while(True):
featset = raw_mail(input("Enter text to classify: "))
print (classifier.classify(featset))
开发者ID:Pooshan,项目名称:Project__spam-and-ham-detection-using-natural-language-processing,代码行数:32,代码来源:NLP-spam-ham.py
示例5: train
def train(self, training_corpus):
assert isinstance(training_corpus, (list, tuple))
assert isinstance(training_corpus[0], dict)
featureset = [(twit_features(i["text"]), i["polarity"])
for i in training_corpus
if i["denied"] == 0]
self.classifier = NaiveBayesClassifier.train(featureset)
开发者ID:yastrov,项目名称:py-tips,代码行数:7,代码来源:SentimentAnalyzerViaNaiveBayes.py
示例6: train
def train(self):
"""
"""
catalog = getToolByName(self, "portal_catalog")
presentNouns = dict()
trainingData = []
allNouns = catalog.uniqueValuesFor("noun_terms")
for item in allNouns:
presentNouns.setdefault(item, 0)
subjectIndex = catalog._catalog.getIndex("Subject")
nounTermsIndex = catalog._catalog.getIndex("noun_terms")
# The internal catalog ids of the objects
# that have noun terms in the catalog
nounTermIndexIds = IISet(nounTermsIndex._unindex.keys())
# The internal catalog ids of the objects
# that have subjects in the catalog
subjectIndexIds = IISet(subjectIndex._unindex.keys())
commonIds = intersection(subjectIndexIds, nounTermIndexIds)
for cid in commonIds:
nounPresence = presentNouns.copy()
nouns = nounTermsIndex._unindex[cid]
tags = subjectIndex._unindex[cid]
for noun in nouns:
nounPresence[noun] = 1
for tag in tags:
trainingData.append((nounPresence, tag))
if trainingData:
self.classifier = NaiveBayesClassifier.train(trainingData)
开发者ID:ggozad,项目名称:collective.classification,代码行数:32,代码来源:nounbayesclassifier.py
示例7: category_by_pos
def category_by_pos():
from nltk.corpus import brown
from nltk import FreqDist
from nltk import DecisionTreeClassifier
from nltk import NaiveBayesClassifier
from nltk import classify
suffix_fdist = FreqDist()
for word in brown.words():
word = word.lower()
suffix_fdist.inc(word[-1:])
suffix_fdist.inc(word[-2:])
suffix_fdist.inc(word[-3:])
common_suffixes = suffix_fdist.keys()[:100]
# print common_suffixes
def pos_features(word):
features = {}
for suffix in common_suffixes:
features['endswith(%s)' % suffix] = word.lower().endswith(suffix)
return features
tagged_words = brown.tagged_words(categories='news')
featuresets = [(pos_features(n), g) for (n, g) in tagged_words]
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
# classifier = DecisionTreeClassifier.train(train_set)
# print 'Decision Tree %f' % classify.accuracy(classifier, test_set)
classifier = NaiveBayesClassifier.train(train_set)
print 'NaiveBay %f' % classify.accuracy(classifier, test_set)
开发者ID:brenden17,项目名称:infinity,代码行数:32,代码来源:category_nltk.py
示例8: get_sentiment_data
def get_sentiment_data(query, training_set):
train = []
with open('training/' + training_set + '/training.txt') as f:
for line in f:
temp = line.split('\t')
#print temp
train.append((get_features(temp[1]), temp[0]))
clf = NaiveBayesClassifier.train(train)
tweets = grab_tweets(query)
print "HERE"
classified = {}
for tweet in tweets:
if tweet.created_at in classified.keys():
classified[tweet.created_at] = classified[tweet.created_at] + [clf.classify(get_features(tweet.text))]
else:
classified[tweet.created_at] = [clf.classify(get_features(tweet.text))]
print classified
returndata = {}
for key in classified:
#numpos = sum([1 if v=='pos' else 0 for v in classified[key]])
#returndata[key] = (numpos, len(classified[key]) - numpos) #tuple of positive, negative
# percent:
returndata[key] = float(sum([1 if v == '1' else 0 for v in classified[key]]))/len(classified[key])
#returndata[key] = math.ceil(float(sum([1 if v == '1' else 0 for v in classified[key]]))/len(classified[key])*100)/100.0
print returndata
return returndata
开发者ID:2ricecrackerfolder,项目名称:twittermood,代码行数:28,代码来源:tweet_analyzer.py
示例9: get_matrix
def get_matrix(spam_set, ham_set, num_folds):
'''
Generate different matrix by taking the average of K Fold data
'''
total_precision = total_recall = F1 = spam_accuracy = ham_accuracy = 0
for train_set, test_spam_set, test_ham_set in utils.get_kfold_data(spam_set, ham_set, num_folds):
classifier = NaiveBayesClassifier.train(train_set)
spam_len = len(test_spam_set)
ham_len = len(test_ham_set)
true_positive = false_positive = true_negative = false_negative = 0
for test in test_spam_set:
features = test[0]
predicted_label = classifier.classify(features)
if predicted_label == 0:
true_positive += 1
else:
false_negative += 1
for test in test_ham_set:
features = test[0]
predicted_label = classifier.classify(features)
if predicted_label == 1:
true_negative += 1
else:
false_positive += 1
precision = true_positive / float(true_positive + false_positive)
recall = true_positive / float(true_positive + false_negative)
F1 += (2 * precision * recall) / (precision + recall)
spam_accuracy += true_positive / float(true_positive + false_negative)
ham_accuracy += true_negative / float(true_negative + false_positive)
total_precision += precision
total_recall += recall
return total_precision/num_folds, total_recall/num_folds, F1/num_folds, spam_accuracy*100/num_folds, ham_accuracy*100/num_folds
开发者ID:shwetgarg,项目名称:spam_filter,代码行数:35,代码来源:spam_filter.py
示例10: check_classifier
def check_classifier(feature_extractor, **kwargs):
'''
Train the classifier on the training spam and ham, then check its accuracy
on the test data, and show the classifier's most informative features.
'''
# Make training and testing sets of (features, label) data
train_set, test_spam, test_ham = \
make_train_test_sets(feature_extractor, **kwargs)
#===============================================
# ADD YOUR CODE HERE
# Train the classifier on the training set (train_set)
# classifier = /your code/
# Test accuracy on test spam emails (test_spam) and test ham emails(test_ham)
# spam_accuracy = /your code/
# Test accuracy on test ham emails (test_spam) and test ham emails(test_ham)
# ham_accuracy = /your code/
#===============================================
classifier = NaiveBayesClassifier.train(train_set)
spam_accuracy = nltk.classify.accuracy(classifier, test_spam)
ham_accuracy = nltk.classify.accuracy(classifier, test_ham)
# How accurate is the classifier on the test sets?
print ('Test Spam accuracy: {0:.2f}%'
.format(100 * spam_accuracy))
print ('Test Ham accuracy: {0:.2f}%'
.format(100 * ham_accuracy))
# Show the top 20 informative features
print classifier.show_most_informative_features(20)
开发者ID:leafsherry,项目名称:UM_F14_EECS445_courseWork,代码行数:31,代码来源:q5solution.py
示例11: __init_naive_bayes
def __init_naive_bayes( self ):
"""
Create and trains the NaiveBayes Classifier
"""
try:
# corpus_no = abs(int(raw_input('Enter the number (1-3) of corpus: ')))
# while corpus_no == 0 or corpus_no > 3:
# corpus_no = abs(int(raw_input('Please the number of corpus from 1 to 2:' )))
corpus = 'corpus2'#+str(corpus_no)
path = os.path.join('corpora/',corpus)
spam_path = os.path.join(path,'spam')
ham_path = os.path.join(path,'ham')
spam_dir = os.listdir(spam_path)
ham_dir = os.listdir(ham_path)
train_spam_filelist = [os.path.join(spam_path, f) for f in spam_dir]
train_ham_filelist = [os.path.join(ham_path, f) for f in ham_dir]
spam_size = len(train_spam_filelist)
ham_size = len(train_ham_filelist)
train_spam_set = self.__make_featured_set(train_spam_filelist,'spam')
train_ham_set = self.__make_featured_set(train_ham_filelist,'ham')
train_set = train_spam_set + train_ham_set
self.classifier = NaiveBayesClassifier.train( train_set )
except:
raise Exception( "Unexpected error in SpamFilter: __spamFilter:",sys.exc_info()[0].__name__,\
os.path.basename( sys.exc_info()[2].tb_frame.f_code.co_filename ),\
sys.exc_info()[2].tb_lineno, \
sys.exc_info()[1].message )
开发者ID:leo-pard,项目名称:HealthCare_Twitter_Analysis,代码行数:35,代码来源:spam_filter.py
示例12: train_classifiers
def train_classifiers(self):
for word in self.senses:
train_set = []
for senseId in self.senses[word]:
for lsa_vector in self.senses[word][senseId]:
train_set.append([dict(lsa_vector), senseId])
self.classifiers[word] = NaiveBayesClassifier.train(train_set)
开发者ID:phdowling,项目名称:CompLingApplications,代码行数:7,代码来源:TMWSD.py
示例13: buildclassifiers
def buildclassifiers(featureslist, SAMPLE_PROPORTION, n):
classnames = ['Naive Bayes', 'Logistic Regression', 'Linear SCV']
allclassifiers = []
for name in classnames:
for i in range(n):
random.shuffle(featureslist)
train_set, test_set = buildsets(featureslist, SAMPLE_PROPORTION)
if name == 'Naive Bayes':
spamclassifier = NaiveBayesClassifier.train(train_set)
if name == 'Logistic Regression':
spamclassifier = SklearnClassifier(LogisticRegression())
spamclassifier.train(train_set)
if name == 'Linear SCV':
spamclassifier = SklearnClassifier(LinearSVC(C=0.01))
spamclassifier.train(train_set)
perfmeasures_i = evaluate(train_set, test_set, spamclassifier, name)
if i == 0:
perfmeasures_n = perfmeasures_i
else:
perfmeasures_n = map(add, perfmeasures_n, perfmeasures_i)
# Store last classifier built per model
allclassifiers.append(spamclassifier)
# Print performance measures per classifier
printperformance(name, perfmeasures_n, n)
return allclassifiers
开发者ID:Vermeij,项目名称:Spamfilter,代码行数:29,代码来源:classifyspam.py
示例14: __init__
def __init__(self, **kwargs):
super(TimeLogicAdapter, self).__init__(**kwargs)
from nltk import NaiveBayesClassifier
self.positive = [
'what time is it',
'do you know the time',
'do you know what time it is',
'what is the time'
]
self.negative = [
'it is time to go to sleep',
'what is your favorite color',
'i had a great time',
'what is'
]
labeled_data = (
[(name, 0) for name in self.negative] +
[(name, 1) for name in self.positive]
)
# train_set = apply_features(self.time_question_features, training_data)
train_set = [(self.time_question_features(n), text) for (n, text) in labeled_data]
self.classifier = NaiveBayesClassifier.train(train_set)
开发者ID:Gustavo6046,项目名称:ChatterBot,代码行数:27,代码来源:time_adapter.py
示例15: __init__
def __init__(self, chatbot, **kwargs):
super().__init__(chatbot, **kwargs)
from nltk import NaiveBayesClassifier
self.positive = kwargs.get('positive', [
'what time is it',
'hey what time is it',
'do you have the time',
'do you know the time',
'do you know what time it is',
'what is the time'
])
self.negative = kwargs.get('negative', [
'it is time to go to sleep',
'what is your favorite color',
'i had a great time',
'thyme is my favorite herb',
'do you have time to look at my essay',
'how do you have the time to do all this'
'what is it'
])
labeled_data = (
[(name, 0) for name in self.negative] +
[(name, 1) for name in self.positive]
)
train_set = [
(self.time_question_features(text), n) for (text, n) in labeled_data
]
self.classifier = NaiveBayesClassifier.train(train_set)
开发者ID:hundredrab,项目名称:ChatterBot,代码行数:33,代码来源:time_adapter.py
示例16: training
def training(features, method, proportion_training):
training_set = features[:int(proportion_training*len(features))] # we take 2/3 for training and 1/3 for testing
testing_set = features[int(proportion_training*len(features)):]
if method == 'NaiveBayes':
classifier = NaiveBayesClassifier.train(training_set)
return training_set, testing_set, classifier
开发者ID:mroussel1,项目名称:SwissRe,代码行数:8,代码来源:news_classification.py
示例17: train
def train(features, samples_proportion):
train_size = int(len(features) * samples_proportion)
train_set, test_set = features[:train_size], features[train_size:]
print ('Training set size = ' + str(len(train_set)) + ' emails')
print ('Test set size = ' + str(len(test_set)) + ' emails')
train_set_tuple = tuple(train_set)
classifier = NaiveBayesClassifier.train(train_set_tuple)
return train_set, test_set, classifier
开发者ID:amitrai1095,项目名称:Spam-Filter,代码行数:8,代码来源:filter.py
示例18: textClass
def textClass():
#dbFile = open("samp.txt")
dbFile = open("all.txt")
reviews = list() #each list element is a list of words in the review
ratings = list() #ratings given
usefulness = list() #review classification
tot_recs = 0
len_tot = 0
mlen = 0
#parse the file and create the list to be passed to the NBClassifiers
while tot_recs < 150000:#True:
if tot_recs % 1000 == 0:
print "num records:", tot_recs
tot_recs += 1
raw_rec = readRec(dbFile)
if len(raw_rec) == 0:
break
review_text = [word.strip(punctuation) for word in raw_rec["text"]]
rate_val = str( raw_rec["score"][0] )
prs_rec = parse4ftrs(raw_rec)
len_tot += prs_rec["length"]
if prs_rec["length"] > mlen:
mlen = prs_rec["length"]
use_val = str( prs_rec["class"] )
#print use_val, rate_val
#word feature dictionary
wfd = word_feats(review_text)
ratings.append( ( wfd , rate_val) )
usefulness.append( ( wfd, use_val) )
dbFile.close()
print "avg length:", len_tot/tot_recs
print "max len:", mlen
#select a cutoff for test v training
#nrecs = len(ratings)
nrecs = tot_recs
rate_cl = NaiveBayesClassifier.train(ratings)
use_cl = NaiveBayesClassifier.train(usefulness)
return rate_cl, use_cl
开发者ID:adams-n-d,项目名称:Miners,代码行数:45,代码来源:txtCls.py
示例19: evaluate_classifier
def evaluate_classifier(train_set, test_spam, test_ham):
""" Using NaiveBayesClassifier.train() method from NLTK to train the train_set (spam + ham),
then classifier is used to evaluate the accuracy of test Spam, Ham. Finally, the most informative
features are showed.
"""
classifier = NaiveBayesClassifier.train(train_set)
print ("Test Spam accuracy: {0:.2f} %".format(100 * nltk.classify.accuracy(classifier, test_spam)))
print ("Test Ham accuracy: {0:.2f} %".format(100 * nltk.classify.accuracy(classifier, test_ham)))
print classifier.show_most_informative_features(20)
开发者ID:bayramiaa,项目名称:creating-enron-spam-corpus-from-raw-data,代码行数:9,代码来源:bayesFilter_ngrams.py
示例20: train
def train(features, samples_proportion):
train_size = int(len(features) * samples_proportion)
# initialise the training and test sets
train_set, test_set = features[:train_size], features[train_size:]
print ('Training set size = ' + str(len(train_set)) + ' emails')
print ('Test set size = ' + str(len(test_set)) + ' emails')
# train the classifier
classifier = NaiveBayesClassifier.train(train_set)
return train_set, test_set, classifier
开发者ID:bharatkashyap,项目名称:clickbait-repel,代码行数:9,代码来源:try.py
注:本文中的nltk.NaiveBayesClassifier类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论