• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python nltk.NaiveBayesClassifier类代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中nltk.NaiveBayesClassifier的典型用法代码示例。如果您正苦于以下问题:Python NaiveBayesClassifier类的具体用法?Python NaiveBayesClassifier怎么用?Python NaiveBayesClassifier使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。



在下文中一共展示了NaiveBayesClassifier类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: category_by_movie

def category_by_movie():
    from nltk.corpus import movie_reviews as mr
    from nltk import FreqDist
    from nltk import NaiveBayesClassifier
    from nltk import classify
    from nltk.corpus import names
    from nltk.classify import apply_features
    import random

    documents = [(list(mr.words(f)), c) for c in mr.categories() for f in
mr.fileids(c)]
    random.shuffle(documents)

    all_words = FreqDist(w.lower() for w in mr.words())
    word_features = all_words.keys()[:2000]

    def document_features(document):
        document_words = set(document)
        features = {}
        for word in word_features:
            features['contains(%s)' % word] = (word in document_words)
        return features

    #print document_features(mr.words('pos/cv957_8737.txt'))
    #print documents[0]

    features = [(document_features(d), c) for (d, c) in documents]
    train_set, test_set = features[100:], features[:100]
    classifier = NaiveBayesClassifier.train(train_set)
    print classify.accuracy(classifier, train_set)
开发者ID:brenden17,项目名称:infinity,代码行数:30,代码来源:category_nltk.py


示例2: train_nltk

def train_nltk(data, labels):
    '''
    Returns a trained nltk.NaiveBayesClassifier
    
    Inputs
    ---------
    data -- np.array of tuples
    '''
    # For now, shuffle, since for now assuming that only the post language itself is all that's needed for offensive measure, though in the future, 2 anti-something users may actually not be offended by one another if they are both negative about something
    kf = cv.KFold(n=len(data), n_folds=N_FOLDS, shuffle=True)

    best_model = None
    max_acc = float('-inf')
    for k, (train_index, test_index) in enumerate(kf):
        X_train, Y_train = data[train_index], labels[train_index]
        X_test, Y_test = data[test_index], labels[test_index]

        features_train = bulk_extract_features(X_train)
        features_test = bulk_extract_features(X_test)

        train_set = zip(features_train, Y_train)
        test_set = zip(features_test, Y_test)
        
        model = nbc.train(train_set)

        acc = nltk.classify.accuracy(model, test_set)
        print str(acc)
        if acc > max_acc:
            max_acc = acc
            best_model = model
    best_model.show_most_informative_features(30)
    return best_model
开发者ID:arizonat,项目名称:data-science,代码行数:32,代码来源:insulter.py


示例3: nltk_model

def nltk_model():
    """Fits the (non-parametric) naive Bayes classifier from nltk on the names
    dataset."""
    # each elt of all_names will be a (name, gender) tuple
    all_names = list()

    with open(MALE_FILE, "r") as f:
        for line in f:
            all_names.append((line.rstrip(), "male"))  # rstrip removes trailing whitespace

    with open(FEMALE_FILE, "r") as g:
        for line in g:
            all_names.append((line.rstrip(), "female"))

    # assert stmts can be useful for debugging etc
    assert len(all_names) == 7944

    # shuffle all_names in place
    random.shuffle(all_names)

    # features are ({'feature_type': feature_value}, gender) tuples
    features = [(nltk_featurize(name), gender) for name, gender in all_names]
    split_pt = int(TRAIN_PCT * len(features))

    train_set, test_set = features[:split_pt], features[split_pt:]
    nb = NaiveBayesClassifier.train(train_set)

    print "accuracy = {0} %".format(int(100 * nltk.classify.accuracy(nb, test_set)))
    nb.show_most_informative_features(10)
开发者ID:CBaader,项目名称:science,代码行数:29,代码来源:nbayes.py


示例4: test_raw_mail

def test_raw_mail(org_email):

	features_test = {}
	wordtokens_test = [word_limit.lemmatize(key.lower()) for key in
	word_tokenize(org_email)]
	for key in wordtokens_test:
		if key not in stpwords:
			features_test[key] = True
	return features_test

	#Extracting the features(Tonenized, stemmed and non-stopwords emails) from all the emails
	feature_sets = [(raw_mail(n), g) for (n,g) in mail_shuffle]

	#Splitting the test and training data sets from the whole email set features
	size_feature = int(len(feature_sets) * 0.10)
	train_set, test_set = feature_sets[size_feature:], feature_sets[:size_feature]
	classifier = NaiveBayesClassifier.train(train_set)
	#print (test_set[1:5])

	#Printing the accuracy of the machine
	print ('accuracy of the machine: ', (classify.accuracy(classifier,test_set))*100) 
	
	#Printing the top 50 features
	classifier.show_most_informative_features(50) 

	#Printing the spam and ham labels
	print ('labels:',classifier.labels())

	#Classification of user entered email
	while(True):
		featset = raw_mail(input("Enter text to classify: "))
		print (classifier.classify(featset))
开发者ID:Pooshan,项目名称:Project__spam-and-ham-detection-using-natural-language-processing,代码行数:32,代码来源:NLP-spam-ham.py


示例5: train

 def train(self, training_corpus):
     assert isinstance(training_corpus, (list, tuple))
     assert isinstance(training_corpus[0], dict)
     featureset = [(twit_features(i["text"]), i["polarity"])
                     for i in training_corpus
                     if i["denied"] == 0]
     self.classifier = NaiveBayesClassifier.train(featureset)
开发者ID:yastrov,项目名称:py-tips,代码行数:7,代码来源:SentimentAnalyzerViaNaiveBayes.py


示例6: train

    def train(self):
        """
        """
        catalog = getToolByName(self, "portal_catalog")
        presentNouns = dict()
        trainingData = []
        allNouns = catalog.uniqueValuesFor("noun_terms")
        for item in allNouns:
            presentNouns.setdefault(item, 0)

        subjectIndex = catalog._catalog.getIndex("Subject")
        nounTermsIndex = catalog._catalog.getIndex("noun_terms")

        # The internal catalog ids of the objects
        # that have noun terms in the catalog
        nounTermIndexIds = IISet(nounTermsIndex._unindex.keys())

        # The internal catalog ids of the objects
        # that have subjects in the catalog
        subjectIndexIds = IISet(subjectIndex._unindex.keys())
        commonIds = intersection(subjectIndexIds, nounTermIndexIds)

        for cid in commonIds:
            nounPresence = presentNouns.copy()
            nouns = nounTermsIndex._unindex[cid]
            tags = subjectIndex._unindex[cid]
            for noun in nouns:
                nounPresence[noun] = 1
            for tag in tags:
                trainingData.append((nounPresence, tag))
        if trainingData:
            self.classifier = NaiveBayesClassifier.train(trainingData)
开发者ID:ggozad,项目名称:collective.classification,代码行数:32,代码来源:nounbayesclassifier.py


示例7: category_by_pos

def category_by_pos():
    from nltk.corpus import brown
    from nltk import FreqDist
    from nltk import DecisionTreeClassifier
    from nltk import NaiveBayesClassifier
    from nltk import classify

    suffix_fdist = FreqDist()
    for word in brown.words():
        word = word.lower()
        suffix_fdist.inc(word[-1:])
        suffix_fdist.inc(word[-2:])
        suffix_fdist.inc(word[-3:])

    common_suffixes = suffix_fdist.keys()[:100]
#    print common_suffixes

    def pos_features(word):
        features = {}
        for suffix in common_suffixes:
            features['endswith(%s)' % suffix] = word.lower().endswith(suffix)
        return features

    tagged_words = brown.tagged_words(categories='news')
    featuresets = [(pos_features(n), g) for (n, g) in tagged_words]
    size = int(len(featuresets) * 0.1)
    train_set, test_set = featuresets[size:], featuresets[:size]
#    classifier = DecisionTreeClassifier.train(train_set)
#    print 'Decision Tree %f' % classify.accuracy(classifier, test_set)

    classifier = NaiveBayesClassifier.train(train_set)
    print 'NaiveBay %f' % classify.accuracy(classifier, test_set)
开发者ID:brenden17,项目名称:infinity,代码行数:32,代码来源:category_nltk.py


示例8: get_sentiment_data

def get_sentiment_data(query, training_set):
	train = []
	with open('training/' + training_set + '/training.txt') as f:
		for line in f:
			temp = line.split('\t')
			#print temp
			train.append((get_features(temp[1]), temp[0]))
	clf = NaiveBayesClassifier.train(train)

	tweets = grab_tweets(query)
	print "HERE"
	classified = {}
	for tweet in tweets:
		if tweet.created_at in classified.keys():
			classified[tweet.created_at] = classified[tweet.created_at] + [clf.classify(get_features(tweet.text))]
		else:
			classified[tweet.created_at] = [clf.classify(get_features(tweet.text))]
	print classified

	returndata = {}
	for key in classified:
		#numpos = sum([1 if v=='pos' else 0 for v in classified[key]])
		#returndata[key] = (numpos, len(classified[key]) - numpos) #tuple of positive, negative
		# percent:
		returndata[key] = float(sum([1 if v == '1' else 0 for v in classified[key]]))/len(classified[key])
		#returndata[key] = math.ceil(float(sum([1 if v == '1' else 0 for v in classified[key]]))/len(classified[key])*100)/100.0
	print returndata
	return returndata
开发者ID:2ricecrackerfolder,项目名称:twittermood,代码行数:28,代码来源:tweet_analyzer.py


示例9: get_matrix

def get_matrix(spam_set, ham_set, num_folds):
	'''
	Generate different matrix by taking the average of K Fold data
	'''
	total_precision = total_recall = F1 = spam_accuracy = ham_accuracy = 0

	for train_set, test_spam_set, test_ham_set in utils.get_kfold_data(spam_set, ham_set, num_folds):
		classifier = NaiveBayesClassifier.train(train_set)
		spam_len = len(test_spam_set)
		ham_len = len(test_ham_set)
		true_positive = false_positive = true_negative = false_negative = 0
		for test in test_spam_set:
			features = test[0]
			predicted_label = classifier.classify(features)
			if predicted_label == 0:
				true_positive += 1
			else:
				false_negative += 1
		for test in test_ham_set:
			features = test[0]
			predicted_label = classifier.classify(features)
			if predicted_label == 1:
				true_negative += 1
			else:
				false_positive += 1
												
		precision = true_positive / float(true_positive + false_positive)
		recall = true_positive / float(true_positive + false_negative)
		F1 += (2 * precision * recall) / (precision + recall)
		spam_accuracy += true_positive / float(true_positive + false_negative)
		ham_accuracy += true_negative / float(true_negative + false_positive)
		total_precision += precision
		total_recall += recall

	return total_precision/num_folds, total_recall/num_folds, F1/num_folds, spam_accuracy*100/num_folds, ham_accuracy*100/num_folds
开发者ID:shwetgarg,项目名称:spam_filter,代码行数:35,代码来源:spam_filter.py


示例10: check_classifier

def check_classifier(feature_extractor, **kwargs):
    '''
    Train the classifier on the training spam and ham, then check its accuracy
    on the test data, and show the classifier's most informative features.
    '''
    
    # Make training and testing sets of (features, label) data
    train_set, test_spam, test_ham = \
        make_train_test_sets(feature_extractor, **kwargs)
    
    #===============================================
    # ADD YOUR CODE HERE
    # Train the classifier on the training set (train_set)
    # classifier = /your code/
    # Test accuracy on test spam emails (test_spam) and test ham emails(test_ham)
    # spam_accuracy = /your code/
    # Test accuracy on test ham emails (test_spam) and test ham emails(test_ham)
    # ham_accuracy = /your code/
    #===============================================
    classifier = NaiveBayesClassifier.train(train_set)
    spam_accuracy = nltk.classify.accuracy(classifier, test_spam)
    ham_accuracy = nltk.classify.accuracy(classifier, test_ham)
    
    # How accurate is the classifier on the test sets?
    print ('Test Spam accuracy: {0:.2f}%'
       .format(100 * spam_accuracy))
    print ('Test Ham accuracy: {0:.2f}%'
       .format(100 * ham_accuracy))

    # Show the top 20 informative features
    print classifier.show_most_informative_features(20)
开发者ID:leafsherry,项目名称:UM_F14_EECS445_courseWork,代码行数:31,代码来源:q5solution.py


示例11: __init_naive_bayes

    def __init_naive_bayes( self ):
        """
    	    Create and trains the NaiveBayes Classifier
        """
	try:
#		corpus_no = abs(int(raw_input('Enter the number (1-3) of corpus: ')))
#		while corpus_no == 0 or corpus_no > 3:
#		    corpus_no = abs(int(raw_input('Please the number of corpus from 1 to 2:' )))
		corpus = 'corpus2'#+str(corpus_no)
		
		path = os.path.join('corpora/',corpus)
		spam_path = os.path.join(path,'spam')
		ham_path = os.path.join(path,'ham')
		
		
		spam_dir = os.listdir(spam_path)
		ham_dir = os.listdir(ham_path)
		
		train_spam_filelist = [os.path.join(spam_path, f) for f in spam_dir]
		train_ham_filelist = [os.path.join(ham_path, f) for f in ham_dir]

		spam_size = len(train_spam_filelist)
		ham_size = len(train_ham_filelist)
		
		train_spam_set = self.__make_featured_set(train_spam_filelist,'spam')
		train_ham_set = self.__make_featured_set(train_ham_filelist,'ham')
		train_set = train_spam_set + train_ham_set
		
		self.classifier = NaiveBayesClassifier.train( train_set )

	except:
		    raise Exception( "Unexpected error in SpamFilter: __spamFilter:",sys.exc_info()[0].__name__,\
			os.path.basename( sys.exc_info()[2].tb_frame.f_code.co_filename ),\
			sys.exc_info()[2].tb_lineno, \
			sys.exc_info()[1].message )
开发者ID:leo-pard,项目名称:HealthCare_Twitter_Analysis,代码行数:35,代码来源:spam_filter.py


示例12: train_classifiers

 def train_classifiers(self):
     for word in self.senses:
         train_set = []
         for senseId in self.senses[word]:
             for lsa_vector in self.senses[word][senseId]:
                 train_set.append([dict(lsa_vector), senseId])
         self.classifiers[word] = NaiveBayesClassifier.train(train_set)
开发者ID:phdowling,项目名称:CompLingApplications,代码行数:7,代码来源:TMWSD.py


示例13: buildclassifiers

def buildclassifiers(featureslist, SAMPLE_PROPORTION, n):
	classnames = ['Naive Bayes', 'Logistic Regression', 'Linear SCV']
	allclassifiers = []
	for name in classnames:
		for i in range(n):
			random.shuffle(featureslist)
			train_set, test_set = buildsets(featureslist, SAMPLE_PROPORTION)

			if name == 'Naive Bayes':
				spamclassifier = NaiveBayesClassifier.train(train_set)
			if name == 'Logistic Regression':
				spamclassifier = SklearnClassifier(LogisticRegression())
				spamclassifier.train(train_set)
			if name == 'Linear SCV':
				spamclassifier = SklearnClassifier(LinearSVC(C=0.01))
				spamclassifier.train(train_set)
			perfmeasures_i = evaluate(train_set, test_set, spamclassifier, name)
			if i == 0:
				perfmeasures_n = perfmeasures_i
			else:
				perfmeasures_n = map(add, perfmeasures_n, perfmeasures_i)
	
		# Store last classifier built per model
		allclassifiers.append(spamclassifier)
		
		# Print performance measures per classifier
		printperformance(name, perfmeasures_n, n)	
		
	return allclassifiers
开发者ID:Vermeij,项目名称:Spamfilter,代码行数:29,代码来源:classifyspam.py


示例14: __init__

    def __init__(self, **kwargs):
        super(TimeLogicAdapter, self).__init__(**kwargs)
        from nltk import NaiveBayesClassifier

        self.positive = [
            'what time is it',
            'do you know the time',
            'do you know what time it is',
            'what is the time'
        ]

        self.negative = [
            'it is time to go to sleep',
            'what is your favorite color',
            'i had a great time',
            'what is'
        ]

        labeled_data = (
            [(name, 0) for name in self.negative] +
            [(name, 1) for name in self.positive]
        )

        # train_set = apply_features(self.time_question_features, training_data)
        train_set = [(self.time_question_features(n), text) for (n, text) in labeled_data]

        self.classifier = NaiveBayesClassifier.train(train_set)
开发者ID:Gustavo6046,项目名称:ChatterBot,代码行数:27,代码来源:time_adapter.py


示例15: __init__

    def __init__(self, chatbot, **kwargs):
        super().__init__(chatbot, **kwargs)
        from nltk import NaiveBayesClassifier

        self.positive = kwargs.get('positive', [
            'what time is it',
            'hey what time is it',
            'do you have the time',
            'do you know the time',
            'do you know what time it is',
            'what is the time'
        ])

        self.negative = kwargs.get('negative', [
            'it is time to go to sleep',
            'what is your favorite color',
            'i had a great time',
            'thyme is my favorite herb',
            'do you have time to look at my essay',
            'how do you have the time to do all this'
            'what is it'
        ])

        labeled_data = (
            [(name, 0) for name in self.negative] +
            [(name, 1) for name in self.positive]
        )

        train_set = [
            (self.time_question_features(text), n) for (text, n) in labeled_data
        ]

        self.classifier = NaiveBayesClassifier.train(train_set)
开发者ID:hundredrab,项目名称:ChatterBot,代码行数:33,代码来源:time_adapter.py


示例16: training

def training(features, method, proportion_training):
	training_set = features[:int(proportion_training*len(features))] # we take 2/3 for training and 1/3 for testing
	testing_set = features[int(proportion_training*len(features)):]
				
	if method == 'NaiveBayes':
		classifier = NaiveBayesClassifier.train(training_set)
				
	return training_set, testing_set, classifier
开发者ID:mroussel1,项目名称:SwissRe,代码行数:8,代码来源:news_classification.py


示例17: train

def train(features, samples_proportion):
    train_size = int(len(features) * samples_proportion)
    train_set, test_set = features[:train_size], features[train_size:]
    print ('Training set size = ' + str(len(train_set)) + ' emails')
    print ('Test set size = ' + str(len(test_set)) + ' emails')
    train_set_tuple = tuple(train_set)
    classifier = NaiveBayesClassifier.train(train_set_tuple)
    return train_set, test_set, classifier
开发者ID:amitrai1095,项目名称:Spam-Filter,代码行数:8,代码来源:filter.py


示例18: textClass

def textClass():
    #dbFile = open("samp.txt")
    dbFile = open("all.txt")

    reviews = list() #each list element is a list of words in the review
    ratings = list() #ratings given
    usefulness = list() #review classification

    tot_recs = 0
    len_tot = 0
    mlen = 0

    #parse the file and create the list to be passed to the NBClassifiers
    while tot_recs < 150000:#True:
        if tot_recs % 1000 == 0:
            print "num records:", tot_recs
        tot_recs += 1
        raw_rec = readRec(dbFile)
        if len(raw_rec) == 0:
            break
        review_text = [word.strip(punctuation) for word in raw_rec["text"]]
        rate_val = str( raw_rec["score"][0] )
        
        prs_rec = parse4ftrs(raw_rec)
        len_tot += prs_rec["length"]
        if prs_rec["length"] > mlen:
            mlen = prs_rec["length"]
        use_val = str( prs_rec["class"] )

        #print use_val, rate_val
        #word feature dictionary
        wfd = word_feats(review_text)

        ratings.append( ( wfd  , rate_val)  )
        usefulness.append( ( wfd, use_val)  )

    dbFile.close()
    print "avg length:", len_tot/tot_recs
    print "max len:", mlen
    #select a cutoff for test v training
    #nrecs = len(ratings)
    nrecs = tot_recs
    rate_cl = NaiveBayesClassifier.train(ratings)
    use_cl = NaiveBayesClassifier.train(usefulness)
    return rate_cl, use_cl
开发者ID:adams-n-d,项目名称:Miners,代码行数:45,代码来源:txtCls.py


示例19: evaluate_classifier

def evaluate_classifier(train_set, test_spam, test_ham):
    """ Using NaiveBayesClassifier.train() method from NLTK to train the train_set (spam + ham),
	then classifier is used to evaluate the accuracy of test Spam, Ham. Finally, the most informative 
	features are showed.
	"""
    classifier = NaiveBayesClassifier.train(train_set)
    print ("Test Spam accuracy: {0:.2f} %".format(100 * nltk.classify.accuracy(classifier, test_spam)))
    print ("Test Ham accuracy: {0:.2f} %".format(100 * nltk.classify.accuracy(classifier, test_ham)))
    print classifier.show_most_informative_features(20)
开发者ID:bayramiaa,项目名称:creating-enron-spam-corpus-from-raw-data,代码行数:9,代码来源:bayesFilter_ngrams.py


示例20: train

def train(features, samples_proportion):
    train_size = int(len(features) * samples_proportion)
    # initialise the training and test sets
    train_set, test_set = features[:train_size], features[train_size:]
    print ('Training set size = ' + str(len(train_set)) + ' emails')
    print ('Test set size = ' + str(len(test_set)) + ' emails')
    # train the classifier
    classifier = NaiveBayesClassifier.train(train_set)
    return train_set, test_set, classifier
开发者ID:bharatkashyap,项目名称:clickbait-repel,代码行数:9,代码来源:try.py



注:本文中的nltk.NaiveBayesClassifier类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python nltk.PorterStemmer类代码示例发布时间:2022-05-27
下一篇:
Python nltk.FreqDist类代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap