本文整理汇总了Python中pyspark.mllib.classification.SVMWithSGD类的典型用法代码示例。如果您正苦于以下问题:Python SVMWithSGD类的具体用法?Python SVMWithSGD怎么用?Python SVMWithSGD使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了SVMWithSGD类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: test_classification
def test_classification(self):
from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
data = [
LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})),
LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})),
LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
]
rdd = self.sc.parallelize(data)
features = [p.features for p in data]
lr_model = LogisticRegressionWithSGD.train(rdd)
self.assertTrue(lr_model.predict(features[0]) <= 0)
self.assertTrue(lr_model.predict(features[1]) > 0)
self.assertTrue(lr_model.predict(features[2]) <= 0)
self.assertTrue(lr_model.predict(features[3]) > 0)
svm_model = SVMWithSGD.train(rdd)
self.assertTrue(svm_model.predict(features[0]) <= 0)
self.assertTrue(svm_model.predict(features[1]) > 0)
self.assertTrue(svm_model.predict(features[2]) <= 0)
self.assertTrue(svm_model.predict(features[3]) > 0)
nb_model = NaiveBayes.train(rdd)
self.assertTrue(nb_model.predict(features[0]) <= 0)
self.assertTrue(nb_model.predict(features[1]) > 0)
self.assertTrue(nb_model.predict(features[2]) <= 0)
self.assertTrue(nb_model.predict(features[3]) > 0)
开发者ID:EronWright,项目名称:spark,代码行数:28,代码来源:tests.py
示例2: test_classification
def test_classification(self):
from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
data = [
LabeledPoint(0.0, [1, 0, 0]),
LabeledPoint(1.0, [0, 1, 1]),
LabeledPoint(0.0, [2, 0, 0]),
LabeledPoint(1.0, [0, 2, 1])
]
rdd = self.sc.parallelize(data)
features = [p.features.tolist() for p in data]
lr_model = LogisticRegressionWithSGD.train(rdd)
self.assertTrue(lr_model.predict(features[0]) <= 0)
self.assertTrue(lr_model.predict(features[1]) > 0)
self.assertTrue(lr_model.predict(features[2]) <= 0)
self.assertTrue(lr_model.predict(features[3]) > 0)
svm_model = SVMWithSGD.train(rdd)
self.assertTrue(svm_model.predict(features[0]) <= 0)
self.assertTrue(svm_model.predict(features[1]) > 0)
self.assertTrue(svm_model.predict(features[2]) <= 0)
self.assertTrue(svm_model.predict(features[3]) > 0)
nb_model = NaiveBayes.train(rdd)
self.assertTrue(nb_model.predict(features[0]) <= 0)
self.assertTrue(nb_model.predict(features[1]) > 0)
self.assertTrue(nb_model.predict(features[2]) <= 0)
self.assertTrue(nb_model.predict(features[3]) > 0)
开发者ID:Altiscale,项目名称:OBSOLETE-spark,代码行数:28,代码来源:tests.py
示例3: modelWithSVM
def modelWithSVM(trainingData, validationData):
##Train the model using Support Vector Machines with different values of iterations.
##Return the SVM model with best accuracy rate
#eta = [0.1, 0.3, 0.5, 1.0, 5.0]
regularizationParamater = [.0000001, 1., 5000., 10000., 200000.]
bestSVMModel = None
bestAccuracy = 0
numOfIterations = 100
visualizationData = []
for regularizer in regularizationParamater:
model = SVMWithSGD.train(trainingData, numOfIterations, 1.0, regParam=regularizer)
predict = validationData.map(lambda ad: (ad.label, model.predict(ad.features)))
totalValidationAds = validationData.count()
correctlyPredicted = predict.filter(lambda x: x[0] == x[1]).count()
accuracy = float(correctlyPredicted)/totalValidationAds
visualizationData += [(regularizer, accuracy)]
if accuracy > bestAccuracy:
bestAccuracy = accuracy
bestSVMModel = model
return bestSVMModel, visualizationData
开发者ID:Abhishek-Arora,项目名称:Ad-Click-Prediction,代码行数:30,代码来源:predictClick.py
示例4: main
def main():
stock_file = sys.argv[1]
output_predict_file = sys.argv[2]
conf = SparkConf().setAppName('Stock Prediction Machine Learning with Twitter')
sc = SparkContext(conf=conf)
assert sc.version >= '1.5.1'
''' extracting the header of CSV file'''
file_data_all = sc.textFile(stock_file)
file_header = file_data_all.first()
file_data = file_data_all.filter(lambda line: line != file_header).cache()
''' for five different predictions getting data '''
parsedFileData_NextDayActualOpening = file_data.map(parseNextDayActualOpening)
parsedFileData_NextDayActualHigh = file_data.map(parseNextDayActualHigh)
parsedFileData_NextDayActualLow = file_data.map(parseNextDayActualLow)
parsedFileData_NextDayActualClose = file_data.map(parseNextDayActualClose)
parsedFileData_NextDayActualVolume = file_data.map(parseNextDayActualVolume)
print(parsedFileData_NextDayActualOpening.collect())
''' calling SVM with Stochastic Gradient Descent and
training using our data set '''
svm_model_nxtdayactopn = SVMWithSGD.train(parsedFileData_NextDayActualOpening, iterations=10)
lpreds = parsedFileData_NextDayActualOpening.map(lambda line: (line.label, svm_model_nxtdayactopn.predict(line.features)))
print(lpreds.collect())
开发者ID:gitofsid,项目名称:StocksPrediction-ML,代码行数:29,代码来源:svm_predicts_stocks.py
示例5: trainSVMModel
def trainSVMModel(data):
"""
Train an SVM model and return it
:param data: RDD[LabeledPoint]
:return: svm classification model
"""
from pyspark.mllib.classification import SVMWithSGD, SVMModel
model = SVMWithSGD.train(data, iterations=100)
return model
开发者ID:theseusyang,项目名称:GEQE,代码行数:9,代码来源:createROC.py
示例6: main
def main():
# prepare training data
# RDDTrainData = sc.textFile('2007_100.csv')
RDDTrainData = sc.textFile(','.join([
# '1987.csv',
# '1988.csv',
# '1989.csv',
# '1990.csv',
# '1991.csv',
# '1992.csv',
# '1993.csv',
# '1994.csv',
# '1995.csv',
# '1996.csv',
# '1997.csv',
# '1998.csv',
# '1999.csv',
# '2000.csv',
# '2001.csv',
# '2002.csv',
# '2003.csv',
# '2004.csv',
# '2005.csv',
# '2006.csv',
'2007.csv',
]))
RDDTrainHeader = RDDTrainData.take(1)[0]
trainData = RDDTrainData.filter(lambda line: line != RDDTrainHeader)\
.map(split)\
.map(parseTrain)
# prepare testing data
RDDTestData = sc.textFile('2008.csv')
RDDTestHeader = RDDTestData.take(1)[0]
testData = RDDTestData.filter(lambda line: line != RDDTestHeader)\
.map(split)\
.map(parseTest)
# do prediction
# SVM
model = SVMWithSGD.train(trainData, iterations=100)
# Logistic Regression
# model = LogisticRegressionWithLBFGS.train(trainData)
predictionData = testData.map(lambda d:
(int(d.label), model.predict(d.features))
)
# evaluate error rate
errorCount = predictionData.filter(lambda d: int(d[0]) != int(d[1])).count()
totalCount = predictionData.count()
print 'error rate =', errorCount, '/', totalCount, '=', float(errorCount) / float(totalCount)
开发者ID:gocreating,项目名称:big-data-hw4,代码行数:54,代码来源:main.py
示例7: train
def train(sc, file_positive, files_negative, file_output):
"""
Trains a binary classification model using positive samples in file_positive and
negative samples in file_negative. It writes the resulting model to file_output
:param sc: The spark context
:type sc: SparkContext
:param file_positive: The file with positive tweets (relevant ones)
:type file_positive: str
:param files_negative: The file with negative tweets (non-relevant ones)
:type files_negative: list[str]
:param file_output: The output where to store the trained model
:type file_output: str
"""
positive_tweets = sc.textFile(file_positive).map(parse_json).filter(is_valid)
negative_tweets = [sc.textFile(file_negative).map(parse_json).filter(is_valid) for file_negative in files_negative]
positive = positive_tweets.map(parse_positive)
negatives = [nt.map(parse_negative) for nt in negative_tweets]
data = positive
for negative in negatives:
data = data.union(negative)
try:
print("Training classification model")
model = SVMWithSGD.train(data, iterations=150, step=1000.0, regType='l1', regParam=1e-7)
print("Saving classification model to file")
pickle.dump(model, open(file_output, 'wb'))
print("Done!")
except Exception as e:
print("Error:")
print(e)
开发者ID:alialavia,项目名称:TwitterNews,代码行数:31,代码来源:train.py
示例8: gen_predictors
def gen_predictors(training_data):
classifiers = dict()
for item in label_map.iteritems():
print "Gen predictor for label '{0}' ...".format(item[0])
global processed_label
processed_label = item[1]
svm = SVMWithSGD.train(training_data.map(transform_label))
classifiers[item[1]] = svm
return classifiers
开发者ID:hwaohung,项目名称:spark_train,代码行数:11,代码来源:main.py
示例9: SVM_module
def SVM_module(training):
"""This function returns a SVM model from your training data.
:param training: (REQUIRED) - the training data
:return: SVM model
Use it as (Be sure to call split_data() to get the training data):
>>> model = SVM_module(trainingData)
"""
# Train a SVM model
return SVMWithSGD.train(training, iterations=300)
开发者ID:Lab41,项目名称:pythia,代码行数:12,代码来源:spark.py
示例10: run_iterations
def run_iterations(parsedData, iter, seed):
fp_rates = []
tp_rates = []
# thld_arr = []
for i in range(0, 10):
trainingData, testingData = parsedData.randomSplit([70, 30], seed)
print("For " + str(iter) + " iterations:")
# Build the model
model = SVMWithSGD.train(trainingData, iterations=100)
# Evaluating the model on training data
labelsAndPreds = trainingData.map(lambda p: (p.label, model.predict(p.features)))
trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(trainingData.count())
MSE = labelsAndPreds.map(lambda(v,p): (v-p)**2).reduce(lambda x, y: x + y)/labelsAndPreds.count()
print("Training Error = " + str(trainErr))
print("MSE = " + str(MSE))
labelsAndPreds = testingData.map(lambda p: (p.label, model.predict(p.features)))
testErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(testingData.count())
MSE = labelsAndPreds.map(lambda(v,p): (v-p)**2).reduce(lambda x, y: x + y)/labelsAndPreds.count()
print("Testing Error = " + str(testErr))
print("MSE = " + str(MSE))
info = labelsAndPreds.collect()
actual = [int(i[0]) for i in info]
predictions = [i[1] for i in info]
false_positive_rate = labelsAndPreds.filter(lambda (v, p): v == 1 and p == 0).count() / float(labelsAndPreds.filter(lambda (v, p): v == 1).count())
true_positive_rate = labelsAndPreds.filter(lambda (v, p): v == 0 and p == 0).count() / float(labelsAndPreds.filter(lambda (v, p): v == 0).count())
fpr, tpr, thresholds = roc_curve(actual, predictions)
# roc_auc = auc(false_positive_rate, true_positive_rate)
print false_positive_rate
print true_positive_rate
fp_rates.append(false_positive_rate)
tp_rates.append(true_positive_rate)
print fp_rates
print tp_rates
roc_auc = auc(fpr, tpr)
plt.title('Receiver Operating Characteristic')
plt.plot(fp_rates, tp_rates, 'b',
label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
plt.savefig('fig.png')
开发者ID:snavien,项目名称:Startups,代码行数:53,代码来源:svm.py
示例11: test_classification
def test_classification(self):
from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
data = [
LabeledPoint(0.0, [1, 0, 0]),
LabeledPoint(1.0, [0, 1, 1]),
LabeledPoint(0.0, [2, 0, 0]),
LabeledPoint(1.0, [0, 2, 1])
]
rdd = self.sc.parallelize(data)
features = [p.features.tolist() for p in data]
lr_model = LogisticRegressionWithSGD.train(rdd)
self.assertTrue(lr_model.predict(features[0]) <= 0)
self.assertTrue(lr_model.predict(features[1]) > 0)
self.assertTrue(lr_model.predict(features[2]) <= 0)
self.assertTrue(lr_model.predict(features[3]) > 0)
svm_model = SVMWithSGD.train(rdd)
self.assertTrue(svm_model.predict(features[0]) <= 0)
self.assertTrue(svm_model.predict(features[1]) > 0)
self.assertTrue(svm_model.predict(features[2]) <= 0)
self.assertTrue(svm_model.predict(features[3]) > 0)
nb_model = NaiveBayes.train(rdd)
self.assertTrue(nb_model.predict(features[0]) <= 0)
self.assertTrue(nb_model.predict(features[1]) > 0)
self.assertTrue(nb_model.predict(features[2]) <= 0)
self.assertTrue(nb_model.predict(features[3]) > 0)
categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories
dt_model = DecisionTree.trainClassifier(
rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo)
self.assertTrue(dt_model.predict(features[0]) <= 0)
self.assertTrue(dt_model.predict(features[1]) > 0)
self.assertTrue(dt_model.predict(features[2]) <= 0)
self.assertTrue(dt_model.predict(features[3]) > 0)
rf_model = RandomForest.trainClassifier(
rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100)
self.assertTrue(rf_model.predict(features[0]) <= 0)
self.assertTrue(rf_model.predict(features[1]) > 0)
self.assertTrue(rf_model.predict(features[2]) <= 0)
self.assertTrue(rf_model.predict(features[3]) > 0)
gbt_model = GradientBoostedTrees.trainClassifier(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
self.assertTrue(gbt_model.predict(features[0]) <= 0)
self.assertTrue(gbt_model.predict(features[1]) > 0)
self.assertTrue(gbt_model.predict(features[2]) <= 0)
self.assertTrue(gbt_model.predict(features[3]) > 0)
开发者ID:greatyan,项目名称:spark,代码行数:51,代码来源:tests.py
示例12: main
def main(sc):
train_data='/usr/local/spark/data/mllib/sample_svm_data.txt'
data=sc.textFile(train_data).map(parse)
if os.path.exists('model'):
model=SVMModel.load(sc, 'model')
else:
model=SVMWithSGD.train(data, iterations=100)
model.save(sc, 'model')
labelsAndPreds=data.map(lambda p: (p.label, model.predict(p.features)))
# trainErr=labelsAndPreds.filter(lambda (v, p): v != p).count() / float(data.count())
# print('Training Error =' + str(trainErr))
labelsAndPreds.map(lambda x:str(x[0])+'\t'+str(x[1])).saveAsTextFile('labelsAndPreds')
开发者ID:feng1008,项目名称:spark,代码行数:16,代码来源:svm_test.py
示例13: training
def training(path):
#import dataset into RDD
raw_data = sc.textFile(path)
#parse raw data into label bag-of-words pairs
parsed_data = raw_data.map(lambda line: parse_line(line))
#separate into training set and test set
training_set, test_set = parsed_data.randomSplit([0.6, 0.4], 17)
#get features for model training
features = feature_extraction(training_set)
labeled_points_training = training_set.map(lambda line: construct_labeled_point(line, features))
labeled_points_test = test_set.map(lambda line: construct_labeled_point(line, features))
#train logistic regression model
lrModel = LogisticRegressionWithLBFGS.train(labeled_points_training)
#train naive bayes model
nbModel = NaiveBayes.train(labeled_points_training)
svmModel = SVMWithSGD.train(labeled_points_training)
return lrModel, nbModel, svmModel, labeled_points_test, features
开发者ID:JiayingYu,项目名称:twitter_event_monitor_Spark,代码行数:17,代码来源:app.py
示例14: main
def main(sc):
inputFile=sys.argv[1]
modelPath=sys.argv[2]
data = sc.textFile(inputFile)
parsedData = data.map(parsePoint)
# Build the model
model = SVMWithSGD.train(parsedData, iterations=100)
# Evaluating the model on training data
labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count())
print("Training Error = " + str(trainErr))
# Save and load model
model.save(sc, modelPath)
# sameModel = SVMModel.load(sc, "svm_model")
sc.stop()
开发者ID:feng1008,项目名称:spark,代码行数:19,代码来源:qianka_SVM.py
示例15: train_trend_model
def train_trend_model(self, model, data, i):
self.logger.info('Start to train the direction model')
rdd_data = self.sc.parallelize(data)
if self.trend_prediction_method == self.RANDOM_FOREST:
model = RandomForest.trainClassifier(rdd_data, numClasses=2, categoricalFeaturesInfo={}, numTrees=40,
featureSubsetStrategy="auto", impurity='gini', maxDepth=20,
maxBins=32)
elif self.trend_prediction_method == self.NAIVE_BAYES:
model = NaiveBayes.train(rdd_data)
elif self.trend_prediction_method == self.LOGISTIC_REGRESSION:
model = LogisticRegressionWithSGD.train(rdd_data, iterations=10000, step=0.001,
initialWeights=None if model is None else model.weights)
elif self.trend_prediction_method == self.SVM:
model = SVMWithSGD.train(rdd_data, iterations=10000, step=0.001,
initialWeights=None if model is None else model.weights)
return model
开发者ID:WarnWang,项目名称:Dissertation,代码行数:19,代码来源:composition_prediction_system.py
示例16: svm_train
def svm_train(sc, top_path, stopwords_dict=None):
# 留个词词典接口,如果有新的词典,把词典放到该目录下
curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
if stopwords_dict is None:
stopwords = set(read_file(os.path.join(curpath, u"stopwords.txt")).split())
else:
stopwords = set(read_file(os.path.join(curpath, u"stopwords_dict.txt")).split())
# 形成两类的文件夹的每个文本分词,去停用词,词频统计结果{'pos':[counter,..],'neg':[counter]}
sub_folder = os.listdir(top_path)
if len(sub_folder) != 2:
raise OSError("need and only need two folder")
top_folder_dict = {}
for name in sub_folder:
top_folder_dict[name] = pre_process(os.path.join(top_path, name), stopwords)
# 选出两类直接区分度最大的词作为这两类的特征词集
topk = 500
features = feature_selection(top_folder_dict[sub_folder[1]], top_folder_dict[sub_folder[0]], topk)
# 计算两类的IDF
IDF = idf(top_folder_dict[sub_folder[1]], top_folder_dict[sub_folder[1]], features)
# 每一类每一篇文本在指定二分类下的向量表示[(),()...]
vector1 = {'1.0': feature_vector(tf(top_folder_dict[sub_folder[1]], features), IDF)}
vector0 = {'0.0': feature_vector(tf(top_folder_dict[sub_folder[0]], features), IDF)}
# 转为Spark所需要的输入格式[Labpoint(0.0,[]),...]
labpoint1 = [LabeledPoint(1.0, list) for list in vector1['1.0']]
labpoint0 = [LabeledPoint(0.0, list) for list in vector0['0.0']]
train_data = labpoint1 + labpoint0
classifier = SVMWithSGD.train(sc.parallelize(train_data))
path = os.path.join(curpath, 'svm_' + sub_folder[1] + '_' + sub_folder[0] + '.pkl')
if os.path.isfile(path): os.remove(path)
with open(path, 'wb') as output:
pickle.dump((features, IDF, classifier), output)
开发者ID:hanwei2008,项目名称:Virtual_Environment,代码行数:41,代码来源:svmtrain.py
示例17: train_model
def train_model(training_data, iterations, model_file_path, calculate_error=True):
"""
Trains an SVG model and saves it
:param training_data:
:param iterations:
:param model_file_path:
:return:
"""
parsed_data = sc.textFile(training_data).map(parse_point)
# Build the model
model = SVMWithSGD.train(parsed_data, iterations=iterations)
# Save the model
model.save(sc, model_file_path)
print "Model saved in: ", model_file_path
if calculate_error:
#predictions
labelsAndPreds = parsed_data.map(lambda p: (p.label, model.predict(p.features)))
trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsed_data.count())
print("============Training Error = " + str(trainErr))
开发者ID:cmantas,项目名称:asap.cslab,代码行数:22,代码来源:svm_spark.py
示例18: test_classification
def test_classification(self):
from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
from pyspark.mllib.tree import DecisionTree
data = [
LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})),
LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})),
LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
]
rdd = self.sc.parallelize(data)
features = [p.features for p in data]
lr_model = LogisticRegressionWithSGD.train(rdd)
self.assertTrue(lr_model.predict(features[0]) <= 0)
self.assertTrue(lr_model.predict(features[1]) > 0)
self.assertTrue(lr_model.predict(features[2]) <= 0)
self.assertTrue(lr_model.predict(features[3]) > 0)
svm_model = SVMWithSGD.train(rdd)
self.assertTrue(svm_model.predict(features[0]) <= 0)
self.assertTrue(svm_model.predict(features[1]) > 0)
self.assertTrue(svm_model.predict(features[2]) <= 0)
self.assertTrue(svm_model.predict(features[3]) > 0)
nb_model = NaiveBayes.train(rdd)
self.assertTrue(nb_model.predict(features[0]) <= 0)
self.assertTrue(nb_model.predict(features[1]) > 0)
self.assertTrue(nb_model.predict(features[2]) <= 0)
self.assertTrue(nb_model.predict(features[3]) > 0)
categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories
dt_model = DecisionTree.trainClassifier(rdd, numClasses=2,
categoricalFeaturesInfo=categoricalFeaturesInfo)
self.assertTrue(dt_model.predict(features[0]) <= 0)
self.assertTrue(dt_model.predict(features[1]) > 0)
self.assertTrue(dt_model.predict(features[2]) <= 0)
self.assertTrue(dt_model.predict(features[3]) > 0)
开发者ID:drewrobb,项目名称:spark,代码行数:37,代码来源:test_linalg.py
示例19: parsePoint
train_dict = [i.asDict() for i in feats_train]
feats_test = test.collect()
test_dict = [i.asDict() for i in feats_test]
def parsePoint(d):
d_copy = deepcopy(d) # I hate using deepcopy so much
pred = d_copy['success_class']
d.pop('success_class', None)
values = [float(x) for x in d.values()]
return LabeledPoint(pred, map(float,values))
trainParsed = sc.parallelize(map(parsePoint, train_dict))
testParsed = sc.parallelize(map(parsePoint, test_dict))
model = SVMWithSGD.train(trainParsed, iterations=100)
# Training Error
trainLabelsAndPreds = trainParsed.map(lambda p: (p.label, float(model.predict(p.features))))
trainErr = trainLabelsAndPreds.filter(lambda (v, p): v != p).count()/float(trainParsed.count())
print trainErr
# Test Error
testLabelsAndPreds = testParsed.map(lambda p: (p.label, float(model.predict(p.features))))
testErr = testLabelsAndPreds.filter(lambda (v, p): v != p).count()/float(testParsed.count())
print testErr
metrics = BinaryClassificationMetrics(testLabelsAndPreds)
print metrics.areaUnderROC
print metrics.areaUnderPR
开发者ID:arifyali,项目名称:Yelp,代码行数:31,代码来源:jordan_hive_spark.py
示例20: test_classification
def test_classification(self):
from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest,\
RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel
data = [
LabeledPoint(0.0, [1, 0, 0]),
LabeledPoint(1.0, [0, 1, 1]),
LabeledPoint(0.0, [2, 0, 0]),
LabeledPoint(1.0, [0, 2, 1])
]
rdd = self.sc.parallelize(data)
features = [p.features.tolist() for p in data]
temp_dir = tempfile.mkdtemp()
lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10)
self.assertTrue(lr_model.predict(features[0]) <= 0)
self.assertTrue(lr_model.predict(features[1]) > 0)
self.assertTrue(lr_model.predict(features[2]) <= 0)
self.assertTrue(lr_model.predict(features[3]) > 0)
svm_model = SVMWithSGD.train(rdd, iterations=10)
self.assertTrue(svm_model.predict(features[0]) <= 0)
self.assertTrue(svm_model.predict(features[1]) > 0)
self.assertTrue(svm_model.predict(features[2]) <= 0)
self.assertTrue(svm_model.predict(features[3]) > 0)
nb_model = NaiveBayes.train(rdd)
self.assertTrue(nb_model.predict(features[0]) <= 0)
self.assertTrue(nb_model.predict(features[1]) > 0)
self.assertTrue(nb_model.predict(features[2]) <= 0)
self.assertTrue(nb_model.predict(features[3]) > 0)
categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories
dt_model = DecisionTree.trainClassifier(
rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4)
self.assertTrue(dt_model.predict(features[0]) <= 0)
self.assertTrue(dt_model.predict(features[1]) > 0)
self.assertTrue(dt_model.predict(features[2]) <= 0)
self.assertTrue(dt_model.predict(features[3]) > 0)
dt_model_dir = os.path.join(temp_dir, "dt")
dt_model.save(self.sc, dt_model_dir)
same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir)
self.assertEqual(same_dt_model.toDebugString(), dt_model.toDebugString())
rf_model = RandomForest.trainClassifier(
rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10,
maxBins=4, seed=1)
self.assertTrue(rf_model.predict(features[0]) <= 0)
self.assertTrue(rf_model.predict(features[1]) > 0)
self.assertTrue(rf_model.predict(features[2]) <= 0)
self.assertTrue(rf_model.predict(features[3]) > 0)
rf_model_dir = os.path.join(temp_dir, "rf")
rf_model.save(self.sc, rf_model_dir)
same_rf_model = RandomForestModel.load(self.sc, rf_model_dir)
self.assertEqual(same_rf_model.toDebugString(), rf_model.toDebugString())
gbt_model = GradientBoostedTrees.trainClassifier(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4)
self.assertTrue(gbt_model.predict(features[0]) <= 0)
self.assertTrue(gbt_model.predict(features[1]) > 0)
self.assertTrue(gbt_model.predict(features[2]) <= 0)
self.assertTrue(gbt_model.predict(features[3]) > 0)
gbt_model_dir = os.path.join(temp_dir, "gbt")
gbt_model.save(self.sc, gbt_model_dir)
same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir)
self.assertEqual(same_gbt_model.toDebugString(), gbt_model.toDebugString())
try:
rmtree(temp_dir)
except OSError:
pass
开发者ID:HodaAlemi,项目名称:spark,代码行数:75,代码来源:tests.py
注:本文中的pyspark.mllib.classification.SVMWithSGD类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论