本文整理汇总了Python中pyspark.mllib.tree.DecisionTreeModel类的典型用法代码示例。如果您正苦于以下问题:Python DecisionTreeModel类的具体用法?Python DecisionTreeModel怎么用?Python DecisionTreeModel使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了DecisionTreeModel类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: evaluate_model
def evaluate_model(type):
if type == 'logistic':
model = LogisticRegressionModel.load(sc, "logit_model.model")
elif type == 'tree':
model = DecisionTreeModel.load(sc, "dt_model.model")
elif type == 'rf':
model = RandomForestModel.load(sc, "rf_model.model")
开发者ID:ayushsagar,项目名称:big-data-analytics,代码行数:7,代码来源:score.py
示例2: loadModel
def loadModel():
clusterModel = KMeansModel.load(sc, pv.clusterModelPath)
classificationModel = DecisionTreeModel.load(sc, pv.classificationModelPath)
if pv.outputDebugMsg:
Utils.logMessage("\nLoad cluster & classification model finished")
return clusterModel, classificationModel
开发者ID:yfliu87,项目名称:VestAccountDetection,代码行数:7,代码来源:vestAccountMain.py
示例3: predict_proba
def predict_proba(rf_model, testRDD):
trees = rf_model._java_model.trees()
ntrees = rf_model.numTrees()
scores_dict = {i: 0 for i in range(0,10)}
scoresRDD = testRDD.map(lambda x: scores_dict.copy())
for tree in trees:
dtm = DecisionTreeModel(tree)
currentScoreRDD = dtm.predict(testRDD)
scoresRDD = scoresRDD.zip(currentScoreRDD)
def reduceTuple(x):
x[0][int(x[1])] += 1
return x[0]
scoresRDD = scoresRDD.map(reduceTuple)
return scoresRDD
开发者ID:Erin-Boehmer,项目名称:MIDS_tinytags,代码行数:18,代码来源:MLProcessing.py
示例4: saveModel
def saveModel(self):
# save the model to the given path
self.tree_model.save(self.sc, "trained")
# re-load the saved model
self.tree_model = DecisionTreeModel.load(self.sc, "trained")
# re-evaluate
self.evaluate()
开发者ID:cjzamora,项目名称:machine-learning,代码行数:9,代码来源:DecisionTree.py
示例5: main
def main(sc, filename):
'''
The driver for the spark scoring application, it generates predictions for
a given file of features and target variables
'''
rawDataRdd = sc.textFile(filename)
print "Data Size: {}".format(rawDataRdd.count())
labeledPointsRdd = rawDataRdd.map(parse_lines)
#load models
logit_model = LogisticRegressionModel.load(sc, "logit_model.model")
dt_model = DecisionTreeModel.load(sc, "dt_model.model")
rf_model = RandomForestModel.load(sc, "rf_model.model")
#logistic predictions
labels_and_preds = labeledPointsRdd.map(lambda p: (float(logit_model.predict(p.features)), p.label ))
labels_and_preds_collected = labels_and_preds.collect()
print "\n"
print "Predictions: Logistic Regression"
y_true = []
y_pred = []
for row in labels_and_preds_collected:
y_true.append(row[1])
y_pred.append(row[0])
# print "predicted: {0} - actual: {1}\n".format(row[0], row[1])
accuracy = labels_and_preds.filter(lambda (v,p): v == p).count() / float(labeledPointsRdd.count())
print_box()
print "Prediction Accuracy (Logistic): {}".format(round(accuracy, 4))
print_box()
print "\n"
#decision tree predictions
predictions = dt_model.predict(labeledPointsRdd.map(lambda p: p.features))
labels_and_preds_dt = labeledPointsRdd.map(lambda p: p.label).zip(predictions)
labels_and_preds_dt_collected = labels_and_preds.collect()
accuracy_dt = labels_and_preds_dt.filter(lambda (v, p): v == p).count() / float(labeledPointsRdd.count())
print_box()
print "Prediction Accuracy (Decision Tree): {}".format(round(accuracy_dt, 4))
print_box()
print "\n"
#random forest predictions
predictions_rf = rf_model.predict(labeledPointsRdd.map(lambda p: p.features))
labels_and_preds_rf = labeledPointsRdd.map(lambda p: p.label).zip(predictions_rf)
accuracy_rf = labels_and_preds_rf.filter(lambda (v, p): v == p).count() / float(labeledPointsRdd.count())
print_box()
print "Prediction Accuracy (Random Forest): {}".format(round(accuracy_rf, 4))
print_box()
开发者ID:ayushsagar,项目名称:big-data-analytics,代码行数:56,代码来源:score.py
示例6: test
def test(sc):
files = ["sounds/flushing/20150227_193109-flushing-04.wav",
"sounds/bike/20150227_193806-bici-14.wav",
"sounds/blender/20150227_193606-licuadora-14.wav"
]
rfmodel = RandomForestModel.load(sc, RF_PATH)
dtmodel = DecisionTreeModel.load(sc, DT_PATH)
print dtmodel.toDebugString()
for f in files:
vec = audio.showFeatures(f)
testfeatures = Vectors.dense([float(x) for x in vec.split(' ')])
print(vec)
pred = dtmodel.predict(testfeatures)
print("DT Prediction is " + str(pred), classes[int(pred)])
pred = rfmodel.predict(testfeatures)
print("RF Prediction is " + str(pred), classes[int(pred)])
开发者ID:LoadedCoders,项目名称:iHear,代码行数:18,代码来源:main.py
示例7: get_probs_classify
def get_probs_classify (model, data):
# Collect the individual decision trees as JavaArray objects
trees = model._java_model.trees()
ntrees = model.numTrees()
scores = DecisionTreeModel(trees[0]).predict(data)
# For each tree, apply its prediction to the entire dataset and zip together the results
for i in range(1,ntrees):
dtm = DecisionTreeModel(trees[i])
scores = scores.zip(dtm.predict(data))
scores = scores.map(lambda x: x[0] + x[1])
# Divide the accumulated scores over the number of trees
return scores.map(lambda x: x/ntrees)
开发者ID:beatriceliang,项目名称:POPREU,代码行数:14,代码来源:stargalaxy.py
示例8: init_spark_context
def init_spark_context():
global predictionModel
# load spark context
conf = SparkConf().setAppName("movie_recommendation-server")
# IMPORTANT: pass aditional Python modules to each worker
sc = SparkContext(conf=conf, pyFiles=['webapp.py', 'service_func.py'])
# absolute path in hdfs
# to run locally, remove first slash '/' i.e my_model1, not /my_model1
predictionModel = DecisionTreeModel.load(sc, '/my_model1')
sc.addFile( 'conv/6.p')
sc.addFile( 'conv/7.p')
sc.addFile( 'conv/8.p')
sc.addFile('conv/10.p')
sc.addFile('conv/12.p')
sc.addFile( 'conv/36.p')
return sc
开发者ID:IcedNecro,项目名称:AWO-61-backend,代码行数:22,代码来源:server.py
示例9: predict_proba
def predict_proba(rf_model, data):
'''
This wrapper overcomes the "binary" nature of predictions in the native
RandomForestModel.
''' # Collect the individual decision tree models by calling the underlying
# Java model. These are returned as JavaArray defined by py4j.
trees = rf_model._java_model.trees()
ntrees = rf_model.numTrees()
scores = DecisionTreeModel(trees[0]).predict(data.map(
lambda row: [float(row.SearchID), float(row.AdID), float(row.Position), float(row.ObjectType),
float(row.HistCTR)]))
# For each decision tree, apply its prediction to the entire dataset and
# accumulate the results using 'zip'.
for i in range(1, ntrees):
dtm = DecisionTreeModel(trees[i])
scores = scores.zip(dtm.predict(data.map(lambda row : [float(row.SearchID),float(row.AdID),float(row.Position),float(row.ObjectType),float(row.HistCTR)])))
scores = scores.map(lambda x: x[0] + x[1])
# Divide the accumulated scores over the number of trees
return scores.map(lambda x: x / ntrees)
开发者ID:abhishek-ch,项目名称:evolveML,代码行数:21,代码来源:Predict.py
示例10: LabeledPoint
nonLable = clean_line_split[1:]
return LabeledPoint (label, nonLable)
parsedData = raw_data.map (parsePoint)
#divide training and test data by 70-30 rule
(training, test) = parsedData.randomSplit([0.7, 0.3])
#start timer at this point
startTime = datetime.now()
#build the model
#empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainRegressor (training, categoricalFeaturesInfo={},
impurity='variance', maxDepth=5, maxBins=32)
#evaluate model on test instances and compute test error
predictions = model.predict (test.map (lambda x: x.features))
labelsAndPredictions = test.map (lambda lp: lp.label).zip (predictions)
testMSE = labelsAndPredictions.map (lambda (v, p): (v - p) * (v - p)).sum() /\
float(testData.count())
print ('Time consumed = '), (datetime.now() - startTime)
print ('Test Mean Squared Error = ' + str (testMSE))
print ('Learned regression tree model:')
print (model.toDebugString())
#save and load model
model.save (sc, "DTR-Wide-2008")
sameModel = DecisionTreeModel.load (sc, "DTR-Wide-2008")
sc.stop ()
开发者ID:bsangee,项目名称:spark_vs_r,代码行数:30,代码来源:decision_tree_regression.py
示例11: SparkContext
# $example off$
if __name__ == "__main__":
sc = SparkContext(appName="PythonDecisionTreeRegressionExample")
# $example on$
# Load and parse the data file into an RDD of LabeledPoint.
data = MLUtils.loadLibSVMFile(sc, 'data/mllib/sample_libsvm_data.txt')
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])
# Train a DecisionTree model.
# Empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainRegressor(trainingData, categoricalFeaturesInfo={},
impurity='variance', maxDepth=5, maxBins=32)
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() /\
float(testData.count())
print('Test Mean Squared Error = ' + str(testMSE))
print('Learned regression tree model:')
print(model.toDebugString())
# Save and load model
model.save(sc, "target/tmp/myDecisionTreeRegressionModel")
sameModel = DecisionTreeModel.load(sc, "target/tmp/myDecisionTreeRegressionModel")
# $example off$
开发者ID:0xqq,项目名称:spark,代码行数:30,代码来源:decision_tree_regression_example.py
示例12: LabeledPoint
#Cancelled becomes the 6th column now, and total columns in the data = 6
label = clean_line_split[5]
nonLable = clean_line_split[0:5]
return LabeledPoint (label, nonLable)
parsedData = raw_data.map (parsePoint)
#divide training and test data by 70-30 rule
(training, test) = parsedData.randomSplit([0.7, 0.3])
training.cache ()
#start timer at this point
startTime = datetime.now()
#build the model
model = DecisionTree.trainClassifier(training, numClasses=2, categoricalFeaturesInfo={},
impurity='gini', maxDepth=5, maxBins=32)
#evaluate model on test instances and compute test error
predictions = model.predict (test.map (lambda x: x.features))
labelsAndPredictions = test.map (lambda lp: lp.label).zip (predictions)
testErr = labelsAndPredictions.filter (lambda (v, p): v != p).count() / float(test.count())
print ('Time consumed = '), (datetime.now() - startTime)
print ('Test Error = ' + str (testErr))
print ('Learned classification tree model:')
print (model.toDebugString())
#save and load model
model.save(sc, "DT-Class-N-95-08")
sameModel = DecisionTreeModel.load(sc, "DT-Class-N-95-08")
sc.stop ()
开发者ID:bmewing,项目名称:spark_vs_r,代码行数:30,代码来源:decision_tree_classification-narrow.py
示例13: getModel
def getModel(self, path):
if self.type == 'NaiveBayes':
return NaiveBayesModel.load(self.sc, path)
elif self.type == 'DecisionTree':
return DecisionTreeModel.load(self.sc, path)
开发者ID:aprando,项目名称:master-thesis-social-recsys,代码行数:5,代码来源:Classifier.py
示例14: SparkContext
.setAppName("Mlib")
.set("spark.executor.memory", "1g"))
sc = SparkContext(conf = conf)
dv1 =np.array([1.0,0.0,3.0])
dv2= [1.0,0.0,3.0]
sv1 = Vectors.sparse(3,[0,2],[1.0,3.0])
sv2 = sps.csc_matrix((np.array([1.0,3.0]),np.array([0,2]),np.array([0,2])),shape=(3,1))
print sv2
data = MLUtils.loadLibSVMFile(sc, 'sample_libsvm_data.txt')
(trainingData, testData) = data.randomSplit([0.7, 0.3])
model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
impurity='gini', maxDepth=5, maxBins=32)
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print('Test Error = ' + str(testErr))
print('Learned classification tree model:')
print(model.toDebugString())
# Save and load model
model.save(sc, "model_data")
sameModel = DecisionTreeModel.load(sc, "model_data")
开发者ID:Riuchando,项目名称:Spark,代码行数:30,代码来源:mllib-test.py
示例15: sets
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.util import MLUtils
# Load and parse the data file into an RDD of LabeledPoint.
data = MLUtils.loadLibSVMFile(sc, 'file')
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])
# Train a DecisionTree model.
# Empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
impurity='entropy', maxDepth=5, maxBins=32)
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print('Test Error = ' + str(testErr))
print('Learned classification tree model:')
print(model.toDebugString())
# Save and load model
model.save(sc, "myModelPath")
sameModel = DecisionTreeModel.load(sc, "myModelPath")
开发者ID:shashankadidamu,项目名称:OttoGroupClassification,代码行数:25,代码来源:decision_tree.py
示例16: SparkContext
from pyspark.mllib.util import MLUtils
# $example off$
if __name__ == "__main__":
sc = SparkContext(appName="PythonDecisionTreeClassificationExample")
# $example on$
# Load and parse the data file into an RDD of LabeledPoint.
data = MLUtils.loadLibSVMFile(sc, 'data/mllib/sample_libsvm_data.txt')
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])
# Train a DecisionTree model.
# Empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
impurity='gini', maxDepth=5, maxBins=32)
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print('Test Error = ' + str(testErr))
print('Learned classification tree model:')
print(model.toDebugString())
# Save and load model
model.save(sc, "target/tmp/myDecisionTreeClassificationModel")
sameModel = DecisionTreeModel.load(sc, "target/tmp/myDecisionTreeClassificationModel")
# $example off$
开发者ID:0xqq,项目名称:spark,代码行数:30,代码来源:decision_tree_classification_example.py
示例17: LabeledPoint
#Cancelled becomes the 9th column now, and total columns in the data = 9
label = clean_line_split[8]
nonLable = clean_line_split[0:8]
return LabeledPoint (label, nonLable)
parsedData = raw_data.map (parsePoint)
#divide training and test data by 70-30 rule
(training, test) = parsedData.randomSplit([0.7, 0.3])
training.cache ()
#start timer at this point
startTime = datetime.now()
#build the model
model = DecisionTree.trainClassifier(training, numClasses=2, categoricalFeaturesInfo={},
impurity='gini', maxDepth=5, maxBins=32)
#evaluate model on test instances and compute test error
predictions = model.predict (test.map (lambda x: x.features))
labelsAndPredictions = test.map (lambda lp: lp.label).zip (predictions)
testErr = labelsAndPredictions.filter (lambda (v, p): v != p).count() / float(test.count())
print ('Time consumed = '), (datetime.now() - startTime)
print ('Test Error = ' + str (testErr))
print ('Learned classification tree model:')
print (model.toDebugString())
#save and load model
model.save(sc, "DT-Class-W-00-08")
sameModel = DecisionTreeModel.load(sc, "DT-Class-W-00-08")
sc.stop ()
开发者ID:bmewing,项目名称:spark_vs_r,代码行数:30,代码来源:decision_tree_classification.py
示例18: float
print "######################################################\n"
print "######################################################\n"
print "######### Start!!! #######\n"
print "######################################################\n"
print "######################################################\n"
print "\n\n\n"
#stop_rdd = rdd_tweets.coalesce(1)
#stop_rdd.saveAsTextFile(output_path)
print "****************************************************\n"
print "Here is the last step\n"
print "****************************************************\n"
#Here is the trainning steps.
binladen_model = DecisionTreeModel.load(sc, binladen_model_path)
#
#training_data = MLUtils.loadLibSVMFile(sc, training_path)
test_data = rdd_labelFeatures
# Evaluate model on test instances and compute test error
predictions = binladen_model.predict(test_data.map(lambda x: x.features))
# test the error value
labelsAndPredictions = test_data.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v!=p).count() / float(test_data.count())
tmp_str = 'Test Error = ' + str(testErr)
print(tmp_str)
log_write(tmp_str)
print "\n\n"
#featuresAndPredictions = test_data.flatMap(lambda words: resplit_only_feature(words))\
# .zip(predictions)
开发者ID:zhuangkechen,项目名称:midm,代码行数:31,代码来源:sub_binladen_retweets_list.py
示例19: test_classification
def test_classification(self):
from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest,\
RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel
data = [
LabeledPoint(0.0, [1, 0, 0]),
LabeledPoint(1.0, [0, 1, 1]),
LabeledPoint(0.0, [2, 0, 0]),
LabeledPoint(1.0, [0, 2, 1])
]
rdd = self.sc.parallelize(data)
features = [p.features.tolist() for p in data]
temp_dir = tempfile.mkdtemp()
lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10)
self.assertTrue(lr_model.predict(features[0]) <= 0)
self.assertTrue(lr_model.predict(features[1]) > 0)
self.assertTrue(lr_model.predict(features[2]) <= 0)
self.assertTrue(lr_model.predict(features[3]) > 0)
svm_model = SVMWithSGD.train(rdd, iterations=10)
self.assertTrue(svm_model.predict(features[0]) <= 0)
self.assertTrue(svm_model.predict(features[1]) > 0)
self.assertTrue(svm_model.predict(features[2]) <= 0)
self.assertTrue(svm_model.predict(features[3]) > 0)
nb_model = NaiveBayes.train(rdd)
self.assertTrue(nb_model.predict(features[0]) <= 0)
self.assertTrue(nb_model.predict(features[1]) > 0)
self.assertTrue(nb_model.predict(features[2]) <= 0)
self.assertTrue(nb_model.predict(features[3]) > 0)
categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories
dt_model = DecisionTree.trainClassifier(
rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4)
self.assertTrue(dt_model.predict(features[0]) <= 0)
self.assertTrue(dt_model.predict(features[1]) > 0)
self.assertTrue(dt_model.predict(features[2]) <= 0)
self.assertTrue(dt_model.predict(features[3]) > 0)
dt_model_dir = os.path.join(temp_dir, "dt")
dt_model.save(self.sc, dt_model_dir)
same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir)
self.assertEqual(same_dt_model.toDebugString(), dt_model.toDebugString())
rf_model = RandomForest.trainClassifier(
rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10,
maxBins=4, seed=1)
self.assertTrue(rf_model.predict(features[0]) <= 0)
self.assertTrue(rf_model.predict(features[1]) > 0)
self.assertTrue(rf_model.predict(features[2]) <= 0)
self.assertTrue(rf_model.predict(features[3]) > 0)
rf_model_dir = os.path.join(temp_dir, "rf")
rf_model.save(self.sc, rf_model_dir)
same_rf_model = RandomForestModel.load(self.sc, rf_model_dir)
self.assertEqual(same_rf_model.toDebugString(), rf_model.toDebugString())
gbt_model = GradientBoostedTrees.trainClassifier(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4)
self.assertTrue(gbt_model.predict(features[0]) <= 0)
self.assertTrue(gbt_model.predict(features[1]) > 0)
self.assertTrue(gbt_model.predict(features[2]) <= 0)
self.assertTrue(gbt_model.predict(features[3]) > 0)
gbt_model_dir = os.path.join(temp_dir, "gbt")
gbt_model.save(self.sc, gbt_model_dir)
same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir)
self.assertEqual(same_gbt_model.toDebugString(), gbt_model.toDebugString())
try:
rmtree(temp_dir)
except OSError:
pass
开发者ID:HodaAlemi,项目名称:spark,代码行数:75,代码来源:tests.py
示例20: SparkContext
.setMaster(master)
.setAppName(app_name))
sc = SparkContext(conf=conf)
lines = sc.textFile(input)
parsedData = lines.map(parseLine)
(trainingData, testData) = parsedData.randomSplit([0.5, 0.5])
# Train a DecisionTree model.
# Empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
impurity='gini', maxDepth=5, maxBins=32)
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
predictions.foreach(my_print)
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
labelsAndPredictions.foreach(my_print)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print('Test Error = ' + str(testErr))
print('Learned classification tree model:')
print(model.toDebugString())
# Save and load model
model.save(sc, output)
sameModel = DecisionTreeModel.load(sc, output)
sc.stop()
开发者ID:wangcunxin,项目名称:spark_py,代码行数:31,代码来源:decisiontree_classify_test.py
注:本文中的pyspark.mllib.tree.DecisionTreeModel类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论