本文整理汇总了Python中pyspark.mllib.tree.DecisionTree类的典型用法代码示例。如果您正苦于以下问题:Python DecisionTree类的具体用法?Python DecisionTree怎么用?Python DecisionTree使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了DecisionTree类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: main
def main():
sc = SparkContext(appName="MyApp")
sc.setLogLevel('ERROR')
# Parse data
train_labels, train_data = load_data('train.csv')
dummy_labels, test_data = load_data('test.csv', use_labels=False)
# Map each data point's label to its features
train_set = reformatData(train_data, train_labels)
test_set = reformatData(test_data, dummy_labels)
# Parallelize the data
parallelized_train_set = sc.parallelize(train_set)
parallelized_test_set = sc.parallelize(test_set)
# Split the data
trainSet, validationSet = parallelized_train_set.randomSplit([1.0, 0.0], seed=42)
# Train the models
decisionTreeModel = DecisionTree.trainClassifier(trainSet, numClasses=5, categoricalFeaturesInfo={},
impurity='gini', maxBins=55, maxDepth=30, minInstancesPerNode=2)
# Test the model
testDecisionTree(decisionTreeModel, parallelized_test_set)
开发者ID:adepalatis,项目名称:379K_Final_Project,代码行数:25,代码来源:DecisionTree.py
示例2: generateDecisionTree
def generateDecisionTree():
if os.path.exists(DT_PATH):
print("DT_PATH Already available")
return
global model
data = sc.textFile(F_PATH).map(parseLine)
(trainingData, testData) = data.randomSplit([0.9, 0.1], seed=1L)
model = DecisionTree.trainClassifier(trainingData, numClasses=classes.__len__(), categoricalFeaturesInfo={},
impurity='gini', maxDepth=5, maxBins=32)
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print('Test Error = ', str(testErr))
print('Learned classification tree model:')
print(model.toDebugString())
modelStatistics(labelsAndPredictions)
# Save and load model
model.save(sc, DT_PATH)
print("Decision Tree model saved!")
开发者ID:GuruTeja,项目名称:iHear-Server,代码行数:26,代码来源:main.py
示例3: decisionTree
def decisionTree(trainingRDD, trainingRDDHashed, testRDDHashed, testRDD):
# Get size of RDD
nFilesV = trainingRDDHashed.count()
nFilesT = testRDDHashed.count()
# Train the Decision Tree Model
trainedModel = DecisionTree.trainClassifier(
trainingRDD,
numClasses=2,
categoricalFeaturesInfo={},
impurity='gini',
maxDepth=2,
maxBins=3)
# Test the Model on the Training Set
predictions = trainedModel.predict(trainingRDD.map(lambda x: x.features))
labelsAndPredictions = trainingRDD.map(
lambda lp: lp.label).zip(predictions).countByValue()
# Map to Dictionary for obtaining Results
resultsValidation = defaultdict(lambda: 0, labelsAndPredictions)
nFilesV = trainingRDDHashed.count()
nFilesT = testRDDHashed.count()
# Get F-Score and Accuracy Value
AccuracyV, fScoreV = getAccuracy(resultsValidation, nFilesV)
# Test the Model on the Test Set
predictions = trainedModel.predict(testRDD.map(lambda x: x.features))
labelsAndPredictions = testRDD.map(
lambda lp: lp.label).zip(predictions).countByValue()
# Map to Dictionary for obtaining Results
resultsTest = defaultdict(lambda: 0, labelsAndPredictions)
AccuracyT, fScoreT = getAccuracy(resultsTest, nFilesT)
# Print Results
print(' Results for Decision Tree')
print(' Training Set: %.3f and F-Score: %.3f') % (AccuracyV, fScoreV)
print(' Test Set: %.3f and F-Score: %.3f') % (AccuracyT, fScoreT)
# Return the Result List
return AccuracyV, fScoreV, AccuracyT, fScoreT
开发者ID:AkiraKane,项目名称:CityUniversity2014,代码行数:35,代码来源:ackf415-Local-LR-Optimisation.py
示例4: trainClassifier
def trainClassifier(self):
# get the current time
current = time()
# get the tags
tags = self.tags
numeric = self.numeric
x = self.x
y = self.y
# get the training data
training_data = self.training_labeled
# start training the tree model
self.tree_model = DecisionTree.trainClassifier(
training_data,
numClasses=4,
categoricalFeaturesInfo={0 : len(tags), 1 : len(numeric), 2 : len(x), 3 : len(y)},
impurity="gini",
maxDepth=5,
maxBins=1000)
print self.tree_model
# total time
total = time() - current
print "Classifier trained in {} seconds.".format(round(total, 3))
# start evaluating the model
self.evaluate()
开发者ID:cjzamora,项目名称:machine-learning,代码行数:31,代码来源:DecisionTree.py
示例5: evaluate_dt
def evaluate_dt(train,test,maxDepth,maxBins):
model = DecisionTree.trainRegressor(train,{},impurity = 'variance',maxDepth = maxDepth,maxBins = maxBins)
preds = model.predict(test.map(lambda p:p.features))
actual = test.map(lambda p:p.label)
tp = actual.zip(preds)
rmsle = np.sqrt(tp.map(lambda (t,p):squared_log_error(t,p)).mean())
return rmsle
开发者ID:zhbzz2007,项目名称:SparkProject,代码行数:7,代码来源:regression.py
示例6: test_regression
def test_regression(self):
from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
RidgeRegressionWithSGD
from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
data = [
LabeledPoint(-1.0, [0, -1]),
LabeledPoint(1.0, [0, 1]),
LabeledPoint(-1.0, [0, -2]),
LabeledPoint(1.0, [0, 2])
]
rdd = self.sc.parallelize(data)
features = [p.features.tolist() for p in data]
lr_model = LinearRegressionWithSGD.train(rdd, iterations=10)
self.assertTrue(lr_model.predict(features[0]) <= 0)
self.assertTrue(lr_model.predict(features[1]) > 0)
self.assertTrue(lr_model.predict(features[2]) <= 0)
self.assertTrue(lr_model.predict(features[3]) > 0)
lasso_model = LassoWithSGD.train(rdd, iterations=10)
self.assertTrue(lasso_model.predict(features[0]) <= 0)
self.assertTrue(lasso_model.predict(features[1]) > 0)
self.assertTrue(lasso_model.predict(features[2]) <= 0)
self.assertTrue(lasso_model.predict(features[3]) > 0)
rr_model = RidgeRegressionWithSGD.train(rdd, iterations=10)
self.assertTrue(rr_model.predict(features[0]) <= 0)
self.assertTrue(rr_model.predict(features[1]) > 0)
self.assertTrue(rr_model.predict(features[2]) <= 0)
self.assertTrue(rr_model.predict(features[3]) > 0)
categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories
dt_model = DecisionTree.trainRegressor(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4)
self.assertTrue(dt_model.predict(features[0]) <= 0)
self.assertTrue(dt_model.predict(features[1]) > 0)
self.assertTrue(dt_model.predict(features[2]) <= 0)
self.assertTrue(dt_model.predict(features[3]) > 0)
rf_model = RandomForest.trainRegressor(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1)
self.assertTrue(rf_model.predict(features[0]) <= 0)
self.assertTrue(rf_model.predict(features[1]) > 0)
self.assertTrue(rf_model.predict(features[2]) <= 0)
self.assertTrue(rf_model.predict(features[3]) > 0)
gbt_model = GradientBoostedTrees.trainRegressor(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4)
self.assertTrue(gbt_model.predict(features[0]) <= 0)
self.assertTrue(gbt_model.predict(features[1]) > 0)
self.assertTrue(gbt_model.predict(features[2]) <= 0)
self.assertTrue(gbt_model.predict(features[3]) > 0)
try:
LinearRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
except ValueError:
self.fail()
开发者ID:1ambda,项目名称:spark,代码行数:59,代码来源:tests.py
示例7: train
def train(self, num_classes=2, categorical_features=None, max_depth=5):
categorical_features = categorical_features or {}
model = DecisionTree.trainClassifier(
self._labeled_feature_vector_rdd(),
numClasses=num_classes,
categoricalFeaturesInfo=categorical_features,
maxDepth=max_depth)
return DecisionTreeModel(model, self.feature_cols)
开发者ID:Atigeo,项目名称:xpatterns-xframe,代码行数:8,代码来源:classify.py
示例8: trainModel
def trainModel(trainingData):
print '\nTraining Decision Tree model started'
Utils.logTime()
model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={}, impurity='gini', maxDepth=5,maxBins=32)
print '\nTraining Decision Tree model finished'
Utils.logTime()
return model
开发者ID:yfliu87,项目名称:MachineLearningModel,代码行数:8,代码来源:DecisionTree.py
示例9: RunDecisionTree
def RunDecisionTree(tf):
rdd = tf.map(parseAsLabeledPoints)
train, test = rdd.randomSplit([.8, .2])
model = DecisionTree.trainClassifier(train, numClasses=numCat, categoricalFeaturesInfo={},impurity='gini', maxDepth=5, maxBins=100)
predictions = model.predict(train.map(lambda x: x.features))
labelsAndPredictions = train.map(lambda lp: lp.label).zip(predictions)
trainErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(test.count())
print('Training Error = ' + str(trainErr))
开发者ID:Sunhick,项目名称:music-cognita,代码行数:8,代码来源:genre_classification.py
示例10: test_regression
def test_regression(self):
from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
RidgeRegressionWithSGD
from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
data = [
LabeledPoint(-1.0, [0, -1]),
LabeledPoint(1.0, [0, 1]),
LabeledPoint(-1.0, [0, -2]),
LabeledPoint(1.0, [0, 2])
]
rdd = self.sc.parallelize(data)
features = [p.features.tolist() for p in data]
lr_model = LinearRegressionWithSGD.train(rdd)
self.assertTrue(lr_model.predict(features[0]) <= 0)
self.assertTrue(lr_model.predict(features[1]) > 0)
self.assertTrue(lr_model.predict(features[2]) <= 0)
self.assertTrue(lr_model.predict(features[3]) > 0)
lasso_model = LassoWithSGD.train(rdd)
self.assertTrue(lasso_model.predict(features[0]) <= 0)
self.assertTrue(lasso_model.predict(features[1]) > 0)
self.assertTrue(lasso_model.predict(features[2]) <= 0)
self.assertTrue(lasso_model.predict(features[3]) > 0)
rr_model = RidgeRegressionWithSGD.train(rdd)
self.assertTrue(rr_model.predict(features[0]) <= 0)
self.assertTrue(rr_model.predict(features[1]) > 0)
self.assertTrue(rr_model.predict(features[2]) <= 0)
self.assertTrue(rr_model.predict(features[3]) > 0)
categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories
dt_model = DecisionTree.trainRegressor(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
self.assertTrue(dt_model.predict(features[0]) <= 0)
self.assertTrue(dt_model.predict(features[1]) > 0)
self.assertTrue(dt_model.predict(features[2]) <= 0)
self.assertTrue(dt_model.predict(features[3]) > 0)
rf_model = RandomForest.trainRegressor(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100)
self.assertTrue(rf_model.predict(features[0]) <= 0)
self.assertTrue(rf_model.predict(features[1]) > 0)
self.assertTrue(rf_model.predict(features[2]) <= 0)
self.assertTrue(rf_model.predict(features[3]) > 0)
gbt_model = GradientBoostedTrees.trainRegressor(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
self.assertTrue(gbt_model.predict(features[0]) <= 0)
self.assertTrue(gbt_model.predict(features[1]) > 0)
self.assertTrue(gbt_model.predict(features[2]) <= 0)
self.assertTrue(gbt_model.predict(features[3]) > 0)
开发者ID:greatyan,项目名称:spark,代码行数:52,代码来源:tests.py
示例11: test_classification
def test_classification(self):
from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
data = [
LabeledPoint(0.0, [1, 0, 0]),
LabeledPoint(1.0, [0, 1, 1]),
LabeledPoint(0.0, [2, 0, 0]),
LabeledPoint(1.0, [0, 2, 1])
]
rdd = self.sc.parallelize(data)
features = [p.features.tolist() for p in data]
lr_model = LogisticRegressionWithSGD.train(rdd)
self.assertTrue(lr_model.predict(features[0]) <= 0)
self.assertTrue(lr_model.predict(features[1]) > 0)
self.assertTrue(lr_model.predict(features[2]) <= 0)
self.assertTrue(lr_model.predict(features[3]) > 0)
svm_model = SVMWithSGD.train(rdd)
self.assertTrue(svm_model.predict(features[0]) <= 0)
self.assertTrue(svm_model.predict(features[1]) > 0)
self.assertTrue(svm_model.predict(features[2]) <= 0)
self.assertTrue(svm_model.predict(features[3]) > 0)
nb_model = NaiveBayes.train(rdd)
self.assertTrue(nb_model.predict(features[0]) <= 0)
self.assertTrue(nb_model.predict(features[1]) > 0)
self.assertTrue(nb_model.predict(features[2]) <= 0)
self.assertTrue(nb_model.predict(features[3]) > 0)
categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories
dt_model = DecisionTree.trainClassifier(
rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo)
self.assertTrue(dt_model.predict(features[0]) <= 0)
self.assertTrue(dt_model.predict(features[1]) > 0)
self.assertTrue(dt_model.predict(features[2]) <= 0)
self.assertTrue(dt_model.predict(features[3]) > 0)
rf_model = RandomForest.trainClassifier(
rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100)
self.assertTrue(rf_model.predict(features[0]) <= 0)
self.assertTrue(rf_model.predict(features[1]) > 0)
self.assertTrue(rf_model.predict(features[2]) <= 0)
self.assertTrue(rf_model.predict(features[3]) > 0)
gbt_model = GradientBoostedTrees.trainClassifier(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
self.assertTrue(gbt_model.predict(features[0]) <= 0)
self.assertTrue(gbt_model.predict(features[1]) > 0)
self.assertTrue(gbt_model.predict(features[2]) <= 0)
self.assertTrue(gbt_model.predict(features[3]) > 0)
开发者ID:greatyan,项目名称:spark,代码行数:51,代码来源:tests.py
示例12: DecisionTreeProcess
def DecisionTreeProcess(trainingSet, testSet, imp, dtMaxDepth, dtMaxBins):
decisionTreeModel = DecisionTree.trainClassifier(trainingSet, numClasses = 4,categoricalFeaturesInfo={},
impurity=imp,maxDepth=dtMaxDepth, maxBins=dtMaxBins)
predictions = decisionTreeModel.predict(trainingSet.map(lambda item: item.features))
trainingLabelsAndPredictions = trainingSet.map(lambda item: item.label).zip(predictions)
eva.calculateErrorRate("\nClassification model Training set", trainingLabelsAndPredictions)
predictions = decisionTreeModel.predict(testSet.map(lambda item: item.features))
testLabelsAndPredictions = testSet.map(lambda item: item.label).zip(predictions)
eva.calculateErrorRate("\nClassification model Test set", testLabelsAndPredictions)
return decisionTreeModel
开发者ID:yfliu87,项目名称:VestAccountDetection,代码行数:15,代码来源:ClassificationMain.py
示例13: regression
def regression(sc, sample):
traindata = sc.parallelize(sample)
traindata = traindata.map(lambda x:LabeledPoint(x[1],x[0]))
testdata = [8.2]
#####
# linear_model = LinearRegressionWithSGD.train(traindata,iterations=10)
# prediction = linear_model.predict(testdata)
# print prediction
#####
decision_model = DecisionTree.trainRegressor(traindata,{})
prediction = decision_model.predict(testdata)
print prediction
开发者ID:fndjjx,项目名称:practice,代码行数:15,代码来源:regression.py
示例14: classify
def classify(sc, sample):
def ff(x):
newsample = []
nl = ["rainy","sad","lack"]
ml = ["cloudy","soso","enough"]
pl = ["sunny","happy","most"]
for i in x:
if i in nl:
newsample.append(0)
elif i in ml:
newsample.append(1)
elif i in pl:
newsample.append(2)
return newsample
f = lambda x:1 if x=="yes" else 0
traindata = sc.parallelize(sample).map(lambda x:(ff(x[0]),f(x[1])))
traindata = traindata.map(lambda x:LabeledPoint(x[1],x[0]))
testdata = traindata.first()
print testdata
######
# print "logistic"
# lrModel = LogisticRegressionWithSGD.train(traindata, 10)
# prediction = lrModel.predict(testdata.features)
# print prediction
#####
# print "svm"
# svmModel = SVMWithSGD.train(traindata, 10)
# prediction = svmModel.predict(testdata.features)
# print prediction
#
#
# ####
# print "naive bayes"
# nbModel = NaiveBayes.train(traindata)
# prediction = nbModel.predict(testdata.features)
# print prediction
#
#
# ####
print "decesion tree"
detreeModel = DecisionTree.trainClassifier(traindata, 2, {})
prediction = detreeModel.predict(testdata.features)
print prediction
开发者ID:fndjjx,项目名称:practice,代码行数:47,代码来源:classify.py
示例15: trainModel
def trainModel(self, vectSpace, path):
try:
if self.type == 'NaiveBayes':
model = NaiveBayes.train(vectSpace)
elif self.type == 'DecisionTree':
model = DecisionTree.trainClassifier(vectSpace, numClasses = len(self.category), categoricalFeaturesInfo={}, impurity='gini', maxDepth=5, maxBins=5)
if not os.path.exists(path):
os.makedirs(path)
else:
shutil.rmtree(path)
os.makedirs(path)
model.save(self.sc, path)
except:
print "Unexpected error:", sys.exc_info()[0]
raise
return model
开发者ID:aprando,项目名称:master-thesis-social-recsys,代码行数:20,代码来源:Classifier.py
示例16: create_model
def create_model(name, training):
if name == 'logistic':
print_box()
print "Logistic Regression Model"
print_box()
model = LogisticRegressionWithLBFGS.train(training)
elif name == 'tree':
print_box()
print "Decision Tree Model"
print_box()
model = DecisionTree.trainClassifier(training, numClasses=2, categoricalFeaturesInfo={},
impurity='gini', maxDepth=5, maxBins=32)
elif name == 'rf':
print_box()
print "Random Forest Model"
print_box()
model = RandomForest.trainClassifier(training, numClasses=2, categoricalFeaturesInfo={},
numTrees=15, featureSubsetStrategy="auto", impurity='gini', maxDepth=5, maxBins=50)
return model
开发者ID:ayushsagar,项目名称:big-data-analytics,代码行数:20,代码来源:models.py
示例17: main
def main(input_file):
sc = pyspark.SparkContext(appName="DecisionTree")
data = MLUtils.loadLabeledPoints(sc, input_file)
trainingData, testData = data.randomSplit([0.70, 0.3])
# Cache in memory for faster training
trainingData.cache()
model = DecisionTree.trainClassifier(trainingData, numClasses=4, impurity='gini',
categoricalFeaturesInfo={}, maxDepth=16, maxBins=10)
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
# print tree_model.toDebugString()
print ""
print ""
print "Test Erros: {}".format(round(testErr,4))
开发者ID:garethdavidjones,项目名称:Election-Contrib,代码行数:20,代码来源:decisionTree.py
示例18: process
def process(sc, dtClusterNum, dtMaxDepth, dtMaxBins, eigenVecFile, markedClusterFile):
filteredEigenVec = sc.textFile(eigenVecFile).map(lambda item: removeVirtualPart(item)).collect()
clusterIDs = sc.textFile(markedClusterFile).map(lambda item: extractClusterID(item)).collect()
clusterIdEigenVecMapRDD = sc.parallelize(clusterIDs).zip(sc.parallelize(filteredEigenVec))
labeledClusterIdEigenVecMapRdd = clusterIdEigenVecMapRDD.map(lambda item: LabeledPoint(item[0], item[1]))
trainingSet, testSet = labeledClusterIdEigenVecMapRdd.randomSplit([0.7, 0.3])
decisionTreeModel = DecisionTree.trainClassifier(trainingSet, numClasses = dtClusterNum,
categoricalFeaturesInfo={},impurity='entropy',maxDepth=dtMaxDepth, maxBins=dtMaxBins)
predictions = decisionTreeModel.predict(trainingSet.map(lambda item: item.features))
trainingLabelsAndPredictions = trainingSet.map(lambda item: item.label).zip(predictions)
eva.calculateErrorRate("\nCluster model Training set", trainingLabelsAndPredictions)
predictions = decisionTreeModel.predict(testSet.map(lambda item: item.features))
testLabelsAndPredictions = testSet.map(lambda item: item.label).zip(predictions)
eva.calculateErrorRate("\nCluster model Test set", testLabelsAndPredictions)
return decisionTreeModel
开发者ID:yfliu87,项目名称:VestAccountDetection,代码行数:20,代码来源:ClassificationModule.py
示例19: trainOptimalModel
def trainOptimalModel(trainingData, testData):
print "\nTraining optimal Decision Tree model started!"
Utils.logTime()
impurityVals = ['gini', 'entropy']
maxDepthVals = [3,4,5,6,7]
maxBinsVals = [8,16,32]
optimalModel = None
optimalMaxDepth = None
optimalImpurity = None
optimalBinsVal = None
minError = None
try:
for curImpurity in impurityVals:
for curMaxDepth in maxDepthVals:
for curMaxBins in maxBinsVals:
model = DecisionTree.trainClassifier(trainingData,
numClasses=2,
categoricalFeaturesInfo={},
impurity=curImpurity,
maxDepth=curMaxDepth,
maxBins=curMaxBins)
testErr, PR, ROC = Evaluation.evaluate(model, testData)
if testErr < minError or not minError:
minError = testErr
optimalImpurity = curImpurity
optimalMaxDepth = curMaxDepth
optimalBinsVal = curMaxBins
optimalModel = model
except:
msg = "\nException during model training with below parameters:"
msg += "\timpurity: " + str(curImpurity)
msg += "\tmaxDepth: " + str(curMaxDepth)
msg += "\tmaxBins: " + str(curMaxBins)
Utils.logMessage(msg)
logMessage(optimalModel, optimalMaxDepth, optimalImpurity, optimalBinsVal, minError)
return optimalModel
开发者ID:yfliu87,项目名称:MachineLearningModel,代码行数:40,代码来源:DecisionTree.py
示例20: test_regression
def test_regression(self):
from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
RidgeRegressionWithSGD
from pyspark.mllib.tree import DecisionTree
data = [
LabeledPoint(-1.0, self.scipy_matrix(2, {1: -1.0})),
LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
LabeledPoint(-1.0, self.scipy_matrix(2, {1: -2.0})),
LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
]
rdd = self.sc.parallelize(data)
features = [p.features for p in data]
lr_model = LinearRegressionWithSGD.train(rdd)
self.assertTrue(lr_model.predict(features[0]) <= 0)
self.assertTrue(lr_model.predict(features[1]) > 0)
self.assertTrue(lr_model.predict(features[2]) <= 0)
self.assertTrue(lr_model.predict(features[3]) > 0)
lasso_model = LassoWithSGD.train(rdd)
self.assertTrue(lasso_model.predict(features[0]) <= 0)
self.assertTrue(lasso_model.predict(features[1]) > 0)
self.assertTrue(lasso_model.predict(features[2]) <= 0)
self.assertTrue(lasso_model.predict(features[3]) > 0)
rr_model = RidgeRegressionWithSGD.train(rdd)
self.assertTrue(rr_model.predict(features[0]) <= 0)
self.assertTrue(rr_model.predict(features[1]) > 0)
self.assertTrue(rr_model.predict(features[2]) <= 0)
self.assertTrue(rr_model.predict(features[3]) > 0)
categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories
dt_model = DecisionTree.trainRegressor(rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
self.assertTrue(dt_model.predict(features[0]) <= 0)
self.assertTrue(dt_model.predict(features[1]) > 0)
self.assertTrue(dt_model.predict(features[2]) <= 0)
self.assertTrue(dt_model.predict(features[3]) > 0)
开发者ID:drewrobb,项目名称:spark,代码行数:37,代码来源:test_linalg.py
注:本文中的pyspark.mllib.tree.DecisionTree类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论