本文整理汇总了Python中pyspark.mllib.tree.RandomForest类的典型用法代码示例。如果您正苦于以下问题:Python RandomForest类的具体用法?Python RandomForest怎么用?Python RandomForest使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了RandomForest类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: testOnce
def testOnce ():
# split the data into training and testing sets
(trainingData, testData) = data.randomSplit([1-test_size, test_size])
# train the random forest
model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
numTrees=num_trees, featureSubsetStrategy = strat,
impurity='gini', maxDepth = max_depth, maxBins=32)
# test the random forest
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
Mg = float(labelsAndPredictions.filter(lambda (v, p): v == 0 and p == 1).count())
Ng = float(labelsAndPredictions.filter(lambda (v, p): v == 0 and p == 0).count())
Ms = float(labelsAndPredictions.filter(lambda (v, p): v == 1 and p == 0).count())
Ns = float(labelsAndPredictions.filter(lambda (v, p): v == 1 and p == 1).count())
probsAndScores = probTest(testData, model)
threshold_accuracy = probsAndScores[0]
probs = probsAndScores[1].map(lambda x: x/num_trees)
labelsAndPredictions = labelsAndPredictions.zip(probs)
labelsAndProbs = testData.map(lambda lp: lp.label).zip(probs)
save(labelsAndProbs, 'answers')
print ('Galaxy Purity = ' + str(Ng / (Ng+Ms)))
print ('Galaxy Completeness = ' + str(Ng / (Ng+Mg)))
print ('Star Purity = ' + str(Ns / (Ns+Mg)))
print ('Star Completeness = ' + str(Ns/(Ns+Ms)))
print ('Accuracy = ' + str(1 - testErr))
print ('Threshold method accuracy = ' + str(threshold_accuracy))
开发者ID:beatriceliang,项目名称:POPREU,代码行数:29,代码来源:stargalaxy.py
示例2: rfTest
def rfTest(sqlContext,dataset_rdd):
dataset_positive = dataset_rdd.filter(lambda e:e[1]>0.5)
dataset_negotive = dataset_rdd.filter(lambda e:e[1]<0.5)
train_positive = dataset_positive.sample(False,0.8)
test_positive = dataset_positive.subtract(train_positive)
train_negotive = dataset_negotive.sample(False,0.8)
test_negotive = dataset_negotive.subtract(train_negotive)
trainset_rdd = train_positive.union(train_negotive)
testset_rdd = test_positive.union(test_negotive)
trainset = trainset_rdd.map(lambda e:LabeledPoint(e[1],e[2:]))
trainset_nums = trainset.count()
testset = testset_rdd.map(lambda e:LabeledPoint(e[1],e[2:]))
testset_nums = testset.count()
trainset_positive = train_positive.count()
testset_positive = test_positive.count()
model = RandomForest.trainClassifier(trainset,2,{},3)
predictions = model.predict(testset.map(lambda x:x.features))
predict = testset.map(lambda lp: lp.label).zip(predictions)
hitALL =predict.filter(lambda e:e[0]==e[1]).count()
hitPositive = predict.filter(lambda e:e[0]==e[1] and (e[0]>0.5)).count()
positive = predict.filter(lambda e:e[1]>0.5).count()
recallPositive = hitPositive/float(testset_positive)
precision = hitPositive/float(positive)
accuracy = hitALL/float(testset.count())
F_Value = 2/(1/precision+1/recallPositive)
return (trainset_nums,testset_nums,trainset_positive,testset_positive,positive,hitPositive,precision,recallPositive,accuracy,F_Value,model)
开发者ID:fighting410381,项目名称:youmi,代码行数:26,代码来源:spark_script.py
示例3: main
def main():
sc = SparkContext(appName="MyApp")
sc.setLogLevel('ERROR')
# Parse data
train_labels, train_data = load_data('train.csv')
dummy_labels, test_data = load_data('test.csv', use_labels=False)
# Truncate the last 2 features of the data
for dataPoint in train_data:
len = np.size(dataPoint)
dataPoint = np.delete(dataPoint, [len - 2, len - 1])
for dataPoint in test_data:
len = np.size(dataPoint)
dataPoint = np.delete(dataPoint, [len - 2, len - 1])
# Map each data point's label to its features
train_set = reformatData(train_data, train_labels)
test_set = reformatData(test_data, dummy_labels)
# Parallelize the data
parallelized_train_set = sc.parallelize(train_set)
parallelized_test_set = sc.parallelize(test_set)
# Split the data
trainSet, validationSet = parallelized_train_set.randomSplit([0.01, 0.99], seed=42)
# Train the models
randomForestModel = RandomForest.trainClassifier(trainSet, numClasses=4, impurity='gini', categoricalFeaturesInfo={},
numTrees=750, seed=42, maxDepth=30, maxBins=32)
# Test the model
testRandomForest(randomForestModel, parallelized_test_set)
开发者ID:adepalatis,项目名称:379K_Final_Project,代码行数:34,代码来源:RandomForest.py
示例4: generateRandomForest
def generateRandomForest():
if os.path.exists(RF_PATH):
print("RF_PATH Already available")
return
data = sc.textFile(F_PATH).map(parseLine)
(trainingData, testData) = data.randomSplit([0.9, 0.1], seed=1L)
# Train a RandomForest model.
# Note: Use larger numTrees in practice.
# Setting featureSubsetStrategy="auto" lets the algorithm choose.
model = RandomForest.trainClassifier(trainingData, numClasses=classes.__len__(), categoricalFeaturesInfo={},
numTrees=4, featureSubsetStrategy="auto",
impurity='gini', maxDepth=4, maxBins=32)
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print('Test Error', str(testErr))
print('Learned classification forest model:')
print(model.toDebugString())
modelStatistics(labelsAndPredictions)
# Save and load model
model.save(sc, RF_PATH)
print("Saved RF Model.")
开发者ID:GuruTeja,项目名称:iHear-Server,代码行数:29,代码来源:main.py
示例5: main
def main():
input_train = sys.argv[1]
input_test = sys.argv[2]
conf = SparkConf().setAppName('Sentiment Analysis with Random Forest')
sc = SparkContext(conf=conf)
assert sc.version >= '1.5.1'
train = sc.textFile(input_train).cache()
test = sc.textFile(input_test).cache()
'''sbaronia - get training and testing labeled points'''
train_lp = train.map(to_labeledpoint).cache()
test_lp = test.map(to_labeledpoint).cache()
'''sbaronia - run RandomForest regression on our training data with
default options except numTrees = 5'''
rf_model = RandomForest.trainRegressor(train_lp,categoricalFeaturesInfo={},numTrees=5,featureSubsetStrategy="auto", impurity='variance', maxDepth=4, maxBins=32)
'''sbaronia - run predictions on testing data and calculate RMSE value'''
predictions = rf_model.predict(test_lp.map(lambda x: x.features))
labelsAndPredictions = test_lp.map(lambda lp: lp.label).zip(predictions)
rmse = math.sqrt(labelsAndPredictions.map(lambda (v, p): (v-p)**2).reduce(lambda x, y: x + y)/float(test_lp.count()))
print("RMSE = " + str(rmse))
开发者ID:gitofsid,项目名称:MyBigDataCode,代码行数:25,代码来源:randomforest.py
示例6: Random_Forest
def Random_Forest(filename, sc):
filename = "/Users/Jacob/SparkService/data/sample_libsvm_data.txt"
# Load and parse the data file into an RDD of LabeledPoint.
data = MLUtils.loadLibSVMFile(sc, filename)
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])
# Train a RandomForest model.
# Empty categoricalFeaturesInfo indicates all features are continuous.
# Note: Use larger numTrees in practice.
# Setting featureSubsetStrategy="auto" lets the algorithm choose.
model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
numTrees=3, featureSubsetStrategy="auto",
impurity='gini', maxDepth=4, maxBins=32)
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print('Test Error = ' + str(testErr))
print('Learned classification forest model:')
print(model.toDebugString())
# Save and load model
#model.save(sc, "target/tmp/myRandomForestClassificationModel")
#sameModel = RandomForestModel.load(sc, "target/tmp/myRandomForestClassificationModel")
开发者ID:bangjieliu,项目名称:SparkService,代码行数:27,代码来源:random_forest.py
示例7: test_regression
def test_regression(self):
from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
RidgeRegressionWithSGD
from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
data = [
LabeledPoint(-1.0, [0, -1]),
LabeledPoint(1.0, [0, 1]),
LabeledPoint(-1.0, [0, -2]),
LabeledPoint(1.0, [0, 2])
]
rdd = self.sc.parallelize(data)
features = [p.features.tolist() for p in data]
lr_model = LinearRegressionWithSGD.train(rdd, iterations=10)
self.assertTrue(lr_model.predict(features[0]) <= 0)
self.assertTrue(lr_model.predict(features[1]) > 0)
self.assertTrue(lr_model.predict(features[2]) <= 0)
self.assertTrue(lr_model.predict(features[3]) > 0)
lasso_model = LassoWithSGD.train(rdd, iterations=10)
self.assertTrue(lasso_model.predict(features[0]) <= 0)
self.assertTrue(lasso_model.predict(features[1]) > 0)
self.assertTrue(lasso_model.predict(features[2]) <= 0)
self.assertTrue(lasso_model.predict(features[3]) > 0)
rr_model = RidgeRegressionWithSGD.train(rdd, iterations=10)
self.assertTrue(rr_model.predict(features[0]) <= 0)
self.assertTrue(rr_model.predict(features[1]) > 0)
self.assertTrue(rr_model.predict(features[2]) <= 0)
self.assertTrue(rr_model.predict(features[3]) > 0)
categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories
dt_model = DecisionTree.trainRegressor(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4)
self.assertTrue(dt_model.predict(features[0]) <= 0)
self.assertTrue(dt_model.predict(features[1]) > 0)
self.assertTrue(dt_model.predict(features[2]) <= 0)
self.assertTrue(dt_model.predict(features[3]) > 0)
rf_model = RandomForest.trainRegressor(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1)
self.assertTrue(rf_model.predict(features[0]) <= 0)
self.assertTrue(rf_model.predict(features[1]) > 0)
self.assertTrue(rf_model.predict(features[2]) <= 0)
self.assertTrue(rf_model.predict(features[3]) > 0)
gbt_model = GradientBoostedTrees.trainRegressor(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4)
self.assertTrue(gbt_model.predict(features[0]) <= 0)
self.assertTrue(gbt_model.predict(features[1]) > 0)
self.assertTrue(gbt_model.predict(features[2]) <= 0)
self.assertTrue(gbt_model.predict(features[3]) > 0)
try:
LinearRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
except ValueError:
self.fail()
开发者ID:1ambda,项目名称:spark,代码行数:59,代码来源:tests.py
示例8: trainRandomForestModel
def trainRandomForestModel(data):
"""
Train a random forest regression model and return it
:param data: RDD[LabeledPoint]
:return: random forest regression model
"""
from pyspark.mllib.tree import RandomForest
model = RandomForest.trainRegressor(data, categoricalFeaturesInfo={}, numTrees=2000, featureSubsetStrategy="auto", impurity="variance", maxDepth=4, maxBins=32)
return model
开发者ID:theseusyang,项目名称:GEQE,代码行数:9,代码来源:createROC.py
示例9: train_model
def train_model(cls, trianData, cateFeaInfo={}, trees=3, impurity="gini",\
depth=4):
"""
训练模型
"""
model = RandomForest.trainClassifier(trainData, numClasses=2,\
categoricalFeaturesInfo=cateFeaInfo, numTrees=trees, \
featureSubsetStrategy="auto", impurity=impurity, maxDepth=depth,\
maxBins=32)
return model
开发者ID:yidun55,项目名称:mllib,代码行数:10,代码来源:randomForest_classification_spark_xiaodai.py
示例10: evaluate
def evaluate(self, trainingData, testData=None, metric=None):
if testData !=None:
model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
numTrees=10, featureSubsetStrategy="auto",
impurity='gini', maxDepth=4, maxBins=32)
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print('Test Error = ' + str(testErr))
else: #cross validation
pass
开发者ID:bngonmang,项目名称:FIND,代码行数:11,代码来源:RF.py
示例11: trainModel
def trainModel(trainingData):
print "\nTrainning Random Forest model started!"
Utils.logTime()
model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
numTrees=3, featureSubsetStrategy="auto", impurity='gini',
maxDepth=5, maxBins=32)
print '\nTraining Random Forest model finished'
Utils.logTime()
return model
开发者ID:yfliu87,项目名称:MachineLearningModel,代码行数:11,代码来源:RandomForest.py
示例12: getRandomForestRMSE
def getRandomForestRMSE(trees_array):
valRMSE_list = []
for trees in trees_array:
model = RandomForest.trainRegressor(train_featureScoreTimeRDD, categoricalFeaturesInfo={},
numTrees=trees, featureSubsetStrategy="auto",
impurity='variance', maxDepth=4, maxBins=32)
predictions = model.predict(val_featureScoreTimeRDD.map(lambda lp: lp.features))
labelsAndPreds = val_featureScoreTimeRDD.map(lambda lp: lp.label).zip(predictions)
valMSE = labelsAndPreds.map(lambda (v, p): (v - p)*(v-p)).sum() / float(val_featureScoreTimeRDD.count())
valRMSE=valMSE**0.5
valRMSE_list.append((trees, valRMSE))
return valRMSE_list
开发者ID:shaileshr,项目名称:SentimentAnalysis,代码行数:12,代码来源:Qn8.py
示例13: test_regression
def test_regression(self):
from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
RidgeRegressionWithSGD
from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
data = [
LabeledPoint(-1.0, [0, -1]),
LabeledPoint(1.0, [0, 1]),
LabeledPoint(-1.0, [0, -2]),
LabeledPoint(1.0, [0, 2])
]
rdd = self.sc.parallelize(data)
features = [p.features.tolist() for p in data]
lr_model = LinearRegressionWithSGD.train(rdd)
self.assertTrue(lr_model.predict(features[0]) <= 0)
self.assertTrue(lr_model.predict(features[1]) > 0)
self.assertTrue(lr_model.predict(features[2]) <= 0)
self.assertTrue(lr_model.predict(features[3]) > 0)
lasso_model = LassoWithSGD.train(rdd)
self.assertTrue(lasso_model.predict(features[0]) <= 0)
self.assertTrue(lasso_model.predict(features[1]) > 0)
self.assertTrue(lasso_model.predict(features[2]) <= 0)
self.assertTrue(lasso_model.predict(features[3]) > 0)
rr_model = RidgeRegressionWithSGD.train(rdd)
self.assertTrue(rr_model.predict(features[0]) <= 0)
self.assertTrue(rr_model.predict(features[1]) > 0)
self.assertTrue(rr_model.predict(features[2]) <= 0)
self.assertTrue(rr_model.predict(features[3]) > 0)
categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories
dt_model = DecisionTree.trainRegressor(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
self.assertTrue(dt_model.predict(features[0]) <= 0)
self.assertTrue(dt_model.predict(features[1]) > 0)
self.assertTrue(dt_model.predict(features[2]) <= 0)
self.assertTrue(dt_model.predict(features[3]) > 0)
rf_model = RandomForest.trainRegressor(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100)
self.assertTrue(rf_model.predict(features[0]) <= 0)
self.assertTrue(rf_model.predict(features[1]) > 0)
self.assertTrue(rf_model.predict(features[2]) <= 0)
self.assertTrue(rf_model.predict(features[3]) > 0)
gbt_model = GradientBoostedTrees.trainRegressor(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
self.assertTrue(gbt_model.predict(features[0]) <= 0)
self.assertTrue(gbt_model.predict(features[1]) > 0)
self.assertTrue(gbt_model.predict(features[2]) <= 0)
self.assertTrue(gbt_model.predict(features[3]) > 0)
开发者ID:greatyan,项目名称:spark,代码行数:52,代码来源:tests.py
示例14: trainOptimalModel
def trainOptimalModel(trainingData, testData):
print "\nTraining optimal Random Forest model started!"
Utils.logTime()
numTreesVals = [3,5,8]
featureSubsetStrategyVals = ['auto','all','sqrt','log2','onethird']
impurityVals = ['gini', 'entropy']
maxDepthVals = [3,4,5,6,7]
maxBinsVals = [8,16,32]
optimalModel = None
optimalNumTrees = None
optimalFeatureSubsetStrategy = None
optimalMaxDepth = None
optimalImpurity = None
optimalBinsVal = None
minError = None
try:
for curNumTree in numTreesVals:
for curFeatureSubsetStrategy in featureSubsetStrategyVals:
for curImpurity in impurityVals:
for curMaxDepth in maxDepthVals:
for curMaxBins in maxBinsVals:
model = RandomForest.trainClassifier(trainingData,
numClasses=2,
categoricalFeaturesInfo={},
numTrees=curNumTree,
featureSubsetStrategy=curFeatureSubsetStrategy,
impurity=curImpurity,
maxDepth=curMaxDepth,
maxBins=curMaxBins)
testErr = Evaluation.evaluate(model, testData)
if testErr < minError or not minError:
minError = testErr
optimalNumTrees = curNumTree
optimalFeatureSubsetStrategy = curFeatureSubsetStrategy
optimalImpurity = curImpurity
optimalMaxDepth = curMaxDepth
optimalBinsVal = curMaxBins
optimalModel = model
except:
msg = "\nException during model training with below parameters:"
msg += "\tnum trees: " + str(optimalNumTrees)
msg += "\tfeature subset strategy: " + optimalFeatureSubsetStrategy
msg += "\timpurity: " + str(curImpurity)
msg += "\tmaxDepth: " + str(curMaxDepth)
msg += "\tmaxBins: " + str(curMaxBins)
Utls.logMessage(msg)
logMessage(optimalModel, optimalNumTrees, optimalFeatureSubsetStrategy, optimalMaxDepth, optimalImpurity, optimalBinsVal, minError)
return optimalModel
开发者ID:yfliu87,项目名称:MachineLearningModel,代码行数:52,代码来源:RandomForest.py
示例15: test_classification
def test_classification(self):
from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
data = [
LabeledPoint(0.0, [1, 0, 0]),
LabeledPoint(1.0, [0, 1, 1]),
LabeledPoint(0.0, [2, 0, 0]),
LabeledPoint(1.0, [0, 2, 1])
]
rdd = self.sc.parallelize(data)
features = [p.features.tolist() for p in data]
lr_model = LogisticRegressionWithSGD.train(rdd)
self.assertTrue(lr_model.predict(features[0]) <= 0)
self.assertTrue(lr_model.predict(features[1]) > 0)
self.assertTrue(lr_model.predict(features[2]) <= 0)
self.assertTrue(lr_model.predict(features[3]) > 0)
svm_model = SVMWithSGD.train(rdd)
self.assertTrue(svm_model.predict(features[0]) <= 0)
self.assertTrue(svm_model.predict(features[1]) > 0)
self.assertTrue(svm_model.predict(features[2]) <= 0)
self.assertTrue(svm_model.predict(features[3]) > 0)
nb_model = NaiveBayes.train(rdd)
self.assertTrue(nb_model.predict(features[0]) <= 0)
self.assertTrue(nb_model.predict(features[1]) > 0)
self.assertTrue(nb_model.predict(features[2]) <= 0)
self.assertTrue(nb_model.predict(features[3]) > 0)
categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories
dt_model = DecisionTree.trainClassifier(
rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo)
self.assertTrue(dt_model.predict(features[0]) <= 0)
self.assertTrue(dt_model.predict(features[1]) > 0)
self.assertTrue(dt_model.predict(features[2]) <= 0)
self.assertTrue(dt_model.predict(features[3]) > 0)
rf_model = RandomForest.trainClassifier(
rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100)
self.assertTrue(rf_model.predict(features[0]) <= 0)
self.assertTrue(rf_model.predict(features[1]) > 0)
self.assertTrue(rf_model.predict(features[2]) <= 0)
self.assertTrue(rf_model.predict(features[3]) > 0)
gbt_model = GradientBoostedTrees.trainClassifier(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
self.assertTrue(gbt_model.predict(features[0]) <= 0)
self.assertTrue(gbt_model.predict(features[1]) > 0)
self.assertTrue(gbt_model.predict(features[2]) <= 0)
self.assertTrue(gbt_model.predict(features[3]) > 0)
开发者ID:greatyan,项目名称:spark,代码行数:51,代码来源:tests.py
示例16: testRegression
def testRegression(trainingData, testData):
# Train a RandomForest model.
# Empty categoricalFeaturesInfo indicates all features are continuous.
# Note: Use larger numTrees in practice.
# Setting featureSubsetStrategy="auto" lets the algorithm choose.
model = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo={},
numTrees=3, featureSubsetStrategy="auto",
impurity='variance', maxDepth=4, maxBins=32)
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testMSE = labelsAndPredictions.map(lambda v_p1: (v_p1[0] - v_p1[1]) * (v_p1[0] - v_p1[1]))\
.sum() / float(testData.count())
print('Test Mean Squared Error = ' + str(testMSE))
print('Learned regression forest model:')
print(model.toDebugString())
开发者ID:1ambda,项目名称:spark,代码行数:17,代码来源:random_forest_example.py
示例17: testClassification
def testClassification(trainingData, testData):
# Train a RandomForest model.
# Empty categoricalFeaturesInfo indicates all features are continuous.
# Note: Use larger numTrees in practice.
# Setting featureSubsetStrategy="auto" lets the algorithm choose.
model = RandomForest.trainClassifier(trainingData, numClasses=2,
categoricalFeaturesInfo={},
numTrees=3, featureSubsetStrategy="auto",
impurity='gini', maxDepth=4, maxBins=32)
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda v_p: v_p[0] != v_p[1]).count()\
/ float(testData.count())
print('Test Error = ' + str(testErr))
print('Learned classification forest model:')
print(model.toDebugString())
开发者ID:1ambda,项目名称:spark,代码行数:18,代码来源:random_forest_example.py
示例18: train_trend_model
def train_trend_model(self, model, data, i):
self.logger.info('Start to train the direction model')
rdd_data = self.sc.parallelize(data)
if self.trend_prediction_method == self.RANDOM_FOREST:
model = RandomForest.trainClassifier(rdd_data, numClasses=2, categoricalFeaturesInfo={}, numTrees=40,
featureSubsetStrategy="auto", impurity='gini', maxDepth=20,
maxBins=32)
elif self.trend_prediction_method == self.NAIVE_BAYES:
model = NaiveBayes.train(rdd_data)
elif self.trend_prediction_method == self.LOGISTIC_REGRESSION:
model = LogisticRegressionWithSGD.train(rdd_data, iterations=10000, step=0.001,
initialWeights=None if model is None else model.weights)
elif self.trend_prediction_method == self.SVM:
model = SVMWithSGD.train(rdd_data, iterations=10000, step=0.001,
initialWeights=None if model is None else model.weights)
return model
开发者ID:WarnWang,项目名称:Dissertation,代码行数:19,代码来源:composition_prediction_system.py
示例19: create_model
def create_model(name, training):
if name == 'logistic':
print_box()
print "Logistic Regression Model"
print_box()
model = LogisticRegressionWithLBFGS.train(training)
elif name == 'tree':
print_box()
print "Decision Tree Model"
print_box()
model = DecisionTree.trainClassifier(training, numClasses=2, categoricalFeaturesInfo={},
impurity='gini', maxDepth=5, maxBins=32)
elif name == 'rf':
print_box()
print "Random Forest Model"
print_box()
model = RandomForest.trainClassifier(training, numClasses=2, categoricalFeaturesInfo={},
numTrees=15, featureSubsetStrategy="auto", impurity='gini', maxDepth=5, maxBins=50)
return model
开发者ID:ayushsagar,项目名称:big-data-analytics,代码行数:20,代码来源:models.py
示例20: kfolds
def kfolds ():
#folds = kFold(data, k) this would work in java
acc = 0
spurity = 0
scomp = 0
gpurity = 0
gcomp = 0
foldsize = data.count()/k
tested = sc.parallelize([])
for i in range(k):
test = sc.parallelize(data.subtract(tested).takeSample(False, foldsize))
tested = tested.union(test)
train = data.subtract(test)
# train the random forest
model = RandomForest.trainClassifier(train, numClasses=2, categoricalFeaturesInfo={},
numTrees=num_trees, featureSubsetStrategy="auto",
impurity='gini', maxDepth = max_depth, maxBins=32)
predictions = model.predict(test.map(lambda x: x.features))
labelsAndPredictions = test.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(test.count())
Mg = float(labelsAndPredictions.filter(lambda (v, p): v == 0 and p == 1).count())
Ng = float(labelsAndPredictions.filter(lambda (v, p): v == 0 and p == 0).count())
Ms = float(labelsAndPredictions.filter(lambda (v, p): v == 1 and p == 0).count())
Ns = float(labelsAndPredictions.filter(lambda (v, p): v == 1 and p == 1).count())
gpurity += (Ng / (Ng+Ms))
gcomp += (Ng / (Ng+Mg))
spurity += (Ns / (Ns+Mg))
scomp += (Ns/(Ns+Ms))
acc += (1 - testErr)
print 'with '+ str(k) + ' folds:'
print ('Average Galaxy Purity = ' + str(gpurity / k))
print ('Average Galaxy Completeness = ' + str(gcomp / k))
print ('Average Star Purity = ' + str(spurity / k))
print ('Average Star Completeness = ' + str(scomp / k))
print ('Average Accuracy = ' + str(acc / k))
开发者ID:beatriceliang,项目名称:POPREU,代码行数:38,代码来源:stargalaxy.py
注:本文中的pyspark.mllib.tree.RandomForest类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论