本文整理汇总了Python中pyspark.ml.evaluation.RegressionEvaluator类的典型用法代码示例。如果您正苦于以下问题:Python RegressionEvaluator类的具体用法?Python RegressionEvaluator怎么用?Python RegressionEvaluator使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了RegressionEvaluator类的17个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: main
def main(input_file):
# Load and parse the data file, converting it to a DataFrame.
data = MLUtils.loadLabeledPoints(sc, input_file)
# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=10).fit(data)
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])
# Train a RandomForest model.
rf = RandomForestRegressor(featuresCol="indexedFeatures")
# Chain indexer and forest in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, rf])
# Train model. This also runs the indexer.
model = pipeline.fit(trainingData)
# Make predictions.
predictions = model.transform(testData)
# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)
# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = {}".format(rmse))
rfModel = model.stages[1]
print(rfModel) # summary only
开发者ID:garethdavidjones,项目名称:Election-Contrib,代码行数:35,代码来源:random_forest.py
示例2: test_fit_maximize_metric
def test_fit_maximize_metric(self):
dataset = self.spark.createDataFrame([
(10, 10.0),
(50, 50.0),
(100, 100.0),
(500, 500.0)] * 10,
["feature", "label"])
iee = InducedErrorEstimator()
evaluator = RegressionEvaluator(metricName="r2")
grid = ParamGridBuilder() \
.addGrid(iee.inducedError, [100.0, 0.0, 10000.0]) \
.build()
tvs = TrainValidationSplit(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator)
tvsModel = tvs.fit(dataset)
bestModel = tvsModel.bestModel
bestModelMetric = evaluator.evaluate(bestModel.transform(dataset))
validationMetrics = tvsModel.validationMetrics
self.assertEqual(0.0, bestModel.getOrDefault('inducedError'),
"Best model should have zero induced error")
self.assertEqual(1.0, bestModelMetric, "Best model has R-squared of 1")
self.assertEqual(len(grid), len(validationMetrics),
"validationMetrics has the same size of grid parameter")
self.assertEqual(1.0, max(validationMetrics))
开发者ID:Brett-A,项目名称:spark,代码行数:26,代码来源:test_tuning.py
示例3: test_fit_maximize_metric
def test_fit_maximize_metric(self):
sqlContext = SQLContext(self.sc)
dataset = sqlContext.createDataFrame(
[(10, 10.0), (50, 50.0), (100, 100.0), (500, 500.0)] * 10, ["feature", "label"]
)
iee = InducedErrorEstimator()
evaluator = RegressionEvaluator(metricName="r2")
grid = ParamGridBuilder().addGrid(iee.inducedError, [100.0, 0.0, 10000.0]).build()
tvs = TrainValidationSplit(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator)
tvsModel = tvs.fit(dataset)
bestModel = tvsModel.bestModel
bestModelMetric = evaluator.evaluate(bestModel.transform(dataset))
self.assertEqual(0.0, bestModel.getOrDefault("inducedError"), "Best model should have zero induced error")
self.assertEqual(1.0, bestModelMetric, "Best model has R-squared of 1")
开发者ID:JeremyNixon,项目名称:spark,代码行数:17,代码来源:tests.py
示例4: test_fit_minimize_metric
def test_fit_minimize_metric(self):
dataset = self.spark.createDataFrame([
(10, 10.0),
(50, 50.0),
(100, 100.0),
(500, 500.0)] * 10,
["feature", "label"])
iee = InducedErrorEstimator()
evaluator = RegressionEvaluator(metricName="rmse")
grid = (ParamGridBuilder()
.addGrid(iee.inducedError, [100.0, 0.0, 10000.0])
.build())
cv = CrossValidator(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator)
cvModel = cv.fit(dataset)
bestModel = cvModel.bestModel
bestModelMetric = evaluator.evaluate(bestModel.transform(dataset))
self.assertEqual(0.0, bestModel.getOrDefault('inducedError'),
"Best model should have zero induced error")
self.assertEqual(0.0, bestModelMetric, "Best model has RMSE of 0")
开发者ID:Brett-A,项目名称:spark,代码行数:22,代码来源:test_tuning.py
示例5: test_java_params
def test_java_params(self):
"""
This tests a bug fixed by SPARK-18274 which causes multiple copies
of a Params instance in Python to be linked to the same Java instance.
"""
evaluator = RegressionEvaluator(metricName="r2")
df = self.spark.createDataFrame([Row(label=1.0, prediction=1.1)])
evaluator.evaluate(df)
self.assertEqual(evaluator._java_obj.getMetricName(), "r2")
evaluatorCopy = evaluator.copy({evaluator.metricName: "mae"})
evaluator.evaluate(df)
evaluatorCopy.evaluate(df)
self.assertEqual(evaluator._java_obj.getMetricName(), "r2")
self.assertEqual(evaluatorCopy._java_obj.getMetricName(), "mae")
开发者ID:Brett-A,项目名称:spark,代码行数:14,代码来源:test_evaluation.py
示例6: VectorAssembler
import os
df = sqlContext.read.json(os.environ['WORKDIR'] + "user_features.json")
df_restaurants = df.filter("category = \"Restaurants\"")
assembler = VectorAssembler(
inputCols=["average_stars", "cat_avg_review_len", "cat_avg_stars", "cat_business_count", "cat_review_count", "months_yelping", "review_count", "votes_cool", "votes_funny", "votes_useful" ],
outputCol="features")
output = assembler.transform(df_restaurants)
(trainingData, testData) = output.randomSplit([0.7, 0.3])
dt = DecisionTreeRegressor(labelCol = "elite", featuresCol="features")
pipeline = Pipeline(stages=[dt])
model = pipeline.fit(trainingData)
predictions = model.transform(testData)
predictions.select("prediction", "elite", "features").show(5)
evaluator = RegressionEvaluator(
labelCol="elite", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print "Root Mean Squared Error (RMSE) on test data = %g" % rmse
开发者ID:abhinavrungta,项目名称:Yelp-Challenge,代码行数:26,代码来源:decisiontree_regression.py
示例7: print
#VECTORIZE TRAIN DATA
energi_habis_train = ssc.textFileStream("train_habis.txt")
energi_habis_train_labeled = energi_habis_train.map(parse_train)
energi_habis_train_labeled_DF = SQLContext.createDataFrame(energi_habis_train_labeled["label", "features"])
print(energi_habis_train_labeled_DF)
#VECTORIZE TEST DATA
energi_habis_test = ssc.textFileStream("test_habis.txt")
energi_habis_test_labeled = energi_habis_test.map(parse_test)
energi_habis_test_labeled_DF = SQLContext.createDataFrame(energi_habis_test_labeled["label", "features"])
print(energi_habis_test_labeled_DF)
#Create Model
numFeatures = 3
lr = LinearRegression(maxIter=50)
lrModel = lr.fit(energi_habis_train_labeled_DF)
#see what the model do
print("Coefficients: "+str(lrModel.coefficients))
print("Intercept: "+str(lrModel.intercept))
#Predict On the tested data
predictions = lrModel.transform(energi_habis_test_labeled_DF)
predictions.select("prediction","label", "features").show()
#Evaluate the predictions
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label", metricName="r2")
evaluator.evaluate(predictions)
开发者ID:rzkhqq,项目名称:BigData4,代码行数:30,代码来源:A_3_b_tren_energi_habis.py
示例8: print
# Make predictions
predictionsA = modelA.transform(trainingData)
print ('-'*70)
print ('MODEL A : ')
predictionsA.select("prediction", "label", "features").show(30)
print ('-'*70)
predictionsB = modelB.transform(trainingData)
print ('-'*70)
print ('MODEL B : ')
predictionsB.select("prediction", "label", "features").show(30)
print ('-'*70)
# Evaluate the model
evaluator = RegressionEvaluator(metricName="rmse")
RMSE = evaluator.evaluate(predictionsA)
print ('-'*70)
print("ModelA: Root Mean Squared Error = " + str(RMSE))
print ('-'*70)
# ModelA: Root Mean Squared Error = 128.602026843
RMSE = evaluator.evaluate(predictionsB)
print ('-'*70)
print("ModelB: Root Mean Squared Error = " + str(RMSE))
print ('-'*70)
# ModelB: Root Mean Squared Error = 129.496300193
开发者ID:yennanliu,项目名称:analysis,代码行数:26,代码来源:Spark_ML_LinearRegression_demo.py
示例9: spark_process
#.........这里部分代码省略.........
######################
#
# features engineering
#
######################
# create new column based on time-delta (minutes)
# convert pickup-datetime column to hour
time_delta_udf = udf(time_delta_minutes,FloatType())
dataframe = dataframe.withColumn('time_delta', time_delta_udf(dataframe.pickup_datetime,dataframe.dropoff_datetime)) \
.withColumn('pick_up_hour', hour(dataframe.pickup_datetime))
dataframe = dataframe.select(dataframe.pick_up_hour, \
dataframe.passenger_count.cast("integer"), \
dataframe.pickup_longitude.cast("double"), \
dataframe.pickup_latitude.cast("double"), \
dataframe.dropoff_longitude.cast("double"),\
dataframe.dropoff_latitude.cast("double"), \
dataframe.time_delta.cast("double"))
dataframe = dataframe.filter(dataframe.time_delta > 1.0).cache()
# split dataframe into feature and label vector
# create feature vectors and labels for model training
feature_assembler = VectorAssembler(inputCols = ['pick_up_hour','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude'],outputCol = 'features')
transformed = feature_assembler.transform(dataframe)
vector_dataframe = transformed.select(col("time_delta").alias("label"),col("features")).cache()
######################
#
# train model
#
######################
if validate:
################################
#
# validate model on 60/40 split
#
################################
# split
training, test = vector_dataframe.randomSplit([0.6, 0.4], seed=0)
decision_tree_reg = DecisionTreeRegressor(maxDepth=12,maxBins=25)
model = decision_tree_reg.fit(training)
train_pred = model.transform(training)
test_pred = model.transform(test)
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")
r2_train = evaluator.evaluate(train_pred)
evaluator_test = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")
r2_test = evaluator_test.evaluate(test_pred)
output = test_pred.select("prediction", "label", "features")
return output, r2_test, r2_train
else:
###################
#
# train on all data
#
###################
decision_tree_reg = DecisionTreeRegressor(maxDepth=12,maxBins=25)
model = decision_tree_reg.fit(vector_dataframe)
predictions = model.transform(vector_dataframe)
output = predictions.select("prediction", "label", "features")
###########################
#
# process to send to Kafka
#
###########################
schema = StructType([StructField("prediction_mins", FloatType(), True),
StructField("pick_up_hour", IntegerType(), True),
StructField("pickup_longitude", DoubleType(), True),
StructField("pickup_latitude", DoubleType(), True),
StructField("dropoff_longitude", DoubleType(), True),
StructField("dropoff_latitude", DoubleType(), True)])
features_from_predictions = output.map(lambda row: (float(row.prediction),int(row.features[0]),float(row.features[1]),float(row.features[2]),float(row.features[3]),float(row.features[4]) ) ).collect()
sqlContext.clearCache()
dataframe_from_prediction_vector = sqlContext.createDataFrame(features_from_predictions,schema).cache()
return dataframe_from_prediction_vector
开发者ID:alcedok,项目名称:capataz,代码行数:101,代码来源:spark_ML_batch.py
示例10: LinearRegression
samples = df12.randomSplit([0.7, 0.3])
training = samples[0]
test = samples[1]
lr = LinearRegression(maxIter=5, regParam=0.3, labelCol="weight", featuresCol="features", predictionCol="predic_weight")
model = lr.fit(training)
print("결정계수(R2):%d" % model.summary.r2)
d13 = model.transform(test)
d13.cache()
d13.select("weight", "predic_weight").show(5, False)
evaluator = RegressionEvaluator(labelCol="weight", predictionCol="predic_weight")
# root mean squared error
rmse = evaluator.evaluate(d13)
# mean squared error
mse = evaluator.setMetricName("mse").evaluate(d13)
# R2 metric
r2 = evaluator.setMetricName("r2").evaluate(d13)
# mean absolute error
mae = evaluator.setMetricName("mae").evaluate(d13)
print("rmse:%d, mse:%d, r2:%d, mae:%d" % (rmse, mse, r2, mae))
开发者ID:oopchoi,项目名称:spark,代码行数:30,代码来源:regression_sample.py
示例11: display
# MAGIC To start, we'll generate the predictions by using the first model in `petalModels`.
# COMMAND ----------
petalPredictions = petalModels[0].transform(irisPetal)
display(petalPredictions)
# COMMAND ----------
# MAGIC %md
# MAGIC Next, we'll evaluate the model using the `RegressionEvaluator`.
# COMMAND ----------
from pyspark.ml.evaluation import RegressionEvaluator
regEval = RegressionEvaluator().setLabelCol('petalWidth')
print regEval.explainParams()
# COMMAND ----------
# MAGIC %md
# MAGIC The default value for `RegressionEvaluator` is root mean square error (RMSE). Let's view that first.
# COMMAND ----------
print regEval.evaluate(petalPredictions)
# COMMAND ----------
# MAGIC %md
开发者ID:smoltis,项目名称:spark,代码行数:31,代码来源:5-regression_student.py
示例12: str
modelprep1 = va.transform(enriched1).select('userId','movieId','rating','features')
training, testing, other = modelprep1.randomSplit([0.07, 0.03, 0.90])
print '[ INFO ] Training: ' + str(training.count()) + ' records'
print '[ INFO ] Testing: ' + str(training.count()) + ' records'
gb = GBTRegressor(featuresCol="features", labelCol=var_target, predictionCol="prediction", maxDepth=5, maxBins=32, maxIter=20, seed=12345)
gbmodel = gb.fit(training)
#gbmodel.save('/tmp/spark_models/kaggle_bike_sharing_gb_model')
predictions = gbmodel.transform(testing)
print '[ INFO ] Printing predictions vs label...'
predictions.show(10,False).select('prediction',var_target)
evaluator = RegressionEvaluator(labelCol=var_target, predictionCol="prediction")
print '[ INFO ] Model Fit (RMSE): ' + str(evaluator.evaluate(predictions, {evaluator.metricName: "rmse"}))
#print '[ INFO ] Model Fit (MSE): ' + str(evaluator.evaluate(predictions, {evaluator.metricName: "mse"}))
#print '[ INFO ] Model Fit (R2): ' + str(evaluator.evaluate(predictions, {evaluator.metricName: "r2"}))
total_runtime_seconds = (datetime.datetime.now() - start_time).seconds
print '#'*100
print '[ INFO ] Total Runtime: ' + str(total_runtime_seconds) + ' seconds'
print '#'*100
#ZEND
开发者ID:zaratsian,项目名称:pyspark,代码行数:30,代码来源:kaggle_movielens.py
示例13: test_fit_maximize_metric
self.assertEqual(0.0, bestModel.getOrDefault('inducedError'),
"Best model should have zero induced error")
self.assertEqual(0.0, bestModelMetric, "Best model has RMSE of 0")
def test_fit_maximize_metric(self):
sqlContext = SQLContext(self.sc)
dataset = sqlContext.createDataFrame([
(10, 10.0),
(50, 50.0),
(100, 100.0),
(500, 500.0)] * 10,
["feature", "label"])
iee = InducedErrorEstimator()
evaluator = RegressionEvaluator(metricName="r2")
grid = (ParamGridBuilder()
.addGrid(iee.inducedError, [100.0, 0.0, 10000.0])
.build())
cv = CrossValidator(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator)
cvModel = cv.fit(dataset)
bestModel = cvModel.bestModel
bestModelMetric = evaluator.evaluate(bestModel.transform(dataset))
self.assertEqual(0.0, bestModel.getOrDefault('inducedError'),
"Best model should have zero induced error")
self.assertEqual(1.0, bestModelMetric, "Best model has R-squared of 1")
if __name__ == "__main__":
开发者ID:dalonso82,项目名称:spark,代码行数:30,代码来源:tests.py
示例14: str
#print("Dispersion: " + str(summary.dispersion))
#print("Null Deviance: " + str(summary.nullDeviance))
#print("Residual Degree Of Freedom Null: " + str(summary.residualDegreeOfFreedomNull))
#print("Deviance: " + str(summary.deviance))
#print("Residual Degree Of Freedom: " + str(summary.residualDegreeOfFreedom))
#print("AIC: " + str(summary.aic))
#print("Deviance Residuals: ")
#summary.residuals().show()
# Make predictions.
predictions = glmmodel.transform(testing)
# Select example rows to display.
predictions.select("prediction", "label").show(30,False)
evaluator = RegressionEvaluator(metricName="rmse") # rmse (default)|mse|r2|mae
RMSE = evaluator.evaluate(predictions)
print 'RMSE: ' + str(RMSE)
#######################################################################################
#
# Modeling - Gradient Boosting (Regression)
#
#######################################################################################
gbt = GBTRegressor(featuresCol="features", labelCol="label", predictionCol="prediction", maxDepth=5, maxBins=32, maxIter=20, seed=12345)
#gbt = GBTClassifier(featuresCol="features", labelCol="label", predictionCol="prediction", maxDepth=5, maxBins=32, maxIter=20, seed=12345)
gbtmodel = gbt.fit(training)
开发者ID:zaratsian,项目名称:pyspark,代码行数:31,代码来源:NYCTaxi_PySpark.py
示例15: found
# TRAIN WITH CROSS-VALIDATION
#cv_model = cv.fit(trainDataFrame)
cv_model = cv.fit(trainReg.toDF(['label','features']))
# EVALUATE MODEL ON TEST SET
#testDataFrame = sqlContext.createDataFrame(oneHotTESTreg, ["features", "label"])
testDataFrame = testReg.toDF(['label','features'])
# MAKE PREDICTIONS ON TEST DOCUMENTS
# cvModel uses the best model found (lrModel).
predictionAndLabels = cv_model.transform(testDataFrame)
predictionAndLabels.select("features", "label", "prediction").show()
# validate the results
# metric to measure how well a fitted Model does on held-out test data
evaluator = RegressionEvaluator(metricName="rmse")
rmse = evaluator.evaluate(predictionAndLabels)
print("Root-mean-square error = %s" % rmse)
#### LOGISTIC REGRESSION
开发者ID:dataminelab,项目名称:framework-training,代码行数:22,代码来源:linear_regression.py
示例16: ALS
from pyspark.ml.recommendation import ALS
# Let's initialize our ALS learner
als = ALS()
# Now we set the parameters for the method
als.setMaxIter(5)\
.setSeed(seed)\
.setRegParam(0.1)\
.setUserCol("userId").setItemCol("movieId").setRatingCol("rating")
# Now let's compute an evaluation metric for our test dataset
from pyspark.ml.evaluation import RegressionEvaluator
# Create an RMSE evaluator using the label and predicted columns
reg_eval = RegressionEvaluator(predictionCol="prediction", labelCol="rating", metricName="rmse")
tolerance = 0.03
ranks = [4, 8, 12]
errors = [0, 0, 0]
models = [0, 0, 0]
err = 0
min_error = float('inf')
best_rank = -1
for rank in ranks:
# Set the rank here:
als.setRank(rank)
# Create the model with these parameters.
model = als.fit(training_df)
# Run the model to create a prediction. Predict against the validation_df.
predict_df = model.transform(validation_df)
开发者ID:23423423424,项目名称:edx,代码行数:31,代码来源:cs110_lab2_als_prediction.py
示例17: HashingTF
posTrain, posTest = pos.randomSplit([0.8, 0.2], seed=17)
hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20)
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
lrPipeline = Pipeline(stages=[hashingTF, lr])
dt = DecisionTreeRegressor(maxDepth=10, maxBins=50)
dtPipeline = Pipeline(stages=[hashingTF, dt])
rf = RandomForestRegressor(maxDepth=10, maxBins=50, numTrees=50)
rfPipeline = Pipeline(stages=[hashingTF, rf])
posLR = lrPipeline.fit(posTrain)
lrPred = posLR.transform(posTest)
posDT = dtPipeline.fit(posTrain)
dtPred = posDT.transform(posTest)
posRF = rfPipeline.fit(posTrain)
rfPred = posRF.transform(posTest)
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
lr_rmse = evaluator.evaluate(lrPred)
dt_rmse = evaluator.evaluate(dtPred)
rf_rmse = evaluator.evaluate(rfPred)
print("LR RMSE %g, DT RMSE %g, RF RMSE %g" % (lr_rmse, dt_rmse, rf_rmse))
# LR RMSE 0.44829, DT RMSE 0.312846, RF RMSE 0.300322
开发者ID:noodlefrenzy,项目名称:DecodedSparkML,代码行数:30,代码来源:sentiment_tagging.py
注:本文中的pyspark.ml.evaluation.RegressionEvaluator类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论