• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python evaluation.RegressionEvaluator类代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中pyspark.ml.evaluation.RegressionEvaluator的典型用法代码示例。如果您正苦于以下问题:Python RegressionEvaluator类的具体用法?Python RegressionEvaluator怎么用?Python RegressionEvaluator使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。



在下文中一共展示了RegressionEvaluator类的17个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: main

def main(input_file):
    # Load and parse the data file, converting it to a DataFrame.
    data = MLUtils.loadLabeledPoints(sc, input_file)

    # Automatically identify categorical features, and index them.
    # Set maxCategories so features with > 4 distinct values are treated as continuous.
    featureIndexer =\
        VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=10).fit(data)

    # Split the data into training and test sets (30% held out for testing)
    (trainingData, testData) = data.randomSplit([0.7, 0.3])

    # Train a RandomForest model.
    rf = RandomForestRegressor(featuresCol="indexedFeatures")

    # Chain indexer and forest in a Pipeline
    pipeline = Pipeline(stages=[featureIndexer, rf])

    # Train model.  This also runs the indexer.
    model = pipeline.fit(trainingData)

    # Make predictions.
    predictions = model.transform(testData)

    # Select example rows to display.
    predictions.select("prediction", "label", "features").show(5)

    # Select (prediction, true label) and compute test error
    evaluator = RegressionEvaluator(
        labelCol="label", predictionCol="prediction", metricName="rmse")
    rmse = evaluator.evaluate(predictions)
    print("Root Mean Squared Error (RMSE) on test data = {}".format(rmse))

    rfModel = model.stages[1]
    print(rfModel)  # summary only
开发者ID:garethdavidjones,项目名称:Election-Contrib,代码行数:35,代码来源:random_forest.py


示例2: test_fit_maximize_metric

    def test_fit_maximize_metric(self):
        dataset = self.spark.createDataFrame([
            (10, 10.0),
            (50, 50.0),
            (100, 100.0),
            (500, 500.0)] * 10,
            ["feature", "label"])

        iee = InducedErrorEstimator()
        evaluator = RegressionEvaluator(metricName="r2")

        grid = ParamGridBuilder() \
            .addGrid(iee.inducedError, [100.0, 0.0, 10000.0]) \
            .build()
        tvs = TrainValidationSplit(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator)
        tvsModel = tvs.fit(dataset)
        bestModel = tvsModel.bestModel
        bestModelMetric = evaluator.evaluate(bestModel.transform(dataset))
        validationMetrics = tvsModel.validationMetrics

        self.assertEqual(0.0, bestModel.getOrDefault('inducedError'),
                         "Best model should have zero induced error")
        self.assertEqual(1.0, bestModelMetric, "Best model has R-squared of 1")
        self.assertEqual(len(grid), len(validationMetrics),
                         "validationMetrics has the same size of grid parameter")
        self.assertEqual(1.0, max(validationMetrics))
开发者ID:Brett-A,项目名称:spark,代码行数:26,代码来源:test_tuning.py


示例3: test_fit_maximize_metric

    def test_fit_maximize_metric(self):
        sqlContext = SQLContext(self.sc)
        dataset = sqlContext.createDataFrame(
            [(10, 10.0), (50, 50.0), (100, 100.0), (500, 500.0)] * 10, ["feature", "label"]
        )

        iee = InducedErrorEstimator()
        evaluator = RegressionEvaluator(metricName="r2")

        grid = ParamGridBuilder().addGrid(iee.inducedError, [100.0, 0.0, 10000.0]).build()
        tvs = TrainValidationSplit(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator)
        tvsModel = tvs.fit(dataset)
        bestModel = tvsModel.bestModel
        bestModelMetric = evaluator.evaluate(bestModel.transform(dataset))

        self.assertEqual(0.0, bestModel.getOrDefault("inducedError"), "Best model should have zero induced error")
        self.assertEqual(1.0, bestModelMetric, "Best model has R-squared of 1")
开发者ID:JeremyNixon,项目名称:spark,代码行数:17,代码来源:tests.py


示例4: test_fit_minimize_metric

    def test_fit_minimize_metric(self):
        dataset = self.spark.createDataFrame([
            (10, 10.0),
            (50, 50.0),
            (100, 100.0),
            (500, 500.0)] * 10,
            ["feature", "label"])

        iee = InducedErrorEstimator()
        evaluator = RegressionEvaluator(metricName="rmse")

        grid = (ParamGridBuilder()
                .addGrid(iee.inducedError, [100.0, 0.0, 10000.0])
                .build())
        cv = CrossValidator(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator)
        cvModel = cv.fit(dataset)
        bestModel = cvModel.bestModel
        bestModelMetric = evaluator.evaluate(bestModel.transform(dataset))

        self.assertEqual(0.0, bestModel.getOrDefault('inducedError'),
                         "Best model should have zero induced error")
        self.assertEqual(0.0, bestModelMetric, "Best model has RMSE of 0")
开发者ID:Brett-A,项目名称:spark,代码行数:22,代码来源:test_tuning.py


示例5: test_java_params

 def test_java_params(self):
     """
     This tests a bug fixed by SPARK-18274 which causes multiple copies
     of a Params instance in Python to be linked to the same Java instance.
     """
     evaluator = RegressionEvaluator(metricName="r2")
     df = self.spark.createDataFrame([Row(label=1.0, prediction=1.1)])
     evaluator.evaluate(df)
     self.assertEqual(evaluator._java_obj.getMetricName(), "r2")
     evaluatorCopy = evaluator.copy({evaluator.metricName: "mae"})
     evaluator.evaluate(df)
     evaluatorCopy.evaluate(df)
     self.assertEqual(evaluator._java_obj.getMetricName(), "r2")
     self.assertEqual(evaluatorCopy._java_obj.getMetricName(), "mae")
开发者ID:Brett-A,项目名称:spark,代码行数:14,代码来源:test_evaluation.py


示例6: VectorAssembler

import os

df = sqlContext.read.json(os.environ['WORKDIR'] + "user_features.json")

df_restaurants = df.filter("category = \"Restaurants\"")


assembler = VectorAssembler(
    inputCols=["average_stars", "cat_avg_review_len", "cat_avg_stars", "cat_business_count", "cat_review_count", "months_yelping", "review_count", "votes_cool", "votes_funny", "votes_useful" ],
    outputCol="features")
output = assembler.transform(df_restaurants)

(trainingData, testData) = output.randomSplit([0.7, 0.3])

dt = DecisionTreeRegressor(labelCol = "elite", featuresCol="features")
pipeline = Pipeline(stages=[dt])
model = pipeline.fit(trainingData)
predictions = model.transform(testData)

predictions.select("prediction", "elite", "features").show(5)


evaluator = RegressionEvaluator(
    labelCol="elite", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print "Root Mean Squared Error (RMSE) on test data = %g" % rmse




开发者ID:abhinavrungta,项目名称:Yelp-Challenge,代码行数:26,代码来源:decisiontree_regression.py


示例7: print

#VECTORIZE TRAIN DATA
energi_habis_train = ssc.textFileStream("train_habis.txt")
energi_habis_train_labeled = energi_habis_train.map(parse_train)
energi_habis_train_labeled_DF = SQLContext.createDataFrame(energi_habis_train_labeled["label", "features"])
print(energi_habis_train_labeled_DF)

#VECTORIZE TEST DATA
energi_habis_test = ssc.textFileStream("test_habis.txt")
energi_habis_test_labeled = energi_habis_test.map(parse_test)
energi_habis_test_labeled_DF = SQLContext.createDataFrame(energi_habis_test_labeled["label", "features"])
print(energi_habis_test_labeled_DF)

#Create Model
numFeatures = 3
lr = LinearRegression(maxIter=50)
lrModel = lr.fit(energi_habis_train_labeled_DF)

#see what the model do
print("Coefficients: "+str(lrModel.coefficients))
print("Intercept: "+str(lrModel.intercept))

#Predict On the tested data
predictions = lrModel.transform(energi_habis_test_labeled_DF)
predictions.select("prediction","label", "features").show()

#Evaluate the predictions
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label", metricName="r2")
evaluator.evaluate(predictions)
开发者ID:rzkhqq,项目名称:BigData4,代码行数:30,代码来源:A_3_b_tren_energi_habis.py


示例8: print

# Make predictions
predictionsA = modelA.transform(trainingData)
print ('-'*70)
print ('MODEL A : ')
predictionsA.select("prediction", "label", "features").show(30)
print ('-'*70)

predictionsB = modelB.transform(trainingData)
print ('-'*70)
print ('MODEL B : ')
predictionsB.select("prediction", "label", "features").show(30)
print ('-'*70)

# Evaluate the model
evaluator = RegressionEvaluator(metricName="rmse")
RMSE = evaluator.evaluate(predictionsA)
print ('-'*70)
print("ModelA: Root Mean Squared Error = " + str(RMSE))
print ('-'*70)
# ModelA: Root Mean Squared Error = 128.602026843

RMSE = evaluator.evaluate(predictionsB)
print ('-'*70)
print("ModelB: Root Mean Squared Error = " + str(RMSE))
print ('-'*70)
# ModelB: Root Mean Squared Error = 129.496300193



开发者ID:yennanliu,项目名称:analysis,代码行数:26,代码来源:Spark_ML_LinearRegression_demo.py


示例9: spark_process


#.........这里部分代码省略.........


	######################
	#
	# features engineering
	#
	######################

	# create new column based on time-delta (minutes)
	# convert pickup-datetime column to hour
		
	time_delta_udf = udf(time_delta_minutes,FloatType())

	dataframe = dataframe.withColumn('time_delta', time_delta_udf(dataframe.pickup_datetime,dataframe.dropoff_datetime)) \
						 .withColumn('pick_up_hour', hour(dataframe.pickup_datetime))

 	dataframe = dataframe.select(dataframe.pick_up_hour,    \
 								dataframe.passenger_count.cast("integer"),  \
								dataframe.pickup_longitude.cast("double"), \
								dataframe.pickup_latitude.cast("double"),  \
								dataframe.dropoff_longitude.cast("double"),\
								dataframe.dropoff_latitude.cast("double"), \
								dataframe.time_delta.cast("double"))

 	dataframe = dataframe.filter(dataframe.time_delta > 1.0).cache()


 	# split dataframe into feature and label vector
	# create feature vectors and labels for model training
	feature_assembler = VectorAssembler(inputCols = ['pick_up_hour','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude'],outputCol = 'features')

	transformed = feature_assembler.transform(dataframe)
	vector_dataframe = transformed.select(col("time_delta").alias("label"),col("features")).cache()

	######################
	#
	# train model
	#
	######################

	if validate:

		################################
		#
		# validate model on 60/40 split
		#
		################################

		# split 
		training, test = vector_dataframe.randomSplit([0.6, 0.4], seed=0)

		decision_tree_reg = DecisionTreeRegressor(maxDepth=12,maxBins=25)
		model = decision_tree_reg.fit(training)

		train_pred = model.transform(training)
		test_pred = model.transform(test)

		evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")
		r2_train = evaluator.evaluate(train_pred)

		evaluator_test = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="r2")
		r2_test = evaluator_test.evaluate(test_pred)

		output = test_pred.select("prediction", "label", "features")

		return output, r2_test, r2_train
	
	else:

		###################
		#
		# train on all data
		#
		###################

		decision_tree_reg = DecisionTreeRegressor(maxDepth=12,maxBins=25)
		model = decision_tree_reg.fit(vector_dataframe)

		predictions = model.transform(vector_dataframe)

		output = predictions.select("prediction", "label", "features")

		###########################
		#
		# process to send to Kafka
		#
		###########################

		schema = StructType([StructField("prediction_mins", FloatType(), True),
							StructField("pick_up_hour", IntegerType(), True),
							StructField("pickup_longitude", DoubleType(), True),
							StructField("pickup_latitude", DoubleType(), True),
							StructField("dropoff_longitude", DoubleType(), True),
							StructField("dropoff_latitude", DoubleType(), True)])

		features_from_predictions = output.map(lambda row: (float(row.prediction),int(row.features[0]),float(row.features[1]),float(row.features[2]),float(row.features[3]),float(row.features[4]) ) ).collect()
		sqlContext.clearCache()
		dataframe_from_prediction_vector = sqlContext.createDataFrame(features_from_predictions,schema).cache()

		return dataframe_from_prediction_vector
开发者ID:alcedok,项目名称:capataz,代码行数:101,代码来源:spark_ML_batch.py


示例10: LinearRegression

samples = df12.randomSplit([0.7, 0.3])
training = samples[0]
test = samples[1]

lr = LinearRegression(maxIter=5, regParam=0.3, labelCol="weight", featuresCol="features", predictionCol="predic_weight")

model = lr.fit(training)

print("결정계수(R2):%d" % model.summary.r2)

d13 = model.transform(test)
d13.cache()

d13.select("weight", "predic_weight").show(5, False)

evaluator = RegressionEvaluator(labelCol="weight", predictionCol="predic_weight")

# root mean squared error
rmse = evaluator.evaluate(d13)

# mean squared error
mse = evaluator.setMetricName("mse").evaluate(d13)

# R2 metric
r2 = evaluator.setMetricName("r2").evaluate(d13)

# mean absolute error
mae = evaluator.setMetricName("mae").evaluate(d13)

print("rmse:%d, mse:%d, r2:%d, mae:%d" % (rmse, mse, r2, mae))
开发者ID:oopchoi,项目名称:spark,代码行数:30,代码来源:regression_sample.py


示例11: display

# MAGIC To start, we'll generate the predictions by using the first model in `petalModels`.

# COMMAND ----------

petalPredictions = petalModels[0].transform(irisPetal)
display(petalPredictions)

# COMMAND ----------

# MAGIC %md
# MAGIC Next, we'll evaluate the model using the `RegressionEvaluator`.

# COMMAND ----------

from pyspark.ml.evaluation import RegressionEvaluator
regEval = RegressionEvaluator().setLabelCol('petalWidth')

print regEval.explainParams()

# COMMAND ----------

# MAGIC %md
# MAGIC The default value for `RegressionEvaluator` is root mean square error (RMSE).  Let's view that first.

# COMMAND ----------

print regEval.evaluate(petalPredictions)

# COMMAND ----------

# MAGIC %md
开发者ID:smoltis,项目名称:spark,代码行数:31,代码来源:5-regression_student.py


示例12: str

modelprep1 = va.transform(enriched1).select('userId','movieId','rating','features')

training, testing, other = modelprep1.randomSplit([0.07, 0.03, 0.90])

print '[ INFO ] Training:          ' + str(training.count()) + ' records'
print '[ INFO ] Testing:           ' + str(training.count()) + ' records'

gb = GBTRegressor(featuresCol="features", labelCol=var_target, predictionCol="prediction", maxDepth=5, maxBins=32, maxIter=20, seed=12345)

gbmodel = gb.fit(training)
#gbmodel.save('/tmp/spark_models/kaggle_bike_sharing_gb_model')

predictions = gbmodel.transform(testing)

print '[ INFO ] Printing predictions vs label...'
predictions.show(10,False).select('prediction',var_target)

evaluator = RegressionEvaluator(labelCol=var_target, predictionCol="prediction")
print '[ INFO ] Model Fit (RMSE):  ' + str(evaluator.evaluate(predictions, {evaluator.metricName: "rmse"}))
#print '[ INFO ] Model Fit (MSE):   ' + str(evaluator.evaluate(predictions, {evaluator.metricName: "mse"}))
#print '[ INFO ] Model Fit (R2):    ' + str(evaluator.evaluate(predictions, {evaluator.metricName: "r2"}))

total_runtime_seconds = (datetime.datetime.now() - start_time).seconds

print '#'*100
print '[ INFO ] Total Runtime:     ' + str(total_runtime_seconds) + ' seconds'
print '#'*100


#ZEND
开发者ID:zaratsian,项目名称:pyspark,代码行数:30,代码来源:kaggle_movielens.py


示例13: test_fit_maximize_metric

        self.assertEqual(0.0, bestModel.getOrDefault('inducedError'),
                         "Best model should have zero induced error")
        self.assertEqual(0.0, bestModelMetric, "Best model has RMSE of 0")

    def test_fit_maximize_metric(self):
        sqlContext = SQLContext(self.sc)
        dataset = sqlContext.createDataFrame([
            (10, 10.0),
            (50, 50.0),
            (100, 100.0),
            (500, 500.0)] * 10,
            ["feature", "label"])

        iee = InducedErrorEstimator()
        evaluator = RegressionEvaluator(metricName="r2")

        grid = (ParamGridBuilder()
                .addGrid(iee.inducedError, [100.0, 0.0, 10000.0])
                .build())
        cv = CrossValidator(estimator=iee, estimatorParamMaps=grid, evaluator=evaluator)
        cvModel = cv.fit(dataset)
        bestModel = cvModel.bestModel
        bestModelMetric = evaluator.evaluate(bestModel.transform(dataset))

        self.assertEqual(0.0, bestModel.getOrDefault('inducedError'),
                         "Best model should have zero induced error")
        self.assertEqual(1.0, bestModelMetric, "Best model has R-squared of 1")


if __name__ == "__main__":
开发者ID:dalonso82,项目名称:spark,代码行数:30,代码来源:tests.py


示例14: str

#print("Dispersion: " + str(summary.dispersion))
#print("Null Deviance: " + str(summary.nullDeviance))
#print("Residual Degree Of Freedom Null: " + str(summary.residualDegreeOfFreedomNull))
#print("Deviance: " + str(summary.deviance))
#print("Residual Degree Of Freedom: " + str(summary.residualDegreeOfFreedom))
#print("AIC: " + str(summary.aic))
#print("Deviance Residuals: ")
#summary.residuals().show()

# Make predictions.
predictions = glmmodel.transform(testing)

# Select example rows to display.
predictions.select("prediction", "label").show(30,False)

evaluator = RegressionEvaluator(metricName="rmse")  # rmse (default)|mse|r2|mae
RMSE = evaluator.evaluate(predictions)
print 'RMSE: ' + str(RMSE)



#######################################################################################
#
#   Modeling - Gradient Boosting (Regression)
#
#######################################################################################

gbt = GBTRegressor(featuresCol="features", labelCol="label", predictionCol="prediction", maxDepth=5, maxBins=32, maxIter=20, seed=12345)
#gbt = GBTClassifier(featuresCol="features", labelCol="label", predictionCol="prediction", maxDepth=5, maxBins=32, maxIter=20, seed=12345)

gbtmodel = gbt.fit(training)
开发者ID:zaratsian,项目名称:pyspark,代码行数:31,代码来源:NYCTaxi_PySpark.py


示例15: found

# TRAIN WITH CROSS-VALIDATION
#cv_model = cv.fit(trainDataFrame)
cv_model = cv.fit(trainReg.toDF(['label','features']))


# EVALUATE MODEL ON TEST SET
#testDataFrame = sqlContext.createDataFrame(oneHotTESTreg, ["features", "label"])
testDataFrame = testReg.toDF(['label','features'])

# MAKE PREDICTIONS ON TEST DOCUMENTS
# cvModel uses the best model found (lrModel).
predictionAndLabels = cv_model.transform(testDataFrame)
predictionAndLabels.select("features", "label", "prediction").show()

# validate the results
# metric to measure how well a fitted Model does on held-out test data
evaluator = RegressionEvaluator(metricName="rmse")
rmse = evaluator.evaluate(predictionAndLabels)
print("Root-mean-square error = %s" % rmse)


#### LOGISTIC REGRESSION







开发者ID:dataminelab,项目名称:framework-training,代码行数:22,代码来源:linear_regression.py


示例16: ALS

from pyspark.ml.recommendation import ALS

# Let's initialize our ALS learner
als = ALS()

# Now we set the parameters for the method
als.setMaxIter(5)\
   .setSeed(seed)\
   .setRegParam(0.1)\
   .setUserCol("userId").setItemCol("movieId").setRatingCol("rating")

# Now let's compute an evaluation metric for our test dataset
from pyspark.ml.evaluation import RegressionEvaluator

# Create an RMSE evaluator using the label and predicted columns
reg_eval = RegressionEvaluator(predictionCol="prediction", labelCol="rating", metricName="rmse")

tolerance = 0.03
ranks = [4, 8, 12]
errors = [0, 0, 0]
models = [0, 0, 0]
err = 0
min_error = float('inf')
best_rank = -1
for rank in ranks:
  # Set the rank here:
  als.setRank(rank)
  # Create the model with these parameters.
  model = als.fit(training_df)
  # Run the model to create a prediction. Predict against the validation_df.
  predict_df = model.transform(validation_df)
开发者ID:23423423424,项目名称:edx,代码行数:31,代码来源:cs110_lab2_als_prediction.py


示例17: HashingTF

posTrain, posTest = pos.randomSplit([0.8, 0.2], seed=17)
hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20)

from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
lrPipeline = Pipeline(stages=[hashingTF, lr])
dt = DecisionTreeRegressor(maxDepth=10, maxBins=50)
dtPipeline = Pipeline(stages=[hashingTF, dt])
rf = RandomForestRegressor(maxDepth=10, maxBins=50, numTrees=50)
rfPipeline = Pipeline(stages=[hashingTF, rf])

posLR = lrPipeline.fit(posTrain)
lrPred = posLR.transform(posTest)
posDT = dtPipeline.fit(posTrain)
dtPred = posDT.transform(posTest)
posRF = rfPipeline.fit(posTrain)
rfPred = posRF.transform(posTest)

evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
lr_rmse = evaluator.evaluate(lrPred)
dt_rmse = evaluator.evaluate(dtPred)
rf_rmse = evaluator.evaluate(rfPred)
print("LR RMSE %g, DT RMSE %g, RF RMSE %g" % (lr_rmse, dt_rmse, rf_rmse))

# LR RMSE 0.44829, DT RMSE 0.312846, RF RMSE 0.300322
开发者ID:noodlefrenzy,项目名称:DecodedSparkML,代码行数:30,代码来源:sentiment_tagging.py



注:本文中的pyspark.ml.evaluation.RegressionEvaluator类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python feature.HashingTF类代码示例发布时间:2022-05-26
下一篇:
Python evaluation.MulticlassClassificationEvaluator类代码示例发布时间:2022-05-26
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap