本文整理汇总了Python中pyspark.mllib.regression.LinearRegressionWithSGD类的典型用法代码示例。如果您正苦于以下问题:Python LinearRegressionWithSGD类的具体用法?Python LinearRegressionWithSGD怎么用?Python LinearRegressionWithSGD使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了LinearRegressionWithSGD类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: do_all
def do_all(f_path,out_name):
sc = SparkContext()
data = sc.textFile(f_path)
data = data.map(parseKeepD).filter(lambda p: p[0] != None)
# Scale Features
features = data.map(lambda x: x[0].features)
summary = Statistics.colStats(features)
global means
global varis
means = summary.mean()
varis = summary.variance()
#scale the points
data = data.map(lambda y: (conv_label_pt(y[0]),y[1]))
#train model
model = LinearRegressionWithSGD().train(data.map(lambda x: x[0]), intercept=True, regType='none')
#calculate disparity
disparity = data.map(lambda p: (p[0].label, model.predict(p[0].features), p[1]))
#calculate SSR for later
ssr = disparity.map(lambda x: (x[0] - x[1])**2).sum()
#keep N
N = disparity.count()
#shut down SC
MSE = ssr/float(N)
se = std_errors(data,MSE,N)
disparity.saveAsTextFile(out_loc + out_name)
sc.stop()
return model.intercept,model.weights,se,disparity, ssr, N
开发者ID:ssz225,项目名称:bigdata_final,代码行数:35,代码来源:spark_reg_local.py
示例2: test_regression
def test_regression(self):
from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
RidgeRegressionWithSGD
from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
data = [
LabeledPoint(-1.0, [0, -1]),
LabeledPoint(1.0, [0, 1]),
LabeledPoint(-1.0, [0, -2]),
LabeledPoint(1.0, [0, 2])
]
rdd = self.sc.parallelize(data)
features = [p.features.tolist() for p in data]
lr_model = LinearRegressionWithSGD.train(rdd, iterations=10)
self.assertTrue(lr_model.predict(features[0]) <= 0)
self.assertTrue(lr_model.predict(features[1]) > 0)
self.assertTrue(lr_model.predict(features[2]) <= 0)
self.assertTrue(lr_model.predict(features[3]) > 0)
lasso_model = LassoWithSGD.train(rdd, iterations=10)
self.assertTrue(lasso_model.predict(features[0]) <= 0)
self.assertTrue(lasso_model.predict(features[1]) > 0)
self.assertTrue(lasso_model.predict(features[2]) <= 0)
self.assertTrue(lasso_model.predict(features[3]) > 0)
rr_model = RidgeRegressionWithSGD.train(rdd, iterations=10)
self.assertTrue(rr_model.predict(features[0]) <= 0)
self.assertTrue(rr_model.predict(features[1]) > 0)
self.assertTrue(rr_model.predict(features[2]) <= 0)
self.assertTrue(rr_model.predict(features[3]) > 0)
categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories
dt_model = DecisionTree.trainRegressor(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4)
self.assertTrue(dt_model.predict(features[0]) <= 0)
self.assertTrue(dt_model.predict(features[1]) > 0)
self.assertTrue(dt_model.predict(features[2]) <= 0)
self.assertTrue(dt_model.predict(features[3]) > 0)
rf_model = RandomForest.trainRegressor(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10, maxBins=4, seed=1)
self.assertTrue(rf_model.predict(features[0]) <= 0)
self.assertTrue(rf_model.predict(features[1]) > 0)
self.assertTrue(rf_model.predict(features[2]) <= 0)
self.assertTrue(rf_model.predict(features[3]) > 0)
gbt_model = GradientBoostedTrees.trainRegressor(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4)
self.assertTrue(gbt_model.predict(features[0]) <= 0)
self.assertTrue(gbt_model.predict(features[1]) > 0)
self.assertTrue(gbt_model.predict(features[2]) <= 0)
self.assertTrue(gbt_model.predict(features[3]) > 0)
try:
LinearRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
LassoWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
RidgeRegressionWithSGD.train(rdd, initialWeights=array([1.0, 1.0]), iterations=10)
except ValueError:
self.fail()
开发者ID:1ambda,项目名称:spark,代码行数:59,代码来源:tests.py
示例3: iterateLRwSGDBatch
def iterateLRwSGDBatch(iterNums, stepSizes, fractions, train, valid):
for numIter in iterNums:
for step in stepSizes:
for miniBFraction in fractions:
alg = LinearRegressionWithSGD()
model = alg.train(train, intercept=True, iterations=numIter, step=step, miniBatchFraction=miniBFraction)
rescaledPredicts = train.map(lambda x: (model.predict(x.features), x.label))
validPredicts = valid.map(lambda x: (model.predict(x.features), x.label))
meanSquared = math.sqrt(rescaledPredicts.map(lambda p: pow(p[0]-p[1],2)).mean())
meanSquaredValid = math.sqrt(validPredicts.map(lambda p: pow(p[0]-p[1],2)).mean())
print("%d, %5.3f %5.3f -> %.4f, %.4f" % (numIter, step, miniBFraction, meanSquared, meanSquaredValid))
开发者ID:AkiraKane,项目名称:first-edition,代码行数:11,代码来源:ch07-listings.py
示例4: iterateLRwSGD
def iterateLRwSGD(iterNums, stepSizes, train, valid):
from pyspark.mllib.regression import LinearRegressionWithSGD
import math
for numIter in iterNums:
for step in stepSizes:
alg = LinearRegressionWithSGD()
model = alg.train(train, iterations=numIter, step=step, intercept=True)
rescaledPredicts = train.map(lambda x: (float(model.predict(x.features)), x.label))
validPredicts = valid.map(lambda x: (float(model.predict(x.features)), x.label))
meanSquared = math.sqrt(rescaledPredicts.map(lambda p: pow(p[0]-p[1],2)).mean())
meanSquaredValid = math.sqrt(validPredicts.map(lambda p: pow(p[0]-p[1],2)).mean())
print("%d, %5.3f -> %.4f, %.4f" % (numIter, step, meanSquared, meanSquaredValid))
开发者ID:AkiraKane,项目名称:first-edition,代码行数:12,代码来源:ch07-listings.py
示例5: test_regression
def test_regression(self):
from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
RidgeRegressionWithSGD
data = [
LabeledPoint(-1.0, self.scipy_matrix(2, {1: -1.0})),
LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
LabeledPoint(-1.0, self.scipy_matrix(2, {1: -2.0})),
LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
]
rdd = self.sc.parallelize(data)
features = [p.features for p in data]
lr_model = LinearRegressionWithSGD.train(rdd)
self.assertTrue(lr_model.predict(features[0]) <= 0)
self.assertTrue(lr_model.predict(features[1]) > 0)
self.assertTrue(lr_model.predict(features[2]) <= 0)
self.assertTrue(lr_model.predict(features[3]) > 0)
lasso_model = LassoWithSGD.train(rdd)
self.assertTrue(lasso_model.predict(features[0]) <= 0)
self.assertTrue(lasso_model.predict(features[1]) > 0)
self.assertTrue(lasso_model.predict(features[2]) <= 0)
self.assertTrue(lasso_model.predict(features[3]) > 0)
rr_model = RidgeRegressionWithSGD.train(rdd)
self.assertTrue(rr_model.predict(features[0]) <= 0)
self.assertTrue(rr_model.predict(features[1]) > 0)
self.assertTrue(rr_model.predict(features[2]) <= 0)
self.assertTrue(rr_model.predict(features[3]) > 0)
开发者ID:EronWright,项目名称:spark,代码行数:29,代码来源:tests.py
示例6: regression
def regression():
#Regression Point
#Reads the data from the joinedResults directory as a parquet file
datadf = sqlContext.read.parquet(output+"/joinedResults")
datadf.show()
data = datadf.rdd.map(lambda w: (float(w.avg_prcp), int(w.yy), float(w.latitude), float(w.longitude)))
max_prcp = data.max()
min_prcp = data.min()
lat = data.map(lambda x: (x[2])).cache()
min_lat = lat.min()
max_lat = lat.max()
longt = data.map(lambda x: (x[3])).cache()
min_long = longt.min()
max_long = longt.max()
max_ = [max_prcp[0], float(2050), max_lat, max_long]
min_ = [min_prcp[0], float(1990), min_lat, min_long]
# change the format to fit in LinearRegression library
parsedData = data.map(lambda x: parsePointPrediction(x, max_, min_)).cache()
# Split data aproximately into training (80%) and test (20%)
trainData, testData = parsedData.randomSplit([0.8, 0.2], seed = 0)
trainData.cache()
testData.cache()
# Build the model using Try and error to find out the Parameters.
model = LinearRegressionWithSGD.train(trainData, iterations =500, regType="l2", regParam=10, intercept="true" )
# Evaluate the model on test data
valuesAndPreds = testData.map(lambda p: (p.label, model.predict(p.features)))
MSE = valuesAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / valuesAndPreds.count()
print("Mean Squared Error = " + str(MSE))
maxVal=max_prcp[0]
model.save(sc, output+"/modelpath")
return
开发者ID:sasoltan,项目名称:DroughtPercipitation,代码行数:34,代码来源:finalcode.py
示例7: evaluate
def evaluate(train,test,iterations,step,regParam,regType,intercept):
model = LinearRegressionWithSGD.train(train, iterations, step,regParam=regParam, regType=regType, intercept=intercept)
tp = test.map(lambda p: (p.label, model.predict(p.features)))
rmse = np.sqrt(tp.map(lambda (t,p): squarred_error(t,p)).mean())
mae = np.sqrt(tp.map(lambda (t,p): abs_error(t,p)).mean())
rmsle = np.sqrt(true_vs_predicted.map(lambda (t,p): squared_log_error(t,p)).mean())
opt_metrics = [rmse,mae,rmsle]
return opt_metrics
开发者ID:kevllino,项目名称:WeatherPred,代码行数:8,代码来源:weather_predict.py
示例8: get_best_result
def get_best_result(best_step_size, training_lp, testing_lp, iterations):
model = LinearRegressionWithSGD.train(training_lp, iterations=iterations, step=best_step_size, regType = 'l2')
values_and_preds = testing_lp.map(lambda p: (p.label, model.predict(p.features)))
MSE = values_and_preds.map(lambda (v, p): (v-p)**2).reduce(operator.add)
RMSE = math.sqrt(MSE)
result_str = 'best step size got by cross validation cv: ' + str(best_step_size) + ', lowest RMSE: ' + str(RMSE)
return result_str
开发者ID:Veterun,项目名称:SparkPythonHanhan,代码行数:8,代码来源:tfidf_cv_lowestRMSE_normalized.py
示例9: getRMSE
def getRMSE(step_array):
valRMSE_list = []
for step in step_array:
model = LinearRegressionWithSGD.train(train_featureScoreTimeRDD, iterations=5000, step=step)
labelsAndPreds = val_featureScoreTimeRDD.map(lambda p: (p.label, model.predict(p.features)))
valMSE = labelsAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / val_featureScoreTimeRDD.count()
valRMSE=valMSE**0.5
valRMSE_list.append((step, valRMSE))
return valRMSE_list
开发者ID:shaileshr,项目名称:SentimentAnalysis,代码行数:9,代码来源:Qn6.py
示例10: linearRegression
def linearRegression(features,sc,output_n):
features_and_label = features.collect()
training_features_labels = features_and_label[0:70]
testing_features_labels = features_and_label[70:116]
linearregression_model = LinearRegressionWithSGD.train(training_data,iterations=0,regParam=200)
prediction = testing_data.map(lambda line: (line.label, linearregression_model.predict(line.features)))
return linearregression_model,prediction
开发者ID:gitofsid,项目名称:StocksPrediction-ML,代码行数:10,代码来源:classifiers_for_stocks.py
示例11: test_regression
def test_regression(self):
from pyspark.mllib.regression import LinearRegressionWithSGD, LassoWithSGD, \
RidgeRegressionWithSGD
from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
data = [
LabeledPoint(-1.0, [0, -1]),
LabeledPoint(1.0, [0, 1]),
LabeledPoint(-1.0, [0, -2]),
LabeledPoint(1.0, [0, 2])
]
rdd = self.sc.parallelize(data)
features = [p.features.tolist() for p in data]
lr_model = LinearRegressionWithSGD.train(rdd)
self.assertTrue(lr_model.predict(features[0]) <= 0)
self.assertTrue(lr_model.predict(features[1]) > 0)
self.assertTrue(lr_model.predict(features[2]) <= 0)
self.assertTrue(lr_model.predict(features[3]) > 0)
lasso_model = LassoWithSGD.train(rdd)
self.assertTrue(lasso_model.predict(features[0]) <= 0)
self.assertTrue(lasso_model.predict(features[1]) > 0)
self.assertTrue(lasso_model.predict(features[2]) <= 0)
self.assertTrue(lasso_model.predict(features[3]) > 0)
rr_model = RidgeRegressionWithSGD.train(rdd)
self.assertTrue(rr_model.predict(features[0]) <= 0)
self.assertTrue(rr_model.predict(features[1]) > 0)
self.assertTrue(rr_model.predict(features[2]) <= 0)
self.assertTrue(rr_model.predict(features[3]) > 0)
categoricalFeaturesInfo = {0: 2} # feature 0 has 2 categories
dt_model = DecisionTree.trainRegressor(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
self.assertTrue(dt_model.predict(features[0]) <= 0)
self.assertTrue(dt_model.predict(features[1]) > 0)
self.assertTrue(dt_model.predict(features[2]) <= 0)
self.assertTrue(dt_model.predict(features[3]) > 0)
rf_model = RandomForest.trainRegressor(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100)
self.assertTrue(rf_model.predict(features[0]) <= 0)
self.assertTrue(rf_model.predict(features[1]) > 0)
self.assertTrue(rf_model.predict(features[2]) <= 0)
self.assertTrue(rf_model.predict(features[3]) > 0)
gbt_model = GradientBoostedTrees.trainRegressor(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
self.assertTrue(gbt_model.predict(features[0]) <= 0)
self.assertTrue(gbt_model.predict(features[1]) > 0)
self.assertTrue(gbt_model.predict(features[2]) <= 0)
self.assertTrue(gbt_model.predict(features[3]) > 0)
开发者ID:greatyan,项目名称:spark,代码行数:52,代码来源:tests.py
示例12: get_best_stepsize
def get_best_stepsize(step_sizes, training_lp, testing_lp, iterations):
best_stepsize = 0
lowest_RMSE = float("inf")
for step_size in step_sizes:
model = LinearRegressionWithSGD.train(training_lp, iterations=iterations, step=step_size)
values_and_preds = testing_lp.map(lambda p: (p.label, model.predict(p.features)))
MSE = values_and_preds.map(lambda (v, p): (v-p)**2).reduce(operator.add)
RMSE = math.sqrt(MSE)
if RMSE < lowest_RMSE:
lowest_RMSE = RMSE
best_stepsize = step_size
result_str = 'best step size: ' + str(best_stepsize) + ', lowest RMSE: ' + str(lowest_RMSE)
return result_str
开发者ID:Veterun,项目名称:SparkPythonHanhan,代码行数:14,代码来源:word2vec_best_RMSE.py
示例13: LinearRegression
def LinearRegression(filename, sc):
filename = "/Users/Jacob/repository/SparkService/data/lpsa.data"
data = sc.textFile(filename)
parsedData = data.map(parsePoint)
# train the model
model = LinearRegressionWithSGD.train(parsedData)
# Evaluate the model on training data
valuesAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
MSE = valuesAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / valuesAndPreds.count()
print("\n\n\n\n\n\nMean Squared Error = " + str(MSE) + "\n\n\n\n\n")
# Save and load model
#model.save(sc, "myModelPath")
#sameModel = LinearRegressionModel.load(sc, "myModelPath")
开发者ID:bangjieliu,项目名称:SparkService,代码行数:16,代码来源:linear_regression.py
示例14: test_spark
def test_spark():
def parsePoint(line):
values = [float(x) for x in line.replace(',', ' ').split(' ')]
return LabeledPoint(values[0], values[1:])
data = sc.textFile(r"/usr/local/Cellar/apache-spark/1.6.1/libexec/data/mllib/ridge-data/lpsa.data")
parsedData = data.map(parsePoint)
print parsedData.collect()
# Build the model
model = LinearRegressionWithSGD.train(parsedData)
# Evaluate the model on training data
valuesAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
MSE = valuesAndPreds.map(lambda (v, p): (v - p) ** 2).reduce(lambda x, y: x + y) / valuesAndPreds.count()
print("Mean Squared Error = " + str(MSE))
print "Model coefficients:", str(model)
开发者ID:WarnWang,项目名称:Dissertation,代码行数:17,代码来源:spark_test.py
示例15: algo
def algo(a):
global data
global week
global target
test = week
week_target = week.map(convert)
#apply(convert, axis=1)
#np.random.seed(123)
data_final = LabeledPoint(target, data)
#make rdd that is input for algo
if a == 'sgd':
#time_0 = time.time()
lrm = LinearRegressionWithSGD.train(sc.parallelize(data_final), iterations=10, initialWeights=np.array([1.0]))
print (abs(lrm.predict(test)))
print time.time() - time_0
开发者ID:mmeoni,项目名称:LHCDataAnalysis,代码行数:17,代码来源:ensemble.py
示例16: linearRegression
def linearRegression(features,sc,output_n):
features_and_label = features.collect()
training_features_labels = features_and_label[0:70]
testing_features_labels = features_and_label[70:]
labeled_training = []
labeled_testing = []
for x in training_features_labels:
labeled_training.append(LabeledPoint(x[0],x[1]))
for y in testing_features_labels:
labeled_testing.append(LabeledPoint(y[0],y[1]))
test = sc.parallelize(labeled_testing)
linearregression_model = LinearRegressionWithSGD.train(labeled_training,iterations=0,regParam=200)
predictions = test.map(lambda line: (line.label, float(linearregression_model.predict(line.features))))
return predictions
开发者ID:gitofsid,项目名称:StocksPrediction-ML,代码行数:19,代码来源:classifiers_for_stocks_replace_feat.py
示例17: linearRegression_f
def linearRegression_f(mode):
if mode == "no_reg":
model = LinearRegressionWithSGD.train(parsedData)
elif mode == "L1_reg":
model = LassoWithSGD.train(parsedData)
elif mode == "L2_reg":
model = RidgeRegressionWithSGD.train(parsedData)
else:
print("ERROR Mode")
#Evaluate the model on training data
# parsedData map method to get {train_data, predict_data} pairs
valuesAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
#calculate the key-value pairs to get MSE
MSE = valuesAndPreds.map(lambda (v, p): (v-p)**2).reduce(lambda x, y: x+y)/valuesAndPreds.count()
return MSE
开发者ID:ZaphyrRobin,项目名称:linear_regression_bill_vs_tip,代码行数:19,代码来源:tip_linear_regression.py
示例18: LinearRegression
def LinearRegression(trainFile, testFile, taskid,sc):
# filename = "/Users/Jacob/repository/SparkService/data/lpsa.data"
# data = sc.textFile(filename)
# parsedData = data.map(parsePoint)
trainData = MLUtils.loadLibSVMFile(sc, trainFile)
testData = MLUtils.loadLibSVMFile(sc, testFile)
# train the model
model = LinearRegressionWithSGD.train(trainData)
# Evaluate the model on training data
# predictionAndLabels = parsedData.map(lambda p: (p.label, model.predict(p.features)))
predictionAndLabels = testData.map(lambda p: (p.label, model.predict(p.features)))
MSE = predictionAndLabels.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / predictionAndLabels.count()
print("\n\n\n\n\n\nMean Squared Error = " + str(MSE) + "\n\n\n\n\n")
# Save and load model
#model.save(sc, "myModelPath")
#sameModel = LinearRegressionModel.load(sc, "myModelPath")
开发者ID:honeycombcmu,项目名称:SparkService,代码行数:20,代码来源:linear_regression.py
示例19: get_best_stepsize
def get_best_stepsize(step_sizes, training_lp, iterations, cv_trails):
best_stepsize = 0
lowest_RMSE = float("inf")
num_folds = 4
fold_set = [1]*num_folds
cv_data = training_lp.randomSplit(fold_set) # 4 folds
for step_size in step_sizes:
total_RMSE = 0.0
for i in range(num_folds):
cv_testing = cv_data[i]
cv_training = training_lp.subtract(cv_testing)
model = LinearRegressionWithSGD.train(cv_training, iterations=iterations, step=step_size)
values_and_preds = cv_testing.map(lambda p: (p.label, model.predict(p.features)))
MSE = values_and_preds.map(lambda (v, p): (v-p)**2).reduce(operator.add)
RMSE = math.sqrt(MSE)
total_RMSE += RMSE
avg_RMSE = total_RMSE/cv_trails
if avg_RMSE < lowest_RMSE:
lowest_RMSE = avg_RMSE
best_stepsize = step_size
return best_stepsize
开发者ID:Veterun,项目名称:SparkPythonHanhan,代码行数:22,代码来源:tfidf_cv_lowestRMSE_normalized.py
示例20: train_amount_model
def train_amount_model(self, model, data, i):
rdd_data = self.sc.parallelize(data)
self.logger.info('Start to train the amount model')
if self.amount_prediction_method == self.ARTIFICIAL_NEURAL_NETWORK:
input_num = self.feature_num
layers = [input_num, input_num / 3 * 2, input_num / 3, 1]
neural_network = NeuralNetworkSpark(layers=layers, bias=0)
model = neural_network.train(rdd_data, method=neural_network.BP, seed=1234, learn_rate=0.0001,
iteration=15, model=model)
elif self.amount_prediction_method == self.RANDOM_FOREST:
model = RandomForest.trainRegressor(rdd_data, categoricalFeaturesInfo={}, numTrees=40,
featureSubsetStrategy="auto", impurity='variance', maxDepth=20,
maxBins=32)
elif self.amount_prediction_method == self.LINEAR_REGRESSION:
model = LinearRegressionWithSGD.train(rdd_data, iterations=10000, step=0.001,
initialWeights=model.weights if model is not None else None)
else:
self.logger.error("Unknown training method {}".format(self.amount_prediction_method))
raise ValueError("Unknown training method {}".format(self.amount_prediction_method))
return model
开发者ID:WarnWang,项目名称:Dissertation,代码行数:23,代码来源:composition_prediction_system.py
注:本文中的pyspark.mllib.regression.LinearRegressionWithSGD类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论