本文整理汇总了Python中pyspark.mllib.classification.LogisticRegressionWithSGD类的典型用法代码示例。如果您正苦于以下问题:Python LogisticRegressionWithSGD类的具体用法?Python LogisticRegressionWithSGD怎么用?Python LogisticRegressionWithSGD使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了LogisticRegressionWithSGD类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: main
def main():
MakePixelFileFromImages("./CarData/TrainImages/*pgm")
sc = SparkContext(appName="Image Classifier 01")
p = sc.textFile("pos.csv")
n = sc.textFile("neg.csv")
pFeatures = p.map(lambda image: image.split(","))
nFeatures = n.map(lambda image: image.split(","))
pExamples = pFeatures.map(lambda features: LabeledPoint(1, features))
nExamples = nFeatures.map(lambda features: LabeledPoint(0, features))
data = pExamples.union(nExamples)
(trainingData, testData) = data.randomSplit([0.7,0.3])
trainingData.cache()
model = LogisticRegressionWithSGD.train(trainingData)
labels_and_predictions = testData.map(lambda image:(image.label, model.predict(image.features)))
error_rate = labels_and_predictions.filter(lambda (val,pred): val!=pred).count() / float(testData.count())
print("************* RESULTS *******************")
print("Error Rate: " + str(error_rate))
pickle.dump(model, open("imageModel.pk1","wb"))
sc.stop()
开发者ID:samacart,项目名称:Park-Or-Bird,代码行数:28,代码来源:spark-ImageClassifier.py
示例2: modelWithLogisticRegression
def modelWithLogisticRegression(trainingData, validationData):
##Train the model using Logistic Regression that employs Stochastic Gradient Descent
##with different sets of parameters (i.e the value of lambda and the learning step size.
##Return the LR model with best accuracy rate
#eta = [0.1, 0.3, 0.5, 1.0, 5.0]
regularizationParamater = [.00000001, .0000005, 1., 1000., 100000.]
bestLRModel = None
bestAccuracy = 0
numOfIterations = 200
visualizationData = []
for regularizer in regularizationParamater:
model = LogisticRegressionWithSGD.train(trainingData, numOfIterations, 1.0, regParam=regularizer)
predict = validationData.map(lambda ad: (ad.label, model.predict(ad.features)))
totalValidationAds = validationData.count()
correctlyPredicted = predict.filter(lambda x: x[0] == x[1]).count()
accuracy = float(correctlyPredicted)/totalValidationAds
visualizationData += [(regularizer, accuracy)]
if accuracy > bestAccuracy:
bestAccuracy = accuracy
bestLRModel = model
return bestLRModel, visualizationData
开发者ID:Abhishek-Arora,项目名称:Ad-Click-Prediction,代码行数:29,代码来源:predictClick.py
示例3: test_classification
def test_classification(self):
from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
data = [
LabeledPoint(0.0, [1, 0, 0]),
LabeledPoint(1.0, [0, 1, 1]),
LabeledPoint(0.0, [2, 0, 0]),
LabeledPoint(1.0, [0, 2, 1])
]
rdd = self.sc.parallelize(data)
features = [p.features.tolist() for p in data]
lr_model = LogisticRegressionWithSGD.train(rdd)
self.assertTrue(lr_model.predict(features[0]) <= 0)
self.assertTrue(lr_model.predict(features[1]) > 0)
self.assertTrue(lr_model.predict(features[2]) <= 0)
self.assertTrue(lr_model.predict(features[3]) > 0)
svm_model = SVMWithSGD.train(rdd)
self.assertTrue(svm_model.predict(features[0]) <= 0)
self.assertTrue(svm_model.predict(features[1]) > 0)
self.assertTrue(svm_model.predict(features[2]) <= 0)
self.assertTrue(svm_model.predict(features[3]) > 0)
nb_model = NaiveBayes.train(rdd)
self.assertTrue(nb_model.predict(features[0]) <= 0)
self.assertTrue(nb_model.predict(features[1]) > 0)
self.assertTrue(nb_model.predict(features[2]) <= 0)
self.assertTrue(nb_model.predict(features[3]) > 0)
开发者ID:Altiscale,项目名称:OBSOLETE-spark,代码行数:28,代码来源:tests.py
示例4: train_committee
def train_committee(train_features, test_features, size=5):
committee = []
attempts = 0
max_attempts = size * 4
roc_threshold = 0.7
test_pairs_features = test_features.map(lambda p: process_batch(p, is_train=True))
test_labeled_pairs = test_pairs_features.map(to_labeled_point)
while len(committee) < size and attempts < max_attempts:
attempts += 1
pairs_features = train_features.map(lambda p: process_batch(p, is_train=True))
labeled_points = pairs_features.map(to_labeled_point).sample(True, 1)
model = LogisticRegressionWithSGD.train(labeled_points)
model.clearThreshold()
scores_and_labels = test_labeled_pairs.map(lambda p: (model.predict(p.features), p.label))
metrics = BinaryClassificationMetrics(scores_and_labels)
if metrics.areaUnderROC > roc_threshold:
print(attempts, metrics.areaUnderROC)
committee.append(model)
return committee
开发者ID:siauPatrick,项目名称:seimur,代码行数:25,代码来源:convert_pairs.py
示例5: test_classification
def test_classification(self):
from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
data = [
LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})),
LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})),
LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
]
rdd = self.sc.parallelize(data)
features = [p.features for p in data]
lr_model = LogisticRegressionWithSGD.train(rdd)
self.assertTrue(lr_model.predict(features[0]) <= 0)
self.assertTrue(lr_model.predict(features[1]) > 0)
self.assertTrue(lr_model.predict(features[2]) <= 0)
self.assertTrue(lr_model.predict(features[3]) > 0)
svm_model = SVMWithSGD.train(rdd)
self.assertTrue(svm_model.predict(features[0]) <= 0)
self.assertTrue(svm_model.predict(features[1]) > 0)
self.assertTrue(svm_model.predict(features[2]) <= 0)
self.assertTrue(svm_model.predict(features[3]) > 0)
nb_model = NaiveBayes.train(rdd)
self.assertTrue(nb_model.predict(features[0]) <= 0)
self.assertTrue(nb_model.predict(features[1]) > 0)
self.assertTrue(nb_model.predict(features[2]) <= 0)
self.assertTrue(nb_model.predict(features[3]) > 0)
开发者ID:EronWright,项目名称:spark,代码行数:28,代码来源:tests.py
示例6: logistic_l2_accuracy
def logistic_l2_accuracy(x_train, x_test, regParam):
# cache data to get reasonable speeds for methods like LogisticRegression and SVM
xc = x_train.cache()
# training logistic regression with L2 regularization
model = LogisticRegressionWithSGD.train(xc, regParam=regParam, regType="l2")
# making prediction on x_test
yhat = x_test.map(lambda p: (p.label, model.predict(p.features)))
# returning accuracy on x_test
return yhat.filter(lambda (v, p): v == p).count() / float(x_test.count())
开发者ID:SemanticBeeng,项目名称:optunity,代码行数:9,代码来源:logistic.py
示例7: main
def main():
"""
Driver program for a spam filter using Spark and MLLib
"""
# Consolidate the individual email files into a single spam file
# and a single ham file
makeDataFileFromEmails( "data/spam_2/", "data/spam.txt")
makeDataFileFromEmails( "data/easy_ham_2/", "data/ham.txt" )
# Create the Spark Context for parallel processing
sc = SparkContext( appName="Spam Filter")
# Load the spam and ham data files into RDDs
spam = sc.textFile( "data/spam.txt" )
ham = sc.textFile( "data/ham.txt" )
# Create a HashingTF instance to map email text to vectors of 10,000 features.
tf = HashingTF(numFeatures = 10000)
# Each email is split into words, and each word is mapped to one feature.
spamFeatures = spam.map(lambda email: tf.transform(email.split(" ")))
hamFeatures = ham.map(lambda email: tf.transform(email.split(" ")))
# Create LabeledPoint datasets for positive (spam) and negative (ham) data points.
positiveExamples = spamFeatures.map(lambda features: LabeledPoint(1, features))
negativeExamples = hamFeatures.map(lambda features: LabeledPoint(0, features))
# Combine positive and negative datasets into one
data = positiveExamples.union(negativeExamples)
# Split the data into 70% for training and 30% test data sets
( trainingData, testData ) = data.randomSplit( [0.7, 0.3] )
# Cache the training data to optmize the Logistic Regression
trainingData.cache()
# Train the model with Logistic Regression using the SGD algorithm.
model = LogisticRegressionWithSGD.train(trainingData)
# Create tuples of actual and predicted values
labels_and_predictions = testData.map( lambda email: (email.label, model.predict( email.features) ) )
# Calculate the error rate as number wrong / total number
error_rate = labels_and_predictions.filter( lambda (val, pred): val != pred ).count() / float(testData.count() )
print( "*********** SPAM FILTER RESULTS **********" )
print( "\n" )
print( "Error Rate: " + str( error_rate ) )
print( "\n" )
# Serialize the model for presistance
pickle.dump( model, open( "spamFilter.pkl", "wb" ) )
sc.stop()
开发者ID:badpaper,项目名称:coursework,代码行数:54,代码来源:spamFilter.py
示例8: test_classification
def test_classification(self):
from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
data = [
LabeledPoint(0.0, [1, 0, 0]),
LabeledPoint(1.0, [0, 1, 1]),
LabeledPoint(0.0, [2, 0, 0]),
LabeledPoint(1.0, [0, 2, 1])
]
rdd = self.sc.parallelize(data)
features = [p.features.tolist() for p in data]
lr_model = LogisticRegressionWithSGD.train(rdd)
self.assertTrue(lr_model.predict(features[0]) <= 0)
self.assertTrue(lr_model.predict(features[1]) > 0)
self.assertTrue(lr_model.predict(features[2]) <= 0)
self.assertTrue(lr_model.predict(features[3]) > 0)
svm_model = SVMWithSGD.train(rdd)
self.assertTrue(svm_model.predict(features[0]) <= 0)
self.assertTrue(svm_model.predict(features[1]) > 0)
self.assertTrue(svm_model.predict(features[2]) <= 0)
self.assertTrue(svm_model.predict(features[3]) > 0)
nb_model = NaiveBayes.train(rdd)
self.assertTrue(nb_model.predict(features[0]) <= 0)
self.assertTrue(nb_model.predict(features[1]) > 0)
self.assertTrue(nb_model.predict(features[2]) <= 0)
self.assertTrue(nb_model.predict(features[3]) > 0)
categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories
dt_model = DecisionTree.trainClassifier(
rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo)
self.assertTrue(dt_model.predict(features[0]) <= 0)
self.assertTrue(dt_model.predict(features[1]) > 0)
self.assertTrue(dt_model.predict(features[2]) <= 0)
self.assertTrue(dt_model.predict(features[3]) > 0)
rf_model = RandomForest.trainClassifier(
rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100)
self.assertTrue(rf_model.predict(features[0]) <= 0)
self.assertTrue(rf_model.predict(features[1]) > 0)
self.assertTrue(rf_model.predict(features[2]) <= 0)
self.assertTrue(rf_model.predict(features[3]) > 0)
gbt_model = GradientBoostedTrees.trainClassifier(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
self.assertTrue(gbt_model.predict(features[0]) <= 0)
self.assertTrue(gbt_model.predict(features[1]) > 0)
self.assertTrue(gbt_model.predict(features[2]) <= 0)
self.assertTrue(gbt_model.predict(features[3]) > 0)
开发者ID:greatyan,项目名称:spark,代码行数:51,代码来源:tests.py
示例9: getLogisticRegressionModel
def getLogisticRegressionModel(Train_Data):
numIters = 10
stepSize = 10.
regParam = 1e-6
regType = 'l2'
includeIntercept = True
return LogisticRegressionWithSGD.train(data = Train_Data,
iterations = numIters,
miniBatchFraction=0.1,
step = stepSize,
regParam = regParam,
regType = regType,
intercept = includeIntercept)
开发者ID:danielnazareth89,项目名称:603-Masters-Project-Apache-Spark-,代码行数:16,代码来源:FeatureHashing.py
示例10: logisticRegression
def logisticRegression(trainingRDD, trainingRDDHashed,
testRDDHashed, iterations, minibatch, stepsize):
# Train a Naive Bayes Model
trainedModel = LogisticRegressionWithSGD.train(
trainingRDD,
iterations=iterations,
miniBatchFraction=minibatch,
regType="l2",
intercept=True,
regParam=0.1,
step=stepsize)
# Test on Validation and Test Sets
resultsValidation = trainingRDDHashed.map(
lambda l_v24: (
(l_v24[0],
trainedModel.predict(
l_v24[1])),
1)).map(
lambda x_y25: (
checkState(
x_y25[0]),
x_y25[1])).reduceByKey(add).collectAsMap()
resultsTest = testRDDHashed.map(
lambda l_v26: (
(l_v26[0],
trainedModel.predict(
l_v26[1])),
1)).map(
lambda x_y27: (
checkState(
x_y27[0]),
x_y27[1])).reduceByKey(add).collectAsMap()
# Get Counts
nFilesV = trainingRDDHashed.count()
nFilesT = testRDDHashed.count()
# Create a dictionary of the Values
resultsValidation = defaultdict(lambda: 0, resultsValidation)
resultsTest = defaultdict(lambda: 0, resultsTest)
# Get F-Score and Accuracy Values
AccuracyV, fScoreV = getAccuracy(resultsValidation, nFilesV)
AccuracyT, fScoreT = getAccuracy(resultsTest, nFilesT)
# Print Results
print(' Results for Logistic Regression')
print(' Training Set: %.3f and F-Score: %.3f') % (AccuracyV, fScoreV)
print(' Test Set: %.3f and F-Score: %.3f') % (AccuracyT, fScoreT)
# Return the Result List
return AccuracyV, fScoreV, AccuracyT, fScoreT
开发者ID:AkiraKane,项目名称:CityUniversity2014,代码行数:47,代码来源:ackf415-Local-LR-Optimisation.py
示例11: train_trend_model
def train_trend_model(self, model, data, i):
self.logger.info('Start to train the direction model')
rdd_data = self.sc.parallelize(data)
if self.trend_prediction_method == self.RANDOM_FOREST:
model = RandomForest.trainClassifier(rdd_data, numClasses=2, categoricalFeaturesInfo={}, numTrees=40,
featureSubsetStrategy="auto", impurity='gini', maxDepth=20,
maxBins=32)
elif self.trend_prediction_method == self.NAIVE_BAYES:
model = NaiveBayes.train(rdd_data)
elif self.trend_prediction_method == self.LOGISTIC_REGRESSION:
model = LogisticRegressionWithSGD.train(rdd_data, iterations=10000, step=0.001,
initialWeights=None if model is None else model.weights)
elif self.trend_prediction_method == self.SVM:
model = SVMWithSGD.train(rdd_data, iterations=10000, step=0.001,
initialWeights=None if model is None else model.weights)
return model
开发者ID:WarnWang,项目名称:Dissertation,代码行数:19,代码来源:composition_prediction_system.py
示例12: main
def main(input_file_path):
print('=====>>>>>')
print('ddd')
data = sc.textFile(input_file_path)
traning_data_RDD = data.filter(lambda line: line.split(',')[3] != '' and line.split(',')[0] != 'INDEX')
unseen_data_RDD = data.filter(lambda line: line.split(',')[3] == '')
traning_data_pddf = create_pddf(traning_data_RDD)
traning_data_df = sqlContext.createDataFrame(traning_data_pddf)
print(traning_data_df.head())
parsed_data = rdd_to_labeled_point(traning_data_df.rdd)
parsed_data.persist()
# Correct print: [LabeledPoint(1.0, [1.0,8.6662186586,6.98047693487])]
logisticRegressionWithSGD = LogisticRegressionWithSGD.train(parsed_data, iterations=100)
labels_and_preds = parsed_data.map(lambda lp: [lp.label, logisticRegressionWithSGD.predict(lp.features)])
Accuracy = labels_and_preds.filter(lambda ele: int(ele[0]) == int(ele[1])).count() / float(parsed_data.count())
print("Training Accuracy on training data = " + str(Accuracy))
unseen_data_pddf = create_pddf(unseen_data_RDD)
unseen_data_df = sqlContext.createDataFrame(unseen_data_pddf)
unseen_parsed_data = rdd_to_index_featurs(unseen_data_df.rdd)
unseen_parsed_data.persist()
file = open('/Users/1002720/Documents/workspace/SNU-project/data/BDA2Project/1-GenderPrediction/result.csv', 'w',
encoding='utf-8')
file.write('INDEX,GENDER\n')
for data in unseen_parsed_data.collect():
file.write(str(data[0]) + ',' + str(logisticRegressionWithSGD.predict(data[1]) + 1) + '\n')
# print(labels_and_preds.collect())
parsed_data.unpersist()
unseen_parsed_data.unpersist()
print('=====>>>>>')
print('=====>>>>>')
print('=====>>>>>')
print('=====>>>>>')
开发者ID:Ggoals,项目名称:SNU-project,代码行数:41,代码来源:1-logisticRegressionWithSGD.py
示例13: test_classification
def test_classification(self):
from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
from pyspark.mllib.tree import DecisionTree
data = [
LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})),
LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})),
LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
]
rdd = self.sc.parallelize(data)
features = [p.features for p in data]
lr_model = LogisticRegressionWithSGD.train(rdd)
self.assertTrue(lr_model.predict(features[0]) <= 0)
self.assertTrue(lr_model.predict(features[1]) > 0)
self.assertTrue(lr_model.predict(features[2]) <= 0)
self.assertTrue(lr_model.predict(features[3]) > 0)
svm_model = SVMWithSGD.train(rdd)
self.assertTrue(svm_model.predict(features[0]) <= 0)
self.assertTrue(svm_model.predict(features[1]) > 0)
self.assertTrue(svm_model.predict(features[2]) <= 0)
self.assertTrue(svm_model.predict(features[3]) > 0)
nb_model = NaiveBayes.train(rdd)
self.assertTrue(nb_model.predict(features[0]) <= 0)
self.assertTrue(nb_model.predict(features[1]) > 0)
self.assertTrue(nb_model.predict(features[2]) <= 0)
self.assertTrue(nb_model.predict(features[3]) > 0)
categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories
dt_model = DecisionTree.trainClassifier(rdd, numClasses=2,
categoricalFeaturesInfo=categoricalFeaturesInfo)
self.assertTrue(dt_model.predict(features[0]) <= 0)
self.assertTrue(dt_model.predict(features[1]) > 0)
self.assertTrue(dt_model.predict(features[2]) <= 0)
self.assertTrue(dt_model.predict(features[3]) > 0)
开发者ID:drewrobb,项目名称:spark,代码行数:37,代码来源:test_linalg.py
示例14: anom_with_lr
def anom_with_lr():
try:
plaintext_rdd = sc.textFile("file:///Users/blahiri/healthcare/data/cloudera_challenge/pat_proc_larger.csv") #69.2 MB
pat_proc = pycsv.csvToDataFrame(sqlContext, plaintext_rdd, sep = ",")
anom = pat_proc.filter(pat_proc.is_anomalous == 1)
benign = pat_proc.filter(pat_proc.is_anomalous == 0)
n_benign = benign.count()
#Take a random sample of 50K from the unlabeled 100K
sqlContext.registerFunction("my_random", lambda x: x - x + random())
sqlContext.registerDataFrameAsTable(benign, "benign")
benign = sqlContext.sql("SELECT *, my_random(is_anomalous) as random_number FROM benign")
threshold = 50000/n_benign
into_model = benign.filter(benign.random_number <= threshold)
for_finding_more = benign.filter(benign.random_number > threshold)
for_modeling = anom.unionAll(into_model.drop(into_model.random_number))
for_finding_more = for_finding_more.drop(for_finding_more.random_number)
#Try to pull this from a much larger sample, or, the entire data, because the ones with lowest probabilities, among
#the selected 10,000, have probabilities around 0.05
print("anom.count() = " + str(anom.count()) + ", benign.count() = " + str(benign.count()) + ", into_model.count() = " + str(into_model.count())
+ ", for_modeling.count() = " + str(for_modeling.count()) + ", for_finding_more.count() = " + str(for_finding_more.count()))
all_columns = for_modeling.columns
features = [x for x in all_columns if (x not in ["patient_id", "is_anomalous"])]
categorical_features = ["age_group", "gender", "income_range"] #We are listing these 3 as categorical features only as the procedure features have 0-1 values anyway
procedure_features = [x for x in features if (x not in categorical_features)]
#Unlike decision tree, logistic regression does not need the map categoricalFeaturesInfo, just an RDD of LabeledPoint objects.
#Create a dictionary where the key-value pairs are as follows: key is the name of the categorical feature, and value is a list with the following entries:
#1) an id of the feature that is incremented sequentially, 2) no. of distinct values of the feature, 3) a list of the distinct values of the feature.
cat_feature_number = 0
dict_cat_features = {}
for feature in categorical_features:
agvalues = pat_proc.select(pat_proc[feature].cast("string").alias("feature")).distinct().collect()
distinct_values = map(lambda row: row.asDict().values()[0], agvalues)
distinct_values = sorted(map(lambda unicode_val: unicode_val.encode('ascii','ignore'), distinct_values))
dict_cat_features[feature] = [cat_feature_number, len(distinct_values), distinct_values]
cat_feature_number += 1
for_modeling = for_modeling.rdd
print("for_modeling.getNumPartitions() = " + str(for_modeling.getNumPartitions())) #4 partitions: the default should be the number of logical cores, which is 8
(train, test) = for_modeling.randomSplit([0.5, 0.5])
test_data_size = test.count()
print("train.count() = " + str(train.count()) + ", test.count() = " + str(test_data_size))
training_data = train.map(lambda x: create_labeled_point(x, features, categorical_features, dict_cat_features, procedure_features))
print("training_data.count() = " + str(training_data.count()))
t0 = time()
#model = LogisticRegressionWithLBFGS.train(training_data) #LBFGS took 66.766 seconds
model = LogisticRegressionWithSGD.train(training_data) #SGCD took 69.261 seconds
tt = time() - t0
print "Classifier trained in {} seconds".format(round(tt,3))
test_data = test.map(lambda x: create_labeled_point(x, features, categorical_features, dict_cat_features, procedure_features))
t0 = time()
predictions = model.predict(test_data.map(lambda p: p.features))
tt = time() - t0
print "Prediction made in {} seconds".format(round(tt,3)) #Reports as 0.0 seconds
labelsAndPreds = test_data.map(lambda p: (p.label, model.predict(p.features)))
test_accuracy = labelsAndPreds.filter(lambda (v, p): v == p).count()/float(test_data_size)
fpr = labelsAndPreds.filter(lambda (v, p): (v == 0 and p == 1)).count()/labelsAndPreds.filter(lambda (v, p): v == 0).count()
fnr = labelsAndPreds.filter(lambda (v, p): (v == 1 and p == 0)).count()/labelsAndPreds.filter(lambda (v, p): v == 1).count()
print "Test accuracy is {}, fpr is {}, fnr is {}".format(round(test_accuracy, 4), round(fpr, 4), round(fnr, 4)) #Test accuracy is 0.9057, fpr is 0.1634, fnr is 0.0282
model.clearThreshold()
for_finding_more = for_finding_more.map(lambda x: create_labeled_point(x, features, categorical_features, dict_cat_features, procedure_features)) #OK
for_finding_more = for_finding_more.map(lambda p: (p.features, model.predict(p.features), p.label)) #OK
try:
for_finding_more.first() #We perform an action here because otherwise the output will be a PipelinedRDD.
#Reverse-sort the additional patients by their predicted probabilities of being anomalous and take the top 10,000
#for_finding_more.take(5)
except EOFError:
print("EOF handled")
df = sqlContext.createDataFrame(for_finding_more.collect(), ['features', 'predicted_prob', 'is_anom'])
df = df.orderBy(df.predicted_prob.desc()) #The orderBy is not actually called if collect() is not called. Can be also triggered by calling take(). We are triggering it by the writing in the next statement.
df.select('is_anom', 'predicted_prob').limit(10000).write.format('com.databricks.spark.csv').save('file:///Users/blahiri/healthcare/data/cloudera_challenge/additional_10000_from_spark.csv') #Top one has
#probability of 0.86818, last one has probability 0.5928958
except Exception:
print("Exception in user code:")
traceback.print_exc(file = sys.stdout)
return for_finding_more
开发者ID:bibudhlahiri,项目名称:healthcare,代码行数:93,代码来源:analyze_anomaly_with_spark.py
示例15: test_classification
def test_classification(self):
from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest,\
RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel
data = [
LabeledPoint(0.0, [1, 0, 0]),
LabeledPoint(1.0, [0, 1, 1]),
LabeledPoint(0.0, [2, 0, 0]),
LabeledPoint(1.0, [0, 2, 1])
]
rdd = self.sc.parallelize(data)
features = [p.features.tolist() for p in data]
temp_dir = tempfile.mkdtemp()
lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10)
self.assertTrue(lr_model.predict(features[0]) <= 0)
self.assertTrue(lr_model.predict(features[1]) > 0)
self.assertTrue(lr_model.predict(features[2]) <= 0)
self.assertTrue(lr_model.predict(features[3]) > 0)
svm_model = SVMWithSGD.train(rdd, iterations=10)
self.assertTrue(svm_model.predict(features[0]) <= 0)
self.assertTrue(svm_model.predict(features[1]) > 0)
self.assertTrue(svm_model.predict(features[2]) <= 0)
self.assertTrue(svm_model.predict(features[3]) > 0)
nb_model = NaiveBayes.train(rdd)
self.assertTrue(nb_model.predict(features[0]) <= 0)
self.assertTrue(nb_model.predict(features[1]) > 0)
self.assertTrue(nb_model.predict(features[2]) <= 0)
self.assertTrue(nb_model.predict(features[3]) > 0)
categoricalFeaturesInfo = {0: 3} # feature 0 has 3 categories
dt_model = DecisionTree.trainClassifier(
rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4)
self.assertTrue(dt_model.predict(features[0]) <= 0)
self.assertTrue(dt_model.predict(features[1]) > 0)
self.assertTrue(dt_model.predict(features[2]) <= 0)
self.assertTrue(dt_model.predict(features[3]) > 0)
dt_model_dir = os.path.join(temp_dir, "dt")
dt_model.save(self.sc, dt_model_dir)
same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir)
self.assertEqual(same_dt_model.toDebugString(), dt_model.toDebugString())
rf_model = RandomForest.trainClassifier(
rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10,
maxBins=4, seed=1)
self.assertTrue(rf_model.predict(features[0]) <= 0)
self.assertTrue(rf_model.predict(features[1]) > 0)
self.assertTrue(rf_model.predict(features[2]) <= 0)
self.assertTrue(rf_model.predict(features[3]) > 0)
rf_model_dir = os.path.join(temp_dir, "rf")
rf_model.save(self.sc, rf_model_dir)
same_rf_model = RandomForestModel.load(self.sc, rf_model_dir)
self.assertEqual(same_rf_model.toDebugString(), rf_model.toDebugString())
gbt_model = GradientBoostedTrees.trainClassifier(
rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4)
self.assertTrue(gbt_model.predict(features[0]) <= 0)
self.assertTrue(gbt_model.predict(features[1]) > 0)
self.assertTrue(gbt_model.predict(features[2]) <= 0)
self.assertTrue(gbt_model.predict(features[3]) > 0)
gbt_model_dir = os.path.join(temp_dir, "gbt")
gbt_model.save(self.sc, gbt_model_dir)
same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir)
self.assertEqual(same_gbt_model.toDebugString(), gbt_model.toDebugString())
try:
rmtree(temp_dir)
except OSError:
pass
开发者ID:HodaAlemi,项目名称:spark,代码行数:75,代码来源:tests.py
示例16: parsePoint
from pyspark.mllib.regression import LabeledPoint
from numpy import array
import parse
# Load and parse the data
#def parsePoint(line): # Creating vector(array) with first input as y and others as xi's
# values = [float(x) for x in line.split(',')]
# return LabeledPoint(values[10], values[0:9])
sc = SparkContext("local[4]", "Logistic Regression") #Initialized SparkContext
data = sc.textFile("/home/ayush/Data /Data for Machine Learning/UCI Adult Data Set/UCI adult.data") #Created an RDD
parsedData = data.map(parse.parsePoint) #RDD Transformation on the input RDD which is string and converting them to labeled points and each labeled points is a tuple of float(label) and ndrarray(features)
# Build the model
model = LogisticRegressionWithSGD.train(parsedData) #Pass an RDD to "train" method of class LogisticRegressionwithSGD
#Use model to create output
#model.predict().collect() # in "predict" method we have to pass an array
#Read Test data
Testdata = sc.textFile("/home/ayush/Data /Data for Machine Learning/UCI Adult Data Set/UCI adult.test")
parsedTestData = Testdata.map(parse.parsePoint)
#predict result for each Test Data
# Evaluating the model on training data
labelsAndPreds = parsedTestData.map(lambda p: (p.label, model.predict(p.features))) #Taking each array of the RDD of parsedTestData which is a tuple(LabeledPoint) and then calculating its label and features , p is an input to lambda function and p is a tuple point(a LabeledPoint)
millis2 = int(round(time.time() * 1000))
print labelsAndPreds.collect()
#Print testing Error
开发者ID:Ayush-iitkgp,项目名称:Spark-MLlib-Python,代码行数:31,代码来源:LogisticRegression.py
示例17: len
.map(lambda lp: len(lp.features.indices))
.sum())
Test.assertEquals(numNZVal, 372080, 'incorrect number of features')
# ** CTR prediction and logloss evaluation **
from pyspark.mllib.classification import LogisticRegressionWithSGD
# fixed hyperparameters
numIters = 50
stepSize = 10.
regParam = 1e-6
regType = 'l2'
includeIntercept = True
model0 = LogisticRegressionWithSGD.train(OHETrainData, numIters, stepSize, 1.0, None, regParam, regType, includeIntercept)
sortedWeights = sorted(model0.weights)
print sortedWeights[:5], model0.intercept
# TEST Logistic regression
Test.assertTrue(np.allclose(model0.intercept, 0.56455084025), 'incorrect value for model0.intercept')
Test.assertTrue(np.allclose(sortedWeights[0:5],
[-0.45899236853575609, -0.37973707648623956, -0.36996558266753304,
-0.36934962879928263, -0.32697945415010637]), 'incorrect value for model0.weights')
# ** Log loss **
from math import log
def computeLogLoss(p, y):
开发者ID:samkujovich,项目名称:SparkExperience,代码行数:31,代码来源:ClickThroughPrediction.py
示例18: train
def train(self, num_iterations=10):
model = LogisticRegressionWithSGD.train(
self._labeled_feature_vector_rdd(),
num_iterations)
return LogisticRegressionModel(model, self.feature_cols)
开发者ID:Atigeo,项目名称:xpatterns-xframe,代码行数:5,代码来源:classify.py
示例19: float
cutoff = float(nrock) / (nrock + nxrock)
# recombine
equalSampleData = labeledRock.union(labeledNotRock)
equalSampleData = labeledData.filter(lambda p: random.random() < cutoff if p.label != 1.0 else True)
# split data
trainData, testData = randomSplit(equalSampleData, [0.9, 0.1])
trainData.map(lambda p: (p.label, p.features)).take(3)
# train model
model = LogisticRegressionWithSGD.train(trainData, intercept=False, iterations=10000)
# model = LinearRegressionWithSGD.train(trainData, step = 0.1, iterations=1000)
# model = SVMWithSGD.train(trainData, step=1, iterations=1000, intercept=True)
# evaluate model
# labelsAndPreds = testData.map(lambda p: (p.label, 1 if model.predict(p.features) > 0.5 else 0))
labelsAndPreds = testData.map(lambda p: (p.label, model.predict(p.features)))
accuracy = labelsAndPreds.filter(lambda (v, p): v == p).count() / float(testData.count())
guess1 = labelsAndPreds.filter(lambda (v, p): p == 1)
precision1 = guess1.filter(lambda (v, p): v == p).count() / float(guess1.count())
act1 = labelsAndPreds.filter(lambda (v, p): v == 1)
recall1 = act1.filter(lambda (v, p): v == p).count() / float(act1.count())
开发者ID:ScalingUpMusic,项目名称:SUMsandbox,代码行数:29,代码来源:rock_ml.py
示例20: print
print(BASE_DATA_PATH)
conf = (SparkConf().setMaster("local[2]").setAppName("Testing MLLib With DataFrame SQL"))
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
# read the dataset
df_test = sqlContext.read.format("com.databricks.spark.csv").options(delimiter=",").options(header="true").load(
BASE_DATA_PATH + '/test.csv')
training = df_test.map(lambda row: LabeledPoint(row.IsClick,
[float(row.SearchID), float(row.AdID), float(row.Position),
float(row.HistCTR), float(row.Price)]))
(trainingData, testData) = training.randomSplit([0.7, 0.3])
model = LogisticRegressionWithSGD.train(trainingData,iterations = 100,step=0.4)
# Build the model
model1 = SVMWithSGD.train(trainingData, iterations=100)
# Evaluate the model on training data
model2 = RandomForest.trainClassifier(trainingData, numClasses=2,
categoricalFeaturesInfo={},
numTrees=3, featureSubsetStrategy="auto",
开发者ID:abhishek-ch,项目名称:evolveML,代码行数:31,代码来源:SampleTest.py
注:本文中的pyspark.mllib.classification.LogisticRegressionWithSGD类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论