• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python classification.LogisticRegressionWithSGD类代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中pyspark.mllib.classification.LogisticRegressionWithSGD的典型用法代码示例。如果您正苦于以下问题:Python LogisticRegressionWithSGD类的具体用法?Python LogisticRegressionWithSGD怎么用?Python LogisticRegressionWithSGD使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。



在下文中一共展示了LogisticRegressionWithSGD类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: main

def main():
    MakePixelFileFromImages("./CarData/TrainImages/*pgm")
    sc = SparkContext(appName="Image Classifier 01")

    p = sc.textFile("pos.csv")
    n = sc.textFile("neg.csv")

    pFeatures = p.map(lambda image: image.split(","))
    nFeatures = n.map(lambda image: image.split(","))

    pExamples = pFeatures.map(lambda features: LabeledPoint(1, features))
    nExamples = nFeatures.map(lambda features: LabeledPoint(0, features))

    data = pExamples.union(nExamples)
    (trainingData, testData) = data.randomSplit([0.7,0.3])

    trainingData.cache()

    model = LogisticRegressionWithSGD.train(trainingData)
    labels_and_predictions = testData.map(lambda image:(image.label, model.predict(image.features)))
    error_rate = labels_and_predictions.filter(lambda (val,pred): val!=pred).count() / float(testData.count())

    print("************* RESULTS *******************")
    print("Error Rate: " + str(error_rate))

    pickle.dump(model, open("imageModel.pk1","wb"))

    sc.stop()
开发者ID:samacart,项目名称:Park-Or-Bird,代码行数:28,代码来源:spark-ImageClassifier.py


示例2: modelWithLogisticRegression

def modelWithLogisticRegression(trainingData, validationData):

	##Train the model using Logistic Regression that employs Stochastic Gradient Descent
	##with different sets of parameters (i.e the value of lambda and the learning step size.
	##Return the LR model with best accuracy rate
	
	#eta = [0.1, 0.3, 0.5, 1.0, 5.0]
	regularizationParamater = [.00000001, .0000005, 1., 1000., 100000.]
	bestLRModel = None
	bestAccuracy = 0
	numOfIterations = 200
	visualizationData = []
	
	
	for regularizer in regularizationParamater:

		model = LogisticRegressionWithSGD.train(trainingData, numOfIterations, 1.0, regParam=regularizer)
		predict = validationData.map(lambda ad: (ad.label, model.predict(ad.features)))
		totalValidationAds = validationData.count()
		correctlyPredicted = predict.filter(lambda x: x[0] == x[1]).count()
		accuracy = float(correctlyPredicted)/totalValidationAds
		
		visualizationData += [(regularizer, accuracy)]
		
		if accuracy > bestAccuracy:
			bestAccuracy = accuracy
			bestLRModel = model
				
	return bestLRModel, visualizationData
开发者ID:Abhishek-Arora,项目名称:Ad-Click-Prediction,代码行数:29,代码来源:predictClick.py


示例3: test_classification

    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)
开发者ID:Altiscale,项目名称:OBSOLETE-spark,代码行数:28,代码来源:tests.py


示例4: train_committee

def train_committee(train_features, test_features, size=5):
    committee = []
    attempts = 0
    max_attempts = size * 4
    roc_threshold = 0.7

    test_pairs_features = test_features.map(lambda p: process_batch(p, is_train=True))
    test_labeled_pairs = test_pairs_features.map(to_labeled_point)

    while len(committee) < size and attempts < max_attempts:
        attempts += 1

        pairs_features = train_features.map(lambda p: process_batch(p, is_train=True))
        labeled_points = pairs_features.map(to_labeled_point).sample(True, 1)

        model = LogisticRegressionWithSGD.train(labeled_points)
        model.clearThreshold()
        scores_and_labels = test_labeled_pairs.map(lambda p: (model.predict(p.features), p.label))

        metrics = BinaryClassificationMetrics(scores_and_labels)
        if metrics.areaUnderROC > roc_threshold:
            print(attempts, metrics.areaUnderROC)
            committee.append(model)

    return committee
开发者ID:siauPatrick,项目名称:seimur,代码行数:25,代码来源:convert_pairs.py


示例5: test_classification

    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        data = [
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)
开发者ID:EronWright,项目名称:spark,代码行数:28,代码来源:tests.py


示例6: logistic_l2_accuracy

def logistic_l2_accuracy(x_train, x_test, regParam):
    # cache data to get reasonable speeds for methods like LogisticRegression and SVM
    xc = x_train.cache()
    # training logistic regression with L2 regularization
    model = LogisticRegressionWithSGD.train(xc, regParam=regParam, regType="l2")
    # making prediction on x_test
    yhat  = x_test.map(lambda p: (p.label, model.predict(p.features)))
    # returning accuracy on x_test
    return yhat.filter(lambda (v, p): v == p).count() / float(x_test.count())
开发者ID:SemanticBeeng,项目名称:optunity,代码行数:9,代码来源:logistic.py


示例7: main

def main():
	"""
	Driver program for a spam filter using Spark and MLLib
	"""

	# Consolidate the individual email files into a single spam file
	# and a single ham file
	makeDataFileFromEmails( "data/spam_2/", "data/spam.txt")
	makeDataFileFromEmails( "data/easy_ham_2/", "data/ham.txt" )

	# Create the Spark Context for parallel processing
	sc = SparkContext( appName="Spam Filter")

	# Load the spam and ham data files into RDDs
	spam = sc.textFile( "data/spam.txt" )
	ham = sc.textFile( "data/ham.txt" )

	# Create a HashingTF instance to map email text to vectors of 10,000 features.
	tf = HashingTF(numFeatures = 10000)

	# Each email is split into words, and each word is mapped to one feature.
	spamFeatures = spam.map(lambda email: tf.transform(email.split(" ")))
	hamFeatures = ham.map(lambda email: tf.transform(email.split(" ")))

	# Create LabeledPoint datasets for positive (spam) and negative (ham) data points.
	positiveExamples = spamFeatures.map(lambda features: LabeledPoint(1, features))
	negativeExamples = hamFeatures.map(lambda features: LabeledPoint(0, features))

	# Combine positive and negative datasets into one
	data = positiveExamples.union(negativeExamples)

	# Split the data into 70% for training and 30% test data sets 
	( trainingData, testData ) = data.randomSplit( [0.7, 0.3] )

	# Cache the training data to optmize the Logistic Regression
	trainingData.cache() 

	# Train the model with Logistic Regression using the SGD algorithm.
	model = LogisticRegressionWithSGD.train(trainingData)

	# Create tuples of actual and predicted values
	labels_and_predictions = testData.map( lambda email: (email.label, model.predict( email.features) ) )

	# Calculate the error rate as number wrong / total number
	error_rate = labels_and_predictions.filter( lambda (val, pred): val != pred ).count() / float(testData.count() )
	print( "*********** SPAM FILTER RESULTS **********" )
	print( "\n" )
	print( "Error Rate: " + str( error_rate ) )
	print( "\n" )

	# Serialize the model for presistance
	pickle.dump( model, open( "spamFilter.pkl", "wb" ) )

	sc.stop()
开发者ID:badpaper,项目名称:coursework,代码行数:54,代码来源:spamFilter.py


示例8: test_classification

    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainClassifier(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)
开发者ID:greatyan,项目名称:spark,代码行数:51,代码来源:tests.py


示例9: getLogisticRegressionModel

def getLogisticRegressionModel(Train_Data):  
    
    numIters = 10
    stepSize = 10.
    regParam = 1e-6
    regType = 'l2'
    includeIntercept = True
    
    
    return LogisticRegressionWithSGD.train(data = Train_Data,
                                   iterations = numIters,
                                   miniBatchFraction=0.1,
                                   step = stepSize,
                                   regParam = regParam,
                                   regType = regType,
                                   intercept = includeIntercept)
开发者ID:danielnazareth89,项目名称:603-Masters-Project-Apache-Spark-,代码行数:16,代码来源:FeatureHashing.py


示例10: logisticRegression

def logisticRegression(trainingRDD, trainingRDDHashed,
                       testRDDHashed, iterations, minibatch, stepsize):
    # Train a Naive Bayes Model
    trainedModel = LogisticRegressionWithSGD.train(
        trainingRDD,
        iterations=iterations,
        miniBatchFraction=minibatch,
        regType="l2",
        intercept=True,
        regParam=0.1,
        step=stepsize)
    # Test on Validation and Test Sets
    resultsValidation = trainingRDDHashed.map(
        lambda l_v24: (
            (l_v24[0],
             trainedModel.predict(
                l_v24[1])),
            1)).map(
        lambda x_y25: (
            checkState(
                x_y25[0]),
            x_y25[1])).reduceByKey(add).collectAsMap()
    resultsTest = testRDDHashed.map(
        lambda l_v26: (
            (l_v26[0],
             trainedModel.predict(
                l_v26[1])),
            1)).map(
        lambda x_y27: (
            checkState(
                x_y27[0]),
            x_y27[1])).reduceByKey(add).collectAsMap()
    # Get Counts
    nFilesV = trainingRDDHashed.count()
    nFilesT = testRDDHashed.count()
    # Create a dictionary of the Values
    resultsValidation = defaultdict(lambda: 0, resultsValidation)
    resultsTest = defaultdict(lambda: 0, resultsTest)
    # Get F-Score and Accuracy Values
    AccuracyV, fScoreV = getAccuracy(resultsValidation, nFilesV)
    AccuracyT, fScoreT = getAccuracy(resultsTest, nFilesT)
    # Print Results
    print('   Results for Logistic Regression')
    print('      Training Set: %.3f and F-Score: %.3f') % (AccuracyV, fScoreV)
    print('      Test Set: %.3f and F-Score: %.3f') % (AccuracyT, fScoreT)
    # Return the Result List
    return AccuracyV, fScoreV, AccuracyT, fScoreT
开发者ID:AkiraKane,项目名称:CityUniversity2014,代码行数:47,代码来源:ackf415-Local-LR-Optimisation.py


示例11: train_trend_model

    def train_trend_model(self, model, data, i):
        self.logger.info('Start to train the direction model')
        rdd_data = self.sc.parallelize(data)
        if self.trend_prediction_method == self.RANDOM_FOREST:
            model = RandomForest.trainClassifier(rdd_data, numClasses=2, categoricalFeaturesInfo={}, numTrees=40,
                                                 featureSubsetStrategy="auto", impurity='gini', maxDepth=20,
                                                 maxBins=32)
        elif self.trend_prediction_method == self.NAIVE_BAYES:
            model = NaiveBayes.train(rdd_data)

        elif self.trend_prediction_method == self.LOGISTIC_REGRESSION:
            model = LogisticRegressionWithSGD.train(rdd_data, iterations=10000, step=0.001,
                                                    initialWeights=None if model is None else model.weights)

        elif self.trend_prediction_method == self.SVM:
            model = SVMWithSGD.train(rdd_data, iterations=10000, step=0.001,
                                     initialWeights=None if model is None else model.weights)

        return model
开发者ID:WarnWang,项目名称:Dissertation,代码行数:19,代码来源:composition_prediction_system.py


示例12: main

def main(input_file_path):

    print('=====>>>>>')
    print('ddd')
    data = sc.textFile(input_file_path)
    traning_data_RDD = data.filter(lambda line: line.split(',')[3] != '' and line.split(',')[0] != 'INDEX')
    unseen_data_RDD = data.filter(lambda line: line.split(',')[3] == '')

    traning_data_pddf = create_pddf(traning_data_RDD)
    traning_data_df = sqlContext.createDataFrame(traning_data_pddf)
    print(traning_data_df.head())

    parsed_data = rdd_to_labeled_point(traning_data_df.rdd)
    parsed_data.persist()
    # Correct print: [LabeledPoint(1.0, [1.0,8.6662186586,6.98047693487])]
    logisticRegressionWithSGD = LogisticRegressionWithSGD.train(parsed_data, iterations=100)

    labels_and_preds = parsed_data.map(lambda lp: [lp.label, logisticRegressionWithSGD.predict(lp.features)])
    Accuracy = labels_and_preds.filter(lambda ele: int(ele[0]) == int(ele[1])).count() / float(parsed_data.count())
    print("Training Accuracy on training data = " + str(Accuracy))

    unseen_data_pddf = create_pddf(unseen_data_RDD)
    unseen_data_df = sqlContext.createDataFrame(unseen_data_pddf)
    unseen_parsed_data = rdd_to_index_featurs(unseen_data_df.rdd)
    unseen_parsed_data.persist()

    file = open('/Users/1002720/Documents/workspace/SNU-project/data/BDA2Project/1-GenderPrediction/result.csv', 'w',
                encoding='utf-8')
    file.write('INDEX,GENDER\n')
    for data in unseen_parsed_data.collect():
        file.write(str(data[0]) + ',' + str(logisticRegressionWithSGD.predict(data[1]) + 1) + '\n')
    # print(labels_and_preds.collect())



    parsed_data.unpersist()
    unseen_parsed_data.unpersist()
    print('=====>>>>>')
    print('=====>>>>>')
    print('=====>>>>>')
    print('=====>>>>>')
开发者ID:Ggoals,项目名称:SNU-project,代码行数:41,代码来源:1-logisticRegressionWithSGD.py


示例13: test_classification

    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree
        data = [
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(rdd, numClasses=2,
                                                categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)
开发者ID:drewrobb,项目名称:spark,代码行数:37,代码来源:test_linalg.py


示例14: anom_with_lr

def anom_with_lr():
  try:
    plaintext_rdd = sc.textFile("file:///Users/blahiri/healthcare/data/cloudera_challenge/pat_proc_larger.csv") #69.2 MB
    pat_proc = pycsv.csvToDataFrame(sqlContext, plaintext_rdd, sep = ",")
    anom = pat_proc.filter(pat_proc.is_anomalous == 1)
    benign = pat_proc.filter(pat_proc.is_anomalous == 0)
    n_benign = benign.count()
    
    #Take a random sample of 50K from the unlabeled 100K
    sqlContext.registerFunction("my_random", lambda x: x - x + random())
    sqlContext.registerDataFrameAsTable(benign, "benign")
    benign = sqlContext.sql("SELECT *, my_random(is_anomalous) as random_number FROM benign")
    
    threshold = 50000/n_benign
    into_model = benign.filter(benign.random_number <= threshold)
    for_finding_more = benign.filter(benign.random_number > threshold)
    
    for_modeling = anom.unionAll(into_model.drop(into_model.random_number))
    for_finding_more = for_finding_more.drop(for_finding_more.random_number)
    #Try to pull this from a much larger sample, or, the entire data, because the ones with lowest probabilities, among
    #the selected 10,000, have probabilities around 0.05
    
    print("anom.count() = " + str(anom.count()) + ", benign.count() = " + str(benign.count()) + ", into_model.count() = " + str(into_model.count()) 
            + ", for_modeling.count() = " + str(for_modeling.count()) + ", for_finding_more.count() = " + str(for_finding_more.count()))
    
    all_columns = for_modeling.columns
    features = [x for x in all_columns if (x not in ["patient_id", "is_anomalous"])]
    categorical_features = ["age_group", "gender", "income_range"] #We are listing these 3 as categorical features only as the procedure features have 0-1 values anyway 
    procedure_features = [x for x in features if (x not in categorical_features)]

    #Unlike decision tree, logistic regression does not need the map categoricalFeaturesInfo, just an RDD of LabeledPoint objects.
    
    #Create a dictionary where the key-value pairs are as follows: key is the name of the categorical feature, and value is a list with the following entries:
    #1) an id of the feature that is incremented sequentially, 2) no. of distinct values of the feature, 3) a list of the distinct values of the feature.
    cat_feature_number = 0
    dict_cat_features = {}
    
    for feature in categorical_features:
       agvalues = pat_proc.select(pat_proc[feature].cast("string").alias("feature")).distinct().collect()
       distinct_values = map(lambda row: row.asDict().values()[0], agvalues)
       distinct_values = sorted(map(lambda unicode_val: unicode_val.encode('ascii','ignore'), distinct_values))
       dict_cat_features[feature] = [cat_feature_number, len(distinct_values), distinct_values]
       cat_feature_number += 1
       
    for_modeling = for_modeling.rdd
    print("for_modeling.getNumPartitions() = " + str(for_modeling.getNumPartitions())) #4 partitions: the default should be the number of logical cores, which is 8
    
    (train, test) = for_modeling.randomSplit([0.5, 0.5])
    test_data_size = test.count()
    print("train.count() = " + str(train.count()) + ", test.count() = " + str(test_data_size))
    training_data = train.map(lambda x: create_labeled_point(x, features, categorical_features, dict_cat_features, procedure_features))
    print("training_data.count() = " + str(training_data.count()))
    
    t0 = time()
    #model = LogisticRegressionWithLBFGS.train(training_data) #LBFGS took 66.766 seconds
    model = LogisticRegressionWithSGD.train(training_data) #SGCD took 69.261 seconds
    tt = time() - t0
    print "Classifier trained in {} seconds".format(round(tt,3)) 
    
    test_data = test.map(lambda x: create_labeled_point(x, features, categorical_features, dict_cat_features, procedure_features))
    
    t0 = time()
    predictions = model.predict(test_data.map(lambda p: p.features))
    tt = time() - t0
    print "Prediction made in {} seconds".format(round(tt,3)) #Reports as 0.0 seconds
    
    labelsAndPreds = test_data.map(lambda p: (p.label, model.predict(p.features)))
    test_accuracy = labelsAndPreds.filter(lambda (v, p): v == p).count()/float(test_data_size)

    fpr = labelsAndPreds.filter(lambda (v, p): (v == 0 and p == 1)).count()/labelsAndPreds.filter(lambda (v, p): v == 0).count() 
    fnr = labelsAndPreds.filter(lambda (v, p): (v == 1 and p == 0)).count()/labelsAndPreds.filter(lambda (v, p): v == 1).count()
    print "Test accuracy is {}, fpr is {}, fnr is {}".format(round(test_accuracy, 4), round(fpr, 4), round(fnr, 4)) #Test accuracy is 0.9057, fpr is 0.1634, fnr is 0.0282
    
    model.clearThreshold()
    for_finding_more = for_finding_more.map(lambda x: create_labeled_point(x, features, categorical_features, dict_cat_features, procedure_features)) #OK
    for_finding_more = for_finding_more.map(lambda p: (p.features, model.predict(p.features), p.label)) #OK
    
    try:
      for_finding_more.first() #We perform an action here because otherwise the output will be a PipelinedRDD.
      #Reverse-sort the additional patients by their predicted probabilities of being anomalous and take the top 10,000
      #for_finding_more.take(5)
    except EOFError:
      print("EOF handled")
      
    df = sqlContext.createDataFrame(for_finding_more.collect(), ['features', 'predicted_prob', 'is_anom'])
    df = df.orderBy(df.predicted_prob.desc()) #The orderBy is not actually called if collect() is not called. Can be also triggered by calling take(). We are triggering it by the writing in the next statement.
    df.select('is_anom', 'predicted_prob').limit(10000).write.format('com.databricks.spark.csv').save('file:///Users/blahiri/healthcare/data/cloudera_challenge/additional_10000_from_spark.csv') #Top one has 
    #probability of 0.86818, last one has probability 0.5928958
    
  except Exception:
    print("Exception in user code:")
    traceback.print_exc(file = sys.stdout)
  return for_finding_more
开发者ID:bibudhlahiri,项目名称:healthcare,代码行数:93,代码来源:analyze_anomaly_with_spark.py


示例15: test_classification

    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest,\
            RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        temp_dir = tempfile.mkdtemp()

        lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd, iterations=10)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        dt_model_dir = os.path.join(temp_dir, "dt")
        dt_model.save(self.sc, dt_model_dir)
        same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir)
        self.assertEqual(same_dt_model.toDebugString(), dt_model.toDebugString())

        rf_model = RandomForest.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10,
            maxBins=4, seed=1)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        rf_model_dir = os.path.join(temp_dir, "rf")
        rf_model.save(self.sc, rf_model_dir)
        same_rf_model = RandomForestModel.load(self.sc, rf_model_dir)
        self.assertEqual(same_rf_model.toDebugString(), rf_model.toDebugString())

        gbt_model = GradientBoostedTrees.trainClassifier(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

        gbt_model_dir = os.path.join(temp_dir, "gbt")
        gbt_model.save(self.sc, gbt_model_dir)
        same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir)
        self.assertEqual(same_gbt_model.toDebugString(), gbt_model.toDebugString())

        try:
            rmtree(temp_dir)
        except OSError:
            pass
开发者ID:HodaAlemi,项目名称:spark,代码行数:75,代码来源:tests.py


示例16: parsePoint

from pyspark.mllib.regression import LabeledPoint
from numpy import array
import parse
# Load and parse the data

#def parsePoint(line):   # Creating vector(array) with first input as y and others as xi's   
#    values = [float(x) for x in line.split(',')]
#    return LabeledPoint(values[10], values[0:9])


sc = SparkContext("local[4]", "Logistic Regression")      #Initialized SparkContext
data = sc.textFile("/home/ayush/Data /Data for Machine Learning/UCI Adult Data Set/UCI adult.data")  #Created an RDD
parsedData = data.map(parse.parsePoint) #RDD Transformation on the input RDD which is string and converting them to labeled points and each labeled points is a tuple of float(label) and ndrarray(features)

# Build the model
model = LogisticRegressionWithSGD.train(parsedData)   #Pass an RDD to "train" method of class LogisticRegressionwithSGD
#Use model to create output
#model.predict().collect()    # in "predict" method we have to pass an array
#Read Test data

Testdata = sc.textFile("/home/ayush/Data /Data for Machine Learning/UCI Adult Data Set/UCI adult.test")
parsedTestData = Testdata.map(parse.parsePoint)
#predict result for each Test Data

# Evaluating the model on training data

labelsAndPreds = parsedTestData.map(lambda p: (p.label, model.predict(p.features)))  #Taking each array of the RDD of parsedTestData which is a tuple(LabeledPoint) and then calculating its label and features , p is an input to lambda function and p is a tuple point(a LabeledPoint) 
millis2 = int(round(time.time() * 1000))

print labelsAndPreds.collect()
#Print testing Error
开发者ID:Ayush-iitkgp,项目名称:Spark-MLlib-Python,代码行数:31,代码来源:LogisticRegression.py


示例17: len

            .map(lambda lp: len(lp.features.indices))
            .sum())
Test.assertEquals(numNZVal, 372080, 'incorrect number of features')


# ** CTR prediction and logloss evaluation **
from pyspark.mllib.classification import LogisticRegressionWithSGD

# fixed hyperparameters
numIters = 50
stepSize = 10.
regParam = 1e-6
regType = 'l2'
includeIntercept = True

model0 = LogisticRegressionWithSGD.train(OHETrainData, numIters, stepSize, 1.0, None, regParam, regType, includeIntercept)
sortedWeights = sorted(model0.weights)
print sortedWeights[:5], model0.intercept


# TEST Logistic regression
Test.assertTrue(np.allclose(model0.intercept,  0.56455084025), 'incorrect value for model0.intercept')
Test.assertTrue(np.allclose(sortedWeights[0:5],
                [-0.45899236853575609, -0.37973707648623956, -0.36996558266753304,
                 -0.36934962879928263, -0.32697945415010637]), 'incorrect value for model0.weights')


# ** Log loss **
from math import log

def computeLogLoss(p, y):
开发者ID:samkujovich,项目名称:SparkExperience,代码行数:31,代码来源:ClickThroughPrediction.py


示例18: train

 def train(self, num_iterations=10):
     model = LogisticRegressionWithSGD.train(
         self._labeled_feature_vector_rdd(), 
         num_iterations)
     return LogisticRegressionModel(model, self.feature_cols)
开发者ID:Atigeo,项目名称:xpatterns-xframe,代码行数:5,代码来源:classify.py


示例19: float

cutoff = float(nrock) / (nrock + nxrock)

# recombine
equalSampleData = labeledRock.union(labeledNotRock)


equalSampleData = labeledData.filter(lambda p: random.random() < cutoff if p.label != 1.0 else True)

# split data
trainData, testData = randomSplit(equalSampleData, [0.9, 0.1])

trainData.map(lambda p: (p.label, p.features)).take(3)

# train model
model = LogisticRegressionWithSGD.train(trainData, intercept=False, iterations=10000)
# model = LinearRegressionWithSGD.train(trainData, step = 0.1, iterations=1000)
# model = SVMWithSGD.train(trainData, step=1, iterations=1000, intercept=True)

# evaluate model
# labelsAndPreds = testData.map(lambda p: (p.label, 1 if model.predict(p.features) > 0.5 else 0))
labelsAndPreds = testData.map(lambda p: (p.label, model.predict(p.features)))

accuracy = labelsAndPreds.filter(lambda (v, p): v == p).count() / float(testData.count())

guess1 = labelsAndPreds.filter(lambda (v, p): p == 1)
precision1 = guess1.filter(lambda (v, p): v == p).count() / float(guess1.count())

act1 = labelsAndPreds.filter(lambda (v, p): v == 1)
recall1 = act1.filter(lambda (v, p): v == p).count() / float(act1.count())
开发者ID:ScalingUpMusic,项目名称:SUMsandbox,代码行数:29,代码来源:rock_ml.py


示例20: print

    print(BASE_DATA_PATH)

    conf = (SparkConf().setMaster("local[2]").setAppName("Testing MLLib With DataFrame SQL"))
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    # read the dataset
    df_test = sqlContext.read.format("com.databricks.spark.csv").options(delimiter=",").options(header="true").load(
        BASE_DATA_PATH + '/test.csv')

    training = df_test.map(lambda row: LabeledPoint(row.IsClick,
                                                    [float(row.SearchID), float(row.AdID), float(row.Position),
                                                     float(row.HistCTR), float(row.Price)]))

    (trainingData, testData) = training.randomSplit([0.7, 0.3])

    model = LogisticRegressionWithSGD.train(trainingData,iterations = 100,step=0.4)



    # Build the model
    model1 = SVMWithSGD.train(trainingData, iterations=100)




    # Evaluate the model on training data


    model2 = RandomForest.trainClassifier(trainingData, numClasses=2,
                                         categoricalFeaturesInfo={},
                                         numTrees=3, featureSubsetStrategy="auto",
开发者ID:abhishek-ch,项目名称:evolveML,代码行数:31,代码来源:SampleTest.py



注:本文中的pyspark.mllib.classification.LogisticRegressionWithSGD类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python classification.SVMWithSGD类代码示例发布时间:2022-05-26
下一篇:
Python classification.LogisticRegressionWithLBFGS类代码示例发布时间:2022-05-26
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap