• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python classification.SVMWithSGD类代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中pyspark.mllib.classification.SVMWithSGD的典型用法代码示例。如果您正苦于以下问题:Python SVMWithSGD类的具体用法?Python SVMWithSGD怎么用?Python SVMWithSGD使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。



在下文中一共展示了SVMWithSGD类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: test_classification

    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        data = [
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)
开发者ID:EronWright,项目名称:spark,代码行数:28,代码来源:tests.py


示例2: test_classification

    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)
开发者ID:Altiscale,项目名称:OBSOLETE-spark,代码行数:28,代码来源:tests.py


示例3: modelWithSVM

def modelWithSVM(trainingData, validationData):
	##Train the model using Support Vector Machines with different values of iterations.
	##Return the SVM model with best accuracy rate
	
	#eta = [0.1, 0.3, 0.5, 1.0, 5.0]
	regularizationParamater = [.0000001, 1., 5000., 10000., 200000.]
	bestSVMModel = None
	bestAccuracy = 0
	numOfIterations = 100
	visualizationData = []
	
	
		
	for regularizer in regularizationParamater:

		model = SVMWithSGD.train(trainingData, numOfIterations, 1.0, regParam=regularizer)
		predict = validationData.map(lambda ad: (ad.label, model.predict(ad.features)))
		totalValidationAds = validationData.count()
		correctlyPredicted = predict.filter(lambda x: x[0] == x[1]).count()
		accuracy = float(correctlyPredicted)/totalValidationAds
		
		visualizationData += [(regularizer, accuracy)]
		
		if accuracy > bestAccuracy:
			bestAccuracy = accuracy
			bestSVMModel = model
				
			
				
	return bestSVMModel, visualizationData
开发者ID:Abhishek-Arora,项目名称:Ad-Click-Prediction,代码行数:30,代码来源:predictClick.py


示例4: main

def main():
	stock_file = sys.argv[1]
	output_predict_file = sys.argv[2]

	conf = SparkConf().setAppName('Stock Prediction Machine Learning with Twitter')
	sc = SparkContext(conf=conf)
	assert sc.version >= '1.5.1'

	''' extracting the header of CSV file'''
	file_data_all = sc.textFile(stock_file)
	file_header = file_data_all.first()
	file_data = file_data_all.filter(lambda line: line != file_header).cache()

	''' for five different predictions getting data '''
	parsedFileData_NextDayActualOpening = file_data.map(parseNextDayActualOpening)
	parsedFileData_NextDayActualHigh = file_data.map(parseNextDayActualHigh)
	parsedFileData_NextDayActualLow = file_data.map(parseNextDayActualLow)
	parsedFileData_NextDayActualClose = file_data.map(parseNextDayActualClose)
	parsedFileData_NextDayActualVolume = file_data.map(parseNextDayActualVolume)

	print(parsedFileData_NextDayActualOpening.collect())

	''' calling SVM with Stochastic Gradient Descent and
	training using our data set '''
	svm_model_nxtdayactopn = SVMWithSGD.train(parsedFileData_NextDayActualOpening, iterations=10)

	lpreds = parsedFileData_NextDayActualOpening.map(lambda line: (line.label, svm_model_nxtdayactopn.predict(line.features)))

	print(lpreds.collect())
开发者ID:gitofsid,项目名称:StocksPrediction-ML,代码行数:29,代码来源:svm_predicts_stocks.py


示例5: trainSVMModel

def trainSVMModel(data):
    """
    Train an SVM model and return it
    :param data: RDD[LabeledPoint]
    :return: svm classification model
    """
    from pyspark.mllib.classification import SVMWithSGD, SVMModel
    model = SVMWithSGD.train(data, iterations=100)
    return model
开发者ID:theseusyang,项目名称:GEQE,代码行数:9,代码来源:createROC.py


示例6: main

def main():
    # prepare training data
    # RDDTrainData = sc.textFile('2007_100.csv')
    RDDTrainData = sc.textFile(','.join([
        # '1987.csv',
        # '1988.csv',
        # '1989.csv',
        # '1990.csv',
        # '1991.csv',
        # '1992.csv',
        # '1993.csv',
        # '1994.csv',
        # '1995.csv',
        # '1996.csv',
        # '1997.csv',
        # '1998.csv',
        # '1999.csv',
        # '2000.csv',
        # '2001.csv',
        # '2002.csv',
        # '2003.csv',
        # '2004.csv',
        # '2005.csv',
        # '2006.csv',
        '2007.csv',
    ]))
    RDDTrainHeader = RDDTrainData.take(1)[0]
    trainData = RDDTrainData.filter(lambda line: line != RDDTrainHeader)\
                            .map(split)\
                            .map(parseTrain)

    # prepare testing data
    RDDTestData = sc.textFile('2008.csv')
    RDDTestHeader = RDDTestData.take(1)[0]
    testData = RDDTestData.filter(lambda line: line != RDDTestHeader)\
                          .map(split)\
                          .map(parseTest)

    # do prediction

    # SVM
    model = SVMWithSGD.train(trainData, iterations=100)

    # Logistic Regression
    # model = LogisticRegressionWithLBFGS.train(trainData)

    predictionData = testData.map(lambda d:
        (int(d.label), model.predict(d.features))
    )

    # evaluate error rate
    errorCount = predictionData.filter(lambda d: int(d[0]) != int(d[1])).count()
    totalCount = predictionData.count()
    print 'error rate =', errorCount, '/', totalCount, '=', float(errorCount) / float(totalCount)
开发者ID:gocreating,项目名称:big-data-hw4,代码行数:54,代码来源:main.py


示例7: train

def train(sc, file_positive, files_negative, file_output):
    """
    Trains a binary classification model using positive samples in file_positive and
    negative samples in file_negative. It writes the resulting model to file_output

    :param sc: The spark context
    :type sc: SparkContext
    :param file_positive: The file with positive tweets (relevant ones)
    :type file_positive: str
    :param files_negative: The file with negative tweets (non-relevant ones)
    :type files_negative: list[str]
    :param file_output: The output where to store the trained model
    :type file_output: str
    """
    positive_tweets = sc.textFile(file_positive).map(parse_json).filter(is_valid)
    negative_tweets = [sc.textFile(file_negative).map(parse_json).filter(is_valid) for file_negative in files_negative]
    positive = positive_tweets.map(parse_positive)
    negatives = [nt.map(parse_negative) for nt in negative_tweets]
    data = positive
    for negative in negatives:
        data = data.union(negative)

    try:
        print("Training classification model")
        model = SVMWithSGD.train(data, iterations=150, step=1000.0, regType='l1', regParam=1e-7)
        print("Saving classification model to file")
        pickle.dump(model, open(file_output, 'wb'))
        print("Done!")
    except Exception as e:
        print("Error:")
        print(e)
开发者ID:alialavia,项目名称:TwitterNews,代码行数:31,代码来源:train.py


示例8: gen_predictors

def gen_predictors(training_data):
    classifiers = dict()
    for item in label_map.iteritems():
        print "Gen predictor for label '{0}' ...".format(item[0])

        global processed_label
        processed_label = item[1]
        svm = SVMWithSGD.train(training_data.map(transform_label))
        classifiers[item[1]] = svm

    return classifiers
开发者ID:hwaohung,项目名称:spark_train,代码行数:11,代码来源:main.py


示例9: SVM_module

def SVM_module(training):
  """This function returns a SVM model from your training data.

  :param training: (REQUIRED) - the training data
  :return: SVM model

  Use it as (Be sure to call split_data() to get the training data):

  >>> model = SVM_module(trainingData)
  """
  # Train a SVM model
  return SVMWithSGD.train(training, iterations=300)
开发者ID:Lab41,项目名称:pythia,代码行数:12,代码来源:spark.py


示例10: run_iterations

def run_iterations(parsedData, iter, seed):
    fp_rates = []
    tp_rates = []
    # thld_arr   = []
    for i in range(0, 10):
        trainingData, testingData = parsedData.randomSplit([70, 30], seed)
        print("For " + str(iter) + " iterations:")
        # Build the model
        model = SVMWithSGD.train(trainingData, iterations=100)

        # Evaluating the model on training data
        labelsAndPreds = trainingData.map(lambda p: (p.label, model.predict(p.features)))
        trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(trainingData.count())
        MSE = labelsAndPreds.map(lambda(v,p): (v-p)**2).reduce(lambda x, y: x + y)/labelsAndPreds.count()
        print("Training Error = " + str(trainErr))
        print("MSE = " + str(MSE))

        labelsAndPreds = testingData.map(lambda p: (p.label, model.predict(p.features)))
        testErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(testingData.count())
        MSE = labelsAndPreds.map(lambda(v,p): (v-p)**2).reduce(lambda x, y: x + y)/labelsAndPreds.count()
        print("Testing Error = " + str(testErr))
        print("MSE = " + str(MSE))



        info = labelsAndPreds.collect()
        actual = [int(i[0]) for i in info]
        predictions = [i[1] for i in info]

        false_positive_rate = labelsAndPreds.filter(lambda (v, p): v == 1 and p == 0).count() / float(labelsAndPreds.filter(lambda (v, p): v == 1).count())
        true_positive_rate = labelsAndPreds.filter(lambda (v, p): v == 0 and p == 0).count() / float(labelsAndPreds.filter(lambda (v, p): v == 0).count())
        fpr, tpr, thresholds = roc_curve(actual, predictions)
        # roc_auc = auc(false_positive_rate, true_positive_rate)
        print false_positive_rate
        print true_positive_rate
        fp_rates.append(false_positive_rate)
        tp_rates.append(true_positive_rate)


        print fp_rates
        print tp_rates
        roc_auc = auc(fpr, tpr)
    plt.title('Receiver Operating Characteristic')
    plt.plot(fp_rates, tp_rates, 'b',
    label='AUC = %0.2f'% roc_auc)
    plt.legend(loc='lower right')
    plt.plot([0,1],[0,1],'r--')
    plt.xlim([-0.1,1.2])
    plt.ylim([-0.1,1.2])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()
    plt.savefig('fig.png')
开发者ID:snavien,项目名称:Startups,代码行数:53,代码来源:svm.py


示例11: test_classification

    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree, RandomForest, GradientBoostedTrees
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        rf_model = RandomForest.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=100)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        gbt_model = GradientBoostedTrees.trainClassifier(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)
开发者ID:greatyan,项目名称:spark,代码行数:51,代码来源:tests.py


示例12: main

def main(sc):
    train_data='/usr/local/spark/data/mllib/sample_svm_data.txt'
    data=sc.textFile(train_data).map(parse)
    
    if os.path.exists('model'):
        model=SVMModel.load(sc, 'model')
    else:
        model=SVMWithSGD.train(data, iterations=100)
        model.save(sc, 'model')

    labelsAndPreds=data.map(lambda p: (p.label, model.predict(p.features)))

    # trainErr=labelsAndPreds.filter(lambda (v, p): v != p).count() / float(data.count())
    # print('Training Error ='  + str(trainErr))

    labelsAndPreds.map(lambda x:str(x[0])+'\t'+str(x[1])).saveAsTextFile('labelsAndPreds')
开发者ID:feng1008,项目名称:spark,代码行数:16,代码来源:svm_test.py


示例13: training

def training(path):
	#import dataset into RDD
	raw_data = sc.textFile(path)
	#parse raw data into label bag-of-words pairs
	parsed_data = raw_data.map(lambda line: parse_line(line))
	#separate into training set and test set
	training_set, test_set = parsed_data.randomSplit([0.6, 0.4], 17)
	#get features for model training
	features = feature_extraction(training_set)
	labeled_points_training = training_set.map(lambda line: construct_labeled_point(line, features))
	labeled_points_test = test_set.map(lambda line: construct_labeled_point(line, features))
	#train logistic regression model
	lrModel = LogisticRegressionWithLBFGS.train(labeled_points_training)
	#train naive bayes model
	nbModel = NaiveBayes.train(labeled_points_training)
	svmModel = SVMWithSGD.train(labeled_points_training)
	return lrModel, nbModel, svmModel, labeled_points_test, features
开发者ID:JiayingYu,项目名称:twitter_event_monitor_Spark,代码行数:17,代码来源:app.py


示例14: main

def main(sc):
    inputFile=sys.argv[1]
    modelPath=sys.argv[2]
    
    data = sc.textFile(inputFile)
    parsedData = data.map(parsePoint)

    # Build the model
    model = SVMWithSGD.train(parsedData, iterations=100)

    # Evaluating the model on training data
    labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
    trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count())
    print("Training Error = " + str(trainErr))

    # Save and load model
    model.save(sc, modelPath)
    # sameModel = SVMModel.load(sc, "svm_model")
    sc.stop()
开发者ID:feng1008,项目名称:spark,代码行数:19,代码来源:qianka_SVM.py


示例15: train_trend_model

    def train_trend_model(self, model, data, i):
        self.logger.info('Start to train the direction model')
        rdd_data = self.sc.parallelize(data)
        if self.trend_prediction_method == self.RANDOM_FOREST:
            model = RandomForest.trainClassifier(rdd_data, numClasses=2, categoricalFeaturesInfo={}, numTrees=40,
                                                 featureSubsetStrategy="auto", impurity='gini', maxDepth=20,
                                                 maxBins=32)
        elif self.trend_prediction_method == self.NAIVE_BAYES:
            model = NaiveBayes.train(rdd_data)

        elif self.trend_prediction_method == self.LOGISTIC_REGRESSION:
            model = LogisticRegressionWithSGD.train(rdd_data, iterations=10000, step=0.001,
                                                    initialWeights=None if model is None else model.weights)

        elif self.trend_prediction_method == self.SVM:
            model = SVMWithSGD.train(rdd_data, iterations=10000, step=0.001,
                                     initialWeights=None if model is None else model.weights)

        return model
开发者ID:WarnWang,项目名称:Dissertation,代码行数:19,代码来源:composition_prediction_system.py


示例16: svm_train

def svm_train(sc, top_path, stopwords_dict=None):
    #   留个词词典接口,如果有新的词典,把词典放到该目录下
    curpath = os.path.normpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
    if stopwords_dict is None:
        stopwords = set(read_file(os.path.join(curpath, u"stopwords.txt")).split())
    else:
        stopwords = set(read_file(os.path.join(curpath, u"stopwords_dict.txt")).split())

    #   形成两类的文件夹的每个文本分词,去停用词,词频统计结果{'pos':[counter,..],'neg':[counter]}

    sub_folder = os.listdir(top_path)
    if len(sub_folder) != 2:
        raise OSError("need and only need two folder")

    top_folder_dict = {}
    for name in sub_folder:
        top_folder_dict[name] = pre_process(os.path.join(top_path, name), stopwords)

    #   选出两类直接区分度最大的词作为这两类的特征词集
    topk = 500
    features = feature_selection(top_folder_dict[sub_folder[1]], top_folder_dict[sub_folder[0]], topk)

    #   计算两类的IDF
    IDF = idf(top_folder_dict[sub_folder[1]], top_folder_dict[sub_folder[1]], features)

    #   每一类每一篇文本在指定二分类下的向量表示[(),()...]
    vector1 = {'1.0': feature_vector(tf(top_folder_dict[sub_folder[1]], features), IDF)}
    vector0 = {'0.0': feature_vector(tf(top_folder_dict[sub_folder[0]], features), IDF)}

    #   转为Spark所需要的输入格式[Labpoint(0.0,[]),...]
    labpoint1 = [LabeledPoint(1.0, list) for list in vector1['1.0']]
    labpoint0 = [LabeledPoint(0.0, list) for list in vector0['0.0']]
    train_data = labpoint1 + labpoint0

    classifier = SVMWithSGD.train(sc.parallelize(train_data))

    path = os.path.join(curpath, 'svm_' + sub_folder[1] + '_' + sub_folder[0] + '.pkl')
    if os.path.isfile(path): os.remove(path)

    with open(path, 'wb') as output:
        pickle.dump((features, IDF, classifier), output)
开发者ID:hanwei2008,项目名称:Virtual_Environment,代码行数:41,代码来源:svmtrain.py


示例17: train_model

def train_model(training_data, iterations, model_file_path, calculate_error=True):
    """
    Trains an SVG model and saves it
    :param training_data:
    :param iterations:
    :param model_file_path:
    :return:
    """
    parsed_data = sc.textFile(training_data).map(parse_point)

    # Build the model
    model = SVMWithSGD.train(parsed_data, iterations=iterations)

    # Save the model
    model.save(sc, model_file_path)
    print "Model saved in: ", model_file_path

    if calculate_error:
        #predictions
        labelsAndPreds = parsed_data.map(lambda p: (p.label, model.predict(p.features)))
        trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsed_data.count())
        print("============Training Error = " + str(trainErr))
开发者ID:cmantas,项目名称:asap.cslab,代码行数:22,代码来源:svm_spark.py


示例18: test_classification

    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree
        data = [
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 1.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 1.0})),
            LabeledPoint(0.0, self.scipy_matrix(2, {0: 2.0})),
            LabeledPoint(1.0, self.scipy_matrix(2, {1: 2.0}))
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features for p in data]

        lr_model = LogisticRegressionWithSGD.train(rdd)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(rdd, numClasses=2,
                                                categoricalFeaturesInfo=categoricalFeaturesInfo)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)
开发者ID:drewrobb,项目名称:spark,代码行数:37,代码来源:test_linalg.py


示例19: parsePoint

train_dict = [i.asDict() for i in feats_train]

feats_test = test.collect()
test_dict = [i.asDict() for i in feats_test]

def parsePoint(d):
    d_copy = deepcopy(d) # I hate using deepcopy so much
    pred = d_copy['success_class']
    d.pop('success_class', None)
    values = [float(x) for x in d.values()]
    return LabeledPoint(pred, map(float,values))

trainParsed = sc.parallelize(map(parsePoint, train_dict))
testParsed = sc.parallelize(map(parsePoint, test_dict))

model = SVMWithSGD.train(trainParsed, iterations=100)

# Training Error
trainLabelsAndPreds = trainParsed.map(lambda p: (p.label, float(model.predict(p.features))))
trainErr = trainLabelsAndPreds.filter(lambda (v, p): v != p).count()/float(trainParsed.count())
print trainErr

# Test Error
testLabelsAndPreds = testParsed.map(lambda p: (p.label, float(model.predict(p.features))))
testErr = testLabelsAndPreds.filter(lambda (v, p): v != p).count()/float(testParsed.count())
print testErr

metrics = BinaryClassificationMetrics(testLabelsAndPreds)

print metrics.areaUnderROC
print metrics.areaUnderPR
开发者ID:arifyali,项目名称:Yelp,代码行数:31,代码来源:jordan_hive_spark.py


示例20: test_classification

    def test_classification(self):
        from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
        from pyspark.mllib.tree import DecisionTree, DecisionTreeModel, RandomForest,\
            RandomForestModel, GradientBoostedTrees, GradientBoostedTreesModel
        data = [
            LabeledPoint(0.0, [1, 0, 0]),
            LabeledPoint(1.0, [0, 1, 1]),
            LabeledPoint(0.0, [2, 0, 0]),
            LabeledPoint(1.0, [0, 2, 1])
        ]
        rdd = self.sc.parallelize(data)
        features = [p.features.tolist() for p in data]

        temp_dir = tempfile.mkdtemp()

        lr_model = LogisticRegressionWithSGD.train(rdd, iterations=10)
        self.assertTrue(lr_model.predict(features[0]) <= 0)
        self.assertTrue(lr_model.predict(features[1]) > 0)
        self.assertTrue(lr_model.predict(features[2]) <= 0)
        self.assertTrue(lr_model.predict(features[3]) > 0)

        svm_model = SVMWithSGD.train(rdd, iterations=10)
        self.assertTrue(svm_model.predict(features[0]) <= 0)
        self.assertTrue(svm_model.predict(features[1]) > 0)
        self.assertTrue(svm_model.predict(features[2]) <= 0)
        self.assertTrue(svm_model.predict(features[3]) > 0)

        nb_model = NaiveBayes.train(rdd)
        self.assertTrue(nb_model.predict(features[0]) <= 0)
        self.assertTrue(nb_model.predict(features[1]) > 0)
        self.assertTrue(nb_model.predict(features[2]) <= 0)
        self.assertTrue(nb_model.predict(features[3]) > 0)

        categoricalFeaturesInfo = {0: 3}  # feature 0 has 3 categories
        dt_model = DecisionTree.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, maxBins=4)
        self.assertTrue(dt_model.predict(features[0]) <= 0)
        self.assertTrue(dt_model.predict(features[1]) > 0)
        self.assertTrue(dt_model.predict(features[2]) <= 0)
        self.assertTrue(dt_model.predict(features[3]) > 0)

        dt_model_dir = os.path.join(temp_dir, "dt")
        dt_model.save(self.sc, dt_model_dir)
        same_dt_model = DecisionTreeModel.load(self.sc, dt_model_dir)
        self.assertEqual(same_dt_model.toDebugString(), dt_model.toDebugString())

        rf_model = RandomForest.trainClassifier(
            rdd, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo, numTrees=10,
            maxBins=4, seed=1)
        self.assertTrue(rf_model.predict(features[0]) <= 0)
        self.assertTrue(rf_model.predict(features[1]) > 0)
        self.assertTrue(rf_model.predict(features[2]) <= 0)
        self.assertTrue(rf_model.predict(features[3]) > 0)

        rf_model_dir = os.path.join(temp_dir, "rf")
        rf_model.save(self.sc, rf_model_dir)
        same_rf_model = RandomForestModel.load(self.sc, rf_model_dir)
        self.assertEqual(same_rf_model.toDebugString(), rf_model.toDebugString())

        gbt_model = GradientBoostedTrees.trainClassifier(
            rdd, categoricalFeaturesInfo=categoricalFeaturesInfo, numIterations=4)
        self.assertTrue(gbt_model.predict(features[0]) <= 0)
        self.assertTrue(gbt_model.predict(features[1]) > 0)
        self.assertTrue(gbt_model.predict(features[2]) <= 0)
        self.assertTrue(gbt_model.predict(features[3]) > 0)

        gbt_model_dir = os.path.join(temp_dir, "gbt")
        gbt_model.save(self.sc, gbt_model_dir)
        same_gbt_model = GradientBoostedTreesModel.load(self.sc, gbt_model_dir)
        self.assertEqual(same_gbt_model.toDebugString(), gbt_model.toDebugString())

        try:
            rmtree(temp_dir)
        except OSError:
            pass
开发者ID:HodaAlemi,项目名称:spark,代码行数:75,代码来源:tests.py



注:本文中的pyspark.mllib.classification.SVMWithSGD类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python clustering.GaussianMixture类代码示例发布时间:2022-05-26
下一篇:
Python classification.LogisticRegressionWithSGD类代码示例发布时间:2022-05-26
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap