• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python evaluation.MulticlassClassificationEvaluator类代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中pyspark.ml.evaluation.MulticlassClassificationEvaluator的典型用法代码示例。如果您正苦于以下问题:Python MulticlassClassificationEvaluator类的具体用法?Python MulticlassClassificationEvaluator怎么用?Python MulticlassClassificationEvaluator使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。



在下文中一共展示了MulticlassClassificationEvaluator类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: RunRandomForest

def RunRandomForest(tf, ctx):
	sqlContext = SQLContext(ctx)
	rdd = tf.map(parseForRandomForest)
	# The schema is encoded in a string.
	schema = ['genre', 'track_id', 'features']
	# Apply the schema to the RDD.
	songDF = sqlContext.createDataFrame(rdd, schema)

	# Register the DataFrame as a table.
	songDF.registerTempTable("genclass")
	labelIndexer = StringIndexer().setInputCol("genre").setOutputCol("indexedLabel").fit(songDF)

	trainingData, testData = songDF.randomSplit([0.8, 0.2])

	labelConverter = IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels)

	rfc = RandomForestClassifier().setMaxDepth(10).setNumTrees(2).setLabelCol("indexedLabel").setFeaturesCol("features")
	#rfc = SVMModel([.5, 10, 20], 5)
	#rfc = LogisticRegression(maxIter=10, regParam=0.01).setLabelCol("indexedLabel").setFeaturesCol("features")

	pipeline = Pipeline(stages=[labelIndexer, rfc, labelConverter])
	model = pipeline.fit(trainingData)

	predictions = model.transform(testData)
	predictions.show()

	evaluator = MulticlassClassificationEvaluator().setLabelCol("indexedLabel").setPredictionCol("prediction").setMetricName("precision")
	accuracy = evaluator.evaluate(predictions)
	print 'Accuracy of RandomForest = ', accuracy * 100
	print "Test Error = ", (1.0 - accuracy) * 100
开发者ID:Sunhick,项目名称:music-cognita,代码行数:30,代码来源:genre_classification.py


示例2: testLogisticMLPipeline1

 def testLogisticMLPipeline1(self):
     training = sqlCtx.createDataFrame([
         ("a b c d e spark", 1.0),
         ("b d", 2.0),
         ("spark f g h", 1.0),
         ("hadoop mapreduce", 2.0),
         ("b spark who", 1.0),
         ("g d a y", 2.0),
         ("spark fly", 1.0),
         ("was mapreduce", 2.0),
         ("e spark program", 1.0),
         ("a e c l", 2.0),
         ("spark compile", 1.0),
         ("hadoop software", 2.0)
         ], ["text", "label"])
     tokenizer = Tokenizer(inputCol="text", outputCol="words")
     hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20)
     lr = LogisticRegression(sqlCtx)
     pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])
     model = pipeline.fit(training)
     test = sqlCtx.createDataFrame([
         ("spark i j k", 1.0),
         ("l m n", 2.0),
         ("mapreduce spark", 1.0),
         ("apache hadoop", 2.0)], ["text", "label"])
     result = model.transform(test)
     predictionAndLabels = result.select("prediction", "label")
     evaluator = MulticlassClassificationEvaluator()
     score = evaluator.evaluate(predictionAndLabels)
     self.failUnless(score == 1.0)
开发者ID:d-behi,项目名称:incubator-systemml,代码行数:30,代码来源:test_mllearn.py


示例3: main

def main(sc, spark):
    # Load and vectorize the corpus
    corpus = load_corpus(sc, spark)
    vector = make_vectorizer().fit(corpus)

    # Index the labels of the classification
    labelIndex = StringIndexer(inputCol="label", outputCol="indexedLabel")
    labelIndex = labelIndex.fit(corpus)

    # Split the data into training and test sets
    training, test = corpus.randomSplit([0.8, 0.2])

    # Create the classifier
    clf = LogisticRegression(
        maxIter=10, regParam=0.3, elasticNetParam=0.8,
        family="multinomial", labelCol="indexedLabel", featuresCol="tfidf")

    # Create the model
    model = Pipeline(stages=[
        vector, labelIndex, clf
    ]).fit(training)

    # Make predictions
    predictions = model.transform(test)
    predictions.select("prediction", "indexedLabel", "tfidf").show(5)

    # Select (prediction, true label) and compute test error
    evaluator = MulticlassClassificationEvaluator(
        labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("Test Error = %g" % (1.0 - accuracy))

    gbtModel = model.stages[2]
    print(gbtModel)  # summary only
开发者ID:yokeyong,项目名称:atap,代码行数:34,代码来源:sc_classification.py


示例4: textPredict

def textPredict(request):
    """6.文本聚类,热度预测"""
    label = request.POST['label']
    title = request.POST['title']

    conf = SparkConf().setAppName('textPredict').setMaster('spark://HP-Pavilion:7077')
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    """处理数据集,生成特征向量"""
    dfTitles = sqlContext.read.parquet('data/roll_news_sina_com_cn.parquet')
    print(dfTitles.dtypes)
    tokenizer = Tokenizer(inputCol="title", outputCol="words")
    wordsData = tokenizer.transform(dfTitles)
    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
    featurizedData = hashingTF.transform(wordsData)
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)
    rescaledData.show()
    for features_label in rescaledData.select("features", "rawFeatures").take(3):
        print(features_label)
    """决策树模型培训"""
    labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(rescaledData)
    featureIndexer =\
        VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(rescaledData)
    (trainingData, testData) = rescaledData.randomSplit([0.7, 0.3])
    dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
    pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])
    model = pipeline.fit(trainingData)
    """模型测试"""
    predictions = model.transform(testData)
    predictions.show()
    predictions.select("prediction", "indexedLabel", "features").show(5)
    """用户数据测试,单个新闻测试"""
    sentenceData = sqlContext.createDataFrame([
        (label,title),
    ],['label',"title"])
    tokenizer = Tokenizer(inputCol="title", outputCol="words")
    wordsData = tokenizer.transform(sentenceData)
    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
    featurizedData = hashingTF.transform(wordsData)
    rescaledData = idfModel.transform(featurizedData)
    myprediction = model.transform(rescaledData)
    print("==================================================")
    myprediction.show()
    resultList = convertDfToList(myprediction)

    """模型评估"""
    evaluator = MulticlassClassificationEvaluator(
        labelCol="indexedLabel", predictionCol="prediction", metricName="precision")
    accuracy = evaluator.evaluate(predictions)
    print("Test Error = %g " % (1.0 - accuracy))

    treeModel = model.stages[2]
    print(treeModel)

    sc.stop()
    return render(request,{'resultList':resultList})
开发者ID:JallyHe,项目名称:networkPublicOpinionAnalysisSystem,代码行数:58,代码来源:views.py


示例5: sparking_your_interest

def sparking_your_interest():
	df = SQLContext.read.json('speeches_dataset.json')
	df_fillna=df.fillna("")
	print(df_fillna.count())
	print(df_fillna.printSchema())

	df_utf=call_utf_encoder(df)
	df_cleaned=call_para_cleanup(df_utf)
	print(df_cleaned)
	df_with_bigrams = call_ngrams(df_cleaned, 2)
	df_with_trigrams = call_ngrams(df_with_bigrams, 3)
	df_with_4grams = call_ngrams(df_with_trigrams, 4)
	df_with_5grams = call_ngrams(df_with_4grams, 4)
	df_with_6grams = call_ngrams(df_with_5grams, 4)
	df_with_vocab_score = call_speech_vocab(df_with_6grams)

	df_with_2grams_idf_vectors = tf_feature_vectorizer(df_with_vocab_score,100,'2grams')
	df_with_3grams_idf_vectors = tf_feature_vectorizer(df_with_2grams_idf_vectors,100,'3grams')
	df_with_4grams_idf_vectors = tf_feature_vectorizer(df_with_3grams_idf_vectors,100,'4grams')
	assembler = VectorAssembler(
	    inputCols=["2gramsfeatures", "2gramsfeatures", "2gramsfeatures", "vocab_score"],
	    outputCol="features")
	assembler_output = assembler.transform(df_with_4grams_idf_vectors)
	output = assembler_output.selectExpr('speaker','speech_id','para_cleaned_text','features')
	print(output.show())
	print(output.count())

	output_tordd = output.rdd
	train_rdd,test_rdd = output_tordd.randomSplit([0.8, 0.2], 123)
	train_df = train_rdd.toDF()
	test_df = test_rdd.toDF()
	print(train_df)
	print(test_df)

	print('Train DF - Count: ')
	print(train_df.count())
	print('Test DF - Count: ')
	print(test_df.count())

	print("Initializing RF Model")
	labelIndexer = StringIndexer(inputCol="speaker", outputCol="indexedLabel").fit(train_df)       
	rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="features",numTrees=1000, featureSubsetStrategy="auto", impurity='gini', maxDepth=4, maxBins=32)
	pipeline = Pipeline(stages=[labelIndexer,rf])
	model = pipeline.fit(output)
	print("Completed RF Model")

	predictions = model.transform(test_df)
	evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="precision")
	accuracy = evaluator.evaluate(predictions)
	print("Test Error = %g" % (1.0 - accuracy))
	rfModel = model.stages[1]
	print(rfModel)  # summary only
	print("Predictions: ")
	print(predictions.show())
开发者ID:vikaasa,项目名称:Spark_Workshop,代码行数:54,代码来源:sparking_your_interest.py


示例6: model

def model(classifier, ftrain, fvalid, fprediction):

    startTime = time.time()

    ctx = SparkContext(appName="model_on_Spark")
    sqlContext = SQLContext(ctx)
    logger = SparkLogger(ctx)
    logger.set_level('ERROR')

    # load and prepare training and validation data
    rawTrain, train = prepData(sqlContext, ctx, ftrain)
    rawValid, valid = prepData(sqlContext, ctx, fvalid)

    # is needed to join columns
    valid = indexData(valid)
    rawValid = indexData(rawValid)

    classifiers = {
        "RandomForestClassifier" : RFC
    }

    clf = classifiers[classifier]()

    labelIndexer = StringIndexer(inputCol="label", outputCol="indexed")
    featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures")

    # train and predict
    pipeline = Pipeline(stages=[labelIndexer, featureIndexer, clf])
    model = pipeline.fit(train)

    predictions = model.transform(valid)

    # write to file:

    subsetPrediction = predictions.select("prediction", "index")
    subsetValidData = rawValid.select("dataset", "index")

    output = (subsetValidData
               .join(subsetPrediction, subsetPrediction.index == subsetValidData.index)
                    .drop("index")
                    .drop("index"))

    lines = output.map(toCSVLine)
    lines.saveAsTextFile('output')

    evaluator = MulticlassClassificationEvaluator(
       labelCol="label", predictionCol="prediction", metricName="precision")
    accuracy = evaluator.evaluate(predictions)
    print "Test Error = %g" % (1.0 - accuracy)

    executionTime = time.time() - startTime
    row=classifier+','+str(executionTime)
    ctx.parallelize([row]).saveAsTextFile("timing")
开发者ID:KiprasKancys,项目名称:DMWMAnalytics,代码行数:53,代码来源:pyspark_ml.py


示例7: build_decision_tree

def build_decision_tree(sqlContext, features, interested):
	print '-----------------------------------------'
	data = sqlContext.createDataFrame(
			[Row(label=interested[i],features=Vectors.dense(features[i])) for i in xrange(len(features))])
	data.printSchema()
	data.show(5)
	print 'created data frame'

	# Index the label column & adding metadata.
	labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)
	print 'created label indexer'

	# Mark the features with < 4 distinct values as categorical
	featureIndexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)

	# Split the data into training and test sets
	(trainingData, testData) = data.randomSplit([0.8, 0.2])

	# Train a DecisionTree model
	dt = RandomForestClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
#	dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
#	dt = GBTClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", maxIter=10)

	# Chain the indexers together with DecisionTree
	pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])

	# Train the model
	model = pipeline.fit(trainingData)

	# Make predictions
	predictions = model.transform(testData)

	predictions.select("prediction", "indexedLabel", "features").show(5)

	# Select (prediction, true label) & compute test error
	evaluator = MulticlassClassificationEvaluator(
			labelCol="indexedLabel", predictionCol="prediction", metricName="precision")
	precision = evaluator.evaluate(predictions)

	treeModel = model.stages[2]
	return (1 - precision, model)
开发者ID:ngthanhtrung23,项目名称:Personalized-Events-Recommendation,代码行数:41,代码来源:decision_tree.py


示例8: naiveBayeseian

def naiveBayeseian():

    def parseLine(line):
        keys  = [float(x) for x in line.split(",")]
        #return LabeledPoint(keys[0],keys[1:])
        return keys
    scdata1 = sc.textFile("/home/ubantu/TwoClassfeatureSet.csv")
    data= scdata1.map(parseLine)
    splits = data.randomSplit([0.8, 0.2], 1234)
    train = splits[0]
    test = splits[1]
    layers = [30, 20, 20, 2]
    # create the trainer and set its parameters
    trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)
    # train the model
    model = trainer.fit(train)
    # compute precision on the test set
    result = model.transform(test)
    predictionAndLabels = result.select("prediction", "label")
    evaluator = MulticlassClassificationEvaluator(metricName="precision")
    print("Precision:" + str(evaluator.evaluate(predictionAndLabels)))
开发者ID:vikaspalkar,项目名称:Parallel-Music-Genre-Classification,代码行数:21,代码来源:SparkClassn.py


示例9: price_predict

def price_predict(path, windows=5, spark_contest=None, sql_context=None):
    if spark_contest is None:
        spark_contest, sql_context = load_spark_context()
    input_data = DataParser(path=path, window_size=windows)
    close_train_df, close_test_df, open_train_df, open_test_df = input_data.get_n_days_history_data(
        data_type=DATA_FRAME, spark_context=spark_contest, sql_context=sql_context)
    evaluator = MulticlassClassificationEvaluator(metricName=PREDICTION)

    # handle open data
    open_trainer = MultilayerPerceptronClassifier(maxIter=1, layers=[4, 5, 4, 3], blockSize=128,
                                                  featuresCol=FEATURES, labelCol=LABEL, seed=1234)
    open_model = open_trainer.fit(open_train_df)
    open_result = open_model.transform(open_test_df)
    open_prediction_labels = open_result.select(PREDICTION, LABEL)
    print("Precision:" + str(evaluator.evaluate(open_prediction_labels)))

    # handle close data
    close_trainer = MultilayerPerceptronClassifier(maxIter=100, layers=[4, 5, 4, 3], blockSize=128,
                                                   featuresCol=FEATURES, labelCol=LABEL, seed=1234)
    close_model = close_trainer.fit(close_train_df)
    close_result = close_model.transform(close_test_df)
    close_prediction_labels = close_result.select(PREDICTION, LABEL)
    print("Precision:" + str(evaluator.evaluate(close_prediction_labels)))
开发者ID:WarnWang,项目名称:Dissertation,代码行数:23,代码来源:neutral_network_MLP.py


示例10: print_evaluation_metrics

def print_evaluation_metrics(model, test_df, labelCol="label", featuresCol="features"):
    """
    Prints evaluation metrics.
    :param model: Used model.
    :param test_df: dataframe containing test data.
    :param labelCol: label column.
    :param featuresCol: features column.
    :return: A DataFrame.
    """
    predictions = model.transform(test_df)


    # Select (prediction, true label) and compute test error
    evaluator = MulticlassClassificationEvaluator(
        labelCol=labelCol, predictionCol="prediction",)
    accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
    f1 = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})
    weighted_precision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})
    weighted_recall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"})
    print "Accuracy:", accuracy
    print "f1:", f1
    print "Precision:", weighted_precision
    print "Recall:", weighted_recall
开发者ID:rhasan,项目名称:machine-learning,代码行数:23,代码来源:Quora.py


示例11: calculate_accuracy_metrics

def calculate_accuracy_metrics(predictions):

    """
    Calculates accuracy metrics for a Prediction DataFrame

    :param predictions:
    :return:
    """
    evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel",
                                                  predictionCol="prediction")
    accuracy = round(evaluator.evaluate(predictions, {evaluator.metricName: "precision"}), 2)
    recall = round(evaluator.evaluate(predictions, {evaluator.metricName: "recall"}), 2)

    positive_cases = predictions.filter(predictions["indexedLabel"] == 1.0)
    negative_cases = predictions.filter(predictions["indexedLabel"] == 0.0)
    false_positive_cases = negative_cases.filter(positive_cases["prediction"] == 1.0)
    false_negative_cases = positive_cases.filter(positive_cases["prediction"] == 0.0)

    return [accuracy,
            recall,
            positive_cases.count(),
            negative_cases.count(),
            false_positive_cases.count(),
            false_negative_cases.count()]
开发者ID:USF-ML2,项目名称:SKYNET-,代码行数:24,代码来源:modeling_utils.py


示例12: display

# Select results to view
display(predictions.select("label", "prediction", "probability"))

# COMMAND ----------

# MAGIC %md
# MAGIC #### Model Evaluation
# MAGIC 
# MAGIC To evaluate our model, we will be making use of the Evaluator in MulticlassClassification. Note that f1-score is the default metric for the MulticlassClassificationEvaluator.

# COMMAND ----------

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="precision")
accuracy = evaluator.evaluate(predictions)
print "Model Accuracy: ", accuracy

# COMMAND ----------

# MAGIC %md
# MAGIC The Evaluator is able to use a few metrics such as f1-score, precision, recall, weightedPrecision and weightedRecall.
# MAGIC 
# MAGIC evaluator.setMetricName("insert_metric_here") can be used to change the metric used to evaluate models.

# COMMAND ----------

evaluator.explainParam("metricName")

# COMMAND ----------
开发者ID:Inscrutive,项目名称:spark,代码行数:31,代码来源:NaiveBayes.py


示例13: LogisticRegression

    # $example on$
    # load data file.
    inputData = spark.read.format("libsvm") \
        .load("data/mllib/sample_multiclass_classification_data.txt")

    # generate the train/test split.
    (train, test) = inputData.randomSplit([0.8, 0.2])

    # instantiate the base classifier.
    lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True)

    # instantiate the One Vs Rest Classifier.
    ovr = OneVsRest(classifier=lr)

    # train the multiclass model.
    ovrModel = ovr.fit(train)

    # score the model on test data.
    predictions = ovrModel.transform(test)

    # obtain evaluator.
    evaluator = MulticlassClassificationEvaluator(metricName="precision")

    # compute the classification error on test data.
    precision = evaluator.evaluate(predictions)
    print("Test Error : " + str(1 - precision))
    # $example off$

    spark.stop()
开发者ID:15652101501,项目名称:spark,代码行数:29,代码来源:one_vs_rest_example.py


示例14: HashingTF

)
hashing_tf = HashingTF(inputCol=tokenizerNoSw.getOutputCol(), outputCol="reviews_tf")
idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="reviews_tfidf")
string_indexer = StringIndexer(inputCol="label", outputCol="target_indexed")
dt = DecisionTreeClassifier(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(), maxDepth=10)

pipeline = Pipeline(stages=[tokenizerNoSw, hashing_tf, idf, string_indexer, dt])


# ****************************************************************
# *********************CROSS VALIDATION: 80%/20%******************
# *******************Model: DecisionTreeClassifier*****************
# *****************************************************************

evaluator = MulticlassClassificationEvaluator(
    predictionCol="prediction", labelCol="target_indexed", metricName="precision"
)

grid = ParamGridBuilder().baseOn([evaluator.metricName, "precision"]).addGrid(dt.maxDepth, [10, 20]).build()

print "Grid is build"

cv = CrossValidator(estimator=pipeline, estimatorParamMaps=grid, evaluator=evaluator)

print "CV Estimator is defined"

cv_model = cv.fit(dfTrain)

print "Model is fitted"

df_test_pred = cv_model.transform(dfTest)
开发者ID:pifouuu,项目名称:ProjetBigData,代码行数:31,代码来源:script1.py


示例15: time

print "Done in {} second".format(round(tt,3))


# In[18]:

from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


print "Fitting the classifier on selected features"
t0 = time()

string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
lr = LogisticRegression(featuresCol='selectedFeatures',labelCol='target_indexed',maxIter=30, regParam=0.01)
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision')

string_indexer_model = string_indexer.fit(dfTrainSelect)
dfTrainIndexed = string_indexer_model.transform(dfTrainSelect).cache()
lrModel = lr.fit(dfTrainIndexed)

tt = time() - t0
print "Done in {} second".format(round(tt,3))


# In[19]:

print "Testing precision of the model"
t0 = time()

dfValidSelect=dfValid.map(partial(vectorizeBi,dico=dictSel_broad.value)).toDF(['selectedFeatures','label']).cache()
开发者ID:pifouuu,项目名称:ProjetBigData,代码行数:31,代码来源:script3_ter.py


示例16: GBTClassifier

data1 = output.select("label", "features")
(training, test) = data1.randomSplit([0.8, 0.2], seed = 12345)


#gbt = GBTClassifier(numTrees = 10, maxDepth = 3, maxBins = 64)
gbt = GBTClassifier(maxIter = 30, maxDepth = 2, impurityType = gini)

#gbt = GBTClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", maxIter=10)
##rf = RandomForestClassifier(numTrees = 25, maxDepth = 4, maxBins = 64)
pipeline = Pipeline(stages=[gbt])
pipelineModel = pipeline.fit(training)

testPredictions = pipelineModel.transform(test)
testPredictions.select("prediction", "label", "features").show(5)

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")#.setMetricName("accuracy")
evaluatorParaMap = {evaluator.metricName: "f1"}
aucTest = evaluator.evaluate(testPredictions, evaluatorParaMap)


from pyspark.ml.tuning import *

paramGrid = ParamGridBuilder().addGrid(gbt.maxIter, [1,5]).build()

cv = CrossValidator().setEstimator(pipeline).setEvaluator(evaluator).setEstimatorParamMaps(paramGrid).setNumFolds(3)

cvModel = cv.fit(training)
cvPredictions = cvModel.transform(test)
cvAUCTest = evaluator.evaluate(cvPredictions, evaluatorParaMap)

print("pipeline Test AUC: %g" % aucTest)
开发者ID:jianwei20,项目名称:Big-Data-Project,代码行数:31,代码来源:gbt_test.py


示例17: print

    data = sqlContext.read.format("libsvm")\
        .load("data/mllib/sample_multiclass_classification_data.txt")
    # Split the data into train and test
    
    data.show() 
    data.printSchema()
    data.select('features').show()
    splits = data.randomSplit([0.6, 0.4], 1234)
    train = splits[0]
    print (train.count())
    train.show()
    test = splits[1]
    
    
    # specify layers for the neural network:
    # input layer of size 4 (features), two intermediate of size 5 and 4
    # and output of size 3 (classes)
    layers = [4, 5, 4, 3]
    # create the trainer and set its parameters
    trainer = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)
    # train the model
    model = trainer.fit(train)
    # compute precision on the test set
    result = model.transform(test)
    predictionAndLabels = result.select("prediction", "label")
    evaluator = MulticlassClassificationEvaluator(metricName="precision")
    print("Precision:" + str(evaluator.evaluate(predictionAndLabels)))
    # $example off$

    sc.stop()
开发者ID:hugomathien,项目名称:football-machine-learning,代码行数:30,代码来源:multilayer_perceptron_classification.py


示例18: main


#.........这里部分代码省略.........
  metric_names = ["accuracy", "weightedPrecision", "weightedRecall", "f1"]
  split_count = 3
  
  for i in range(1, split_count + 1):
    print("\nRun {} out of {} of test/train splits in cross validation...".format(
      i,
      split_count,
    )
    )
    
    # Test/train split
    training_data, test_data = final_vectorized_features.randomSplit([0.8, 0.2])
    
    # Instantiate and fit random forest classifier on all the data
    from pyspark.ml.classification import RandomForestClassifier
    rfc = RandomForestClassifier(
      featuresCol="Features_vec",
      labelCol="ArrDelayBucket",
      predictionCol="Prediction",
      maxBins=4896,
    )
    model = rfc.fit(training_data)
    
    # Save the new model over the old one
    model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.baseline.bin".format(
      base_path
    )
    model.write().overwrite().save(model_output_path)
    
    # Evaluate model using test data
    predictions = model.transform(test_data)
    
    # Evaluate this split's results for each metric
    from pyspark.ml.evaluation import MulticlassClassificationEvaluator
    for metric_name in metric_names:
      evaluator = MulticlassClassificationEvaluator(
        labelCol="ArrDelayBucket",
        predictionCol="Prediction",
        metricName=metric_name
      )
      score = evaluator.evaluate(predictions)
      
      scores[metric_name].append(score)
      print("{} = {}".format(metric_name, score))
    
    #
    # Collect feature importances
    #
    feature_names = vector_assembler.getInputCols()
    feature_importance_list = model.featureImportances
    for feature_name, feature_importance in zip(feature_names, feature_importance_list):
      feature_importances[feature_name].append(feature_importance)
  
  #
  # Evaluate average and STD of each metric and print a table
  #
  import numpy as np
  score_averages = defaultdict(float)
  
  # Compute the table data
  average_stds = []  # ha
  for metric_name in metric_names:
    metric_scores = scores[metric_name]
    
    average_accuracy = sum(metric_scores) / len(metric_scores)
    score_averages[metric_name] = average_accuracy
开发者ID:rjurney,项目名称:Agile_Data_Code_2,代码行数:67,代码来源:spark_model_with_airplanes.py


示例19: StringIndexer

from pyspark.ml.feature import StringIndexer
string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
string_indexer_model = string_indexer.fit(dfTrainSelect)
dfTrainIndexed = string_indexer_model.transform(dfTrainSelect)


# In[329]:

from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(featuresCol='bigramVectors', labelCol='target_indexed', maxDepth=10)


# In[330]:

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision')


# In[331]:

from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator
grid=(ParamGridBuilder()
     .baseOn([evaluator.metricName,'precision'])
     .addGrid(dt.maxDepth, [10,20])
     .build())
cv = CrossValidator(estimator=dt, estimatorParamMaps=grid,evaluator=evaluator)


# In[332]:
开发者ID:pifouuu,项目名称:ProjetBigData,代码行数:30,代码来源:script3_bis.py


示例20: print

data = sc.parallelize(Xtrain)

print("\nSplitting data into 60","%"," training and 40","%","testing")
training_data, testing_data = data.randomSplit([0.6, 0.4], seed=0)
vectorizedData = training_data.toDF()
print("Creating MultilayerPerceptronClassifier...")
MLP = MultilayerPerceptronClassifier(labelCol='indexedLabel', featuresCol='indexedFeatures')
labelIndexer = StringIndexer(inputCol='label',
                             outputCol='indexedLabel').fit(vectorizedData)
featureIndexer = VectorIndexer(inputCol='features',
                               outputCol='indexedFeatures',
                               maxCategories=2).fit(data.toDF())
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, MLP])

paramGrid_MLP = ParamGridBuilder().addGrid(MLP.layers,[[3072, neuron, 10] for neuron in [200, 500]]).build()
evaluator = MulticlassClassificationEvaluator(labelCol='indexedLabel',
                                      predictionCol='prediction', metricName='f1')
print("Processing crossvalidation with 3-fold & 200/500 hidden layer units")
crossval = CrossValidator(estimator=pipeline,
                  estimatorParamMaps=paramGrid_MLP,
                  evaluator=evaluator,
                  numFolds=3)
starttime = datetime.datetime.now()
CV_model = crossval.fit(vectorizedData)
print CV_model.bestModel.stages[2]
print('Done on fitting model:%s'%(datetime.datetime.now()-starttime))

print("Transforming testing data...")
vectorized_test_data = testing_data.toDF()

#transformed_data1 = CV_model.transform(vectorizedData)
#print evaluator.getMetricName(), 'accuracy:', evaluator.evaluate(transformed_data1)
开发者ID:kyoyachuan,项目名称:datascienceProject,代码行数:32,代码来源:imgtrain_parrellal_load_final_version_multilayer.py



注:本文中的pyspark.ml.evaluation.MulticlassClassificationEvaluator类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python evaluation.RegressionEvaluator类代码示例发布时间:2022-05-26
下一篇:
Python clustering.KMeans类代码示例发布时间:2022-05-26
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap