• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Java PipelineStage类代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Java中org.apache.spark.ml.PipelineStage的典型用法代码示例。如果您正苦于以下问题:Java PipelineStage类的具体用法?Java PipelineStage怎么用?Java PipelineStage使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。



PipelineStage类属于org.apache.spark.ml包,在下文中一共展示了PipelineStage类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Java代码示例。

示例1: createPipeline

import org.apache.spark.ml.PipelineStage; //导入依赖的package包/类
/**
 * Creates a processing pipeline.
 * @return a pipeline
 */
private Pipeline createPipeline() {
	Tokenizer tokenizer = new Tokenizer()
		.setInputCol("featureStrings")
		.setOutputCol("tokens");
	CountVectorizer countVectorizer = new CountVectorizer()
		.setInputCol("tokens")
		.setOutputCol("features")
		.setMinDF((Double)params.getOrDefault(params.getMinFF()))
		.setVocabSize((Integer)params.getOrDefault(params.getNumFeatures()));  
	StringIndexer tagIndexer = new StringIndexer()
		.setInputCol("tag")
		.setOutputCol("label");
	
	Pipeline pipeline = new Pipeline().setStages(new PipelineStage[]{tokenizer, countVectorizer, tagIndexer});
	return pipeline;
}
 
开发者ID:phuonglh,项目名称:vn.vitk,代码行数:21,代码来源:CMM.java


示例2: createPipeline

import org.apache.spark.ml.PipelineStage; //导入依赖的package包/类
/**
 * Creates a processing pipeline.
 * @return a pipeline
 */
protected Pipeline createPipeline() {
	Tokenizer tokenizer = new Tokenizer()
		.setInputCol("text")
		.setOutputCol("tokens");
	CountVectorizer countVectorizer = new CountVectorizer()
		.setInputCol("tokens")
		.setOutputCol("features")
		.setMinDF((Double)params.getOrDefault(params.getMinFF()))
		.setVocabSize((Integer)params.getOrDefault(params.getNumFeatures()));  
	StringIndexer transitionIndexer = new StringIndexer()
		.setInputCol("transition")
		.setOutputCol("label");
	
	Pipeline pipeline = new Pipeline().setStages(new PipelineStage[]{tokenizer, countVectorizer, transitionIndexer});
	return pipeline;
}
 
开发者ID:phuonglh,项目名称:vn.vitk,代码行数:21,代码来源:TransitionClassifier.java


示例3: testNetwork

import org.apache.spark.ml.PipelineStage; //导入依赖的package包/类
@Test
public void testNetwork() {
    DatasetFacade df = DatasetFacade.dataRows(sqlContext.read().json("src/test/resources/dl4jnetwork"));
    Pipeline p = new Pipeline().setStages(new PipelineStage[] {getAssembler(new String[] {"x", "y"}, "features")});
    DatasetFacade part2 = DatasetFacade.dataRows(p.fit(df.get()).transform(df.get()).select("features", "label"));

    ParamSerializer ps = new ParamHelper();
    MultiLayerConfiguration mc = getNNConfiguration();
    Collection<IterationListener> il = new ArrayList<>();
    il.add(new ScoreIterationListener(1));

    SparkDl4jNetwork sparkDl4jNetwork =
                    new SparkDl4jNetwork(mc, 2, ps, 1, il, true).setFeaturesCol("features").setLabelCol("label");

    SparkDl4jModel sm = sparkDl4jNetwork.fit(part2.get());
    MultiLayerNetwork mln = sm.getMultiLayerNetwork();
    Assert.assertNotNull(mln);
    DatasetFacade transformed = DatasetFacade.dataRows(sm.transform(part2.get()));
    List<?> rows = transformed.get().collectAsList();
    Assert.assertNotNull(sm.getTrainingStats());
    Assert.assertNotNull(rows);
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:23,代码来源:SparkDl4jNetworkTest.java


示例4: testNetworkLoader

import org.apache.spark.ml.PipelineStage; //导入依赖的package包/类
@Test
public void testNetworkLoader() throws Exception {
    DatasetFacade df = DatasetFacade.dataRows(sqlContext.read().json("src/test/resources/dl4jnetwork"));
    Pipeline p = new Pipeline().setStages(new PipelineStage[] {getAssembler(new String[] {"x", "y"}, "features")});
    DatasetFacade part2 = DatasetFacade.dataRows(p.fit(df.get()).transform(df.get()).select("features", "label"));

    ParamSerializer ps = new ParamHelper();
    MultiLayerConfiguration mc = getNNConfiguration();
    Collection<IterationListener> il = new ArrayList<>();
    il.add(new ScoreIterationListener(1));

    SparkDl4jNetwork sparkDl4jNetwork =
                    new SparkDl4jNetwork(mc, 2, ps, 1, il, true).setFeaturesCol("features").setLabelCol("label");

    String fileName = UUID.randomUUID().toString();
    SparkDl4jModel sm = sparkDl4jNetwork.fit(part2.get());
    sm.write().overwrite().save(fileName);
    SparkDl4jModel spdm = SparkDl4jModel.load(fileName);
    Assert.assertNotNull(spdm);

    File file1 = new File(fileName);
    File file2 = new File(fileName + "_metadata");
    FileUtils.deleteDirectory(file1);
    FileUtils.deleteDirectory(file2);
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:26,代码来源:SparkDl4jNetworkTest.java


示例5: testAutoencoderSave

import org.apache.spark.ml.PipelineStage; //导入依赖的package包/类
@Test
public void testAutoencoderSave() throws IOException {
    DatasetFacade df = DatasetFacade.dataRows(sqlContext.read().json("src/test/resources/autoencoders"));
    Pipeline p = new Pipeline().setStages(new PipelineStage[] {
                    getAssembler(new String[] {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j"}, "features")});
    DatasetFacade part2 = DatasetFacade.dataRows(p.fit(df.get()).transform(df.get()).select("features"));

    AutoEncoder sparkDl4jNetwork = new AutoEncoder().setInputCol("features").setOutputCol("auto_encoded")
                    .setCompressedLayer(2).setTrainingMaster(new ParamHelper())
                    .setMultiLayerConfiguration(getNNConfiguration());

    AutoEncoderModel sm = sparkDl4jNetwork.fit(part2.get());

    String fileName = UUID.randomUUID().toString();
    sm.write().save(fileName);
    AutoEncoderModel spdm = AutoEncoderModel.load(fileName);
    Assert.assertNotNull(spdm);
    Assert.assertNotNull(spdm.transform(part2.get()));

    File file = new File(fileName);
    File file2 = new File(fileName + "_metadata");
    FileUtils.deleteDirectory(file);
    FileUtils.deleteDirectory(file2);
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:25,代码来源:AutoEncoderNetworkTest.java


示例6: createPipeline

import org.apache.spark.ml.PipelineStage; //导入依赖的package包/类
static
private Pipeline createPipeline(FunctionType function, String formulaString){
	RFormula formula = new RFormula()
		.setFormula(formulaString);

	Predictor<?, ?, ?> predictor;

	switch(function){
		case CLASSIFICATION:
			predictor = new DecisionTreeClassifier()
				.setMinInstancesPerNode(10);
			break;
		case REGRESSION:
			predictor = new DecisionTreeRegressor()
				.setMinInstancesPerNode(10);
			break;
		default:
			throw new IllegalArgumentException();
	}

	predictor
		.setLabelCol(formula.getLabelCol())
		.setFeaturesCol(formula.getFeaturesCol());

	Pipeline pipeline = new Pipeline()
		.setStages(new PipelineStage[]{formula, predictor});

	return pipeline;
}
 
开发者ID:jpmml,项目名称:jpmml-sparkml-bootstrap,代码行数:30,代码来源:Main.java


示例7: testNetwork

import org.apache.spark.ml.PipelineStage; //导入依赖的package包/类
@Test
public void testNetwork() {
    DatasetFacade df = DatasetFacade.dataRows(sqlContext.read().json("src/test/resources/autoencoders"));
    Pipeline p = new Pipeline().setStages(new PipelineStage[] {
                    getAssembler(new String[] {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j"}, "features")});
    DatasetFacade part2 = DatasetFacade.dataRows(p.fit(df.get()).transform(df.get()).select("features"));

    AutoEncoder sparkDl4jNetwork = new AutoEncoder().setInputCol("features").setOutputCol("auto_encoded")
                    .setCompressedLayer(2).setTrainingMaster(new ParamHelper())
                    .setMultiLayerConfiguration(getNNConfiguration());

    AutoEncoderModel sm = sparkDl4jNetwork.fit(part2.get());
    MultiLayerNetwork mln = sm.getNetwork();
    Assert.assertNotNull(mln);
}
 
开发者ID:deeplearning4j,项目名称:deeplearning4j,代码行数:16,代码来源:AutoEncoderNetworkTest.java


示例8: train

import org.apache.spark.ml.PipelineStage; //导入依赖的package包/类
/**
 * Trains a whitespace classifier model and save the resulting pipeline model
 * to an external file. 
 * @param sentences a list of tokenized sentences.
 * @param pipelineModelFileName
 * @param numFeatures
 */
public void train(List<String> sentences, String pipelineModelFileName, int numFeatures) {
	List<WhitespaceContext> contexts = new ArrayList<WhitespaceContext>(sentences.size());
	int id = 0;
	for (String sentence : sentences) {
		sentence = sentence.trim();
		for (int j = 0; j < sentence.length(); j++) {
			char c = sentence.charAt(j);
			if (c == ' ' || c == '_') {
				WhitespaceContext context = new WhitespaceContext();
				context.setId(id++);
				context.setContext(extractContext(sentence, j));
				context.setLabel(c == ' ' ? 0d : 1d);
				contexts.add(context);
			}
		}
	}
	JavaRDD<WhitespaceContext> jrdd = jsc.parallelize(contexts);
	DataFrame df = sqlContext.createDataFrame(jrdd, WhitespaceContext.class);
	df.show(false);
	System.out.println("N = " + df.count());
	df.groupBy("label").count().show();
	
	org.apache.spark.ml.feature.Tokenizer tokenizer = new Tokenizer()
			.setInputCol("context").setOutputCol("words");
	HashingTF hashingTF = new HashingTF().setNumFeatures(numFeatures)
			.setInputCol(tokenizer.getOutputCol()).setOutputCol("features");
	LogisticRegression lr = new LogisticRegression().setMaxIter(100)
			.setRegParam(0.01);
	Pipeline pipeline = new Pipeline().setStages(new PipelineStage[] {
			tokenizer, hashingTF, lr });
	model = pipeline.fit(df);
	
	try {
		model.write().overwrite().save(pipelineModelFileName);
	} catch (IOException e) {
		e.printStackTrace();
	}
	
	DataFrame predictions = model.transform(df);
	predictions.show();
	MulticlassClassificationEvaluator evaluator = new MulticlassClassificationEvaluator().setMetricName("precision");
	double accuracy = evaluator.evaluate(predictions);
	System.out.println("training accuracy = " + accuracy);
	
	LogisticRegressionModel lrModel = (LogisticRegressionModel) model.stages()[2];
	LogisticRegressionTrainingSummary trainingSummary = lrModel.summary();
	double[] objectiveHistory = trainingSummary.objectiveHistory();
	System.out.println("#(iterations) = " + objectiveHistory.length);
	for (double lossPerIteration : objectiveHistory) {
	  System.out.println(lossPerIteration);
	}
	
}
 
开发者ID:phuonglh,项目名称:vn.vitk,代码行数:61,代码来源:WhitespaceClassifier.java


示例9: testDecisionTreeRegressionPrediction

import org.apache.spark.ml.PipelineStage; //导入依赖的package包/类
@Test
  public void testDecisionTreeRegressionPrediction() {
      // Load the data stored in LIBSVM format as a DataFrame.
  	String datapath = "src/test/resources/regression_test.libsvm";
  	
  	Dataset<Row> data = spark.read().format("libsvm").load(datapath);


      // Split the data into training and test sets (30% held out for testing)
      Dataset<Row>[] splits = data.randomSplit(new double[]{0.7, 0.3});
      Dataset<Row> trainingData = splits[0];
      Dataset<Row> testData = splits[1];

      StringIndexer indexer = new StringIndexer()
              .setInputCol("label")
              .setOutputCol("labelIndex").setHandleInvalid("skip");
      
DecisionTreeRegressor regressionModel =
        new DecisionTreeRegressor().setLabelCol("labelIndex").setFeaturesCol("features");

Pipeline pipeline = new Pipeline()
              .setStages(new PipelineStage[]{indexer, regressionModel});

PipelineModel sparkPipeline = pipeline.fit(trainingData);

      byte[] exportedModel = ModelExporter.export(sparkPipeline);

      Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);
      List<Row> output = sparkPipeline.transform(testData).select("features", "prediction", "label").collectAsList();

      //compare predictions
      for (Row row : output) {
      	Map<String, Object> data_ = new HashMap<>();
          data_.put("features", ((SparseVector) row.get(0)).toArray());
          data_.put("label", (row.get(2)).toString());
          transformer.transform(data_);
          System.out.println(data_);
          System.out.println(data_.get("prediction"));
          assertEquals((double)data_.get("prediction"), (double)row.get(1), EPSILON);
      }
  }
 
开发者ID:flipkart-incubator,项目名称:spark-transformers,代码行数:42,代码来源:DecisionTreeRegressionModelBridgePipelineTest.java


示例10: testGradientBoostClassification

import org.apache.spark.ml.PipelineStage; //导入依赖的package包/类
@Test
public void testGradientBoostClassification() {
	// Load the data stored in LIBSVM format as a DataFrame.
	String datapath = "src/test/resources/binary_classification_test.libsvm";

	Dataset<Row> data = spark.read().format("libsvm").load(datapath);
	StringIndexer indexer = new StringIndexer()
               .setInputCol("label")
               .setOutputCol("labelIndex");
	// Split the data into training and test sets (30% held out for testing)
	Dataset<Row>[] splits = data.randomSplit(new double[]{0.7, 0.3});
	Dataset<Row> trainingData = splits[0];
	Dataset<Row> testData = splits[1];

	// Train a RandomForest model.
	GBTClassifier classificationModel = new GBTClassifier().setLabelCol("labelIndex")
               .setFeaturesCol("features");;

        Pipeline pipeline = new Pipeline()
                .setStages(new PipelineStage[]{indexer, classificationModel});


	 PipelineModel sparkPipeline = pipeline.fit(trainingData);

	// Export this model
	byte[] exportedModel = ModelExporter.export(sparkPipeline);

	// Import and get Transformer
	Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

	List<Row> sparkOutput = sparkPipeline.transform(testData).select("features", "prediction", "label").collectAsList();
	
	// compare predictions
	for (Row row : sparkOutput) {
		Map<String, Object> data_ = new HashMap<>();
		data_.put("features", ((SparseVector) row.get(0)).toArray());
		data_.put("label", (row.get(2)).toString());
		transformer.transform(data_);
		System.out.println(data_);
		System.out.println(data_.get("prediction")+" ,"+row.get(1));
		assertEquals((double) data_.get("prediction"), (double) row.get(1), EPSILON);
	}

}
 
开发者ID:flipkart-incubator,项目名称:spark-transformers,代码行数:45,代码来源:GradientBoostClassificationModelPipelineTest.java


示例11: testDecisionTreeClassificationWithPipeline

import org.apache.spark.ml.PipelineStage; //导入依赖的package包/类
@Test
public void testDecisionTreeClassificationWithPipeline() {
	

    // Load the data stored in LIBSVM format as a DataFrame.
	String datapath = "src/test/resources/classification_test.libsvm";
	Dataset<Row> data = spark.read().format("libsvm").load(datapath);



    // Split the data into training and test sets (30% held out for testing)
    Dataset<Row>[] splits = data.randomSplit(new double[]{0.7, 0.3});        

    Dataset<Row> trainingData = splits[0];
    Dataset<Row> testData = splits[1];

    StringIndexer indexer = new StringIndexer()
            .setInputCol("label")
            .setOutputCol("labelIndex");

    // Train a DecisionTree model.
    DecisionTreeClassifier classificationModel = new DecisionTreeClassifier()
            .setLabelCol("labelIndex")
            .setFeaturesCol("features");

    Pipeline pipeline = new Pipeline()
            .setStages(new PipelineStage[]{indexer, classificationModel});


    // Train model.  This also runs the indexer.
    PipelineModel sparkPipeline = pipeline.fit(trainingData);

    //Export this model
    byte[] exportedModel = ModelExporter.export(sparkPipeline);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    List<Row> output = sparkPipeline.transform(testData).select("features", "label","prediction","rawPrediction").collectAsList();

    //compare predictions
    for (Row row : output) {
    	Map<String, Object> data_ = new HashMap<>();
    	double [] actualRawPrediction = ((DenseVector) row.get(3)).toArray();
        data_.put("features", ((SparseVector) row.get(0)).toArray());
        data_.put("label", (row.get(1)).toString());
        transformer.transform(data_);
        System.out.println(data_);
        System.out.println(data_.get("prediction"));
        assertEquals((double)data_.get("prediction"), (double)row.get(2), EPSILON);
        assertArrayEquals((double[]) data_.get("rawPrediction"), actualRawPrediction, EPSILON);
    }
}
 
开发者ID:flipkart-incubator,项目名称:spark-transformers,代码行数:54,代码来源:DecisionTreeClassificationModelBridgePipelineTest.java


示例12: testPipeline

import org.apache.spark.ml.PipelineStage; //导入依赖的package包/类
@Test
public void testPipeline() {
    // Prepare training documents, which are labeled.
    StructType schema = createStructType(new StructField[]{
            createStructField("id", LongType, false),
            createStructField("text", StringType, false),
            createStructField("label", DoubleType, false)
    });
    Dataset<Row> trainingData = spark.createDataFrame(Arrays.asList(
            cr(0L, "a b c d e spark", 1.0),
            cr(1L, "b d", 0.0),
            cr(2L, "spark f g h", 1.0),
            cr(3L, "hadoop mapreduce", 0.0)
    ), schema);

    // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and LogisticRegression.
    RegexTokenizer tokenizer = new RegexTokenizer()
            .setInputCol("text")
            .setOutputCol("words")
            .setPattern("\\s")
            .setGaps(true)
            .setToLowercase(false);

    HashingTF hashingTF = new HashingTF()
            .setNumFeatures(1000)
            .setInputCol(tokenizer.getOutputCol())
            .setOutputCol("features");
    LogisticRegression lr = new LogisticRegression()
            .setMaxIter(10)
            .setRegParam(0.01);
    Pipeline pipeline = new Pipeline()
            .setStages(new PipelineStage[]{tokenizer, hashingTF, lr});

    // Fit the pipeline to training documents.
    PipelineModel sparkPipelineModel = pipeline.fit(trainingData);


    //Export this model
    byte[] exportedModel = ModelExporter.export(sparkPipelineModel);
    System.out.println(new String(exportedModel));

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //prepare test data
    StructType testSchema = createStructType(new StructField[]{
            createStructField("id", LongType, false),
            createStructField("text", StringType, false),
    });
    Dataset<Row> testData = spark.createDataFrame(Arrays.asList(
            cr(4L, "spark i j k"),
            cr(5L, "l m n"),
            cr(6L, "mapreduce spark"),
            cr(7L, "apache hadoop")
    ), testSchema);

    //verify that predictions for spark pipeline and exported pipeline are the same
    List<Row> predictions = sparkPipelineModel.transform(testData).select("id", "text", "probability", "prediction").collectAsList();
    for (Row r : predictions) {
        System.out.println(r);
        double sparkPipelineOp = r.getDouble(3);
        Map<String, Object> data = new HashMap<String, Object>();
        data.put("text", r.getString(1));
        transformer.transform(data);
        double exportedPipelineOp = (double) data.get("prediction");
        double exportedPipelineProb = (double) data.get("probability");
        assertEquals(sparkPipelineOp, exportedPipelineOp, 0.01);
    }
}
 
开发者ID:flipkart-incubator,项目名称:spark-transformers,代码行数:70,代码来源:PipelineBridgeTest.java


示例13: testRandomForestRegressionWithPipeline

import org.apache.spark.ml.PipelineStage; //导入依赖的package包/类
@Test
public void testRandomForestRegressionWithPipeline() {
    // Load the data stored in LIBSVM format as a DataFrame.
    DataFrame data = sqlContext.read().format("libsvm").load("src/test/resources/regression_test.libsvm");

    // Split the data into training and test sets (30% held out for testing)
    DataFrame[] splits = data.randomSplit(new double[]{0.7, 0.3});
    DataFrame trainingData = splits[0];
    DataFrame testData = splits[1];

    // Train a RandomForest model.
    RandomForestRegressionModel regressionModel = new RandomForestRegressor()
            .setFeaturesCol("features").fit(trainingData);

    Pipeline pipeline = new Pipeline()
            .setStages(new PipelineStage[]{regressionModel});

    // Train model.  This also runs the indexer.
    PipelineModel sparkPipeline = pipeline.fit(trainingData);

    //Export this model
    byte[] exportedModel = ModelExporter.export(sparkPipeline, null);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    Row[] sparkOutput = sparkPipeline.transform(testData).select("features", "prediction").collect();

    //compare predictions
    for (Row row : sparkOutput) {
        Vector v = (Vector) row.get(0);
        double actual = row.getDouble(1);

        Map<String, Object> inputData = new HashMap<String, Object>();
        inputData.put(transformer.getInputKeys().iterator().next(), v.toArray());
        transformer.transform(inputData);
        double predicted = (double) inputData.get(transformer.getOutputKeys().iterator().next());

        assertEquals(actual, predicted, EPSILON);
    }
}
 
开发者ID:flipkart-incubator,项目名称:spark-transformers,代码行数:42,代码来源:RandomForestRegressionModelInfoAdapterBridgeTest.java


示例14: testDecisionTreeRegressionWithPipeline

import org.apache.spark.ml.PipelineStage; //导入依赖的package包/类
@Test
public void testDecisionTreeRegressionWithPipeline() {
    // Load the data stored in LIBSVM format as a DataFrame.
    DataFrame data = sqlContext.read().format("libsvm").load("src/test/resources/regression_test.libsvm");

    // Split the data into training and test sets (30% held out for testing)
    DataFrame[] splits = data.randomSplit(new double[]{0.7, 0.3});
    DataFrame trainingData = splits[0];
    DataFrame testData = splits[1];

    // Train a DecisionTree model.
    DecisionTreeRegressor dt = new DecisionTreeRegressor()
            .setFeaturesCol("features");

    Pipeline pipeline = new Pipeline()
            .setStages(new PipelineStage[]{dt});

    // Train model.  This also runs the indexer.
    PipelineModel sparkPipeline = pipeline.fit(trainingData);

    //Export this model
    byte[] exportedModel = ModelExporter.export(sparkPipeline, null);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    Row[] sparkOutput = sparkPipeline.transform(testData).select("features", "prediction").collect();

    //compare predictions
    for (Row row : sparkOutput) {
        Vector v = (Vector) row.get(0);
        double actual = row.getDouble(1);

        Map<String, Object> inputData = new HashMap<String, Object>();
        inputData.put(transformer.getInputKeys().iterator().next(), v.toArray());
        transformer.transform(inputData);
        double predicted = (double) inputData.get(transformer.getOutputKeys().iterator().next());

        assertEquals(actual, predicted, EPSILON);
    }
}
 
开发者ID:flipkart-incubator,项目名称:spark-transformers,代码行数:42,代码来源:DecisionTreeRegressionModelBridgeTest.java


示例15: testPipeline

import org.apache.spark.ml.PipelineStage; //导入依赖的package包/类
@Test
public void testPipeline() {
    // Prepare training documents, which are labeled.
    StructType schema = createStructType(new StructField[]{
            createStructField("id", LongType, false),
            createStructField("text", StringType, false),
            createStructField("label", DoubleType, false)
    });
    DataFrame trainingData = sqlContext.createDataFrame(Arrays.asList(
            cr(0L, "a b c d e spark", 1.0),
            cr(1L, "b d", 0.0),
            cr(2L, "spark f g h", 1.0),
            cr(3L, "hadoop mapreduce", 0.0)
    ), schema);

    // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and LogisticRegression.
    RegexTokenizer tokenizer = new RegexTokenizer()
            .setInputCol("text")
            .setOutputCol("words")
            .setPattern("\\s")
            .setGaps(true)
            .setToLowercase(false);

    HashingTF hashingTF = new HashingTF()
            .setNumFeatures(1000)
            .setInputCol(tokenizer.getOutputCol())
            .setOutputCol("features");
    LogisticRegression lr = new LogisticRegression()
            .setMaxIter(10)
            .setRegParam(0.01);
    Pipeline pipeline = new Pipeline()
            .setStages(new PipelineStage[]{tokenizer, hashingTF, lr});

    // Fit the pipeline to training documents.
    PipelineModel sparkPipelineModel = pipeline.fit(trainingData);


    //Export this model
    byte[] exportedModel = ModelExporter.export(sparkPipelineModel, trainingData);
    System.out.println(new String(exportedModel));

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //prepare test data
    StructType testSchema = createStructType(new StructField[]{
            createStructField("id", LongType, false),
            createStructField("text", StringType, false),
    });
    DataFrame testData = sqlContext.createDataFrame(Arrays.asList(
            cr(4L, "spark i j k"),
            cr(5L, "l m n"),
            cr(6L, "mapreduce spark"),
            cr(7L, "apache hadoop")
    ), testSchema);

    //verify that predictions for spark pipeline and exported pipeline are the same
    Row[] predictions = sparkPipelineModel.transform(testData).select("id", "text", "probability", "prediction").collect();
    for (Row r : predictions) {
        System.out.println(r);
        double sparkPipelineOp = r.getDouble(3);
        Map<String, Object> data = new HashMap<String, Object>();
        data.put("text", r.getString(1));
        transformer.transform(data);
        double exportedPipelineOp = (double) data.get("prediction");
        double exportedPipelineProb = (double) data.get("probability");
        assertEquals(sparkPipelineOp, exportedPipelineOp, EPSILON);
    }
}
 
开发者ID:flipkart-incubator,项目名称:spark-transformers,代码行数:70,代码来源:PipelineBridgeTest.java


示例16: testRandomForestClassificationWithPipeline

import org.apache.spark.ml.PipelineStage; //导入依赖的package包/类
@Test
public void testRandomForestClassificationWithPipeline() {
    // Load the data stored in LIBSVM format as a DataFrame.
    DataFrame data = sqlContext.read().format("libsvm").load("src/test/resources/classification_test.libsvm");

    // Split the data into training and test sets (30% held out for testing)
    DataFrame[] splits = data.randomSplit(new double[]{0.7, 0.3});
    DataFrame trainingData = splits[0];
    DataFrame testData = splits[1];

    StringIndexer indexer = new StringIndexer()
            .setInputCol("label")
            .setOutputCol("labelIndex");

    // Train a DecisionTree model.
    RandomForestClassifier classifier = new RandomForestClassifier()
            .setLabelCol("labelIndex")
            .setFeaturesCol("features")
            .setPredictionCol("prediction")
            .setRawPredictionCol("rawPrediction")
            .setProbabilityCol("probability");


    Pipeline pipeline = new Pipeline()
            .setStages(new PipelineStage[]{indexer, classifier});

    // Train model.  This also runs the indexer.
    PipelineModel sparkPipeline = pipeline.fit(trainingData);

    //Export this model
    byte[] exportedModel = ModelExporter.export(sparkPipeline, null);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    Row[] sparkOutput = sparkPipeline.transform(testData).select("label", "features", "prediction", "rawPrediction", "probability").collect();

    //compare predictions
    for (Row row : sparkOutput) {
        Vector v = (Vector) row.get(1);
        double actual = row.getDouble(2);
        double [] actualProbability = ((Vector) row.get(4)).toArray();
        double[] actualRaw = ((Vector) row.get(3)).toArray();

        Map<String, Object> inputData = new HashMap<String, Object>();
        inputData.put("features", v.toArray());
        inputData.put("label", row.get(0).toString());
        transformer.transform(inputData);
        double predicted = (double) inputData.get("prediction");
        double[] probability = (double[]) inputData.get("probability");
        double[] rawPrediction = (double[]) inputData.get("rawPrediction");

        assertEquals(actual, predicted, EPSILON);
        assertArrayEquals(actualProbability, probability, EPSILON);
        assertArrayEquals(actualRaw, rawPrediction, EPSILON);
    }
}
 
开发者ID:flipkart-incubator,项目名称:spark-transformers,代码行数:58,代码来源:RandomForestClassificationModelInfoAdapterBridgeTest.java


示例17: testDecisionTreeClassificationWithPipeline

import org.apache.spark.ml.PipelineStage; //导入依赖的package包/类
@Test
public void testDecisionTreeClassificationWithPipeline() {
    // Load the data stored in LIBSVM format as a DataFrame.
    DataFrame data = sqlContext.read().format("libsvm").load("src/test/resources/classification_test.libsvm");

    // Split the data into training and test sets (30% held out for testing)
    DataFrame[] splits = data.randomSplit(new double[]{0.7, 0.3});
    DataFrame trainingData = splits[0];
    DataFrame testData = splits[1];

    StringIndexer indexer = new StringIndexer()
            .setInputCol("label")
            .setOutputCol("labelIndex");

    // Train a DecisionTree model.
    DecisionTreeClassifier classificationModel = new DecisionTreeClassifier()
            .setLabelCol("labelIndex")
            .setFeaturesCol("features");

    Pipeline pipeline = new Pipeline()
            .setStages(new PipelineStage[]{indexer, classificationModel});

    // Train model.  This also runs the indexer.
    PipelineModel sparkPipeline = pipeline.fit(trainingData);

    //Export this model
    byte[] exportedModel = ModelExporter.export(sparkPipeline, null);

    //Import and get Transformer
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    Row[] sparkOutput = sparkPipeline.transform(testData).select("label", "features", "prediction").collect();

    //compare predictions
    for (Row row : sparkOutput) {
        Vector v = (Vector) row.get(1);
        double actual = row.getDouble(2);

        Map<String, Object> inputData = new HashMap<String, Object>();
        inputData.put("features", v.toArray());
        inputData.put("label", row.get(0).toString());
        transformer.transform(inputData);
        double predicted = (double) inputData.get("prediction");

        assertEquals(actual, predicted, EPSILON);
    }
}
 
开发者ID:flipkart-incubator,项目名称:spark-transformers,代码行数:48,代码来源:DecisionTreeClassificationModelBridgeTest.java


示例18: shouldWorkCorrectlyWithPipeline

import org.apache.spark.ml.PipelineStage; //导入依赖的package包/类
@Test
public void shouldWorkCorrectlyWithPipeline() {

    //Prepare test data
    DataFrame df = getDataFrame();
    Row[] originalData = df.orderBy("id").select("id", "a", "b", "c", "d").collect();

    //prepare transformation pipeline
    FillNAValuesTransformer fillNAValuesTransformer = new FillNAValuesTransformer();
    fillNAValuesTransformer.setNAValueMap( getFillNAMap() );
    Pipeline pipeline = new Pipeline();
    pipeline.setStages(new PipelineStage[]{fillNAValuesTransformer});
    PipelineModel model = pipeline.fit(df);

    //predict
    Row[] sparkOutput = model.transform(df).orderBy("id").select("id", "a", "b", "c", "d").collect();

    //export
    byte[] exportedModel = ModelExporter.export(model, df);
    Transformer transformer = ModelImporter.importAndGetTransformer(exportedModel);

    //verify correctness
    assertTrue(transformer.getInputKeys().size() == 4);
    assertTrue(transformer.getInputKeys().containsAll(Arrays.asList("a", "b", "c", "d")));
    assertTrue(transformer.getOutputKeys().size() == 4);
    assertTrue(transformer.getOutputKeys().containsAll(Arrays.asList("a", "b", "c", "d")));
    for( int i=0; i < originalData.length; i++) {
        Map<String, Object> input = new HashMap<String, Object>();
        input.put("a", originalData[i].get(1));
        input.put("b", originalData[i].get(2));
        input.put("c", originalData[i].get(3));
        input.put("d", originalData[i].get(4));

        transformer.transform(input);

        assertEquals(sparkOutput[i].get(1), input.get("a"));
        assertEquals(sparkOutput[i].get(2), input.get("b"));
        assertEquals(sparkOutput[i].get(3), input.get("c"));
        assertEquals(sparkOutput[i].get(4), input.get("d"));
    }
}
 
开发者ID:flipkart-incubator,项目名称:spark-transformers,代码行数:42,代码来源:FillNAValuesTransformerBridgeTest.java


示例19: trainModel

import org.apache.spark.ml.PipelineStage; //导入依赖的package包/类
private static Transformer trainModel(SQLContext sqlContxt, DataFrame train, String tokenizerOutputCol, boolean useCV) {
		train = getCommonFeatures(sqlContxt, train, TOKENIZER_OUTPUT);
		
		VectorAssembler featuresForNorm = new VectorAssembler()
				.setInputCols(new String[] {"commonfeatures"})
				.setOutputCol("commonfeatures_norm");
		
		Normalizer norm = new Normalizer()
				.setInputCol(featuresForNorm.getOutputCol())
				.setOutputCol("norm_features");
		
		HashingTF hashingTF = new HashingTF()
				.setInputCol("ngrams")
				.setOutputCol("tf");
		
		IDF idf = new IDF()
				.setInputCol(hashingTF.getOutputCol())
				.setOutputCol("idf");
		
		// Learn a mapping from words to Vectors.
		Word2Vec word2Vec = new Word2Vec()
		  .setInputCol(tokenizerOutputCol)
		  .setOutputCol("w2v");
		
		List<String> assmeblerInput = new ArrayList<>();
			assmeblerInput.add("commonfeatures");
//			assmeblerInput.add(norm.getOutputCol());
//			assmeblerInput.add(idf.getOutputCol());
//			assmeblerInput.add(word2Vec.getOutputCol());
			assmeblerInput.add(W2V_DB);
		
		VectorAssembler assembler = new VectorAssembler()
				  .setInputCols(assmeblerInput.toArray(new String[assmeblerInput.size()]))
				  .setOutputCol("features");
		
		LogisticRegression lr = new LogisticRegression();
		
//		int[] layers = new int[] {108, 10, 10, 2};
//		// create the trainer and set its parameters
//		MultilayerPerceptronClassifier perceptron = new MultilayerPerceptronClassifier()
//		  .setLayers(layers)
//		  .setBlockSize(128)
//		  .setSeed(1234L)
//		  .setMaxIter(100);
//				.setRegParam(0.03);
//				.setElasticNetParam(0.3);
		
//			ngramTransformer, hashingTF, idf,
		PipelineStage[] pipelineStages = new PipelineStage[] {  /*hashingTF, idf,  word2Vec,*/  w2vModel, /*featuresForNorm, norm, */assembler, lr};
		Pipeline pipeline = new Pipeline()
				  .setStages(pipelineStages);
		
		stagesToString = ("commonfeatures_suff1x\t" + StringUtils.join(pipelineStages, "\t")).replaceAll("([A-Za-z]+)_[0-9A-Za-z]+", "$1");
					
		// We use a ParamGridBuilder to construct a grid of parameters to search over.
		// With 3 values for hashingTF.numFeatures and 2 values for lr.regParam,
		// this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from.
		ParamMap[] paramGrid = new ParamGridBuilder()
//				.addGrid(word2Vec.vectorSize(), new int[] {100, 500})
//				.addGrid(word2Vec.minCount(), new int[] {2, 3, 4})
//				.addGrid(ngramTransformer.n(), new int[] {2, 3})
//				.addGrid(hashingTF.numFeatures(), new int[] {1000, 2000})
			.addGrid(lr.maxIter(), new int[] {10})
//		    .addGrid(lr.regParam(), new double[] {0.0, 0.1, 0.4, 0.8, 1, 3, 5, 10})
//		    .addGrid(lr.fitIntercept())
//		    .addGrid(lr.elasticNetParam(), new double[] {0.0, 0.2, 0.5, 0.8, 1.0} )
//			    .addGrid(idf.minDocFreq(), new int[]{2, 4})
		    .build();
		
		Transformer model;
		
		if (!useCV) {
			model = trainWithValidationSplit(train, pipeline, paramGrid);
		} else {
			model = trainWithCrossValidation(train, pipeline, paramGrid);
		}
		
		return model;
	}
 
开发者ID:mhardalov,项目名称:news-credibility,代码行数:80,代码来源:NewsCredibilityMain.java


示例20: main

import org.apache.spark.ml.PipelineStage; //导入依赖的packa 

鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Java FixedTrackSelection类代码示例发布时间:2022-05-22
下一篇:
Java ScopeType类代码示例发布时间:2022-05-22
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap