本文整理汇总了Python中pyspark.ml.linalg.Vectors类的典型用法代码示例。如果您正苦于以下问题:Python Vectors类的具体用法?Python Vectors怎么用?Python Vectors使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Vectors类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: test_save_load_simple_estimator
def test_save_load_simple_estimator(self):
temp_path = tempfile.mkdtemp()
dataset = self.spark.createDataFrame(
[(Vectors.dense([0.0]), 0.0),
(Vectors.dense([0.4]), 1.0),
(Vectors.dense([0.5]), 0.0),
(Vectors.dense([0.6]), 1.0),
(Vectors.dense([1.0]), 1.0)] * 10,
["features", "label"])
lr = LogisticRegression()
grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
evaluator = BinaryClassificationEvaluator()
# test save/load of CrossValidator
cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
cvModel = cv.fit(dataset)
cvPath = temp_path + "/cv"
cv.save(cvPath)
loadedCV = CrossValidator.load(cvPath)
self.assertEqual(loadedCV.getEstimator().uid, cv.getEstimator().uid)
self.assertEqual(loadedCV.getEvaluator().uid, cv.getEvaluator().uid)
self.assertEqual(loadedCV.getEstimatorParamMaps(), cv.getEstimatorParamMaps())
# test save/load of CrossValidatorModel
cvModelPath = temp_path + "/cvModel"
cvModel.save(cvModelPath)
loadedModel = CrossValidatorModel.load(cvModelPath)
self.assertEqual(loadedModel.bestModel.uid, cvModel.bestModel.uid)
开发者ID:Brett-A,项目名称:spark,代码行数:29,代码来源:test_tuning.py
示例2: test_equals
def test_equals(self):
indices = [1, 2, 4]
values = [1., 3., 2.]
self.assertTrue(Vectors._equals(indices, values, list(range(5)), [0., 1., 3., 0., 2.]))
self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 3., 1., 0., 2.]))
self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 3., 0., 2.]))
self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 1., 3., 2., 2.]))
开发者ID:JkSelf,项目名称:spark,代码行数:7,代码来源:test_linalg.py
示例3: test_expose_sub_models
def test_expose_sub_models(self):
temp_path = tempfile.mkdtemp()
dataset = self.spark.createDataFrame(
[(Vectors.dense([0.0]), 0.0),
(Vectors.dense([0.4]), 1.0),
(Vectors.dense([0.5]), 0.0),
(Vectors.dense([0.6]), 1.0),
(Vectors.dense([1.0]), 1.0)] * 10,
["features", "label"])
lr = LogisticRegression()
grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
evaluator = BinaryClassificationEvaluator()
tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator,
collectSubModels=True)
tvsModel = tvs.fit(dataset)
self.assertEqual(len(tvsModel.subModels), len(grid))
# Test the default value for option "persistSubModel" to be "true"
testSubPath = temp_path + "/testTrainValidationSplitSubModels"
savingPathWithSubModels = testSubPath + "cvModel3"
tvsModel.save(savingPathWithSubModels)
tvsModel3 = TrainValidationSplitModel.load(savingPathWithSubModels)
self.assertEqual(len(tvsModel3.subModels), len(grid))
tvsModel4 = tvsModel3.copy()
self.assertEqual(len(tvsModel4.subModels), len(grid))
savingPathWithoutSubModels = testSubPath + "cvModel2"
tvsModel.write().option("persistSubModels", "false").save(savingPathWithoutSubModels)
tvsModel2 = TrainValidationSplitModel.load(savingPathWithoutSubModels)
self.assertEqual(tvsModel2.subModels, None)
for i in range(len(grid)):
self.assertEqual(tvsModel.subModels[i].uid, tvsModel3.subModels[i].uid)
开发者ID:Brett-A,项目名称:spark,代码行数:33,代码来源:test_tuning.py
示例4: test_java_object_gets_detached
def test_java_object_gets_detached(self):
df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
(0.0, 2.0, Vectors.sparse(1, [], []))],
["label", "weight", "features"])
lr = LinearRegression(maxIter=1, regParam=0.0, solver="normal", weightCol="weight",
fitIntercept=False)
model = lr.fit(df)
summary = model.summary
self.assertIsInstance(model, JavaWrapper)
self.assertIsInstance(summary, JavaWrapper)
self.assertIsInstance(model, JavaParams)
self.assertNotIsInstance(summary, JavaParams)
error_no_object = 'Target Object ID does not exist for this gateway'
self.assertIn("LinearRegression_", model._java_obj.toString())
self.assertIn("LinearRegressionTrainingSummary", summary._java_obj.toString())
model.__del__()
with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object):
model._java_obj.toString()
self.assertIn("LinearRegressionTrainingSummary", summary._java_obj.toString())
try:
summary.__del__()
except:
pass
with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object):
model._java_obj.toString()
with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object):
summary._java_obj.toString()
开发者ID:Brett-A,项目名称:spark,代码行数:35,代码来源:test_wrapper.py
示例5: test_persistence
def test_persistence(self):
# Test save/load for LDA, LocalLDAModel, DistributedLDAModel.
df = self.spark.createDataFrame([
[1, Vectors.dense([0.0, 1.0])],
[2, Vectors.sparse(2, {0: 1.0})],
], ["id", "features"])
# Fit model
lda = LDA(k=2, seed=1, optimizer="em")
distributedModel = lda.fit(df)
self.assertTrue(distributedModel.isDistributed())
localModel = distributedModel.toLocal()
self.assertFalse(localModel.isDistributed())
# Define paths
path = tempfile.mkdtemp()
lda_path = path + "/lda"
dist_model_path = path + "/distLDAModel"
local_model_path = path + "/localLDAModel"
# Test LDA
lda.save(lda_path)
lda2 = LDA.load(lda_path)
self._compare(lda, lda2)
# Test DistributedLDAModel
distributedModel.save(dist_model_path)
distributedModel2 = DistributedLDAModel.load(dist_model_path)
self._compare(distributedModel, distributedModel2)
# Test LocalLDAModel
localModel.save(local_model_path)
localModel2 = LocalLDAModel.load(local_model_path)
self._compare(localModel, localModel2)
# Clean up
try:
rmtree(path)
except OSError:
pass
开发者ID:Brett-A,项目名称:spark,代码行数:34,代码来源:test_algorithms.py
示例6: test_output_columns
def test_output_columns(self):
df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
(1.0, Vectors.sparse(2, [], [])),
(2.0, Vectors.dense(0.5, 0.5))],
["label", "features"])
lr = LogisticRegression(maxIter=5, regParam=0.01)
ovr = OneVsRest(classifier=lr, parallelism=1)
model = ovr.fit(df)
output = model.transform(df)
self.assertEqual(output.columns, ["label", "features", "rawPrediction", "prediction"])
开发者ID:Brett-A,项目名称:spark,代码行数:10,代码来源:test_algorithms.py
示例7: test_copy
def test_copy(self):
df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
(1.0, Vectors.sparse(2, [], [])),
(2.0, Vectors.dense(0.5, 0.5))],
["label", "features"])
lr = LogisticRegression(maxIter=5, regParam=0.01)
ovr = OneVsRest(classifier=lr)
ovr1 = ovr.copy({lr.maxIter: 10})
self.assertEqual(ovr.getClassifier().getMaxIter(), 5)
self.assertEqual(ovr1.getClassifier().getMaxIter(), 10)
model = ovr.fit(df)
model1 = model.copy({model.predictionCol: "indexed"})
self.assertEqual(model1.getPredictionCol(), "indexed")
开发者ID:Brett-A,项目名称:spark,代码行数:13,代码来源:test_algorithms.py
示例8: test_parallelism_doesnt_change_output
def test_parallelism_doesnt_change_output(self):
df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
(1.0, Vectors.sparse(2, [], [])),
(2.0, Vectors.dense(0.5, 0.5))],
["label", "features"])
ovrPar1 = OneVsRest(classifier=LogisticRegression(maxIter=5, regParam=.01), parallelism=1)
modelPar1 = ovrPar1.fit(df)
ovrPar2 = OneVsRest(classifier=LogisticRegression(maxIter=5, regParam=.01), parallelism=2)
modelPar2 = ovrPar2.fit(df)
for i, model in enumerate(modelPar1.models):
self.assertTrue(np.allclose(model.coefficients.toArray(),
modelPar2.models[i].coefficients.toArray(), atol=1E-4))
self.assertTrue(np.allclose(model.intercept, modelPar2.models[i].intercept, atol=1E-4))
开发者ID:Brett-A,项目名称:spark,代码行数:13,代码来源:test_algorithms.py
示例9: test_support_for_weightCol
def test_support_for_weightCol(self):
df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8), 1.0),
(1.0, Vectors.sparse(2, [], []), 1.0),
(2.0, Vectors.dense(0.5, 0.5), 1.0)],
["label", "features", "weight"])
# classifier inherits hasWeightCol
lr = LogisticRegression(maxIter=5, regParam=0.01)
ovr = OneVsRest(classifier=lr, weightCol="weight")
self.assertIsNotNone(ovr.fit(df))
# classifier doesn't inherit hasWeightCol
dt = DecisionTreeClassifier()
ovr2 = OneVsRest(classifier=dt, weightCol="weight")
self.assertIsNotNone(ovr2.fit(df))
开发者ID:Brett-A,项目名称:spark,代码行数:13,代码来源:test_algorithms.py
示例10: test_offset
def test_offset(self):
df = self.spark.createDataFrame(
[(0.2, 1.0, 2.0, Vectors.dense(0.0, 5.0)),
(0.5, 2.1, 0.5, Vectors.dense(1.0, 2.0)),
(0.9, 0.4, 1.0, Vectors.dense(2.0, 1.0)),
(0.7, 0.7, 0.0, Vectors.dense(3.0, 3.0))], ["label", "weight", "offset", "features"])
glr = GeneralizedLinearRegression(family="poisson", weightCol="weight", offsetCol="offset")
model = glr.fit(df)
self.assertTrue(np.allclose(model.coefficients.toArray(), [0.664647, -0.3192581],
atol=1E-4))
self.assertTrue(np.isclose(model.intercept, -1.561613, atol=1E-4))
开发者ID:Brett-A,项目名称:spark,代码行数:13,代码来源:test_algorithms.py
示例11: ztest_toPandas
def ztest_toPandas(self):
data = [(Vectors.dense([0.1, 0.2]),),
(Vectors.sparse(2, {0:0.3, 1:0.4}),),
(Vectors.sparse(2, {0:0.5, 1:0.6}),)]
df = self.sql.createDataFrame(data, ["features"])
self.assertEqual(df.count(), 3)
pd = self.converter.toPandas(df)
self.assertEqual(len(pd), 3)
self.assertTrue(isinstance(pd.features[0], csr_matrix),
"Expected pd.features[0] to be csr_matrix but found: %s" %
type(pd.features[0]))
self.assertEqual(pd.features[0].shape[0], 3)
self.assertEqual(pd.features[0].shape[1], 2)
self.assertEqual(pd.features[0][0,0], 0.1)
self.assertEqual(pd.features[0][0,1], 0.2)
开发者ID:Sandy4321,项目名称:spark-sklearn,代码行数:15,代码来源:converter_test.py
示例12: test_binomial_logistic_regression_with_bound
def test_binomial_logistic_regression_with_bound(self):
df = self.spark.createDataFrame(
[(1.0, 1.0, Vectors.dense(0.0, 5.0)),
(0.0, 2.0, Vectors.dense(1.0, 2.0)),
(1.0, 3.0, Vectors.dense(2.0, 1.0)),
(0.0, 4.0, Vectors.dense(3.0, 3.0)), ], ["label", "weight", "features"])
lor = LogisticRegression(regParam=0.01, weightCol="weight",
lowerBoundsOnCoefficients=Matrices.dense(1, 2, [-1.0, -1.0]),
upperBoundsOnIntercepts=Vectors.dense(0.0))
model = lor.fit(df)
self.assertTrue(
np.allclose(model.coefficients.toArray(), [-0.2944, -0.0484], atol=1E-4))
self.assertTrue(np.isclose(model.intercept, 0.0, atol=1E-4))
开发者ID:Brett-A,项目名称:spark,代码行数:15,代码来源:test_algorithms.py
示例13: test_bisecting_kmeans_summary
def test_bisecting_kmeans_summary(self):
data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),),
(Vectors.sparse(1, [], []),)]
df = self.spark.createDataFrame(data, ["features"])
bkm = BisectingKMeans(k=2)
model = bkm.fit(df)
self.assertTrue(model.hasSummary)
s = model.summary
self.assertTrue(isinstance(s.predictions, DataFrame))
self.assertEqual(s.featuresCol, "features")
self.assertEqual(s.predictionCol, "prediction")
self.assertTrue(isinstance(s.cluster, DataFrame))
self.assertEqual(len(s.clusterSizes), 2)
self.assertEqual(s.k, 2)
self.assertEqual(s.numIter, 20)
开发者ID:Brett-A,项目名称:spark,代码行数:15,代码来源:test_training_summary.py
示例14: test_kmeans_summary
def test_kmeans_summary(self):
data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),),
(Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)]
df = self.spark.createDataFrame(data, ["features"])
kmeans = KMeans(k=2, seed=1)
model = kmeans.fit(df)
self.assertTrue(model.hasSummary)
s = model.summary
self.assertTrue(isinstance(s.predictions, DataFrame))
self.assertEqual(s.featuresCol, "features")
self.assertEqual(s.predictionCol, "prediction")
self.assertTrue(isinstance(s.cluster, DataFrame))
self.assertEqual(len(s.clusterSizes), 2)
self.assertEqual(s.k, 2)
self.assertEqual(s.numIter, 1)
开发者ID:Brett-A,项目名称:spark,代码行数:15,代码来源:test_training_summary.py
示例15: test_kmean_pmml_basic
def test_kmean_pmml_basic(self):
# Most of the validation is done in the Scala side, here we just check
# that we output text rather than parquet (e.g. that the format flag
# was respected).
data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),),
(Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)]
df = self.spark.createDataFrame(data, ["features"])
kmeans = KMeans(k=2, seed=1)
model = kmeans.fit(df)
path = tempfile.mkdtemp()
km_path = path + "/km-pmml"
model.write().format("pmml").save(km_path)
pmml_text_list = self.sc.textFile(km_path).collect()
pmml_text = "\n".join(pmml_text_list)
self.assertIn("Apache Spark", pmml_text)
self.assertIn("PMML", pmml_text)
开发者ID:Brett-A,项目名称:spark,代码行数:16,代码来源:test_persistence.py
示例16: test_vector_size_hint
def test_vector_size_hint(self):
df = self.spark.createDataFrame(
[(0, Vectors.dense([0.0, 10.0, 0.5])),
(1, Vectors.dense([1.0, 11.0, 0.5, 0.6])),
(2, Vectors.dense([2.0, 12.0]))],
["id", "vector"])
sizeHint = VectorSizeHint(
inputCol="vector",
handleInvalid="skip")
sizeHint.setSize(3)
self.assertEqual(sizeHint.getSize(), 3)
output = sizeHint.transform(df).head().vector
expected = DenseVector([0.0, 10.0, 0.5])
self.assertEqual(output, expected)
开发者ID:Brett-A,项目名称:spark,代码行数:16,代码来源:test_feature.py
示例17: test_linear_regression_pmml_basic
def test_linear_regression_pmml_basic(self):
# Most of the validation is done in the Scala side, here we just check
# that we output text rather than parquet (e.g. that the format flag
# was respected).
df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
(0.0, 2.0, Vectors.sparse(1, [], []))],
["label", "weight", "features"])
lr = LinearRegression(maxIter=1)
model = lr.fit(df)
path = tempfile.mkdtemp()
lr_path = path + "/lr-pmml"
model.write().format("pmml").save(lr_path)
pmml_text_list = self.sc.textFile(lr_path).collect()
pmml_text = "\n".join(pmml_text_list)
self.assertIn("Apache Spark", pmml_text)
self.assertIn("PMML", pmml_text)
开发者ID:JingchengDu,项目名称:spark,代码行数:16,代码来源:test_persistence.py
示例18: test_tweedie_distribution
def test_tweedie_distribution(self):
df = self.spark.createDataFrame(
[(1.0, Vectors.dense(0.0, 0.0)),
(1.0, Vectors.dense(1.0, 2.0)),
(2.0, Vectors.dense(0.0, 0.0)),
(2.0, Vectors.dense(1.0, 1.0)), ], ["label", "features"])
glr = GeneralizedLinearRegression(family="tweedie", variancePower=1.6)
model = glr.fit(df)
self.assertTrue(np.allclose(model.coefficients.toArray(), [-0.4645, 0.3402], atol=1E-4))
self.assertTrue(np.isclose(model.intercept, 0.7841, atol=1E-4))
model2 = glr.setLinkPower(-1.0).fit(df)
self.assertTrue(np.allclose(model2.coefficients.toArray(), [-0.6667, 0.5], atol=1E-4))
self.assertTrue(np.isclose(model2.intercept, 0.6667, atol=1E-4))
开发者ID:Brett-A,项目名称:spark,代码行数:16,代码来源:test_algorithms.py
示例19: reduce
def reduce(inputpath,alg,k):
n_data = 0
n_features = 0
result = "successful!"
inputdir = os.path.dirname(inputpath)
print "inputdir: " + inputdir + result
inputfile = open(inputpath,'r')
for line in inputfile:
input_n = len(line.split(" "))
n_data += 1
#print "Selected data set has " + str(input_n) + " features"
#break
inputfile.close()
# result = "File: " + os.path.basename(output_data) + '</br>'
# result += "Path: " + os.path.dirname(output_data) + '/' + alg + str(k) + "_Features/" + '</br>'
# result += "Dimension: " + str(n_data) + " x " + str(n_features) + "</br>"
# context = {'result': result}
# yield context
if int(k) >= input_n:
print "reduced features must be smaller than input features."
result = "reduced features must be smaller than input features."
else:
# os.system("export _JAVA_OPTIONS='-Xms1g -Xmx40g'")
# conf = (SparkConf().set("spark.driver.maxResultSize", "5g"))
# sc = SparkContext(conf=conf)
# sqlContext = SQLContext(sc)
lines = sc.textFile(inputpath).map(lambda x:x.split(" "))
lines = lines.map(lambda x:(x[0],[float(y) for y in x[1:]]))
df = lines.map(lambda x: Row(labels=x[0],features=Vectors.dense(x[1]))).toDF()
if alg == "pca":
output_data = pca(inputdir,df,alg,k)
#os.system("spark-submit /home/ubuntu/yi-imPro/imagepro/pca.py " + inputpath + " " + k)
output_data = inputdir + "/" + alg + str(k) + "_Data"
inputfile = open(output_data, 'r')
file_size = str(os.stat(output_data).st_size )
counter = 0
n_features = '0'
for line in inputfile:
input_n = len(line.split(" "))
n_features = str(input_n)
counter += 1
inputfile.close()
n_data = str(counter)
result = "File: " + os.path.basename(output_data) + '</br>'
result += "Path: " + os.path.dirname(output_data) + '/' + alg + str(k) + "_Features/" + '</br>'
result += "Dimension: " + n_data + " x " + n_features + "</br>"
result += "Size: " + file_size + ' bytes'
print result
# sc.stop()
print "Dimension reduction finished!"
context = {'n_data': n_data, 'n_features': n_features, 'result': result}
return context
开发者ID:eason001,项目名称:imPro,代码行数:60,代码来源:views.py
示例20: test_parallel_evaluation
def test_parallel_evaluation(self):
dataset = self.spark.createDataFrame(
[(Vectors.dense([0.0]), 0.0),
(Vectors.dense([0.4]), 1.0),
(Vectors.dense([0.5]), 0.0),
(Vectors.dense([0.6]), 1.0),
(Vectors.dense([1.0]), 1.0)] * 10,
["features", "label"])
lr = LogisticRegression()
grid = ParamGridBuilder().addGrid(lr.maxIter, [5, 6]).build()
evaluator = BinaryClassificationEvaluator()
tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
tvs.setParallelism(1)
tvsSerialModel = tvs.fit(dataset)
tvs.setParallelism(2)
tvsParallelModel = tvs.fit(dataset)
self.assertEqual(tvsSerialModel.validationMetrics, tvsParallelModel.validationMetrics)
开发者ID:Brett-A,项目名称:spark,代码行数:17,代码来源:test_tuning.py
注:本文中的pyspark.ml.linalg.Vectors类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论