本文整理汇总了Python中pyspark.mllib.linalg.Vectors类的典型用法代码示例。如果您正苦于以下问题:Python Vectors类的具体用法?Python Vectors怎么用?Python Vectors使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Vectors类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: test_glr_summary
def test_glr_summary(self):
from pyspark.mllib.linalg import Vectors
df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
(0.0, 2.0, Vectors.sparse(1, [], []))],
["label", "weight", "features"])
glr = GeneralizedLinearRegression(family="gaussian", link="identity", weightCol="weight",
fitIntercept=False)
model = glr.fit(df)
self.assertTrue(model.hasSummary)
s = model.summary
# test that api is callable and returns expected types
self.assertEqual(s.numIterations, 1) # this should default to a single iteration of WLS
self.assertTrue(isinstance(s.predictions, DataFrame))
self.assertEqual(s.predictionCol, "prediction")
self.assertTrue(isinstance(s.residuals(), DataFrame))
self.assertTrue(isinstance(s.residuals("pearson"), DataFrame))
coefStdErr = s.coefficientStandardErrors
self.assertTrue(isinstance(coefStdErr, list) and isinstance(coefStdErr[0], float))
tValues = s.tValues
self.assertTrue(isinstance(tValues, list) and isinstance(tValues[0], float))
pValues = s.pValues
self.assertTrue(isinstance(pValues, list) and isinstance(pValues[0], float))
self.assertEqual(s.degreesOfFreedom, 1)
self.assertEqual(s.residualDegreeOfFreedom, 1)
self.assertEqual(s.residualDegreeOfFreedomNull, 2)
self.assertEqual(s.rank, 1)
self.assertTrue(isinstance(s.solver, basestring))
self.assertTrue(isinstance(s.aic, float))
self.assertTrue(isinstance(s.deviance, float))
self.assertTrue(isinstance(s.nullDeviance, float))
self.assertTrue(isinstance(s.dispersion, float))
# test evaluation (with training dataset) produces a summary with same values
# one check is enough to verify a summary is returned, Scala version runs full test
sameSummary = model.evaluate(df)
self.assertAlmostEqual(sameSummary.deviance, s.deviance)
开发者ID:A7mech,项目名称:spark,代码行数:35,代码来源:tests.py
示例2: test_equals
def test_equals(self):
indices = [1, 2, 4]
values = [1., 3., 2.]
self.assertTrue(Vectors._equals(indices, values, list(range(5)), [0., 1., 3., 0., 2.]))
self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 3., 1., 0., 2.]))
self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 3., 0., 2.]))
self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 1., 3., 2., 2.]))
开发者ID:drewrobb,项目名称:spark,代码行数:7,代码来源:test_linalg.py
示例3: test_logistic_regression_summary
def test_logistic_regression_summary(self):
from pyspark.mllib.linalg import Vectors
sqlContext = SQLContext(self.sc)
df = sqlContext.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
(0.0, 2.0, Vectors.sparse(1, [], []))],
["label", "weight", "features"])
lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False)
model = lr.fit(df)
self.assertTrue(model.hasSummary)
s = model.summary
# test that api is callable and returns expected types
self.assertTrue(isinstance(s.predictions, DataFrame))
self.assertEqual(s.probabilityCol, "probability")
self.assertEqual(s.labelCol, "label")
self.assertEqual(s.featuresCol, "features")
objHist = s.objectiveHistory
self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float))
self.assertGreater(s.totalIterations, 0)
self.assertTrue(isinstance(s.roc, DataFrame))
self.assertAlmostEqual(s.areaUnderROC, 1.0, 2)
self.assertTrue(isinstance(s.pr, DataFrame))
self.assertTrue(isinstance(s.fMeasureByThreshold, DataFrame))
self.assertTrue(isinstance(s.precisionByThreshold, DataFrame))
self.assertTrue(isinstance(s.recallByThreshold, DataFrame))
# test evaluation (with training dataset) produces a summary with same values
# one check is enough to verify a summary is returned, Scala version runs full test
sameSummary = model.evaluate(df)
self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC)
开发者ID:Bella-Lin,项目名称:spark,代码行数:28,代码来源:tests.py
示例4: test_nnclassifier_in_pipeline
def test_nnclassifier_in_pipeline(self):
if self.sc.version.startswith("1"):
from pyspark.mllib.linalg import Vectors
df = self.sqlContext.createDataFrame(
[(Vectors.dense([2.0, 1.0]), 1.0),
(Vectors.dense([1.0, 2.0]), 2.0),
(Vectors.dense([2.0, 1.0]), 1.0),
(Vectors.dense([1.0, 2.0]), 2.0),
], ["features", "label"])
scaler = MinMaxScaler().setInputCol("features").setOutputCol("scaled")
model = Sequential().add(Linear(2, 2))
criterion = ClassNLLCriterion()
classifier = NNClassifier(model, criterion, MLlibVectorToTensor([2]))\
.setBatchSize(4) \
.setLearningRate(0.01).setMaxEpoch(1).setFeaturesCol("scaled")
pipeline = Pipeline(stages=[scaler, classifier])
pipelineModel = pipeline.fit(df)
res = pipelineModel.transform(df)
assert type(res).__name__ == 'DataFrame'
开发者ID:ru003ar,项目名称:analytics-zoo,代码行数:25,代码来源:test_nn_classifier.py
示例5: test_save_load
def test_save_load(self):
temp_path = tempfile.mkdtemp()
sqlContext = SQLContext(self.sc)
dataset = sqlContext.createDataFrame(
[(Vectors.dense([0.0]), 0.0),
(Vectors.dense([0.4]), 1.0),
(Vectors.dense([0.5]), 0.0),
(Vectors.dense([0.6]), 1.0),
(Vectors.dense([1.0]), 1.0)] * 10,
["features", "label"])
lr = LogisticRegression()
grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
evaluator = BinaryClassificationEvaluator()
cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
cvModel = cv.fit(dataset)
cvPath = temp_path + "/cv"
cv.save(cvPath)
loadedCV = CrossValidator.load(cvPath)
self.assertEqual(loadedCV.getEstimator().uid, cv.getEstimator().uid)
self.assertEqual(loadedCV.getEvaluator().uid, cv.getEvaluator().uid)
self.assertEqual(loadedCV.getEstimatorParamMaps(), cv.getEstimatorParamMaps())
cvModelPath = temp_path + "/cvModel"
cvModel.save(cvModelPath)
loadedModel = CrossValidatorModel.load(cvModelPath)
self.assertEqual(loadedModel.bestModel.uid, cvModel.bestModel.uid)
开发者ID:Bella-Lin,项目名称:spark,代码行数:25,代码来源:tests.py
示例6: test_append_bias_with_sp_vector
def test_append_bias_with_sp_vector(self):
data = Vectors.sparse(3, {0: 2.0, 2: 2.0})
expected = Vectors.sparse(4, {0: 2.0, 2: 2.0, 3: 1.0})
# Returned value must be SparseVector
ret = MLUtils.appendBias(data)
self.assertEqual(ret, expected)
self.assertEqual(type(ret), SparseVector)
开发者ID:drewrobb,项目名称:spark,代码行数:7,代码来源:test_util.py
示例7: test_persistence
def test_persistence(self):
# Test save/load for LDA, LocalLDAModel, DistributedLDAModel.
sqlContext = SQLContext(self.sc)
df = sqlContext.createDataFrame([
[1, Vectors.dense([0.0, 1.0])],
[2, Vectors.sparse(2, {0: 1.0})],
], ["id", "features"])
# Fit model
lda = LDA(k=2, seed=1, optimizer="em")
distributedModel = lda.fit(df)
self.assertTrue(distributedModel.isDistributed())
localModel = distributedModel.toLocal()
self.assertFalse(localModel.isDistributed())
# Define paths
path = tempfile.mkdtemp()
lda_path = path + "/lda"
dist_model_path = path + "/distLDAModel"
local_model_path = path + "/localLDAModel"
# Test LDA
lda.save(lda_path)
lda2 = LDA.load(lda_path)
self._compare(lda, lda2)
# Test DistributedLDAModel
distributedModel.save(dist_model_path)
distributedModel2 = DistributedLDAModel.load(dist_model_path)
self._compare(distributedModel, distributedModel2)
# Test LocalLDAModel
localModel.save(local_model_path)
localModel2 = LocalLDAModel.load(local_model_path)
self._compare(localModel, localModel2)
# Clean up
try:
rmtree(path)
except OSError:
pass
开发者ID:bsangee,项目名称:spark,代码行数:35,代码来源:tests.py
示例8: test_model_transform
def test_model_transform(self):
weight = Vectors.dense([3, 2, 1])
densevec = Vectors.dense([4, 5, 6])
sparsevec = Vectors.sparse(3, [0], [1])
eprod = ElementwiseProduct(weight)
self.assertEqual(eprod.transform(densevec), DenseVector([12, 10, 6]))
self.assertEqual(
eprod.transform(sparsevec), SparseVector(3, [0], [3]))
开发者ID:HodaAlemi,项目名称:spark,代码行数:9,代码来源:tests.py
示例9: test_right_number_of_results
def test_right_number_of_results(self):
num_cols = 1001
sparse_data = [
LabeledPoint(0.0, Vectors.sparse(num_cols, [(100, 2.0)])),
LabeledPoint(0.1, Vectors.sparse(num_cols, [(200, 1.0)]))
]
chi = Statistics.chiSqTest(self.sc.parallelize(sparse_data))
self.assertEqual(len(chi), num_cols)
self.assertIsNotNone(chi[1000])
开发者ID:greatyan,项目名称:spark,代码行数:9,代码来源:tests.py
示例10: _get_train_data
def _get_train_data(self):
sql_context = SQLContext(self.sc)
l = [
(1, Vectors.dense([1, 2, 3]), 1.0),
(2, Vectors.dense([1, 2, 3]), 0.0),
(3, Vectors.dense([1, 2, 3]), 1.0),
(4, Vectors.dense([1, 2, 3]), 0.0),
]
return sql_context.createDataFrame(l, ['id', 'features', 'label'])
开发者ID:ngarneau,项目名称:sentiment-analysis,代码行数:9,代码来源:pipelines.py
示例11: test_parse_vector
def test_parse_vector(self):
a = DenseVector([3, 4, 6, 7])
self.assertTrue(str(a), '[3.0,4.0,6.0,7.0]')
self.assertTrue(Vectors.parse(str(a)), a)
a = SparseVector(4, [0, 2], [3, 4])
self.assertTrue(str(a), '(4,[0,2],[3.0,4.0])')
self.assertTrue(Vectors.parse(str(a)), a)
a = SparseVector(10, [0, 1], [4, 5])
self.assertTrue(SparseVector.parse(' (10, [0,1 ],[ 4.0,5.0] )'), a)
开发者ID:HodaAlemi,项目名称:spark,代码行数:9,代码来源:tests.py
示例12: test_idf_model
def test_idf_model(self):
data = [
Vectors.dense([1, 2, 6, 0, 2, 3, 1, 1, 0, 0, 3]),
Vectors.dense([1, 3, 0, 1, 3, 0, 0, 2, 0, 0, 1]),
Vectors.dense([1, 4, 1, 0, 0, 4, 9, 0, 1, 2, 0]),
Vectors.dense([2, 1, 0, 3, 0, 0, 5, 0, 2, 3, 9])
]
model = IDF().fit(self.sc.parallelize(data, 2))
idf = model.idf()
self.assertEqual(len(idf), 11)
开发者ID:HodaAlemi,项目名称:spark,代码行数:10,代码来源:tests.py
示例13: test_output_columns
def test_output_columns(self):
df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
(1.0, Vectors.sparse(2, [], [])),
(2.0, Vectors.dense(0.5, 0.5))],
["label", "features"])
lr = LogisticRegression(maxIter=5, regParam=0.01)
ovr = OneVsRest(classifier=lr)
model = ovr.fit(df)
output = model.transform(df)
self.assertEqual(output.columns, ["label", "features", "prediction"])
开发者ID:A7mech,项目名称:spark,代码行数:10,代码来源:tests.py
示例14: load_data_rdd
def load_data_rdd(csv_file, shuffle=True, train=True):
if shuffle:
shuffle_csv(csv_file)
data = sc.textFile(data_path + csv_file)
data = data.filter(lambda x:x.split(',')[0] != 'id').map(lambda line: line.split(','))
if train:
data = data.map(
lambda line: (Vectors.dense(np.asarray(line[1:-1]).astype(np.float32)),
str(line[-1]).replace('Class_', '')) )
else:
data = data.map(lambda line: (Vectors.dense(np.asarray(line[1:]).astype(np.float32)), "1") )
return data
开发者ID:thisiskofi,项目名称:elephas,代码行数:12,代码来源:ml_pipeline_otto.py
示例15: parseEntry
def parseEntry(xx):
mindate=datetime.datetime(datetime.MINYEAR, 1, 1,1,1)
xx=xx.split('\t')
a_virtual=xx[0]
browser=xx[1]
referrer=xx[2]
a_user_key=xx[3]
try:
birthyear=int(xx[4])
age=2015-birthyear
except Exception as _:
birthyear=xx[4]
age=-1
gender=xx[5]
#print(xx)
#print(xx[6])
if xx[6]!='NAN':
reg_date=datetime.datetime.strptime(xx[6],'%Y-%m-%d')
else:
reg_date=mindate
device=xx[7]
date=datetime.datetime.strptime(xx[8],'%d-%m-%Y')
tdiff=datetime.timedelta(hours=int(xx[9]))
date=date+tdiff
year=date.year
month=date.month
day=date.day
hour=int(xx[9])
weekday=date.weekday()
if reg_date>mindate:
days_since_registration=(date-reg_date).days
else:
days_since_registration=-1
metrics=list([int(x.replace(',0','')) for x in xx[10:]])
visits=metrics[0]
visits_betalt=metrics[1]
pageviews=metrics[2]
pageview_nothome=metrics[3]
pageview_betalt=metrics[4]
timegroup_pvs=Vectors.sparse(maxInd,[(intervalIndDict[(weekday,hour)],pageviews)])
timegroup_visit=Vectors.sparse(maxInd,[(intervalIndDict[(weekday,hour)],1.)])
return Row(browser=browser,a_user_key=a_user_key,age=age,\
day=day,hour=hour,date=date,weekday=weekday,pv=pageviews,\
pv_nh=pageview_nothome,pv_bet=pageview_betalt,referrer=referrer,\
device=device,gender=gender,days_since_registration=days_since_registration,\
reg_date=reg_date,timegroup_pvs=timegroup_pvs,timegroup_visit=timegroup_visit,\
a_virtual=a_virtual)
开发者ID:Froskekongen,项目名称:content-consumption,代码行数:52,代码来源:consume_profiles_spark_2.py
示例16: remove_time_dependent_effects
def remove_time_dependent_effects(self, ts):
"""
Given a timeseries, apply inverse operations to obtain the original series of underlying errors.
Parameters
----------
ts:
Time series of observations with this model's characteristics as a Numpy array
returns the time series with removed time-dependent effects as a Numpy array
"""
destts = Vectors.dense(np.array([0] * len(ts)))
result = self._jmodel.removeTimeDependentEffects(_py2java(self._ctx, Vectors.dense(ts)), _py2java(self._ctx, destts))
return _java2py(self._ctx, result.toArray())
开发者ID:BabelTower,项目名称:spark-timeseries,代码行数:13,代码来源:_model.py
示例17: create_rows_for_rdd
def create_rows_for_rdd(x):
"""
:param x:
:return:
"""
features = list(x[1])
l = len(features) - 1
label = float(features.pop(l))
meta_data = x[0]
return Row(label=label,
features=Vectors.dense(features),
meta_data=Vectors.dense(meta_data))
开发者ID:USF-ML2,项目名称:SKYNET-,代码行数:13,代码来源:modeling_utils.py
示例18: load_data_frame
def load_data_frame(csv_file, shuffle=True, train=True):
if shuffle:
shuffle_csv(csv_file)
data = sc.textFile('/home/minglu/dist_spark/data/' + csv_file) # This is an RDD, which will later be transformed to a data frame
data = data.filter(lambda x:x.split(',')[0] != 'label').map(lambda line: line.split(','))
if train:
data = data.map(
lambda line: (Vectors.dense(np.asarray(line[1:]).astype(np.float32)),
'class_'+str(line[0]),int(line[0])) )
else:
# Test data gets dummy labels. We need the same structure as in Train data
data = data.map( lambda line: (Vectors.dense(np.asarray(line[1:]).astype(np.float32)),'class_'+str(line[0]),int(line[0])) )
return sqlcontext.createDataFrame(data, ['features', 'category','label'])
开发者ID:ChienHsiung,项目名称:python,代码行数:13,代码来源:spark101.py
示例19: test_copy
def test_copy(self):
df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
(1.0, Vectors.sparse(2, [], [])),
(2.0, Vectors.dense(0.5, 0.5))],
["label", "features"])
lr = LogisticRegression(maxIter=5, regParam=0.01)
ovr = OneVsRest(classifier=lr)
ovr1 = ovr.copy({lr.maxIter: 10})
self.assertEqual(ovr.getClassifier().getMaxIter(), 5)
self.assertEqual(ovr1.getClassifier().getMaxIter(), 10)
model = ovr.fit(df)
model1 = model.copy({model.predictionCol: "indexed"})
self.assertEqual(model1.getPredictionCol(), "indexed")
开发者ID:A7mech,项目名称:spark,代码行数:13,代码来源:tests.py
示例20: add_time_dependent_effects
def add_time_dependent_effects(self, ts):
"""
Given a timeseries, apply a model to it.
Parameters
----------
ts:
Time series of i.i.d. observations as a Numpy array
returns the time series with added time-dependent effects as a Numpy array.
"""
destts = Vectors.dense([0] * len(ts))
result = self._jmodel.addTimeDependentEffects(_py2java(self._ctx, Vectors.dense(ts)), _py2java(self._ctx, destts))
return _java2py(self._ctx, result.toArray())
开发者ID:BabelTower,项目名称:spark-timeseries,代码行数:14,代码来源:_model.py
注:本文中的pyspark.mllib.linalg.Vectors类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论