本文整理汇总了Python中pyspark.mllib.feature.StandardScaler类的典型用法代码示例。如果您正苦于以下问题:Python StandardScaler类的具体用法?Python StandardScaler怎么用?Python StandardScaler使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了StandardScaler类的17个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: test_model_transform
def test_model_transform(self):
data = [
[1.0, 2.0, 3.0],
[2.0, 3.0, 4.0],
[3.0, 4.0, 5.0]
]
model = StandardScaler().fit(self.sc.parallelize(data))
self.assertEqual(model.transform([1.0, 2.0, 3.0]), DenseVector([1.0, 2.0, 3.0]))
开发者ID:HodaAlemi,项目名称:spark,代码行数:8,代码来源:tests.py
示例2: test_model_setters
def test_model_setters(self):
data = [
[1.0, 2.0, 3.0],
[2.0, 3.0, 4.0],
[3.0, 4.0, 5.0]
]
model = StandardScaler().fit(self.sc.parallelize(data))
self.assertIsNotNone(model.setWithMean(True))
self.assertIsNotNone(model.setWithStd(True))
self.assertEqual(model.transform([1.0, 2.0, 3.0]), DenseVector([-1.0, -1.0, -1.0]))
开发者ID:HodaAlemi,项目名称:spark,代码行数:10,代码来源:tests.py
示例3: extract_features
def extract_features(self, feat='tfidf', **kwargs):
"""
Converts each subtitle into its TF/TFIDF representation.
Normalizes if necessary.
Parameters
--------
Feat: 'tf' or 'tfidf'.
kwargs: num_features, minDocFreq, or other arguments to be passed
to the MLLib objects.
Returns
--------
RDD of features with key.
"""
# transform BOW into TF vectors
num_features = kwargs.get('num_features', 10000)
htf = HashingTF(num_features)
feat_rdd = self.RDD.mapValues(htf.transform).cache()
# transform TF vectors into IDF vectors
if feat == 'tfidf':
keys, tf_vecs = feat_rdd.keys(), feat_rdd.values()
minDocFreq = kwargs.get('minDocFreq', 2)
idf = IDF(minDocFreq=minDocFreq)
idf_model = idf.fit(tf_vecs)
idf_rdd = idf_model.transform(tf_vecs.map(lambda vec: vec.toArray()))
feat_rdd = keys.zip(idf_rdd)
if self.model_type == 'log_reg':
normalizer = StandardScaler(withMean=True, withStd=True)
keys, vecs = feat_rdd.keys(), feat_rdd.values()
norm_model = normalizer.fit(vecs)
norm_rdd = norm_model.transform(vecs.map(lambda vec: vec.toArray()))
feat_rdd = keys.zip(norm_rdd)
return feat_rdd
开发者ID:Nathx,项目名称:parental_advisory_ml,代码行数:38,代码来源:spark_model.py
示例4: LabeledPoint
# 24 = mode
# 27 = tempo
# 28 = time_signature
allData = trackRocks.join(songData).map(lambda (tr, (rocks, data)): (tr, (0.0 if rocks is None else rocks, data)))
allData.take(3)
# label data
# only uses one feature for now
# labeledData = allData.map(lambda (tr, (rocks, data)): LabeledPoint(rocks, [data[6]]))
# labeledData = allData.map(lambda (tr, (rocks, data)): LabeledPoint(rocks, [random.random() + (.5 if rocks == 1 else 0)]))
labels = allData.map(lambda (tr, (rocks, data)): rocks)
features = allData.map(lambda (tr, (rocks, data)): data)
std = StandardScaler(True, True).fit(features)
scaledFeatures = std.transform(features)
labeledData = labels.zip(scaledFeatures).map(lambda (label, data): LabeledPoint(label, data))
# uses all extracted
# labeledData = allData.map(lambda (tr, (rocks, data)): LabeledPoint(rocks, [x for x in data]))
labeledData.take(3)
# make sample sizes equal
labeledRock = labeledData.filter(lambda p: p.label == 1.0)
labeledRock.count()
labeledRock.map(lambda p: p.features[0]).mean()
nrock = labeledRock.count()
开发者ID:ScalingUpMusic,项目名称:SUMsandbox,代码行数:30,代码来源:rock_ml.py
示例5: StandardScaler
df = sqlContext.createDataFrame(dictList)
df.show()
pdf = df.toPandas
table = pd.pivot_table(pdf, index=['datetime'], columns=['data:temp'], aggfunc=numpy.mean)
print table.values
# For Testing
#df.show()
#df.describe(['data:temp', 'datetime', 'sensorName', 'data:humidity']).show()
df = df.select('data:temp', 'data:humidity', 'data:chlPPM', 'data:co2', 'data:flo', 'data:psi')
#df.show()
temp = df.map(lambda line:LabeledPoint(line[0], [line[1:]]))
# Scale the data
features = df.map(lambda row: row[1:])
standardizer = StandardScaler()
model = standardizer.fit(features)
features_transform = model.transform(features)
print features_transform.take(5)
lab = df.map(lambda row: row[0])
transformedData = lab.zip(features_transform)
transformedData = transformedData.map(lambda row: LabeledPoint(row[0], [row[1]]))
trainingData, testingData = transformedData.randomSplit([.8, .2], seed=1234)
lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
linearModel = LinearRegressionWithSGD.train(trainingData, 1000, .0002)
开发者ID:stevekludt,项目名称:sparkModels,代码行数:31,代码来源:HBaseRead.py
示例6: main
def main():
appName = "BadOrGood;zl"
conf = (SparkConf()
.setAppName(appName)
.set("spark.executor.memory", "5g")
.set("spark.executor.cores","3")
.set("spark.executor.instance", "3")
)
sc = SparkContext(conf = conf)
hc = HiveContext(sc)
#fetch data
#filepath = '/sshomework_zl/BadOrGood/AllDataRowrdd'
#fetchDataToFile(hc, filepath)
#load data
# AllDataRawrdd = sc.pickleFile(filepath) \
# .map( lambda _: {'label':int(_.status), 'feature':extractFeature(_)} ) \
# .repartition(10)
AllDataRawrdd = sc.pickleFile('/pickleData').repartition(10)
#standardizer for train and test data
model = StandardScaler(True, True) \
.fit( AllDataRawrdd \
.map( lambda _: Vectors.dense(_['feature']) )
)
labels = AllDataRawrdd.map(lambda _: _['label'])
featureTransformed = model.transform( AllDataRawrdd.map(lambda _: _['feature']) )
AllDataRawrdd = labels \
.zip(featureTransformed) \
.map( lambda _: { 'label':_[0], 'feature':_[1] } )
#sampling
trainDataRawrdd, testDataRawrdd = AllDataRawrdd.randomSplit(weights=[0.7, 0.3], seed=100)
trainDatardd = trainDataRawrdd.map( lambda _: LabeledPoint( _['label'], _['feature'] ) ).persist()
testDatardd = testDataRawrdd.map( lambda _: {'label': _['label'], 'feature': list(_['feature']) } ).persist()
#prediction & test
lrmLBFGS = LogisticRegressionWithLBFGS.train(trainDatardd, iterations=3000, regParam=0.01, regType="l1")
resultrdd = test(lrmLBFGS, testDatardd)
lrmLBFGSFone = fone(resultrdd)
lrmLBFGSac = accuracy(resultrdd)
lrmSGD = LogisticRegressionWithSGD.train(trainDatardd, iterations=3000, step=0.1, regParam=0.01, regType="l1")
resultrdd = test(lrmSGD, testDatardd)
lrmSGDFone = fone(resultrdd)
lrmSGDac = accuracy(resultrdd)
dt = DecisionTree.trainClassifier(trainDatardd, 2, {}, maxDepth=10)
resultrdd = test(dt, testDatardd)
dtFone = fone(resultrdd)
dtac = accuracy(resultrdd)
rf = RandomForest.trainClassifier(trainDatardd, 2, {}, 10)
resultrdd = test(rf, testDatardd)
rfFone = fone(resultrdd)
rfac = accuracy(resultrdd)
print "LR_LBFGS f1 is : %f, ac is : %f" % (lrmLBFGSFone, lrmLBFGSac)
print "LR_SGD f1 is : %f, ac is : %f" % (lrmSGDFone, lrmSGDac)
print "Decision Tree f1 is: %f, ac is : %f" % (dtFone, dtac)
print "Random Forest f1 is: %f, ac is : %f" % (rfFone, rfac)
print lrmLBFGS.weights
print lrmSGD.weights
sc.stop()
开发者ID:retanoj,项目名称:ss_homework,代码行数:69,代码来源:BadOrGood.py
示例7: norm
def norm(features):
scaler = StandardScaler(withMean=False, withStd=False).fit(features)
return scaler.transform(features)
开发者ID:aymen82,项目名称:kaggler-competitions-scripts,代码行数:3,代码来源:script.py
示例8: return
parts = line.strip().split("::")
return (int(parts[0])-1, int(parts[1])-1, float(parts[2]))
#load in input file
path = sys.argv[1]
#path = "/Users/jamesledoux/Documents/BigData/netflixrecommender/movie_features_dataset.dat/"
data = MLUtils.loadLibSVMFile(sc, path)
labels = data.map(lambda x: x.label)
features = data.map(lambda x: x.features)
#normalize:
#scaler = StandardScaler(withMean = True, withStd = True).fit(features) #data needs to be dense (zeros included)
scaler = StandardScaler(withMean = False, withStd = True).fit(features) #becomes dense if using withMean. may run out of memory locally
#convert data to dense vector to be normalized
#data2 = labels.zip(scaler.transform(features.map(lambda x: Vectors.dense(x.toArray()))))
data2 = labels.zip(scaler.transform(features)) #use this line if having memory issues
#hide 10% of the data for final test
data, test = data2.randomSplit([.9, .1])
#get size of chunks for 10-fold cross-validation
num_folds = 10
partitionSize = (len(data.collect())/num_folds) #parameterize this value as num_folds (in loop as well)
#train/validate 10 times on each k
i = 0
j = partitionSize
开发者ID:Aniketsaoji,项目名称:NetflixRecommender,代码行数:31,代码来源:KMeans_content.py
示例9: OrderedDict
label_counts = labels.countByValue()
sorted_labels = OrderedDict(sorted(label_counts.items(), key=lambda t: t[1], reverse=True))
for label, count in sorted_labels.items():
print label, count
# Prepare data for clustering input
# the data contains non-numeric features, we want to exclude them since
# k-means works with numeric features. These are the first three and the last
# column in each data row
print "Parsing dataset..."
parsed_data = raw_data.map(parse_interaction)
parsed_data_values = parsed_data.values().cache()
# Standardize data
print "Standardizing data..."
standardizer = StandardScaler(True, True)
standardizer_model = standardizer.fit(parsed_data_values)
standardized_data_values = standardizer_model.transform(parsed_data_values)
# Evaluate values of k from 5 to 40
print "Calculating total in within cluster distance for different k values (10 to %(max_k)d):" % {"max_k": max_k}
scores = map(lambda k: clustering_score(standardized_data_values, k), range(10,max_k+1,10))
# Obtain min score k
min_k = min(scores, key=lambda x: x[2])[0]
print "Best k value is %(best_k)d" % {"best_k": min_k}
# Use the best model to assign a cluster to each datum
# We use here standardized data - it is more appropriate for exploratory purposes
print "Obtaining clustering result sample for k=%(min_k)d..." % {"min_k": min_k}
best_model = min(scores, key=lambda x: x[2])[1]
开发者ID:4sp1r3,项目名称:kdd-cup-99-spark,代码行数:31,代码来源:KDDCup99.py
示例10: StandardScaler
# This should be the maximum possible time
max_time = 23 * 3600 + 59 * 60 + 59
#max_time = 16 * 60
low = 0
high = 15 * 60
modelList = []
while low < max_time: # Temp should run once
timeseries = df.filter(lambda x: low < x.timestamp < high)
#if timeseries.count() > 0:
features = timeseries.map(lambda row: row[1:])
#print "Possible points"
#print features.collect()
model = StandardScaler().fit(features)
features_t = model.transform(features)
label = timeseries.map(lambda row: row[0])
labeled_data = label.zip(features_t)
final_data = labeled_data.map(lambda row: LabeledPoint(row[0], row[1]))
model = LinearRegressionWithSGD.train(final_data, 1000, .0000001, intercept=True)
#model = RidgeRegressionWithSGD.train(final_data, 1000, .00000001, intercept=True)
#model = LassoWithSGD.train(final_data, 1000, .00000001, intercept=True)
modelList.append(model)
#print ""
#print "Model1 weights " + str(model.weights)
开发者ID:benCoomes,项目名称:projectSol,代码行数:31,代码来源:spark_linear_regression.py
示例11: SparkContext
from pyspark import SparkContext
# $example on$
from pyspark.mllib.feature import StandardScaler, StandardScalerModel
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.util import MLUtils
# $example off$
if __name__ == "__main__":
sc = SparkContext(appName="StandardScalerExample") # SparkContext
# $example on$
data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
label = data.map(lambda x: x.label)
features = data.map(lambda x: x.features)
scaler1 = StandardScaler().fit(features)
scaler2 = StandardScaler(withMean=True, withStd=True).fit(features)
# data1 will be unit variance.
data1 = label.zip(scaler1.transform(features))
# data2 will be unit variance and zero mean.
data2 = label.zip(scaler2.transform(features.map(lambda x: Vectors.dense(x.toArray()))))
# $example off$
print("data1:")
for each in data1.collect():
print(each)
print("data2:")
for each in data2.collect():
开发者ID:11wzy001,项目名称:spark,代码行数:31,代码来源:standard_scaler_example.py
示例12: SparkContext
#Standardizes features by removing the mean and scaling to unit variance using column summary statistics on the samples in the training set.
from pyspark.mllib.feature import Normalizer
from pyspark.mllib.linalg import Vectors
from pyspark import SparkContext
from pyspark.mllib.feature import StandardScaler
sc = SparkContext()
vs = [Vectors.dense([-2.0, 2.3, 0]), Vectors.dense([3.8, 0.0, 1.9])]
dataset = sc.parallelize(vs)
#all false, do nothing.
standardizer = StandardScaler(False, False)
model = standardizer.fit(dataset)
result = model.transform(dataset)
for r in result.collect(): print r
print("\n")
#deducts the mean
standardizer = StandardScaler(True, False)
model = standardizer.fit(dataset)
result = model.transform(dataset)
for r in result.collect(): print r
print("\n")
#divides the length of vector
开发者ID:aviyashchin,项目名称:CollabFiltering-Netflix-PySpark,代码行数:29,代码来源:classification.py
示例13: toLabeledPoint
#Section 7.4.4
from pyspark.mllib.regression import LabeledPoint
def toLabeledPoint(x):
a = x.toArray()
return LabeledPoint(a[-1], Vectors.dense(a[0:-1]))
housingData = housingVals.map(toLabeledPoint)
#Section 7.4.5
sets = housingData.randomSplit([0.8, 0.2])
housingTrain = sets[0]
housingValid = sets[1]
#Section 7.4.6
from pyspark.mllib.feature import StandardScaler
scaler = StandardScaler(True, True).fit(housingTrain.map(lambda x: x.features))
trainLabel = housingTrain.map(lambda x: x.label)
trainFeatures = housingTrain.map(lambda x: x.features)
validLabel = housingValid.map(lambda x: x.label)
validFeatures = housingValid.map(lambda x: x.features)
trainScaled = trainLabel.zip(scaler.transform(trainFeatures)).map(lambda x: LabeledPoint(x[0], x[1]))
validScaled = validLabel.zip(scaler.transform(validFeatures)).map(lambda x: LabeledPoint(x[0], x[1]))
#Section 7.5
from pyspark.mllib.regression import LinearRegressionWithSGD
alg = LinearRegressionWithSGD()
trainScaled.cache()
validScaled.cache()
model = alg.train(trainScaled, iterations=200, intercept=True)
#Section 7.5.1
开发者ID:AkiraKane,项目名称:first-edition,代码行数:31,代码来源:ch07-listings.py
示例14: main
def main(argv):
verbose = False
dbpath = '/root/data/AdditionalFiles/'
tagstring = 'rock'
usealldata = False
holdout = 0.1
model_iterations = 100
model_step = 1.0
model_intercept = True
# possible types logistic and svm
model_type = 'logistic'
try:
opts, args = getopt.getopt(argv,"hvd:t:am:s:i:o:c",["help","verbose","datapath=","tagstring=","alldata","model=","step=","iterations=","holdout=","intercept"])
except getopt.GetoptError:
print 'rockTag.py -d <data path> -t <tag string>'
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
print('rockTag.py -d <data path> -t <tag string>')
sys.exit()
elif opt in ("-v", "--verbose"):
verbose = True
elif opt in ("-d", "--datapath"):
dbpath = arg
elif opt in ("-t", "--tagstring"):
tagstring = str(arg).lower()
elif opt in ("-a", "--alldata"):
usealldata = True
elif opt in ("-m", "--model"):
if str(arg).lower() in ['logistic','svm']:
model_type = str(arg).lower
else:
print('valid models are logistic and svm')
sys.exit()
elif opt in ("-s", "--step"):
model_step = float(arg)
elif opt in ("-i", "--iterations"):
model_iterations = int(arg)
elif opt in ("-o", "--holdout"):
holdout = float(arg)
if holdout <= 0 | holdout >= 1:
print('holdout must be greater than 0 and less than 1')
elif opt in ("-c", "--intercept"):
model_intercept = True
if verbose:
print('data path: ' + dbpath)
print('tag string: ' + tagstring)
labels, features = getLabelsAndFeatures(dbpath, tagstring=tagstring, verbose=verbose, usealldata=usealldata)
# scale features
std = StandardScaler(True, True).fit(features)
features = std.transform(features)
# make labeled data
labeledData = labels.zip(features).map(lambda (label, data): LabeledPoint(label, data))
if verbose: labeledData.take(3)
# rebalance samples
equalSampleData = rebalanceSample(labeledData, verbose=verbose)
# split data
trainData, testData = randomSplit(equalSampleData, [1-holdout, holdout])
if verbose: trainData.map(lambda p: (p.label, p.features)).take(3)
# train model
if model_type == 'logistic':
model = LogisticRegressionWithSGD.train(trainData, intercept=model_intercept, iterations=model_iterations, step=model_step)
elif model_type == 'svm':
model = SVMWithSGD.train(trainData, intercept=model_intercept, iterations=model_iterations, step=model_step)
evalString = evaluateModel(model, testData)
print(evalString)
开发者ID:ScalingUpMusic,项目名称:SUMapplication,代码行数:79,代码来源:oneTag_1_4.py
示例15: SparkConf
# step 1 - create spark context
conf = SparkConf().setAppName("KMeans-Content")\
.set("spark.executor.memory","1g")
sc = SparkContext()
# step 2 - load in input file
data = MLUtils.loadLibSVMFile(sc,"/Users/Ellen/Desktop/movie_features_dataset.dat")
labels = data.map(lambda x:x.label)
features = data.map(lambda x:x.features)
# step 3 - standarize the data with unit values and 0 mean
scaler = StandardScaler(withMean=False,withStd=True).fit(features)
data2 = labels.zip(scaler.transform(features))
numFeatures = len(data2.values().take(10)[0])
print "Type of data2: ",type(data2) #RDD
print "Type of data2.values(): ",type(data2.values()) # pipelinedrdd
print "Sample: ",data2.values().take(1)[0]
# splitting up the data to training, validation and testing models.
train,val,test = data2.randomSplit([.80,.10,.10])
print "Training Dataset Size:",train.count()
print "Validation Dataset size:",val.count()
print "Test Dataset Size:",test.count()
开发者ID:ellenkimsy,项目名称:Big-Data-Homework,代码行数:29,代码来源:Content_KMeans.py
示例16: setLevel
logger = sc._jvm.org.apache.log4j
logger.LogManager.getLogger("org"). setLevel( logger.Level.ERROR )
logger.LogManager.getLogger("akka").setLevel( logger.Level.ERROR )
def parsePoint(data):
#return LabeledPoint(data[3],np.append(data[0:3],data[4:]))
return LabeledPoint(data[0],data[1:])
# store the data from cassandra to a data frame and remove the NA value
data=sc.cassandraTable("msd_01", "songs").select("song_hotttnesss","loudness","year","sentiment","tempo","unique_words").toDF()
data=data.filter("year>0").na.drop()
print data.count()
# Scale the features with Standard Scaler
data2=data.map(lambda x: [x.song_hotttnesss, x.loudness,x.year, x.sentiment,x.tempo,x.unique_words])#Convert each sql.row to an array
scaler= StandardScaler(withMean=True, withStd=True).fit(data2) #fit a scaler on the every column
scaledData = scaler.transform(data2)# transform our data
# Transform to a labelled vector
parsedData = scaledData.map(parsePoint)
# # Build the model
model = LinearRegressionWithSGD.train(parsedData, iterations=1000,regParam=1.0,regType="l2",intercept=True)
# Evaluate the model on training data
print ("intercept",model.intercept)
print zip(["loudness","year","sentiment","tempo","unique_words"],model.weights)
sc.stop()
开发者ID:StephTruong,项目名称:W251-MillionSong,代码行数:31,代码来源:songHotnessRegression.py
示例17: fit
def fit(self, dataset):
"""
Computa la media y desvio estándar de un conjunto de datos, las cuales se usarán para estandarizar datos.
:param dataset: pyspark.rdd.RDD o numpy.ndarray o :class:`.LabeledDataSet`
"""
if isinstance(dataset, LabeledDataSet):
dataset = dataset.features
if isinstance(dataset, pyspark.rdd.RDD):
standarizer = StdSc(self.flag_mean, self.flag_std)
self.model = standarizer.fit(dataset)
else:
if type(dataset) is not np.ndarray:
dataset = np.array(dataset)
if self.flag_mean is True:
self.mean = dataset.mean(axis=0)
if self.flag_std is True:
self.std = dataset.std(axis=0, ddof=1)
return
开发者ID:leferrad,项目名称:learninspy,代码行数:20,代码来源:data.py
注:本文中的pyspark.mllib.feature.StandardScaler类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论