本文整理汇总了Python中pyspark.ml.clustering.KMeans类的典型用法代码示例。如果您正苦于以下问题:Python KMeans类的具体用法?Python KMeans怎么用?Python KMeans使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了KMeans类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: kmeans
def kmeans(df):
kmeans = KMeans(k=2,seed=1)
model = kmeans.fit(df)
centers = model.clusterCenters()
print len(centers)
kmFeatures = model.transform(df).select("features", "prediction")
dfwrite(kmFeatures,'kmFeatures')
开发者ID:eason001,项目名称:imBot,代码行数:7,代码来源:yispark.py
示例2: test_kmeans_cosine_distance
def test_kmeans_cosine_distance(self):
data = [(Vectors.dense([1.0, 1.0]),), (Vectors.dense([10.0, 10.0]),),
(Vectors.dense([1.0, 0.5]),), (Vectors.dense([10.0, 4.4]),),
(Vectors.dense([-1.0, 1.0]),), (Vectors.dense([-100.0, 90.0]),)]
df = self.spark.createDataFrame(data, ["features"])
kmeans = KMeans(k=3, seed=1, distanceMeasure="cosine")
model = kmeans.fit(df)
result = model.transform(df).collect()
self.assertTrue(result[0].prediction == result[1].prediction)
self.assertTrue(result[2].prediction == result[3].prediction)
self.assertTrue(result[4].prediction == result[5].prediction)
开发者ID:Brett-A,项目名称:spark,代码行数:11,代码来源:test_algorithms.py
示例3: clustering
def clustering(input_df, input_col_name, n):
""" KMeans and PCA """
input_df = input_df.select('state','categories','stars',input_col_name)
norm = Normalizer(inputCol=input_col_name, outputCol="features", p=1.0)
df = norm.transform(input_df)
kmeans = KMeans(k=n, seed=2)
KMmodel = kmeans.fit(df)
predicted = KMmodel.transform(df).cache()
pca = PCA(k=2, inputCol='features', outputCol="pc")
df = pca.fit(dfsample).transform(dfsample).cache()
return df
开发者ID:sam46,项目名称:Yelper,代码行数:11,代码来源:project.py
示例4: test_kmeans_param
def test_kmeans_param(self):
algo = KMeans()
self.assertEqual(algo.getInitMode(), "k-means||")
algo.setK(10)
self.assertEqual(algo.getK(), 10)
algo.setInitSteps(10)
self.assertEqual(algo.getInitSteps(), 10)
开发者ID:Bella-Lin,项目名称:spark,代码行数:7,代码来源:tests.py
示例5: test_kmeans_summary
def test_kmeans_summary(self):
data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),),
(Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)]
df = self.spark.createDataFrame(data, ["features"])
kmeans = KMeans(k=2, seed=1)
model = kmeans.fit(df)
self.assertTrue(model.hasSummary)
s = model.summary
self.assertTrue(isinstance(s.predictions, DataFrame))
self.assertEqual(s.featuresCol, "features")
self.assertEqual(s.predictionCol, "prediction")
self.assertTrue(isinstance(s.cluster, DataFrame))
self.assertEqual(len(s.clusterSizes), 2)
self.assertEqual(s.k, 2)
self.assertEqual(s.numIter, 1)
开发者ID:Brett-A,项目名称:spark,代码行数:15,代码来源:test_training_summary.py
示例6: test_kmean_pmml_basic
def test_kmean_pmml_basic(self):
# Most of the validation is done in the Scala side, here we just check
# that we output text rather than parquet (e.g. that the format flag
# was respected).
data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),),
(Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)]
df = self.spark.createDataFrame(data, ["features"])
kmeans = KMeans(k=2, seed=1)
model = kmeans.fit(df)
path = tempfile.mkdtemp()
km_path = path + "/km-pmml"
model.write().format("pmml").save(km_path)
pmml_text_list = self.sc.textFile(km_path).collect()
pmml_text = "\n".join(pmml_text_list)
self.assertIn("Apache Spark", pmml_text)
self.assertIn("PMML", pmml_text)
开发者ID:Brett-A,项目名称:spark,代码行数:16,代码来源:test_persistence.py
示例7: kmeans
def kmeans(inputdir,df,alg,k):
from pyspark.ml.clustering import KMeans
from numpy import array
from math import sqrt
kmeans = KMeans(k=int(k), seed=1,initSteps=5, tol=1e-4, maxIter=20, initMode="k-means||", featuresCol="features")
model = kmeans.fit(df)
kmFeatures = model.transform(df).select("labels", "prediction")
erFeatures = model.transform(df).select("features", "prediction")
###Evaluation
rows = erFeatures.collect()
WSSSE = 0
for i in rows:
WSSSE += sqrt(sum([x**2 for x in (model.clusterCenters()[i[1]]-i[0])]))
print("Within Set Sum of Squared Error = " + str(WSSSE))
output_data = writeOutClu(inputdir,kmFeatures,alg,k,WSSSE)
return output_data
开发者ID:eason001,项目名称:imPro,代码行数:17,代码来源:views.py
示例8: test_kmeans
def test_kmeans(self):
kmeans = KMeans(k=2, seed=1)
path = tempfile.mkdtemp()
km_path = path + "/km"
kmeans.save(km_path)
kmeans2 = KMeans.load(km_path)
self.assertEqual(kmeans.uid, kmeans2.uid)
self.assertEqual(type(kmeans.uid), type(kmeans2.uid))
self.assertEqual(kmeans2.uid, kmeans2.k.parent,
"Loaded KMeans instance uid (%s) did not match Param's uid (%s)"
% (kmeans2.uid, kmeans2.k.parent))
self.assertEqual(kmeans._defaultParamMap[kmeans.k], kmeans2._defaultParamMap[kmeans2.k],
"Loaded KMeans instance default params did not match " +
"original defaults")
try:
rmtree(path)
except OSError:
pass
开发者ID:Brett-A,项目名称:spark,代码行数:18,代码来源:test_persistence.py
示例9: cluster
def cluster():
ld = load(open(DATAP+'\\temp\olangdict.json','r',encoding='UTF-8'))
spark = SparkSession.builder\
.master("local")\
.appName("Word Count")\
.config("spark.some.config.option", "some-value")\
.getOrCreate()
df = spark.createDataFrame([["0"],
["1"],
["2"],
["3"],
["4"]],
["id"])
df.show()
vecAssembler = VectorAssembler(inputCols=["feat1", "feat2"], outputCol="features")
new_df = vecAssembler.transform(df)
kmeans = KMeans(k=2, seed=1) # 2 clusters here
model = kmeans.fit(new_df.select('features'))
transformed = model.transform(new_df)
print(transformed.show())
开发者ID:softlang,项目名称:wikionto,代码行数:24,代码来源:explore.py
示例10: SparkContext
from pyspark.mllib.linalg import Vectors
from pyspark.ml.clustering import KMeans
from pyspark import SparkContext
from pyspark.sql import SQLContext
# sc = SparkContext(appName="test")
# sqlContext = SQLContext(sc)
data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),),(Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)]
df = sqlContext.createDataFrame(data, ["features"])
kmeans = KMeans(k=2, seed=1)
model = kmeans.fit(df)
centers = model.clusterCenters()
model.transform(df).select("features", "prediction").collect()
开发者ID:zjffdu,项目名称:hadoop-spark,代码行数:15,代码来源:kmeans.py
示例11: KMeans
sales = va.transform(spark.read.format("csv")
.option("header", "true")
.option("inferSchema", "true")
.load("/data/retail-data/by-day/*.csv")
.limit(50)
.coalesce(1)
.where("Description IS NOT NULL"))
sales.cache()
# COMMAND ----------
from pyspark.ml.clustering import KMeans
km = KMeans().setK(5)
print km.explainParams()
kmModel = km.fit(sales)
# COMMAND ----------
summary = kmModel.summary
print summary.clusterSizes # number of points
kmModel.computeCost(sales)
centers = kmModel.clusterCenters()
print("Cluster Centers: ")
for center in centers:
print(center)
开发者ID:yehonatc,项目名称:Spark-The-Definitive-Guide,代码行数:28,代码来源:Advanced_Analytics_and_Machine_Learning-Chapter_29_Unsupervised_Learning.py
示例12: assign_cluster
def assign_cluster(data):
"""Train kmeans on rescaled data and then label the rescaled data."""
kmeans = KMeans(k=2, seed=1, featuresCol="features_scaled", predictionCol="label")
model = kmeans.fit(data)
label_df = model.transform(data)
return label_df
开发者ID:datitran,项目名称:spark-tdd-example,代码行数:6,代码来源:clustering.py
示例13: KMeans
# $example off$
from pyspark.sql import SparkSession
if __name__ == "__main__":
spark = SparkSession\
.builder\
.appName("KMeansExample")\
.getOrCreate()
# $example on$
# Loads data.
dataset = spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt")
# Trains a k-means model.
kmeans = KMeans().setK(2).setSeed(1)
model = kmeans.fit(dataset)
# Make predictions
predictions = model.transform(dataset)
# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))
# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
开发者ID:BaiBenny,项目名称:spark,代码行数:31,代码来源:kmeans_example.py
示例14: print
print(colStdDev)
#Place the means and std.dev values in a broadcast variable
bcMeans = sc.broadcast(colMeans)
bcStdDev = sc.broadcast(colStdDev)
csAuto = autoVector.map(centerAndScale)
#csAuto.collect()
#csAuto.foreach(println)
print(csAuto)
#Create Spark Data Frame
autoRows = csAuto.map(lambda f:Row(features=f))
autoDf = SQLContext.createDataFrame(autoRows)
autoDf.select("features").show(10)
kmeans = KMeans(k=3, seed=1)
model = kmeans.fit(autoDf)
predictions = model.transform(autoDf)
predictions.collect()
predictions.foreach(println)
#Plot the results in a scatter plot
unstripped = predictions.map(unstripData)
predList=unstripped.collect()
predPd = pd.DataFrame(predList)
# preparing to save the clustered data
list_current_gni_final_maped = current_gni_final_maped.collect()
list_current_gni_rdd = current_gni_rdd.collect()
list_predictions_pandas=predictions.toPandas()
list_predictions_temp=list_predictions_pandas.as_matrix()
开发者ID:rzkhqq,项目名称:BigData4,代码行数:31,代码来源:current_gni.py
示例15: VectorAssembler
trainingData = VectorAssembler(inputCols=["duration", "tempo", "loudness"], outputCol="features").transform(
table("songsTable")
)
# COMMAND ----------
# MAGIC %md We can now pass this new DataFrame to the `KMeans` model and ask it to categorize different rows in our data to two different classes (`setK(2)`). We place the model in a variable named `model`.
# MAGIC
# MAGIC **Note:** This command multiple spark jobs (one job per iteration in the KMeans algorithm). You will see the progress bar starting over and over again.
# COMMAND ----------
from pyspark.ml.clustering import KMeans
model = KMeans().setK(2).fit(trainingData)
# COMMAND ----------
# MAGIC %md To see the result of our clustering, we produce a scatter plot matrix that shows interaction between input variables and learned clusters. To get that we apply the model on the original data and pick four columns: `prediction` and the original features (`duration`, `tempo`, and `loudness`).
# COMMAND ----------
transformed = model.transform(trainingData).select("duration", "tempo", "loudness", "prediction")
# COMMAND ----------
# MAGIC %md To comfortably visualize the data we produce a random sample.
# MAGIC Remember the `display()` function? We can use it to produce a nicely rendered table of transformed DataFrame.
# COMMAND ----------
开发者ID:,项目名称:,代码行数:30,代码来源:
示例16: test_kmeans_param
def test_kmeans_param(self):
algo = KMeans()
self.assertEqual(algo.getInitMode(), "k-means||")
algo.setK(10)
self.assertEqual(algo.getK(), 10)
algo.setInitSteps(10)
self.assertEqual(algo.getInitSteps(), 10)
self.assertEqual(algo.getDistanceMeasure(), "euclidean")
algo.setDistanceMeasure("cosine")
self.assertEqual(algo.getDistanceMeasure(), "cosine")
开发者ID:Brett-A,项目名称:spark,代码行数:10,代码来源:test_param.py
示例17: Row
twDF = tw.map(lambda p: Row(text=p)).toDF()
t0 = time.time()
word2Vec = Word2Vec(vectorSize=100, minCount=5, stepSize=0.025, inputCol="text", outputCol="result")
modelW2V = word2Vec.fit(twDF)
wordVectorsDF = modelW2V.getVectors()
timeW2V = time.time() - t0
## Train K-means on top of the Word2Vec matrix:
t0 = time.time()
vocabSize = wordVectorsDF.count()
K = int(math.floor(math.sqrt(float(vocabSize)/2)))
# K ~ sqrt(n/2) this is a rule of thumb for choosing K,
# where n is the number of words in the model
# feel free to choose K with a fancier algorithm
dfW2V = wordVectorsDF.select('vector').withColumnRenamed('vector','features')
kmeans = KMeans(k=K, seed=1)
modelK = kmeans.fit(dfW2V)
labelsDF = modelK.transform(dfW2V).select('prediction').withColumnRenamed('prediction','labels')
vocabSize = wordVectorsDF.count()
timeKmeans = time.time() - t0
sc.stop()
## Print Some Results
printResults = 1 # set t
if (printResults):
## Read Tweets
print "="*80
print "Read Tweets..."
开发者ID:agilemobiledev,项目名称:w2v,代码行数:31,代码来源:w2vAndKmeans.py
示例18: display
# COMMAND ----------
display(transformed)
# COMMAND ----------
# MAGIC %md
# MAGIC #### K-Means Visualized
# COMMAND ----------
modelCenters = []
iterations = [0, 2, 4, 7, 10, 20]
for i in iterations:
kmeans = KMeans(k=3, seed=5, maxIter=i, initSteps=1)
model = kmeans.fit(irisTwoFeatures)
modelCenters.append(model.clusterCenters())
# COMMAND ----------
print 'modelCenters:'
for centroids in modelCenters:
print centroids
# COMMAND ----------
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
开发者ID:smoltis,项目名称:spark,代码行数:29,代码来源:2-etl-kmeans_student.py
示例19: KMeans
# For now, analysis is still required. We cache the output because we are going to perform
# multiple runs on the dataset.
df0 = tfs.analyze(df).cache()
mllib_df.count()
df0.count()
np.random.seed(2)
init_centers = np.random.randn(k, num_features)
start_centers = init_centers
dataframe = df0
ta_0 = time.time()
kmeans = KMeans().setK(k).setSeed(1).setFeaturesCol(FEATURES_COL).setInitMode(
"random").setMaxIter(num_iters)
mod = kmeans.fit(mllib_df)
ta_1 = time.time()
tb_0 = time.time()
(centers, agg_distances) = kmeanstf(df0, init_centers, num_iters=num_iters, tf_aggregate=False)
tb_1 = time.time()
tc_0 = time.time()
(centers, agg_distances) = kmeanstf(df0, init_centers, num_iters=num_iters, tf_aggregate=True)
tc_1 = time.time()
mllib_dt = ta_1 - ta_0
tf_dt = tb_1 - tb_0
tf2_dt = tc_1 - tc_0
开发者ID:databricks,项目名称:tensorframes,代码行数:31,代码来源:kmeans_demo.py
示例20: KMeans
# COMMAND ----------
fittedPipeline = transformationPipeline.fit(trainDataFrame)
# COMMAND ----------
transformedTraining = fittedPipeline.transform(trainDataFrame)
# COMMAND ----------
from pyspark.ml.clustering import KMeans
kmeans = KMeans()\
.setK(20)\
.setSeed(1L)
# COMMAND ----------
kmModel = kmeans.fit(transformedTraining)
# COMMAND ----------
transformedTest = fittedPipeline.transform(testDataFrame)
# COMMAND ----------
开发者ID:yehonatc,项目名称:Spark-The-Definitive-Guide,代码行数:29,代码来源:A_Gentle_Introduction_to_Spark-Chapter_3_A_Tour_of_Sparks_Toolset.py
注:本文中的pyspark.ml.clustering.KMeans类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论