本文整理汇总了Python中sklearn.utils.extmath.row_norms函数的典型用法代码示例。如果您正苦于以下问题:Python row_norms函数的具体用法?Python row_norms怎么用?Python row_norms使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了row_norms函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: test_row_norms
def test_row_norms():
X = np.random.RandomState(42).randn(100, 100)
for dtype in (np.float32, np.float64):
if dtype is np.float32:
precision = 4
else:
precision = 5
X = X.astype(dtype)
sq_norm = (X ** 2).sum(axis=1)
assert_array_almost_equal(sq_norm, row_norms(X, squared=True),
precision)
assert_array_almost_equal(np.sqrt(sq_norm), row_norms(X), precision)
for csr_index_dtype in [np.int32, np.int64]:
Xcsr = sparse.csr_matrix(X, dtype=dtype)
# csr_matrix will use int32 indices by default,
# up-casting those to int64 when necessary
if csr_index_dtype is np.int64:
Xcsr.indptr = Xcsr.indptr.astype(csr_index_dtype)
Xcsr.indices = Xcsr.indices.astype(csr_index_dtype)
assert Xcsr.indices.dtype == csr_index_dtype
assert Xcsr.indptr.dtype == csr_index_dtype
assert_array_almost_equal(sq_norm, row_norms(Xcsr, squared=True),
precision)
assert_array_almost_equal(np.sqrt(sq_norm), row_norms(Xcsr),
precision)
开发者ID:BasilBeirouti,项目名称:scikit-learn,代码行数:28,代码来源:test_extmath.py
示例2: test_row_norms
def test_row_norms():
X = np.random.RandomState(42).randn(100, 100)
sq_norm = (X ** 2).sum(axis=1)
assert_array_almost_equal(sq_norm, row_norms(X, squared=True), 5)
assert_array_almost_equal(np.sqrt(sq_norm), row_norms(X))
Xcsr = sparse.csr_matrix(X, dtype=np.float32)
assert_array_almost_equal(sq_norm, row_norms(Xcsr, squared=True), 5)
assert_array_almost_equal(np.sqrt(sq_norm), row_norms(Xcsr))
开发者ID:93sam,项目名称:scikit-learn,代码行数:10,代码来源:test_extmath.py
示例3: euclidean_distances
def euclidean_distances(X, Y=None):
YY = row_norms(Y, squared=True)[np.newaxis, :]
if X is Y: # shortcut in the common case euclidean_distances(X, X)
XX = YY.T
else:
XX = row_norms(X, squared=True)[:, np.newaxis]
distances = np.dot(X, Y.T)
distances *= -2
distances += XX
distances += YY
np.maximum(distances, 0, out=distances)
return distances
开发者ID:haoopeng,项目名称:MLAlgorithms,代码行数:14,代码来源:different_kernels.py
示例4: fit
def fit(self, X, y):
"""Fit factorization machine to training data.
Parameters
----------
X : array-like or sparse, shape = [n_samples, n_features]
Training vectors, where n_samples is the number of samples
and n_features is the number of features.
y : array-like, shape = [n_samples]
Target values.
Returns
-------
self : Estimator
Returns self.
"""
if self.degree > 3:
raise ValueError("FMs with degree >3 not yet supported.")
X, y = self._check_X_y(X, y)
X = self._augment(X)
n_features = X.shape[1] # augmented
X_col_norms = row_norms(X.T, squared=True)
dataset = get_dataset(X, order="fortran")
rng = check_random_state(self.random_state)
loss_obj = self._get_loss(self.loss)
if not (self.warm_start and hasattr(self, 'w_')):
self.w_ = np.zeros(n_features, dtype=np.double)
if self.fit_lower == 'explicit':
n_orders = self.degree - 1
else:
n_orders = 1
if not (self.warm_start and hasattr(self, 'P_')):
self.P_ = 0.01 * rng.randn(n_orders, self.n_components, n_features)
if not (self.warm_start and hasattr(self, 'lams_')):
if self.init_lambdas == 'ones':
self.lams_ = np.ones(self.n_components)
elif self.init_lambdas == 'random_signs':
self.lams_ = np.sign(rng.randn(self.n_components))
else:
raise ValueError("Lambdas must be initialized as ones "
"(init_lambdas='ones') or as random "
"+/- 1 (init_lambdas='random_signs').")
y_pred = self._get_output(X)
converged = _cd_direct_ho(self.P_, self.w_, dataset, X_col_norms, y,
y_pred, self.lams_, self.degree, self.alpha,
self.beta, self.fit_linear,
self.fit_lower == 'explicit', loss_obj,
self.max_iter, self.tol, self.verbose)
if not converged:
warnings.warn("Objective did not converge. Increase max_iter.")
return self
开发者ID:vene,项目名称:polylearn,代码行数:60,代码来源:factorization_machine.py
示例5: test_get_auto_step_size
def test_get_auto_step_size():
X = np.array([[1, 2, 3], [2, 3, 4], [2, 3, 2]], dtype=np.float64)
alpha = 1.2
fit_intercept = False
# sum the squares of the second sample because that's the largest
max_squared_sum = 4 + 9 + 16
max_squared_sum_ = row_norms(X, squared=True).max()
assert_almost_equal(max_squared_sum, max_squared_sum_, decimal=4)
for fit_intercept in (True, False):
step_size_sqr = 1.0 / (max_squared_sum + alpha + int(fit_intercept))
step_size_log = 4.0 / (max_squared_sum + 4.0 * alpha +
int(fit_intercept))
step_size_sqr_ = get_auto_step_size(max_squared_sum_, alpha, "squared",
fit_intercept)
step_size_log_ = get_auto_step_size(max_squared_sum_, alpha, "log",
fit_intercept)
assert_almost_equal(step_size_sqr, step_size_sqr_, decimal=4)
assert_almost_equal(step_size_log, step_size_log_, decimal=4)
msg = 'Unknown loss function for SAG solver, got wrong instead of'
assert_raise_message(ValueError, msg, get_auto_step_size,
max_squared_sum_, alpha, "wrong", fit_intercept)
开发者ID:1992huanghai,项目名称:scikit-learn,代码行数:25,代码来源:test_sag.py
示例6: compute_distances
def compute_distances(self, x1, x2=None):
"""
The method
- extracts normalized continuous attributes and then uses `row_norms`
and `safe_sparse_do`t to compute the distance as x^2 - 2xy - y^2
(the trick from sklearn);
- calls a function in Cython that adds the contributions of discrete
columns
"""
if self.normalize:
x1 = x1 - self.means
x1 /= np.sqrt(2 * self.vars)
# adapted from sklearn.metric.euclidean_distances
xx = row_norms(x1.T, squared=True)[:, np.newaxis]
distances = safe_sparse_dot(x1.T, x1, dense_output=True)
distances *= -2
distances += xx
distances += xx.T
with np.errstate(invalid="ignore"): # Nans are fixed below
np.maximum(distances, 0, out=distances)
distances.flat[::distances.shape[0] + 1] = 0.0
fixer = _distance.fix_euclidean_cols_normalized if self.normalize \
else _distance.fix_euclidean_cols
fixer(distances, x1, self.means, self.vars)
return np.sqrt(distances)
开发者ID:acopar,项目名称:orange3,代码行数:27,代码来源:distance.py
示例7: test_labels_assignment_and_inertia
def test_labels_assignment_and_inertia():
# pure numpy implementation as easily auditable reference gold
# implementation
rng = np.random.RandomState(42)
noisy_centers = centers + rng.normal(size=centers.shape)
labels_gold = - np.ones(n_samples, dtype=np.int)
mindist = np.empty(n_samples)
mindist.fill(np.infty)
for center_id in range(n_clusters):
dist = np.sum((X - noisy_centers[center_id]) ** 2, axis=1)
labels_gold[dist < mindist] = center_id
mindist = np.minimum(dist, mindist)
inertia_gold = mindist.sum()
assert_true((mindist >= 0.0).all())
assert_true((labels_gold != -1).all())
# perform label assignment using the dense array input
x_squared_norms = (X ** 2).sum(axis=1)
labels_array, inertia_array = _labels_inertia(
X, x_squared_norms, noisy_centers)
assert_array_almost_equal(inertia_array, inertia_gold)
assert_array_equal(labels_array, labels_gold)
# perform label assignment using the sparse CSR input
x_squared_norms_from_csr = row_norms(X_csr, squared=True)
labels_csr, inertia_csr = _labels_inertia(
X_csr, x_squared_norms_from_csr, noisy_centers)
assert_array_almost_equal(inertia_csr, inertia_gold)
assert_array_equal(labels_csr, labels_gold)
开发者ID:Lavanya-Basavaraju,项目名称:scikit-learn,代码行数:29,代码来源:test_k_means.py
示例8: get_kpp_init
def get_kpp_init(X,n_clusters,random_state=None):
random_state = None
random_state = check_random_state(random_state)
x_squared_norms = row_norms(X, squared=True)
centers = sklearn.cluster.k_means_._k_init(X, n_clusters, random_state=random_state,x_squared_norms=x_squared_norms) # n_clusters x D
W = np.transpose( centers ) # D x D^(1)
W_tf = tf.constant(W)
return centers,W,W_tf
开发者ID:brando90,项目名称:hbf_tensorflow_code,代码行数:8,代码来源:initializations.py
示例9: _kmeans_spark
def _kmeans_spark(X, n_clusters, max_iter=300, worker_nums=10, init='k-means++', random_state=None, tol=1e-4):
from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName('K-Means_Spark').setMaster('local[%d]'%worker_nums)
sc = SparkContext(conf=conf)
data = sc.parallelize(X)
data.cache()
random_state = check_random_state(random_state)
best_labels, best_inertia, best_centers = None, None, None
x_squared_norms = row_norms(X, squared=True)
# x_squared_norms = data.map(lambda x: (x*x).sum(axis=0)).collect()
# x_squared_norms = np.array(x_squared_norms, dtype='float64')
centers = _init_centroids(X, n_clusters, init, random_state, x_squared_norms=x_squared_norms)
bs = X.shape[0]/worker_nums
data_temp = []
for i in range(worker_nums-1):
data_temp.append(X[i*bs:(i+1)*bs])
data_temp.append(X[(worker_nums-1)*bs:])
data_temp = np.array(data_temp, dtype='float64')
data_temp = sc.parallelize(data_temp)
data_temp.cache()
for i in range(max_iter):
centers_old = centers.copy()
all_distances = data_temp.map(lambda x: euclidean_distances(centers, x, squared=True)).collect()
temp_all_distances = all_distances[0]
for i in range(1, worker_nums):
temp_all_distances = np.hstack((temp_all_distances, all_distances[i]))
all_distances = temp_all_distances
# all_distances = data.map(lambda x: euclidean_distances(centers, x, squared=True)).collect()
# # reshape, from (1, n_samples, k) to (k, n_samples)
# all_distances = np.asarray(all_distances, dtype="float64").T[0]
# Assignment, also called E-step of EM
labels, inertia = _labels_inertia(X, x_squared_norms, centers, all_distances=all_distances)
# re-computation of the centroids, also called M-step of EM
centers = _centers(X, labels, n_clusters)
if best_inertia is None or inertia < best_inertia:
best_labels = labels.copy()
best_centers = centers.copy()
best_inertia = inertia
shift = squared_norm(centers_old - centers)
if shift <= tol:
break
return best_centers, best_labels, best_inertia
开发者ID:cyh24,项目名称:PySparkML,代码行数:56,代码来源:k_means_.py
示例10: test_row_norms
def test_row_norms():
X = np.random.RandomState(42).randn(100, 100)
for dtype in (np.float32, np.float64):
if dtype is np.float32:
precision = 4
else:
precision = 5
X = X.astype(dtype)
sq_norm = (X ** 2).sum(axis=1)
assert_array_almost_equal(sq_norm, row_norms(X, squared=True),
precision)
assert_array_almost_equal(np.sqrt(sq_norm), row_norms(X), precision)
Xcsr = sparse.csr_matrix(X, dtype=dtype)
assert_array_almost_equal(sq_norm, row_norms(Xcsr, squared=True),
precision)
assert_array_almost_equal(np.sqrt(sq_norm), row_norms(Xcsr), precision)
开发者ID:antoinewdg,项目名称:scikit-learn,代码行数:19,代码来源:test_extmath.py
示例11: get_auto_step_size
def get_auto_step_size(X, alpha, loss, gamma=None, sample_weight=None):
"""Compute automatic step size for SAG solver
Stepsize computed using the following objective:
minimize_w 1 / n_samples * \sum_i loss(w^T x_i, y_i)
+ alpha * 0.5 * ||w||^2_2
Parameters
----------
X : ndarray
Array of samples x_i.
alpha : float
Constant that multiplies the l2 penalty term.
loss : string, in {"log", "squared"}
The loss function used in SAG solver.
Returns
-------
step_size : float
Step size used in SAG/SAGA solver.
"""
if sample_weight is None:
weighted_norms = row_norms(X, squared=True)
else:
weighted_norms = sample_weight * row_norms(X, squared=True)
L = np.max(weighted_norms)
n_samples = X.shape[0]
if loss == 'log':
# inverse Lipschitz constant for log loss
lipschitz_constant = 0.25 * L + alpha
elif loss == 'squared':
lipschitz_constant = L + alpha
elif loss == 'modified_huber':
lipschitz_constant = 2 * L + alpha
elif loss == 'smooth_hinge':
lipschitz_constant = L + gamma + alpha
elif loss == 'squared_hinge':
lipschitz_constant = 2 * L + alpha
else:
raise ValueError("`auto` stepsize is only available for `squared` or "
"`log` losses (got `%s` loss). Please specify a "
"stepsize." % loss)
return 1.0 / lipschitz_constant
开发者ID:casotto,项目名称:lightning,代码行数:42,代码来源:sag.py
示例12: prepare_data
def prepare_data(x):
if self.discrete.any():
data = Cosine.discrete_to_indicators(x, self.discrete)
else:
data = x.copy()
for col, mean in enumerate(self.means):
column = data[:, col]
column[np.isnan(column)] = mean
if self.axis == 0:
data = data.T
data /= row_norms(data)[:, np.newaxis]
return data
开发者ID:acopar,项目名称:orange3,代码行数:12,代码来源:distance.py
示例13: fit
def fit(self, X):
x_squared_norms = row_norms(X, squared=True)
rng = np.random.RandomState(self.random_state)
if self.init == "kmeans++":
# Private function of sklearn.cluster.k_means_, to get the initial centers.
init_centers = _k_init(X, self.n_clusters, x_squared_norms, rng)
elif self.init == "random":
random_samples = rng.random_integers(0, X.shape[0], size=self.n_clusters)
init_centers = X[random_samples, :]
else:
raise ValueError("init should be either kmeans++ or random")
# Assign initial labels. skip norm of x**2
init_distances = np.sum(init_centers**2, axis=1) - 2 * np.dot(X, init_centers.T)
init_labels = np.argmin(init_distances, axis=1)
self.labels_ = init_labels
self.centers_ = init_centers
self.n_samples_ = np.zeros(self.n_clusters)
# Count the number of samples in each cluster.
for i in range(self.n_clusters):
self.n_samples_[i] = np.sum(self.labels_ == i)
for i, (sample, label) in enumerate(zip(X, self.labels_)):
curr_label = label
max_cost = np.inf
while max_cost > 0:
distances = x_squared_norms[i] - 2 * np.dot(sample, self.centers_.T) + np.sum(self.centers_**2, axis=1)
curr_distance = distances[curr_label]
other_distance = np.delete(distances, curr_label)
curr_n_samples = self.n_samples_[curr_label]
other_n_samples = np.delete(self.n_samples_, curr_label)
cost = (curr_n_samples / (curr_n_samples - 1) * curr_distance) - (other_n_samples / (other_n_samples + 1) * other_distance)
max_cost_ind = np.argmax(cost)
max_cost = cost[max_cost_ind]
if max_cost > 0:
# We deleted the label index from other_n_samples
if max_cost_ind > curr_label:
max_cost_ind += 1
# Reassign the clusters
self.labels_[i] = max_cost_ind
self.centers_[curr_label] = (curr_n_samples * self.centers_[curr_label] - sample) / (curr_n_samples - 1)
moved_n_samples = self.n_samples_[max_cost_ind]
self.centers_[max_cost_ind] = (moved_n_samples * self.centers_[max_cost_ind] + sample) / (moved_n_samples + 1)
self.n_samples_[curr_label] -= 1
self.n_samples_[max_cost_ind] += 1
curr_label = max_cost_ind
开发者ID:MechCoder,项目名称:Hartigan,代码行数:53,代码来源:naive_hartigan.py
示例14: kmeans_subsample
def kmeans_subsample(X, n_clusters, random_state=None, n_local_trials=10):
random_state = check_random_state(random_state)
n_samples, n_features = X.shape
x_squared_norms = row_norms(X, squared=True)
centers = np.empty((n_clusters, n_features))
# Pick first center randomly
center_id = random_state.randint(n_samples)
centers[0] = X[center_id]
# Initialize list of closest distances and calculate current potential
closest_dist_sq = euclidean_distances(centers[0].reshape(1, -1), X, Y_norm_squared=x_squared_norms, squared=True)
current_pot = closest_dist_sq.sum()
# Pick the remaining n_clusters-1 points
for c in range(1, n_clusters):
# Choose center candidates by sampling with probability proportional
# to the squared distance to the closest existing center
rand_vals = random_state.random_sample(n_local_trials) * current_pot
candidate_ids = np.searchsorted(closest_dist_sq.cumsum(), rand_vals)
# Compute distances to center candidates
distance_to_candidates = euclidean_distances(X[candidate_ids], X, Y_norm_squared=x_squared_norms, squared=True)
# Decide which candidate is the best
best_candidate = None
best_pot = None
best_dist_sq = None
for trial in range(n_local_trials):
# Compute potential when including center candidate
new_dist_sq = np.minimum(closest_dist_sq, distance_to_candidates[trial])
new_pot = new_dist_sq.sum()
# Store result if it is the best local trial so far
if (best_candidate is None) or (new_pot < best_pot):
best_candidate = candidate_ids[trial]
best_pot = new_pot
best_dist_sq = new_dist_sq
# Permanently add best center candidate found in local tries
centers[c] = X[best_candidate]
current_pot = best_pot
closest_dist_sq = best_dist_sq
return centers
开发者ID:eiriniar,项目名称:CellCnn,代码行数:47,代码来源:downsample.py
示例15: kmeanspp
def kmeanspp(X, k, seed):
# That we need to do this is a bug in _init_centroids
x_squared_norms = row_norms(X, squared=True)
# Use k-means++ to initialise the centroids
centroids = _init_centroids(X, k, 'k-means++', random_state=seed, x_squared_norms=x_squared_norms)
# OK, we should just short-circuit and get these from k-means++...
# quick and dirty solution
nns = NearestNeighbors()
nns.fit(X)
centroid_candidatess = nns.radius_neighbors(X=centroids, radius=0, return_distance=False)
# Account for "degenerated" solutions: serveral voxels at distance 0, each becoming a centroid
centroids = set()
for centroid_candidates in centroid_candidatess:
centroid_candidates = set(centroid_candidates) - centroids
if len(set(centroid_candidates) - centroids) == 0:
raise Exception('Cannot get an unambiguous set of centers;'
'theoretically this cannot happen, so check for bugs')
centroids.add(centroid_candidates.pop())
return np.array(sorted(centroids))
开发者ID:strawlab,项目名称:braincode,代码行数:19,代码来源:stability.py
示例16: _init_centroids
def _init_centroids(X, k, init, random_state, x_squared_norms=None):
random_state = check_random_state(random_state)
n_samples = X.shape[0]
if x_squared_norms is None:
x_squared_norms = row_norms(X, squared=True)
if n_samples < k:
raise ValueError("n_samples=%d should be larger than k=%d"%(n_samples, k))
if init == 'k-means++':
centers = _k_init(X, k, random_state=random_state,
x_squared_norms=x_squared_norms)
elif init == 'random':
seeds = random_state.permutation(n_samples)[:k]
centers = X[seeds]
return centers
开发者ID:cyh24,项目名称:PySparkML,代码行数:19,代码来源:k_means_.py
示例17: test_get_auto_step_size
def test_get_auto_step_size():
X = np.array([[1, 2, 3], [2, 3, 4], [2, 3, 2]], dtype=np.float64)
alpha = 1.2
fit_intercept = False
# sum the squares of the second sample because that's the largest
max_squared_sum = 4 + 9 + 16
max_squared_sum_ = row_norms(X, squared=True).max()
n_samples = X.shape[0]
assert_almost_equal(max_squared_sum, max_squared_sum_, decimal=4)
for saga in [True, False]:
for fit_intercept in (True, False):
if saga:
L_sqr = (max_squared_sum + alpha + int(fit_intercept))
L_log = (max_squared_sum + 4.0 * alpha +
int(fit_intercept)) / 4.0
mun_sqr = min(2 * n_samples * alpha, L_sqr)
mun_log = min(2 * n_samples * alpha, L_log)
step_size_sqr = 1 / (2 * L_sqr + mun_sqr)
step_size_log = 1 / (2 * L_log + mun_log)
else:
step_size_sqr = 1.0 / (max_squared_sum +
alpha + int(fit_intercept))
step_size_log = 4.0 / (max_squared_sum + 4.0 * alpha +
int(fit_intercept))
step_size_sqr_ = get_auto_step_size(max_squared_sum_, alpha,
"squared",
fit_intercept,
n_samples=n_samples,
is_saga=saga)
step_size_log_ = get_auto_step_size(max_squared_sum_, alpha, "log",
fit_intercept,
n_samples=n_samples,
is_saga=saga)
assert_almost_equal(step_size_sqr, step_size_sqr_, decimal=4)
assert_almost_equal(step_size_log, step_size_log_, decimal=4)
msg = 'Unknown loss function for SAG solver, got wrong instead of'
assert_raise_message(ValueError, msg, get_auto_step_size,
max_squared_sum_, alpha, "wrong", fit_intercept)
开发者ID:AlexisMignon,项目名称:scikit-learn,代码行数:42,代码来源:test_sag.py
示例18: predict
def predict(self, X):
"""Predict the closest cluster each sample in X belongs to.
In the vector quantization literature, `cluster_centers_` is called
the code book and each value returned by `predict` is the index of
the closest code in the code book.
Parameters
----------
X : {array-like, sparse matrix}, shape = [n_samples, n_features]
New data to predict.
Returns
-------
labels : array, shape [n_samples,]
Index of the cluster each sample belongs to.
"""
#check_is_fitted(self, 'cluster_centers_')
X = self._check_test_data(X)
x_squared_norms = row_norms(X, squared=True)
return _labels_inertia(X, x_squared_norms, self.cluster_centers_)[0]
开发者ID:AnilSener,项目名称:semiKmeans,代码行数:22,代码来源:semiKMeans.py
示例19: run_step
def run_step(self,run_number,step_size,howlong):
df_slot = self.get_input_slot('df')
df_slot.update(run_number, buffer_created=True, buffer_updated=True)
if df_slot.has_deleted():
self.reset()
df_slot.reset()
df_slot.update(run_number)
input_df = df_slot.data()
columns = self.get_columns(input_df)
if input_df is None or len(input_df)==0:
return self._return_run_step(self.state_blocked, steps_run=0)
indices = df_slot.next_created(step_size)
steps = indices_len(indices)
step_size -= steps
steps_run = steps
if steps != 0:
indices = fix_loc(indices)
self._buffer.append(input_df.loc[indices])
self._df = self._buffer.df()
self._df.loc[indices,self.UPDATE_COLUMN] = run_number
if step_size > 0 and df_slot.has_updated():
indices = df_slot.next_updated(step_size,as_slice=False)
steps = indices_len(indices)
if steps != 0:
steps_run += steps
indices = fix_loc(indices) # no need, but stick to the stereotype
updated = self.filter_columns(input_df, indices)
df = self.filter_columns(self._df, indices)
norms = row_norms(updated-df)
selected = (norms > (self._delta*self.get_scale()))
indices = indices[selected]
if selected.any():
logger.debug('updating at %d', run_number)
self._df.loc[indices, self._columns] = updated.loc[indices, self._columns]
self._df.loc[indices, self.UPDATE_COLUMN] = run_number
else:
logger.debug('Not updating at %d', run_number)
return self._return_run_step(df_slot.next_state(), steps_run=steps_run)
开发者ID:jdfekete,项目名称:progressivis,代码行数:38,代码来源:select_delta.py
示例20: _kmeans_single
def _kmeans_single(X, n_clusters, max_iter=300, init='k-means++', random_state=None, tol=1e-4):
random_state = check_random_state(random_state)
best_labels, best_inertia, best_centers = None, None, None
# init
x_squared_norms = row_norms(X, squared=True)
centers = _init_centroids(X, n_clusters, init, random_state, x_squared_norms=x_squared_norms)
# distances = np.zeros(shape=(X.shape[0],), dtype=np.float64)
# iterations
for i in range(max_iter):
centers_old = centers.copy()
# Assignment, also called E-step of EM
labels, inertia = _labels_inertia(X, x_squared_norms, centers)
# re-computation of the centroids, also called M-step of EM
centers = _centers(X, labels, n_clusters)
if best_inertia is None or inertia < best_inertia:
best_labels = labels.copy()
best_centers = centers.copy()
best_inertia = inertia
shift = squared_norm(centers_old - centers)
if shift <= tol:
break
if shift > 0:
# rerun E-step in case of non-convergence so that predicted labels
# match cluster centers
best_labels, best_inertia = \
_labels_inertia(X, x_squared_norms, best_centers)
return best_centers, best_labels, best_inertia
开发者ID:cyh24,项目名称:PySparkML,代码行数:37,代码来源:k_means_.py
注:本文中的sklearn.utils.extmath.row_norms函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论