本文整理汇总了Python中sklearn.metrics.pairwise.pairwise_distances函数的典型用法代码示例。如果您正苦于以下问题:Python pairwise_distances函数的具体用法?Python pairwise_distances怎么用?Python pairwise_distances使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了pairwise_distances函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: test_kneighbors_regressor_sparse
def test_kneighbors_regressor_sparse(n_samples=40,
n_features=5,
n_test_pts=10,
n_neighbors=5,
random_state=0):
# Test radius-based regression on sparse matrices
# Like the above, but with various types of sparse matrices
rng = np.random.RandomState(random_state)
X = 2 * rng.rand(n_samples, n_features) - 1
y = ((X ** 2).sum(axis=1) < .25).astype(np.int)
for sparsemat in SPARSE_TYPES:
knn = neighbors.KNeighborsRegressor(n_neighbors=n_neighbors,
algorithm='auto')
knn.fit(sparsemat(X), y)
knn_pre = neighbors.KNeighborsRegressor(n_neighbors=n_neighbors,
metric='precomputed')
knn_pre.fit(pairwise_distances(X, metric='euclidean'), y)
for sparsev in SPARSE_OR_DENSE:
X2 = sparsev(X)
assert_true(np.mean(knn.predict(X2).round() == y) > 0.95)
X2_pre = sparsev(pairwise_distances(X, metric='euclidean'))
if issparse(sparsev(X2_pre)):
assert_raises(ValueError, knn_pre.predict, X2_pre)
else:
assert_true(
np.mean(knn_pre.predict(X2_pre).round() == y) > 0.95)
开发者ID:BasilBeirouti,项目名称:scikit-learn,代码行数:30,代码来源:test_neighbors.py
示例2: make_rbf
def make_rbf(x,sigma,metric='euclidean', x2=None):
if x.ndim == 1:
x = np.expand_dims(x, 1)
if x2 is None:
x2 = x
if metric == 'cosine':
#This code may be faster for some matrices
# Code from http://stackoverflow.com/questions/17627219/whats-the-fastest-way-in-python-to-calculate-cosine-similarity-given-sparse-mat
'''
tic()
#x = x.toarray()
#similarity = np.dot(x, x.T)
similarity = (x.dot(x.T)).toarray()
square_mag = np.diag(similarity)
inv_square_mag = 1 / square_mag
inv_square_mag[np.isinf(inv_square_mag)] = 0
inv_mag = np.sqrt(inv_square_mag)
W = similarity * inv_mag
W = W.T * inv_mag
W = 1 - W
toc()
tic()
W2 = pairwise.pairwise_distances(x,x,metric)
toc()
'''
W = pairwise.pairwise_distances(x,x2,metric)
else:
#tic()
W = pairwise.pairwise_distances(x,x2,metric)
#toc()
W = np.square(W)
W = -sigma * W
W = np.exp(W)
return W
开发者ID:adgress,项目名称:PythonFramework,代码行数:34,代码来源:array_functions.py
示例3: bipartite_clustering
def bipartite_clustering(D2W,word_cluster_num,doc_cluster_num,metric,criteria):
W2D = D2W.transpose()
W2WC = kmean(W2D,word_cluster_num,criteria)
#word_cluster_num = np.amax(W2WC)+1
#print "wc:",word_cluster_num
for loop in range(4):
#D2WC = D2W.dot(transform_from_index_array(W2WC,W2WC.size,word_cluster_num))
#print D2WC
#print loop
new_centroids = get_new_centroids(W2D,W2WC)
new_distance_matrix = pairwise_distances(W2D,new_centroids,metric=metric) #how to calculate distance? maybe 1-matrix?
#print new_distance_matrix
D2WC = D2W.dot(new_distance_matrix)
if loop==0:
D2DC = kmean(D2WC,doc_cluster_num,criteria)
else:
new_centroids = get_new_centroids(D2WC,D2DC)
D2DC = kmean(D2WC,doc_cluster_num,criteria,new_centroids)
#doc_cluster_num = np.amax(D2DC)+1
#print "dc:",doc_cluster_num
new_centroids = get_new_centroids(D2W,D2DC)
new_distance_matrix = pairwise_distances(D2W,new_centroids,metric=metric)
W2DC = W2D.dot(new_distance_matrix)
new_centroids = get_new_centroids(W2DC,W2WC)
W2WC = kmean(W2DC,word_cluster_num,criteria,new_centroids)
#word_cluster_num = np.amax(W2WC)+1
#print "wc:",word_cluster_num
return D2DC,W2WC
开发者ID:Smuzi,项目名称:Text-Mining,代码行数:28,代码来源:bpc.py
示例4: generate_dist_stats_feat
def generate_dist_stats_feat(metric, X_train, ids_train, X_test, ids_test, indices_dict):
## stats parameters
quantiles_range = np.arange(0, 1.5, 0.5)
stats_func = [ np.mean, np.std ]
stats_feat_num = len(quantiles_range) + len(stats_func)
n_class_relevance = 13
if metric == "cosine":
stats_feat = 0 * np.ones((len(ids_test), stats_feat_num*n_class_relevance), dtype=float)
sim = 1. - pairwise_distances(X_test, X_train, metric=metric, n_jobs=1)
elif metric == "euclidean":
stats_feat = -1 * np.ones((len(ids_test), stats_feat_num*n_class_relevance), dtype=float)
sim = pairwise_distances(X_test, X_train, metric=metric, n_jobs=1)
print("pairwise_distances generated!")
for i in range(len(ids_test)):
id = ids_test[i]
for j in range(n_class_relevance):
key = j
if key in indices_dict:
inds = indices_dict[key]
# exclude this sample itself from the list of indices
inds = [ ind for ind in inds if id != ids_train[ind] ]
sim_tmp = sim[i][inds]
if len(sim_tmp) != 0:
feat = [ func(sim_tmp) for func in stats_func ]
## quantile
sim_tmp = pd.Series(sim_tmp)
quantiles = sim_tmp.quantile(quantiles_range)
feat = np.hstack((feat, quantiles))
stats_feat[i,j*stats_feat_num:(j+1)*stats_feat_num] = feat
return stats_feat
开发者ID:aaxwaz,项目名称:Kaggle_HomeDepot_Stacking,代码行数:32,代码来源:utils.py
示例5: dunn
def dunn(max_nc, all_labels, dataset):
dunn = []
print "DUNN (MAX)..."
for nc in xrange(2, max_nc + 1):
dn = 0.0
max_intra = 0.0
for cluster_i in xrange(nc):
instances_i = dataset[np.where(all_labels[nc - 2] == cluster_i)[0]]
pairwase_matrix_intra = pairwise_distances(instances_i, n_jobs=1)
new_max_intra = np.amax(pairwase_matrix_intra)
if new_max_intra > max_intra:
max_intra = new_max_intra
for cluster_i in xrange(nc):
instances_i = dataset[np.where(all_labels[nc - 2] == cluster_i)[0]]
for cluster_j in xrange(nc):
if cluster_j > cluster_i:
instances_j = dataset[np.where(all_labels[nc - 2] == cluster_j)[0]]
pairwase_matrix_inter = pairwise_distances(instances_i, instances_j, n_jobs=1)
min_inter = np.amin(pairwase_matrix_inter)
if dn == 0.0:
dn = min_inter / max_intra
elif min_inter / max_intra < dn:
dn = min_inter / max_intra
print 'DUNN for k = ' + str(nc) + ' is ' + str(dn) + ' ...'
dunn += [dn]
return dunn
开发者ID:nielsenrechia,项目名称:clustering-measurements,代码行数:27,代码来源:measurements.py
示例6: pairwise_distances
def pairwise_distances(X, Y=None, index=None, metric="euclidean"):
'''
Compute the distance matrix from a vector array X and optional Y.
This method takes either a vector array or a distance matrix,
and returns a distance matrix. If the input is a vector array,
the distances are computed. If the input is a distances matrix,
it is returned instead.
This method provides a safe way to take a distance matrix as input,
while preserving compatibility with many other algorithms that take
a vector array.
:param X: array [n_samples_a, n_samples_a]
Array of pairwise distances between samples, or a feature array.
:param Y: array [n_samples_b, n_features]
A second feature array only if X has shape [n_samples_a, n_features].
:param index: int, the index of element in X array
:param metric: The metric to use when calculating distance between instances in a feature array.
If metric ='rmsd', it should be computed by MDTraj
:return: The distances
'''
if metric == "rmsd":
if Y is None:
distances_ = md.rmsd(X, X, index, parallel=True, precentered=True)
else:
#distances_ = np.empty((len(X), len(Y)), dtype=np.float32)
# for i in xrange(len(Y)):
distances_ = md.rmsd(X, Y, index, parallel=True, precentered=True)
return distances_
else:
if Y is None:
print "if Y is None"
return sp.pairwise_distances(X, X[index], metric=metric)
if index is None:
print "if index is None, pairwise XX"
return sp.pairwise_distances(X, X, metric=metric)
开发者ID:liusong299,项目名称:HK_DataMiner,代码行数:35,代码来源:pairwise.py
示例7: generate_dist_stats_feat
def generate_dist_stats_feat(metric, X_train, ids_train, X_test, ids_test, indices_dict, qids_test=None):
if metric == "cosine":
stats_feat = 0 * np.ones((len(ids_test), stats_feat_num * n_classes), dtype=float)
sim = 1. - pairwise_distances(X_test, X_train, metric=metric, n_jobs=1)
elif metric == "euclidean":
stats_feat = -1 * np.ones((len(ids_test), stats_feat_num * n_classes), dtype=float)
sim = pairwise_distances(X_test, X_train, metric=metric, n_jobs=1)
for i in range(len(ids_test)):
id = ids_test[i]
if qids_test is not None:
qid = qids_test[i]
for j in range(n_classes):
key = (qid, j + 1) if qids_test is not None else j + 1
if indices_dict.has_key(key):
inds = indices_dict[key]
# exclude this sample itself from the list of indices
inds = [ind for ind in inds if id != ids_train[ind]]
sim_tmp = sim[i][inds]
if len(sim_tmp) != 0:
feat = [func(sim_tmp) for func in stats_func]
## quantile
sim_tmp = pd.Series(sim_tmp)
quantiles = sim_tmp.quantile(quantiles_range)
feat = np.hstack((feat, quantiles))
stats_feat[i, j * stats_feat_num:(j + 1) * stats_feat_num] = feat
return stats_feat
开发者ID:venkataravuri,项目名称:kaggle_homedepot,代码行数:27,代码来源:feature_extract_basic_tiidf.py
示例8: test_no_data_conversion_warning
def test_no_data_conversion_warning():
# No warnings issued if metric is not a boolean distance function
rng = np.random.RandomState(0)
X = rng.randn(5, 4)
with pytest.warns(None) as records:
pairwise_distances(X, metric="minkowski")
assert len(records) == 0
开发者ID:scikit-learn,项目名称:scikit-learn,代码行数:7,代码来源:test_pairwise.py
示例9: trustworthiness
def trustworthiness(X, X_embedded, n_neighbors=5, precomputed=False):
"""Expresses to what extent the local structure is retained.
The trustworthiness is within [0, 1]. It is defined as
.. math::
T(k) = 1 - \frac{2}{nk (2n - 3k - 1)} \sum^n_{i=1}
\sum_{j \in U^{(k)}_i (r(i, j) - k)}
where :math:`r(i, j)` is the rank of the embedded datapoint j
according to the pairwise distances between the embedded datapoints,
:math:`U^{(k)}_i` is the set of points that are in the k nearest
neighbors in the embedded space but not in the original space.
* "Neighborhood Preservation in Nonlinear Projection Methods: An
Experimental Study"
J. Venna, S. Kaski
* "Learning a Parametric Embedding by Preserving Local Structure"
L.J.P. van der Maaten
Parameters
----------
X : array, shape (n_samples, n_features) or (n_samples, n_samples)
If the metric is 'precomputed' X must be a square distance
matrix. Otherwise it contains a sample per row.
X_embedded : array, shape (n_samples, n_components)
Embedding of the training data in low-dimensional space.
n_neighbors : int, optional (default: 5)
Number of neighbors k that will be considered.
precomputed : bool, optional (default: False)
Set this flag if X is a precomputed square distance matrix.
Returns
-------
trustworthiness : float
Trustworthiness of the low-dimensional embedding.
"""
if precomputed:
dist_X = X
else:
dist_X = pairwise_distances(X, squared=True)
dist_X_embedded = pairwise_distances(X_embedded, squared=True)
ind_X = np.argsort(dist_X, axis=1)
ind_X_embedded = np.argsort(dist_X_embedded, axis=1)[:, 1:n_neighbors + 1]
n_samples = X.shape[0]
t = 0.0
ranks = np.zeros(n_neighbors)
for i in range(n_samples):
for j in range(n_neighbors):
ranks[j] = np.where(ind_X[i] == ind_X_embedded[i, j])[0][0]
ranks -= n_neighbors
t += np.sum(ranks[ranks > 0])
t = 1.0 - t * (2.0 / (n_samples * n_neighbors *
(2.0 * n_samples - 3.0 * n_neighbors - 1.0)))
return t
开发者ID:lmcinnes,项目名称:sstsne,代码行数:60,代码来源:ss_t_sne.py
示例10: test_radius_neighbors
def test_radius_neighbors():
# Checks whether Returned distances are less than `radius`
# At least one point should be returned when the `radius` is set
# to mean distance from the considering point to other points in
# the database.
# Moreover, this test compares the radius neighbors of LSHForest
# with the `sklearn.neighbors.NearestNeighbors`.
n_samples = 12
n_features = 2
n_iter = 10
rng = np.random.RandomState(42)
X = rng.rand(n_samples, n_features)
lshf = ignore_warnings(LSHForest, category=DeprecationWarning)()
# Test unfitted estimator
assert_raises(ValueError, lshf.radius_neighbors, X[0])
ignore_warnings(lshf.fit)(X)
for i in range(n_iter):
# Select a random point in the dataset as the query
query = X[rng.randint(0, n_samples)].reshape(1, -1)
# At least one neighbor should be returned when the radius is the
# mean distance from the query to the points of the dataset.
mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
neighbors = lshf.radius_neighbors(query, radius=mean_dist,
return_distance=False)
assert_equal(neighbors.shape, (1,))
assert_equal(neighbors.dtype, object)
assert_greater(neighbors[0].shape[0], 0)
# All distances to points in the results of the radius query should
# be less than mean_dist
distances, neighbors = lshf.radius_neighbors(query,
radius=mean_dist,
return_distance=True)
assert_array_less(distances[0], mean_dist)
# Multiple points
n_queries = 5
queries = X[rng.randint(0, n_samples, n_queries)]
distances, neighbors = lshf.radius_neighbors(queries,
return_distance=True)
# dists and inds should not be 1D arrays or arrays of variable lengths
# hence the use of the object dtype.
assert_equal(distances.shape, (n_queries,))
assert_equal(distances.dtype, object)
assert_equal(neighbors.shape, (n_queries,))
assert_equal(neighbors.dtype, object)
# Compare with exact neighbor search
query = X[rng.randint(0, n_samples)].reshape(1, -1)
mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
nbrs = NearestNeighbors(algorithm='brute', metric='cosine').fit(X)
distances_exact, _ = nbrs.radius_neighbors(query, radius=mean_dist)
distances_approx, _ = lshf.radius_neighbors(query, radius=mean_dist)
开发者ID:AlexisMignon,项目名称:scikit-learn,代码行数:59,代码来源:test_approximate.py
示例11: getSimMat
def getSimMat(self, type = 'euclidean', ftr_type = 'data', orderFlag = True, pca_dim=20):
if ftr_type == 'ftr':
#use input features
self.slctData = [ts for ts in self.slctData if ((ts.ftr is not None) and (len(ts.ftr) > 0))]
dataMat = [ts.ftr for ts in self.slctData]
elif ftr_type == 'data':
#use input data
dataMat = [ts.val for ts in self.slctData]
else:
print 'unknown ftr_type for ftr_type:', ftr_type
if pca_dim > len(dataMat):
pca_dim = int(math.ceil(len(dataMat)/2.0))
if type == 'euclidean': #euclidean distance based on time series data
self.simMat = skmpw.euclidean_distances(dataMat)
elif type == 'pca_euc': #extract feature based on PCA, then use Euclidean distance
pca = skd.PCA(n_components=pca_dim)
dataMat = pca.fit_transform(dataMat)
self.simMat = skmpw.euclidean_distances(dataMat)
elif type == 'nmf_euc': #extract feature based on NMF, then use Euclidean distance
nmf = skd.NMF(n_components=pca_dim)
dataMat = nmf.fit_transform(dataMat)
self.simMat = skmpw.euclidean_distances(dataMat)
elif type =='ica_euc': #extract feature based on ICA, then use Euclidean distance
ica = skd.FastICA(n_components=pca_dim)
dataMat = ica.fit_transform(dataMat)
self.simMat = skmpw.euclidean_distances(dataMat)
elif type =='cosine':
self.simMat = skmpw.pairwise_distances(dataMat, metric='cosine')
elif type == 'pca_cos': #extract feature based on PCA, then use cosine distance
pca = skd.PCA(n_components=pca_dim)
dataMat = pca.fit_transform(dataMat)
self.simMat = skmpw.pairwise_distances(dataMat, metric='cosine')
elif type == 'nmf_cos': #extract feature based on NMF, then use cosine distance
nmf = skd.NMF(n_components=pca_dim)
dataMat = nmf.fit_transform(dataMat)
self.simMat = skmpw.pairwise_distances(dataMat, metric='cosine')
elif type =='ica_cos': #extract feature based on ICA, then use cosine distance
ica = skd.FastICA(n_components=pca_dim)
dataMat = ica.fit_transform(dataMat)
self.simMat = skmpw.pairwise_distances(dataMat, metric='cosine')
else:
print 'unknown type for similarity matrix: ', type
#rearrange the order of data in simMat
self.slctDataMat = dataMat
if orderFlag:
link = spc.hierarchy.linkage(self.simMat)
dend = spc.hierarchy.dendrogram(link, no_plot=True)
order = dend['leaves']
self.slctData = [self.slctData[i] for i in order] #rearrange order
self.simMat = [self.simMat[i] for i in order]
for i in xrange(len(self.simMat)):
self.simMat[i] = [self.simMat[i][j] for j in order]
self.slctDataMat = [self.slctDataMat[i] for i in order]
# self.patchOrdering = [ts.ptchNm for ts in self.slctData] #record new ordering
self.patchOrdering = JSONifyData(self.slctData) # Deok wants all the data for each patch in the response
self.clstData = self.slctData
self.clstSimMat = self.simMat
开发者ID:KonstantinosX,项目名称:TimeGrouper,代码行数:59,代码来源:ts_clustering.py
示例12: predict
def predict(dialogue_session, line):
lowest = ('x',1)
data = dataDict[dialogue_session][1][line,:]
for vector in vDict:
predictor = vDict[vector]
if pair.pairwise_distances(predictor,data,'cosine') < lowest[1]:
lowest = (vector, pair.pairwise_distances(predictor,data,'cosine'))
return lowest
开发者ID:DmitriyLeybel,项目名称:multimodal_prediction,代码行数:8,代码来源:predictability_svd_final.py
示例13: cramer_statistic
def cramer_statistic(self, n_jobs=1):
'''
Applies the Cramer Statistic to the datasets.
Parameters
----------
n_jobs : int, optional
Sets the number of cores to use to calculate
pairwise distances. Default is 1.
'''
# Adjust what we call n,m based on the larger dimension.
# Then the looping below is valid.
if self.data_matrix1.shape[0] >= self.data_matrix2.shape[0]:
m = self.data_matrix1.shape[0]
n = self.data_matrix2.shape[0]
larger = self.data_matrix1
smaller = self.data_matrix2
else:
n = self.data_matrix1.shape[0]
m = self.data_matrix2.shape[0]
larger = self.data_matrix2
smaller = self.data_matrix1
pairdist11 = pairwise_distances(larger, metric="euclidean",
n_jobs=n_jobs)
pairdist22 = pairwise_distances(smaller, metric="euclidean",
n_jobs=n_jobs)
pairdist12 = pairwise_distances(larger, smaller,
metric="euclidean", n_jobs=n_jobs)
# Take sqrt of each
# We default to using the Cramer kernel in Baringhaus & Franz (2004)
# \phi(dist) = sqrt(dist) / 2.
# The normalization values below reflect this
pairdist11 = np.sqrt(pairdist11)
pairdist12 = np.sqrt(pairdist12)
pairdist22 = np.sqrt(pairdist22)
term1 = 0.0
term2 = 0.0
term3 = 0.0
for i in range(m):
for j in range(n):
term1 += pairdist12[i, j]
for ii in range(m):
term2 += pairdist11[i, ii]
if i < n:
for jj in range(n):
term3 += pairdist22[i, jj]
m, n = float(m), float(n)
term1 *= (1 / (m * n))
term2 *= (1 / (2 * m ** 2.))
term3 *= (1 / (2 * n ** 2.))
self._distance = (m * n / (m + n)) * (term1 - term2 - term3)
开发者ID:Astroua,项目名称:TurbuStat,代码行数:58,代码来源:cramer.py
示例14: run_step
def run_step(self, run_number, step_size, howlong):
dfslot = self.get_input_slot("df")
df = dfslot.data()
dfslot.update(run_number)
if dfslot.has_updated() or dfslot.has_deleted():
dfslot.reset()
logger.info("Reseting history because of changes in the input df")
dfslot.update(run_number, df)
# TODO: be smarter with changed values
m = step_size
indices = dfslot.next_created(m)
m = indices_len(indices)
i = None
j = None
Si = self._buf.matrix()
arrayslot = self.get_input_slot("array")
if arrayslot is not None and arrayslot.data() is not None:
array = arrayslot.data()
logger.debug("Using array instead of DataFrame columns")
if Si is not None:
i = array[self._last_index]
j = array[indices]
if j is None:
if self.columns is None:
self.columns = df.columns.delete(np.where(df.columns == Module.UPDATE_COLUMN))
elif not isinstance(self.columns, pd.Index):
self.columns = pd.Index(self.columns)
rows = df[self.columns]
if Si is not None:
i = rows.loc[self._last_index]
assert len(i) == len(self._last_index)
j = rows.loc[fix_loc(indices)]
assert len(j) == indices_len(indices)
Sj = pairwise_distances(j, metric=self._metric, n_jobs=self._n_jobs)
if Si is None:
mat = self._buf.resize(Sj.shape[0])
mat[:, :] = Sj
self._last_index = dfslot.last_index[indices]
else:
Sij = pairwise_distances(i, j, metric=self._metric, n_jobs=self._n_jobs)
n0 = i.shape[0]
n1 = n0 + j.shape[0]
mat = self._buf.resize(n1)
mat[0:n0, n0:n1] = Sij
mat[n0:n1, 0:n0] = Sij.T
mat[n0:n1, n0:n1] = Sj
self._last_index = self._last_index.append(df.index[indices])
# truth = pairwise_distances(array[0:n1], metric=self._metric)
# import pdb
# pdb.set_trace()
# assert np.allclose(mat,truth)
return self._return_run_step(dfslot.next_state(), steps_run=m)
开发者ID:jdfekete,项目名称:progressivis,代码行数:57,代码来源:pairwise.py
示例15: test_radius_neighbors
def test_radius_neighbors():
"""Checks whether Returned distances are less than `radius`
At least one point should be returned when the `radius` is set
to mean distance from the considering point to other points in
the database.
Moreover, this test compares the radius neighbors of LSHForest
with the `sklearn.neighbors.NearestNeighbors`.
"""
n_samples = 12
n_features = 2
n_iter = 10
rng = np.random.RandomState(42)
X = rng.rand(n_samples, n_features)
lshf = LSHForest()
# Test unfitted estimator
assert_raises(ValueError, lshf.radius_neighbors, X[0])
lshf.fit(X)
for i in range(n_iter):
query = X[rng.randint(0, n_samples)]
mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
neighbors = lshf.radius_neighbors(query, radius=mean_dist,
return_distance=False)
# At least one neighbor should be returned.
assert_greater(neighbors.shape[0], 0)
# All distances should be less than mean_dist
distances, neighbors = lshf.radius_neighbors(query,
radius=mean_dist,
return_distance=True)
assert_array_less(distances[0], mean_dist)
# Multiple points
n_queries = 5
queries = X[rng.randint(0, n_samples, n_queries)]
distances, neighbors = lshf.radius_neighbors(queries,
return_distance=True)
assert_equal(neighbors.shape[0], n_queries)
assert_equal(distances.shape[0], n_queries)
# dists and inds should not be 2D arrays
assert_equal(distances.ndim, 1)
assert_equal(neighbors.ndim, 1)
# Compare with exact neighbor search
query = X[rng.randint(0, n_samples)]
mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
nbrs = NearestNeighbors(algorithm='brute', metric='cosine')
nbrs.fit(X)
distances_approx, _ = lshf.radius_neighbors(query, radius=mean_dist)
distances_exact, _ = nbrs.radius_neighbors(query, radius=mean_dist)
# Distances of exact neighbors is less than or equal to approximate
assert_true(np.all(np.less_equal(np.sort(distances_exact[0]),
np.sort(distances_approx[0]))))
开发者ID:CC-Fu-CC,项目名称:scikit-learn,代码行数:56,代码来源:test_approximate.py
示例16: fit
def fit(self, X, y=None, c=None):
"""Fit the model using X as training data.
Parameters
----------
X : array, shape (n_samples, n_features) or (n_samples, n_samples)
If the metric is 'precomputed' X must be a square distance
matrix. Otherwise it contains a sample per row.
"""
X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], dtype=np.float64)
random_state = check_random_state(self.random_state)
if self.early_exaggeration < 1.0:
raise ValueError("early_exaggeration must be at least 1, but is "
"%f" % self.early_exaggeration)
if self.n_iter < 200:
raise ValueError("n_iter should be at least 200")
if self.metric == "precomputed":
if self.init == 'pca':
raise ValueError("The parameter init=\"pca\" cannot be used "
"with metric=\"precomputed\".")
if X.shape[0] != X.shape[1]:
raise ValueError("X should be a square distance matrix")
distances = X
else:
if self.verbose:
print("[t-SNE] Computing pairwise distances...")
if self.metric == "euclidean":
distances = pairwise_distances(X, metric=self.metric, squared=True)
else:
distances = pairwise_distances(X, metric=self.metric)
# Degrees of freedom of the Student's t-distribution. The suggestion
# alpha = n_components - 1 comes from "Learning a Parametric Embedding
# by Preserving Local Structure" Laurens van der Maaten, 2009.
alpha = max(self.n_components - 1.0, 1)
n_samples = X.shape[0]
self.training_data_ = X
P = _joint_probabilities(distances, self.perplexity, self.verbose)
self.P = deepcopy(P)
if self.init == 'pca':
pca = RandomizedPCA(n_components=self.n_components,
random_state=random_state)
X_embedded = pca.fit_transform(X)
elif self.init == 'random':
X_embedded = None
else:
raise ValueError("Unsupported initialization scheme: %s"
% self.init)
self.embedding_ = self._tsne(P, alpha, n_samples, random_state,
X_embedded=X_embedded, c=c)
开发者ID:Kazjon,项目名称:deep_creeval,代码行数:56,代码来源:AnimTSNE_experiment.py
示例17: knn_dist
def knn_dist(x, x_ctrl, s=100, p=1):
x_tmp = random_subsample(x_ctrl, 200000, replace=False)
xs = kmeans_subsample(x_tmp, s)
if p == 1:
min_dist = np.min(pairwise_distances(X=x, Y=xs, metric="l1"), axis=1)
elif p == 2:
min_dist = np.min(pairwise_distances(X=x, Y=xs, metric="l2"), axis=1)
assert len(min_dist) == x.shape[0]
return min_dist
开发者ID:eiriniar,项目名称:CellCnn,代码行数:10,代码来源:downsample.py
示例18: multiQuadricKernel
def multiQuadricKernel(X, X2=None, offset=1.0, jobs=1, *args, **kwargs):
offset = float(offset)
if X2 is not None:
distanceMatrix = pairwise.pairwise_distances(X, X2, n_jobs=jobs)
else:
distanceMatrix = pairwise.pairwise_distances(X, n_jobs=jobs)
result = np.sqrt(distanceMatrix**2 + offset**2)
return result
开发者ID:FabianIsensee,项目名称:machine_learning_1,代码行数:10,代码来源:ex6jens.py
示例19: kpca_cluster
def kpca_cluster(data,nclusters=100,ncomponents=40,topwhat=10,zscored=False):
'''
Computes clustering of bag-of-words vectors of articles
INPUT
folder model folder
nclusters number of clusters
'''
from sklearn.cluster import KMeans
# filtering out some noise words
stops = map(lambda x:x.lower().strip(),open('stopwords.txt').readlines()[6:])
# vectorize non-stopwords
bow = TfidfVectorizer(min_df=2,stop_words=stops)
X = bow.fit_transform(data)
# creating bow-index-to-word map
idx2word = dict(zip(bow.vocabulary_.values(),bow.vocabulary_.keys()))
# using now stopwords and filtering out digits
print 'Computing pairwise distances'
K = pairwise_distances(X,metric='l2',n_jobs=1)
perc = 50.0
width = percentile(K.flatten(),perc)
# KPCA transform bow vectors
Xc = KernelPCA(n_components=ncomponents,kernel='rbf',gamma=width).fit_transform(X)
if zscored:
Xc = zscore(Xc)
# compute clusters
km = KMeans(n_clusters=nclusters).fit(Xc)
Xc = km.predict(Xc)
clusters = []
for icluster in range(nclusters):
nmembers = (Xc==icluster).sum()
if True:#nmembers < len(data) / 5.0 and nmembers > 1: # only group clusters big enough but not too big
members = (Xc==icluster).nonzero()[0]
topwordidx = array(X[members,:].sum(axis=0))[0].argsort()[-topwhat:][::-1]
topwords = ' '.join([idx2word[wi] for wi in topwordidx])
meanDist = triu(pairwise_distances(X[members,:],metric='l2',n_jobs=1)).sum()
meanDist = meanDist / (len(members) + (len(members)**2 - len(members))/2.0)
# print u'Cluster %d'%icluster + u' %d members'%nmembers + u' mean Distance %f'%meanDist + u'\n\t'+topwords
clusters.append({
'name':'Cluster-%d'%icluster,
'description': topwords,
'members': list(members),
'meanL2Distances': meanDist
})
return clusters
开发者ID:christinakraus,项目名称:political-affiliation-prediction,代码行数:55,代码来源:newsreader.py
示例20: cramer_statistic
def cramer_statistic(self, n_jobs=1):
'''
Applies the Cramer Statistic to the datasets.
Parameters
----------
n_jobs : int, optional
Sets the number of cores to use to calculate
pairwise distances
'''
# Adjust what we call n,m based on the larger dimension.
# Then the looping below is valid.
if self.data_matrix1.shape[0] >= self.data_matrix2.shape[0]:
m = self.data_matrix1.shape[0]
n = self.data_matrix2.shape[0]
larger = self.data_matrix1
smaller = self.data_matrix2
else:
n = self.data_matrix1.shape[0]
m = self.data_matrix2.shape[0]
larger = self.data_matrix2
smaller = self.data_matrix1
pairdist11 = pairwise_distances(
larger, metric="euclidean", n_jobs=n_jobs)
pairdist22 = pairwise_distances(
smaller, metric="euclidean", n_jobs=n_jobs)
pairdist12 = pairwise_distances(
larger, smaller,
metric="euclidean", n_jobs=n_jobs)
term1 = 0.0
term2 = 0.0
term3 = 0.0
for i in range(m):
for j in range(n):
term1 += pairdist12[i, j]
for ii in range(m):
term2 += pairdist11[i, ii]
if i < n:
for jj in range(n):
term3 += pairdist22[i, jj]
m, n = float(m), float(n)
term1 *= (1 / (m * n))
term2 *= (1 / (2 * m ** 2.))
term3 *= (1 / (2 * n ** 2.))
self.distance = (m * n / (m + n)) * (term1 - term2 - term3)
return self
开发者ID:keflavich,项目名称:TurbuStat,代码行数:54,代码来源:cramer.py
注:本文中的sklearn.metrics.pairwise.pairwise_distances函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论