本文整理汇总了Python中sklearn.metrics.pairwise.cosine_distances函数的典型用法代码示例。如果您正苦于以下问题:Python cosine_distances函数的具体用法?Python cosine_distances怎么用?Python cosine_distances使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了cosine_distances函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: test_cosine_distances
def test_cosine_distances():
# Check the pairwise Cosine distances computation
rng = np.random.RandomState(1337)
x = np.abs(rng.rand(910))
XA = np.vstack([x, x])
D = cosine_distances(XA)
assert_array_almost_equal(D, [[0., 0.], [0., 0.]])
# check that all elements are in [0, 2]
assert np.all(D >= 0.)
assert np.all(D <= 2.)
# check that diagonal elements are equal to 0
assert_array_almost_equal(D[np.diag_indices_from(D)], [0., 0.])
XB = np.vstack([x, -x])
D2 = cosine_distances(XB)
# check that all elements are in [0, 2]
assert np.all(D2 >= 0.)
assert np.all(D2 <= 2.)
# check that diagonal elements are equal to 0 and non diagonal to 2
assert_array_almost_equal(D2, [[0., 2.], [2., 0.]])
# check large random matrix
X = np.abs(rng.rand(1000, 5000))
D = cosine_distances(X)
# check that diagonal elements are equal to 0
assert_array_almost_equal(D[np.diag_indices_from(D)], [0.] * D.shape[0])
assert np.all(D >= 0.)
assert np.all(D <= 2.)
开发者ID:scikit-learn,项目名称:scikit-learn,代码行数:28,代码来源:test_pairwise.py
示例2: get_features
def get_features(vocab):
vectorizer_head = TfidfVectorizer(vocabulary=vocab, use_idf=False, norm='l2')
X_train_head = vectorizer_head.fit_transform(headlines)
vectorizer_body = TfidfVectorizer(vocabulary=vocab, use_idf=False, norm='l2')
X_train_body = vectorizer_body.fit_transform(bodies)
# calculates n most important topics of the bodies. Each topic contains all words but ordered by importance. The
# more important topic words a body contains of a certain topic, the higher its value for this topic
lda_body = LatentDirichletAllocation(n_topics=n_topics, learning_method='online', random_state=0, n_jobs=3)
print("latent_dirichlet_allocation_cos: fit and transform body")
t0 = time()
lda_body_matrix = lda_body.fit_transform(X_train_body)
print("done in %0.3fs." % (time() - t0))
print("latent_dirichlet_allocation_cos: transform head")
# use the lda trained for body topcis on the headlines => if the headlines and bodies share topics
# their vectors should be similar
lda_head_matrix = lda_body.transform(X_train_head)
#print_top_words(lda_body, vectorizer_body.get_feature_names(), 100)
print('latent_dirichlet_allocation_cos: calculating cosine distance between head and body')
# calculate cosine distance between the body and head
X = []
for i in range(len(lda_head_matrix)):
X_head_vector = np.array(lda_head_matrix[i]).reshape((1, -1)) #1d array is deprecated
X_body_vector = np.array(lda_body_matrix[i]).reshape((1, -1))
cos_dist = cosine_distances(X_head_vector, X_body_vector).flatten()
X.append(cos_dist.tolist())
return X
开发者ID:paris5020,项目名称:athene_system,代码行数:32,代码来源:topic_models.py
示例3: sumACluster
def sumACluster(dist, vecsIn, topK_t, sameTweetThred):
if dist == "cosine":
distMatrix = pairwise.cosine_distances(vecsIn)
elif dist == "eu":
distMatrix = pairwise.euclidean_distances(vecsIn, vecsIn)
sameTweetClusters = [[0]]
for seqid, text in enumerate(vecsIn[1:], start=1):
added = None
for stcid, stc in enumerate(sameTweetClusters):
sameFlag = False
if distMatrix[seqid][stc[0]] <= sameTweetThred:
sameFlag = True
if sameFlag:
stc.append(seqid)
added = (stcid, stc)
break
if added is None:
sameTweetClusters.append([seqid])
else:
sameTweetClusters[added[0]] = added[1]
sameTweetClusterNum = [(stcid, len(stc)) for stcid, stc in enumerate(sameTweetClusters)]
numIn = len(sameTweetClusterNum)
top = sorted(sameTweetClusterNum, key = lambda a:a[1], reverse=True)[:min(topK_t, numIn)]
top = [(sameTweetClusters[item[0]][0], item[1]) for item in top]
return top
开发者ID:qolina,项目名称:DBED,代码行数:27,代码来源:tweetClustering.py
示例4: test_linkage_misc
def test_linkage_misc():
# Misc tests on linkage
X = np.ones((5, 5))
assert_raises(ValueError,
AgglomerativeClustering(linkage='foobar').fit,
X)
assert_raises(ValueError, linkage_tree, X, linkage='foobar')
assert_raises(ValueError, linkage_tree, X, connectivity=np.ones((4, 4)))
# Smoke test FeatureAgglomeration
FeatureAgglomeration().fit(X)
with warnings.catch_warnings(record=True) as warning_list:
warnings.simplefilter("always", DeprecationWarning)
# Use the copy argument, to raise a warning
Ward(copy=True).fit(X)
# We should be getting 2 warnings: one for using Ward that is
# deprecated, one for using the copy argument
assert_equal(len(warning_list), 2)
# test hiearchical clustering on a precomputed distances matrix
dis = cosine_distances(X)
res = linkage_tree(dis, affinity="precomputed")
assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0])
# test hiearchical clustering on a precomputed distances matrix
res = linkage_tree(X, affinity=manhattan_distances)
assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0])
开发者ID:Arezou1,项目名称:scikit-learn,代码行数:28,代码来源:test_hierarchical.py
示例5: test_linkage_misc
def test_linkage_misc():
# Misc tests on linkage
rnd = np.random.RandomState(42)
X = rnd.normal(size=(5, 5))
assert_raises(ValueError, AgglomerativeClustering(linkage='foo').fit, X)
assert_raises(ValueError, linkage_tree, X, linkage='foo')
assert_raises(ValueError, linkage_tree, X, connectivity=np.ones((4, 4)))
# Smoke test FeatureAgglomeration
FeatureAgglomeration().fit(X)
# Deprecation of Ward class
with warnings.catch_warnings(record=True) as warning_list:
warnings.simplefilter("always", DeprecationWarning)
Ward().fit(X)
assert_equal(len(warning_list), 1)
# test hiearchical clustering on a precomputed distances matrix
dis = cosine_distances(X)
res = linkage_tree(dis, affinity="precomputed")
assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0])
# test hiearchical clustering on a precomputed distances matrix
res = linkage_tree(X, affinity=manhattan_distances)
assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0])
开发者ID:MatteoFu,项目名称:scikit-learn,代码行数:25,代码来源:test_hierarchical.py
示例6: getModelInfo
def getModelInfo(model, features):
print("Shape of the transformed features = {}".format(features.shape))
# Uncomment to info:
# vocab = model.get_feature_names()
# dist = np.sum(features, axis=0)
# for tag, count in izip(vocab, dist):
# print("word = {}, frequency = {}".format(tag, count))
return cosine_distances(features)
开发者ID:geekman2,项目名称:GutenTag,代码行数:8,代码来源:sklearn_model.py
示例7: _build_metastore
def _build_metastore(self):
medians = np.median(self.X, axis=0).reshape(1, self.dim)
# how far each data point is from the global median
dists = cosine_distances(self.X, Y=medians).reshape(-1)
sorted_index = [self.index[i] for i in dists.argsort()]
return {'sorted_index': sorted_index}
开发者ID:ashishyadavppe,项目名称:Skater,代码行数:10,代码来源:datamanager.py
示例8: calcurate_centroid_Matrix
def calcurate_centroid_Matrix(veclist, word2vecdic,DimentionN):
centroid_Matrix = np.zeros((DimentionN, 200))
distance_arrays = np.zeros(DimentionN)
for word in veclist:
label = word2vecdic[word]
centroid_Matrix[label] += veclist[word]
for word in veclist:
label = word2vecdic[word]
distance_arrays[label] += cosine_distances(veclist[word], centroid_Matrix[label])
return centroid_Matrix, distance_arrays
开发者ID:ItoTomoki,项目名称:ruiternews,代码行数:10,代码来源:yahoofinancil_board_preprocess.py
示例9: memory_cf
def memory_cf(users, movies, k, similarity_measure, weight_schema,
repr_matrix=rating_matrix_orig, rating_matrix=rating_matrix_orig):
"""
Memory-based collaborative filtering.
:param users: a user list.
:param movies: a movie list.
:param k: number of nearest users
:param similarity_measure: 'cosine' or 'dot_product'
:param weight_schema: 'mean' or 'weighted_mean'
:param repr_matrix: data point representation
:param rating_matrix: ratings based on user-movie or cluster centroids
:return: recommended ratings for the queries
"""
# construct mapping between input users and unique users
ratings, user_unique = [], list(set(users))
user_index_map = dict((u, i) for i, u in enumerate(user_unique))
users = [(u, user_index_map[u]) for u in users]
# find k nearest neighbor for each user
if similarity_measure == 'cosine':
dist = cosine_distances(repr_matrix[user_unique, :], repr_matrix)
sims = 1 - dist
elif similarity_measure == 'dot_product':
sims = repr_matrix[user_unique, :].dot(repr_matrix.T)
if issparse(sims):
sims = sims.toarray()
dist = -sims
sorted_neighbors = np.argsort(dist, axis=1)
# make rating matrix dense for fast access
rating_matrix = rating_matrix.toarray()
weight_method = mean if weight_schema == 'mean' else weighted_mean
for (user_index, neighbor_index), movie in zip(users, movies):
neighbors = list(islice(ifilter(lambda u: (u, movie) in entry_set,
sorted_neighbors[neighbor_index]),
k + 1))
# no neighbors, regarded as 3
if not neighbors:
ratings.append(3)
continue
# exclude itself
if user_index in neighbors:
neighbors.remove(user_index)
rating = weight_method(rating_matrix[neighbors, movie],
sims[neighbor_index, neighbors])
ratings.append(rating)
return ratings
开发者ID:EDFward,项目名称:TrivialCF,代码行数:54,代码来源:cf.py
示例10: get_sparse_dist_matrix
def get_sparse_dist_matrix(tweets_tfidf_matrix, eps):
"""Get the sparse distance matrix from the pairwise cosine distance
computations from the given tfidf vectors. Only distances less than or
equal to eps are put into the matrix"""
rows = []
cols = []
data = []
for ndx, tweet in enumerate(tweets_tfidf_matrix):
rows.append(len(cols))
distances = cosine_distances(tweet, tweets_tfidf_matrix)[0]
for other_ndx, dist in enumerate(distances):
if ndx != other_ndx and dist <= eps:
cols.append(other_ndx)
data.append(dist)
return csr_matrix((data, cols, rows), dtype=int)
开发者ID:jiwu14,项目名称:TweetAnalyzer,代码行数:15,代码来源:TweetAnalyzer.py
示例11: test_linkage_misc
def test_linkage_misc():
# Misc tests on linkage
rng = np.random.RandomState(42)
X = rng.normal(size=(5, 5))
assert_raises(ValueError, AgglomerativeClustering(linkage='foo').fit, X)
assert_raises(ValueError, linkage_tree, X, linkage='foo')
assert_raises(ValueError, linkage_tree, X, connectivity=np.ones((4, 4)))
# Smoke test FeatureAgglomeration
FeatureAgglomeration().fit(X)
# test hiearchical clustering on a precomputed distances matrix
dis = cosine_distances(X)
res = linkage_tree(dis, affinity="precomputed")
assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0])
# test hiearchical clustering on a precomputed distances matrix
res = linkage_tree(X, affinity=manhattan_distances)
assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0])
开发者ID:foresthz,项目名称:scikit-learn,代码行数:20,代码来源:test_hierarchical.py
示例12: cluster_cf_memory
def cluster_cf_memory():
"""
Cluster-based memory CF.
"""
rating_matrix_cluster = np.empty([k_user, rating_matrix_orig.shape[1]],
dtype=np.float64)
# build rating matrix for each user cluster, on each movie
for i in range(k_user):
cluster_indicator = np.where(user_belonging == i)[0]
rating_cluster = rating_matrix_orig[cluster_indicator, :]
rating_sum = rating_cluster.sum(axis=0)
# take average by dividing count
rating_cluster.data = np.ones(len(rating_cluster.data))
mu = rating_sum / rating_cluster.sum(axis=0)
# fill 0 for nan
mu[np.isnan(mu)] = 0
rating_matrix_cluster[i, :] = mu
# construct mapping between input users and unique users
ratings, user_unique = [], list(set(users))
user_index_map = dict((u, i) for i, u in enumerate(user_unique))
users_neighbors = [user_index_map[u] for u in users]
if similarity_measure == 'cosine':
dist = cosine_distances(rating_matrix_orig[user_unique, :], m2uc.T)
sims = 1 - dist
else:
sims = rating_matrix_orig[user_unique, :].dot(m2uc).toarray()
dist = -sims
nearest_neighbors = np.argpartition(dist, k, axis=1)[:, :k]
weight_method = mean if weight_schema == 'mean' else weighted_mean
for neighbor_index, movie in zip(users_neighbors, movies):
neighbors = nearest_neighbors[neighbor_index]
rating = weight_method(rating_matrix_cluster[neighbors, movie],
sims[neighbor_index, neighbors])
ratings.append(rating)
return ratings
开发者ID:EDFward,项目名称:TrivialCF,代码行数:41,代码来源:cf.py
示例13: plot_mds
def plot_mds(points, genres, n_points=500):
'''
Plots a set of documents in MDS space
Args:
points: dense array with coordinates of each document
genres: list of genres for each entry in points
Returns:
None
'''
genres = np.array(genres)
genre_sel = np.not_equal(genres, None)
X, y = points[genre_sel], genres[genre_sel]
X_train, X_test, y_train, y_test = train_test_split(
X, y, stratify=y, train_size=n_points)
distances = cosine_distances(X_train, X_train)
mds = MDS(n_components=2, dissimilarity='precomputed')
mds.fit(distances)
plot_embedding(mds.embedding_, y_train)
开发者ID:lwoloszy,项目名称:albumpitch,代码行数:23,代码来源:genres.py
示例14: test_fp16_cosine_metric
def test_fp16_cosine_metric(self):
arr = numpy.empty((10000, 2), dtype=numpy.float16)
angs = numpy.random.rand(10000) * 2 * numpy.pi
for i in range(10000):
arr[i] = numpy.sin(angs[i]), numpy.cos(angs[i])
with self.stdout:
centroids, assignments = kmeans_cuda(
arr, 4, init="kmeans++", metric="cos", device=1, verbosity=2,
seed=3)
self.assertEqual(self._get_iters_number(self.stdout), 5)
self.assertEqual(len(centroids), 4)
for c in centroids:
norm = numpy.linalg.norm(c)
self.assertTrue(0.9995 < norm < 1.0005)
dists = numpy.round(cosine_distances(centroids)).astype(int)
self.assertTrue((dists == [
[0, 2, 1, 1],
[2, 0, 1, 1],
[1, 1, 0, 2],
[1, 1, 2, 0],
]).all())
self.assertEqual(numpy.min(assignments), 0)
self.assertEqual(numpy.max(assignments), 3)
开发者ID:src-d,项目名称:kmcuda,代码行数:23,代码来源:test.py
示例15: cosine_similarity
def cosine_similarity(vector_a, vector_b):
return 1-cosine_distances(vector_a,vector_b)
开发者ID:hanveiga,项目名称:master-thesis,代码行数:2,代码来源:information_measure.py
示例16: len
'control': 3.721765211295327,
'democratic': 3.1026721743330414,
'governments': 4.167571323949673,
'in': 0.0009654063501214492,
'law': 2.4538226269605703,
'popular': 2.764478952022998,
'response': 4.261461747058352,
'to': 0.04694493768179923}
word_indices = [map_index_to_word[word] for word in tweet.keys()]
tweet_tf_idf = scipy.sparse.csr_matrix((list(tweet.values()), ([0] * len(word_indices), word_indices)),
shape=(1, tf_idf.shape[1]))
obama_tf_idf = tf_idf[obama_id]
print("The cosine distance between Obama's article and the tweet is {:.6e}."
.format(cosine_distances(obama_tf_idf, tweet_tf_idf)[0, 0]))
print('''
With cosine distances, the tweet is "nearer" to Barack Obama.
Ignoring article lengths completely resulted in nonsensical results.
In practice, it is common to enforce maximum or minimum document lengths.
''')
# QUIZ QUESTIONS:
print("Quiz Questions:")
# 1. Among the words that appear in both Barack Obama and Francisco Barrio,
# take the 5 that appear most frequently in Obama.
# How many of the articles in the Wikipedia dataset contain all of those 5 words?
print("1. Among the words that appear in both Barack Obama and Francisco Barrio, ")
print(" take the 5 that appear most frequently in Obama.")
print(" There are {:d} articles in the Wikipedia dataset contain all of those 5 words.\n"
.format(has_top_words_count[True]))
开发者ID:yf23,项目名称:Machine_Learning_UW,代码行数:31,代码来源:assignment_1.py
示例17: cosine_similarity
"""
Testing the change in embeddings over time. Assumes
that we've already generated embeddings in output/.
"""
import pandas as pd
import numpy as np
import os, codecs
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances
if __name__ == '__main__':
out_dir = 'output'
embedding_files = [os.path.join(out_dir, f) for f in os.listdir(out_dir)]
# test 0: do the embeddings make semantic sense?
end_embedding = pd.read_csv(embedding_files[-1], sep='\t', index_col=0)
test_words = ['you', 'go', 'road', 'give', 'cold']
for test_word in test_words:
sims = end_embedding.apply(lambda r: cosine_similarity(r.reshape(1,-1),
end_embedding.loc[test_word].reshape(1,-1))[0][0],
axis=1)
print('test word %s has top 10 similarities \n%s'%
(test_word, sims.sort_values(ascending=False)[:10]))
# TL;DR the embeddings aren't perfect but they work for more common words
# test 1: how much have embeddings changed from start to end of data?
start_embedding = pd.read_csv(embedding_files[1], sep='\t', index_col=0)
embedding_deltas = abs(cosine_distances(end_embedding, start_embedding))
embedding_deltas = pd.Series(np.diagonal(embedding_deltas),
index=end_embedding.index).sort_values(ascending=True)
print('got embedding deltas %s'%(embedding_deltas))
开发者ID:ianbstewart,项目名称:concept-dynamics,代码行数:28,代码来源:test_temporal_embeddings.py
示例18: manhattan_distances
# extract the terms-by-documents matrix
# in scipy compressed sparse column format
sparse_movies_tdm = tdm_method.fit_transform(parsed_text)
# convert sparse matrix into regular terms-by-documents matrix
movies_tdm = sparse_movies_tdm.todense()
# define the documents-by-terms matrix
movies_dtm = movies_tdm.transpose()
# dissimilarity measures and multidimensional scaling
# consider alternative pairwise distance metrics from sklearn modules
# euclidean_distances, cosine_distances, manhattan_distances (city-block)
# note that different metrics provide different solutions
# movies_distance_matrix = euclidean_distances(movies_tdm)
# movies_distance_matrix = manhattan_distances(movies_tdm)
movies_distance_matrix = cosine_distances(movies_tdm)
mds_method = manifold.MDS(n_components = 2, random_state = 9999,\
dissimilarity = 'precomputed')
mds_fit = mds_method.fit(movies_distance_matrix)
mds_coordinates = mds_method.fit_transform(movies_distance_matrix)
# plot tagline text for years in two dimensions
# defined by multidimensional scaling
plt.figure()
plt.scatter(mds_coordinates[:,0],mds_coordinates[:,1],\
facecolors = 'none', edgecolors = 'none') # plots points in white (invisible)
labels = []
for iyear in range(1974,2014):
labels.append(str(iyear))
for label, x, y in zip(labels, mds_coordinates[:,0], mds_coordinates[:,1]):
开发者ID:Alextnelson,项目名称:mtpa,代码行数:30,代码来源:chapter_7_program.py
示例19: test_pairwise_distances
def test_pairwise_distances():
# Test the pairwise_distance helper function.
rng = np.random.RandomState(0)
# Euclidean distance should be equivalent to calling the function.
X = rng.random_sample((5, 4))
S = pairwise_distances(X, metric="euclidean")
S2 = euclidean_distances(X)
assert_array_almost_equal(S, S2)
# Euclidean distance, with Y != X.
Y = rng.random_sample((2, 4))
S = pairwise_distances(X, Y, metric="euclidean")
S2 = euclidean_distances(X, Y)
assert_array_almost_equal(S, S2)
# Test with tuples as X and Y
X_tuples = tuple([tuple([v for v in row]) for row in X])
Y_tuples = tuple([tuple([v for v in row]) for row in Y])
S2 = pairwise_distances(X_tuples, Y_tuples, metric="euclidean")
assert_array_almost_equal(S, S2)
# Test haversine distance
# The data should be valid latitude and longitude
X = rng.random_sample((5, 2))
X[:, 0] = (X[:, 0] - 0.5) * 2 * np.pi/2
X[:, 1] = (X[:, 1] - 0.5) * 2 * np.pi
S = pairwise_distances(X, metric="haversine")
S2 = haversine_distances(X)
assert_array_almost_equal(S, S2)
# Test haversine distance, with Y != X
Y = rng.random_sample((2, 2))
Y[:, 0] = (Y[:, 0] - 0.5)*2*np.pi/2
Y[:, 1] = (Y[:, 1] - 0.5)*2*np.pi
S = pairwise_distances(X, Y, metric="haversine")
S2 = haversine_distances(X, Y)
assert_array_almost_equal(S, S2)
# "cityblock" uses scikit-learn metric, cityblock (function) is
# scipy.spatial.
S = pairwise_distances(X, metric="cityblock")
S2 = pairwise_distances(X, metric=cityblock)
assert_equal(S.shape[0], S.shape[1])
assert_equal(S.shape[0], X.shape[0])
assert_array_almost_equal(S, S2)
# The manhattan metric should be equivalent to cityblock.
S = pairwise_distances(X, Y, metric="manhattan")
S2 = pairwise_distances(X, Y, metric=cityblock)
assert_equal(S.shape[0], X.shape[0])
assert_equal(S.shape[1], Y.shape[0])
assert_array_almost_equal(S, S2)
# Test cosine as a string metric versus cosine callable
# The string "cosine" uses sklearn.metric,
# while the function cosine is scipy.spatial
S = pairwise_distances(X, Y, metric="cosine")
S2 = pairwise_distances(X, Y, metric=cosine)
assert_equal(S.shape[0], X.shape[0])
assert_equal(S.shape[1], Y.shape[0])
assert_array_almost_equal(S, S2)
# Test with sparse X and Y,
# currently only supported for Euclidean, L1 and cosine.
X_sparse = csr_matrix(X)
Y_sparse = csr_matrix(Y)
S = pairwise_distances(X_sparse, Y_sparse, metric="euclidean")
S2 = euclidean_distances(X_sparse, Y_sparse)
assert_array_almost_equal(S, S2)
S = pairwise_distances(X_sparse, Y_sparse, metric="cosine")
S2 = cosine_distances(X_sparse, Y_sparse)
assert_array_almost_equal(S, S2)
S = pairwise_distances(X_sparse, Y_sparse.tocsc(), metric="manhattan")
S2 = manhattan_distances(X_sparse.tobsr(), Y_sparse.tocoo())
assert_array_almost_equal(S, S2)
S2 = manhattan_distances(X, Y)
assert_array_almost_equal(S, S2)
# Test with scipy.spatial.distance metric, with a kwd
kwds = {"p": 2.0}
S = pairwise_distances(X, Y, metric="minkowski", **kwds)
S2 = pairwise_distances(X, Y, metric=minkowski, **kwds)
assert_array_almost_equal(S, S2)
# same with Y = None
kwds = {"p": 2.0}
S = pairwise_distances(X, metric="minkowski", **kwds)
S2 = pairwise_distances(X, metric=minkowski, **kwds)
assert_array_almost_equal(S, S2)
# Test that scipy distance metrics throw an error if sparse matrix given
assert_raises(TypeError, pairwise_distances, X_sparse, metric="minkowski")
assert_raises(TypeError, pairwise_distances, X, Y_sparse,
metric="minkowski")
# Test that a value error is raised if the metric is unknown
assert_raises(ValueError, pairwise_distances, X, Y, metric="blah")
开发者ID:scikit-learn,项目名称:scikit-learn,代码行数:98,代码来源:test_pairwise.py
示例20: test_pairwise_distances
def test_pairwise_distances():
""" Test the pairwise_distance helper function. """
rng = np.random.RandomState(0)
# Euclidean distance should be equivalent to calling the function.
X = rng.random_sample((5, 4))
S = pairwise_distances(X, metric="euclidean")
S2 = euclidean_distances(X)
assert_array_almost_equal(S, S2)
# Euclidean distance, with Y != X.
Y = rng.random_sample((2, 4))
S = pairwise_distances(X, Y, metric="euclidean")
S2 = euclidean_distances(X, Y)
assert_array_almost_equal(S, S2)
# Test with tuples as X and Y
X_tuples = tuple([tuple([v for v in row]) for row in X])
Y_tuples = tuple([tuple([v for v in row]) for row in Y])
S2 = pairwise_distances(X_tuples, Y_tuples, metric="euclidean")
assert_array_almost_equal(S, S2)
# "cityblock" uses sklearn metric, cityblock (function) is scipy.spatial.
S = pairwise_distances(X, metric="cityblock")
S2 = pairwise_distances(X, metric=cityblock)
assert_equal(S.shape[0], S.shape[1])
assert_equal(S.shape[0], X.shape[0])
assert_array_almost_equal(S, S2)
# The manhattan metric should be equivalent to cityblock.
S = pairwise_distances(X, Y, metric="manhattan")
S2 = pairwise_distances(X, Y, metric=cityblock)
assert_equal(S.shape[0], X.shape[0])
assert_equal(S.shape[1], Y.shape[0])
assert_array_almost_equal(S, S2)
# manhattan does not support sparse matrices atm.
assert_raises(ValueError, pairwise_distances, csr_matrix(X),
metric="manhattan")
# Low-level function for manhattan can divide in blocks to avoid
# using too much memory during the broadcasting
S3 = manhattan_distances(X, Y, size_threshold=10)
assert_array_almost_equal(S, S3)
# Test cosine as a string metric versus cosine callable
# "cosine" uses sklearn metric, cosine (function) is scipy.spatial
S = pairwise_distances(X, Y, metric="cosine")
S2 = pairwise_distances(X, Y, metric=cosine)
assert_equal(S.shape[0], X.shape[0])
assert_equal(S.shape[1], Y.shape[0])
assert_array_almost_equal(S, S2)
# Tests that precomputed metric returns pointer to, and not copy of, X.
S = np.dot(X, X.T)
S2 = pairwise_distances(S, metric="precomputed")
assert_true(S is S2)
# Test with sparse X and Y,
# currently only supported for euclidean and cosine
X_sparse = csr_matrix(X)
Y_sparse = csr_matrix(Y)
S = pairwise_distances(X_sparse, Y_sparse, metric="euclidean")
S2 = euclidean_distances(X_sparse, Y_sparse)
assert_array_almost_equal(S, S2)
S = pairwise_distances(X_sparse, Y_sparse, metric="cosine")
S2 = cosine_distances(X_sparse, Y_sparse)
assert_array_almost_equal(S, S2)
# Test with scipy.spatial.distance metric, with a kwd
kwds = {"p": 2.0}
S = pairwise_distances(X, Y, metric="minkowski", **kwds)
S2 = pairwise_distances(X, Y, metric=minkowski, **kwds)
assert_array_almost_equal(S, S2)
# same with Y = None
kwds = {"p": 2.0}
S = pairwise_distances(X, metric="minkowski", **kwds)
S2 = pairwise_distances(X, metric=minkowski, **kwds)
assert_array_almost_equal(S, S2)
# Test that scipy distance metrics throw an error if sparse matrix given
assert_raises(TypeError, pairwise_distances, X_sparse, metric="minkowski")
assert_raises(TypeError, pairwise_distances, X, Y_sparse,
metric="minkowski")
开发者ID:SHoltzen,项目名称:scikit-learn,代码行数:72,代码来源:test_pairwise.py
注:本文中的sklearn.metrics.pairwise.cosine_distances函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论