• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python pairwise.pairwise_distances函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中sklearn.metrics.pairwise.pairwise_distances函数的典型用法代码示例。如果您正苦于以下问题:Python pairwise_distances函数的具体用法?Python pairwise_distances怎么用?Python pairwise_distances使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了pairwise_distances函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: test_kneighbors_regressor_sparse

def test_kneighbors_regressor_sparse(n_samples=40,
                                     n_features=5,
                                     n_test_pts=10,
                                     n_neighbors=5,
                                     random_state=0):
    # Test radius-based regression on sparse matrices
    # Like the above, but with various types of sparse matrices
    rng = np.random.RandomState(random_state)
    X = 2 * rng.rand(n_samples, n_features) - 1
    y = ((X ** 2).sum(axis=1) < .25).astype(np.int)

    for sparsemat in SPARSE_TYPES:
        knn = neighbors.KNeighborsRegressor(n_neighbors=n_neighbors,
                                            algorithm='auto')
        knn.fit(sparsemat(X), y)

        knn_pre = neighbors.KNeighborsRegressor(n_neighbors=n_neighbors,
                                                metric='precomputed')
        knn_pre.fit(pairwise_distances(X, metric='euclidean'), y)

        for sparsev in SPARSE_OR_DENSE:
            X2 = sparsev(X)
            assert_true(np.mean(knn.predict(X2).round() == y) > 0.95)

            X2_pre = sparsev(pairwise_distances(X, metric='euclidean'))
            if issparse(sparsev(X2_pre)):
                assert_raises(ValueError, knn_pre.predict, X2_pre)
            else:
                assert_true(
                    np.mean(knn_pre.predict(X2_pre).round() == y) > 0.95)
开发者ID:BasilBeirouti,项目名称:scikit-learn,代码行数:30,代码来源:test_neighbors.py


示例2: make_rbf

def make_rbf(x,sigma,metric='euclidean', x2=None):
    if x.ndim == 1:
        x = np.expand_dims(x, 1)
    if x2 is None:
        x2 = x
    if metric == 'cosine':
        #This code may be faster for some matrices
        # Code from http://stackoverflow.com/questions/17627219/whats-the-fastest-way-in-python-to-calculate-cosine-similarity-given-sparse-mat
        '''
        tic()
        #x = x.toarray()
        #similarity = np.dot(x, x.T)
        similarity = (x.dot(x.T)).toarray()
        square_mag = np.diag(similarity)
        inv_square_mag = 1 / square_mag
        inv_square_mag[np.isinf(inv_square_mag)] = 0
        inv_mag = np.sqrt(inv_square_mag)
        W = similarity * inv_mag
        W = W.T * inv_mag
        W = 1 - W
        toc()
        tic()
        W2 = pairwise.pairwise_distances(x,x,metric)
        toc()
        '''
        W = pairwise.pairwise_distances(x,x2,metric)
    else:
        #tic()
        W = pairwise.pairwise_distances(x,x2,metric)
        #toc()
    W = np.square(W)
    W = -sigma * W
    W = np.exp(W)
    return W
开发者ID:adgress,项目名称:PythonFramework,代码行数:34,代码来源:array_functions.py


示例3: bipartite_clustering

def bipartite_clustering(D2W,word_cluster_num,doc_cluster_num,metric,criteria):
	W2D = D2W.transpose()
	W2WC = kmean(W2D,word_cluster_num,criteria)
	#word_cluster_num = np.amax(W2WC)+1
	#print "wc:",word_cluster_num
	for loop in range(4):
		#D2WC = D2W.dot(transform_from_index_array(W2WC,W2WC.size,word_cluster_num))
		#print D2WC
		#print loop
		new_centroids = get_new_centroids(W2D,W2WC)
		new_distance_matrix = pairwise_distances(W2D,new_centroids,metric=metric) #how to calculate distance? maybe 1-matrix?
		#print new_distance_matrix
		D2WC = D2W.dot(new_distance_matrix)
		if loop==0:
			D2DC = kmean(D2WC,doc_cluster_num,criteria)
		else:
			new_centroids = get_new_centroids(D2WC,D2DC)
			D2DC = kmean(D2WC,doc_cluster_num,criteria,new_centroids)
		#doc_cluster_num = np.amax(D2DC)+1
		#print "dc:",doc_cluster_num
		new_centroids = get_new_centroids(D2W,D2DC)
		new_distance_matrix = pairwise_distances(D2W,new_centroids,metric=metric) 
		W2DC = W2D.dot(new_distance_matrix)
		new_centroids = get_new_centroids(W2DC,W2WC)
		W2WC = kmean(W2DC,word_cluster_num,criteria,new_centroids)
		#word_cluster_num = np.amax(W2WC)+1
		#print "wc:",word_cluster_num 
	return D2DC,W2WC
开发者ID:Smuzi,项目名称:Text-Mining,代码行数:28,代码来源:bpc.py


示例4: generate_dist_stats_feat

def generate_dist_stats_feat(metric, X_train, ids_train, X_test, ids_test, indices_dict):
    ## stats parameters 
    quantiles_range = np.arange(0, 1.5, 0.5)
    stats_func = [ np.mean, np.std ]
    stats_feat_num = len(quantiles_range) + len(stats_func)
    n_class_relevance = 13
    
    if metric == "cosine":
        stats_feat = 0 * np.ones((len(ids_test), stats_feat_num*n_class_relevance), dtype=float)
        sim = 1. - pairwise_distances(X_test, X_train, metric=metric, n_jobs=1)
    elif metric == "euclidean":
        stats_feat = -1 * np.ones((len(ids_test), stats_feat_num*n_class_relevance), dtype=float)
        sim = pairwise_distances(X_test, X_train, metric=metric, n_jobs=1)

    print("pairwise_distances generated!")
    for i in range(len(ids_test)):
        id = ids_test[i]
        for j in range(n_class_relevance):
            key = j
            if key in indices_dict:
                inds = indices_dict[key]
                # exclude this sample itself from the list of indices
                inds = [ ind for ind in inds if id != ids_train[ind] ]
                sim_tmp = sim[i][inds]
                if len(sim_tmp) != 0:
                    feat = [ func(sim_tmp) for func in stats_func ]
                    ## quantile
                    sim_tmp = pd.Series(sim_tmp)
                    quantiles = sim_tmp.quantile(quantiles_range)
                    feat = np.hstack((feat, quantiles))
                    stats_feat[i,j*stats_feat_num:(j+1)*stats_feat_num] = feat
    return stats_feat
开发者ID:aaxwaz,项目名称:Kaggle_HomeDepot_Stacking,代码行数:32,代码来源:utils.py


示例5: dunn

    def dunn(max_nc, all_labels, dataset):
        dunn = []
        print "DUNN (MAX)..."
        for nc in xrange(2, max_nc + 1):
            dn = 0.0
            max_intra = 0.0
            for cluster_i in xrange(nc):
                instances_i = dataset[np.where(all_labels[nc - 2] == cluster_i)[0]]
                pairwase_matrix_intra = pairwise_distances(instances_i, n_jobs=1)
                new_max_intra = np.amax(pairwase_matrix_intra)
                if new_max_intra > max_intra:
                    max_intra = new_max_intra
            for cluster_i in xrange(nc):
                instances_i = dataset[np.where(all_labels[nc - 2] == cluster_i)[0]]
                for cluster_j in xrange(nc):
                    if cluster_j > cluster_i:
                        instances_j = dataset[np.where(all_labels[nc - 2] == cluster_j)[0]]
                        pairwase_matrix_inter = pairwise_distances(instances_i, instances_j, n_jobs=1)
                        min_inter = np.amin(pairwase_matrix_inter)

                        if dn == 0.0:
                            dn = min_inter / max_intra
                        elif min_inter / max_intra < dn:
                            dn = min_inter / max_intra
            print 'DUNN for k = ' + str(nc) + ' is ' + str(dn) + ' ...'
            dunn += [dn]
        return dunn
开发者ID:nielsenrechia,项目名称:clustering-measurements,代码行数:27,代码来源:measurements.py


示例6: pairwise_distances

def pairwise_distances(X, Y=None, index=None, metric="euclidean"):
    '''
    Compute the distance matrix from a vector array X and optional Y.
    This method takes either a vector array or a distance matrix,
    and returns a distance matrix. If the input is a vector array,
    the distances are computed. If the input is a distances matrix,
    it is returned instead.
    This method provides a safe way to take a distance matrix as input,
    while preserving compatibility with many other algorithms that take
    a vector array.

    :param X:  array [n_samples_a, n_samples_a]
        Array of pairwise distances between samples, or a feature array.
    :param Y:   array [n_samples_b, n_features]
        A second feature array only if X has shape [n_samples_a, n_features].
    :param index:  int, the index of element in X array
    :param metric: The metric to use when calculating distance between instances in a feature array.
        If metric ='rmsd', it should be computed by MDTraj
    :return: The distances
    '''
    if metric == "rmsd":
        if Y is None:
            distances_ = md.rmsd(X, X, index, parallel=True, precentered=True)
        else:
            #distances_ = np.empty((len(X), len(Y)), dtype=np.float32)
           # for i in xrange(len(Y)):
            distances_ = md.rmsd(X, Y, index, parallel=True, precentered=True)
        return distances_
    else:
        if Y is None:
            print "if Y is None"
            return sp.pairwise_distances(X, X[index], metric=metric)
        if index is None:
            print "if index is None, pairwise XX"
            return sp.pairwise_distances(X, X, metric=metric)
开发者ID:liusong299,项目名称:HK_DataMiner,代码行数:35,代码来源:pairwise.py


示例7: generate_dist_stats_feat

def generate_dist_stats_feat(metric, X_train, ids_train, X_test, ids_test, indices_dict, qids_test=None):
    if metric == "cosine":
        stats_feat = 0 * np.ones((len(ids_test), stats_feat_num * n_classes), dtype=float)
        sim = 1. - pairwise_distances(X_test, X_train, metric=metric, n_jobs=1)
    elif metric == "euclidean":
        stats_feat = -1 * np.ones((len(ids_test), stats_feat_num * n_classes), dtype=float)
        sim = pairwise_distances(X_test, X_train, metric=metric, n_jobs=1)

    for i in range(len(ids_test)):
        id = ids_test[i]
        if qids_test is not None:
            qid = qids_test[i]
        for j in range(n_classes):
            key = (qid, j + 1) if qids_test is not None else j + 1
            if indices_dict.has_key(key):
                inds = indices_dict[key]
                # exclude this sample itself from the list of indices
                inds = [ind for ind in inds if id != ids_train[ind]]
                sim_tmp = sim[i][inds]
                if len(sim_tmp) != 0:
                    feat = [func(sim_tmp) for func in stats_func]
                    ## quantile
                    sim_tmp = pd.Series(sim_tmp)
                    quantiles = sim_tmp.quantile(quantiles_range)
                    feat = np.hstack((feat, quantiles))
                    stats_feat[i, j * stats_feat_num:(j + 1) * stats_feat_num] = feat
    return stats_feat
开发者ID:venkataravuri,项目名称:kaggle_homedepot,代码行数:27,代码来源:feature_extract_basic_tiidf.py


示例8: test_no_data_conversion_warning

def test_no_data_conversion_warning():
    # No warnings issued if metric is not a boolean distance function
    rng = np.random.RandomState(0)
    X = rng.randn(5, 4)
    with pytest.warns(None) as records:
        pairwise_distances(X, metric="minkowski")
    assert len(records) == 0
开发者ID:scikit-learn,项目名称:scikit-learn,代码行数:7,代码来源:test_pairwise.py


示例9: trustworthiness

def trustworthiness(X, X_embedded, n_neighbors=5, precomputed=False):
    """Expresses to what extent the local structure is retained.

    The trustworthiness is within [0, 1]. It is defined as

    .. math::

        T(k) = 1 - \frac{2}{nk (2n - 3k - 1)} \sum^n_{i=1}
            \sum_{j \in U^{(k)}_i (r(i, j) - k)}

    where :math:`r(i, j)` is the rank of the embedded datapoint j
    according to the pairwise distances between the embedded datapoints,
    :math:`U^{(k)}_i` is the set of points that are in the k nearest
    neighbors in the embedded space but not in the original space.

    * "Neighborhood Preservation in Nonlinear Projection Methods: An
      Experimental Study"
      J. Venna, S. Kaski
    * "Learning a Parametric Embedding by Preserving Local Structure"
      L.J.P. van der Maaten

    Parameters
    ----------
    X : array, shape (n_samples, n_features) or (n_samples, n_samples)
        If the metric is 'precomputed' X must be a square distance
        matrix. Otherwise it contains a sample per row.

    X_embedded : array, shape (n_samples, n_components)
        Embedding of the training data in low-dimensional space.

    n_neighbors : int, optional (default: 5)
        Number of neighbors k that will be considered.

    precomputed : bool, optional (default: False)
        Set this flag if X is a precomputed square distance matrix.

    Returns
    -------
    trustworthiness : float
        Trustworthiness of the low-dimensional embedding.
    """
    if precomputed:
        dist_X = X
    else:
        dist_X = pairwise_distances(X, squared=True)
    dist_X_embedded = pairwise_distances(X_embedded, squared=True)
    ind_X = np.argsort(dist_X, axis=1)
    ind_X_embedded = np.argsort(dist_X_embedded, axis=1)[:, 1:n_neighbors + 1]

    n_samples = X.shape[0]
    t = 0.0
    ranks = np.zeros(n_neighbors)
    for i in range(n_samples):
        for j in range(n_neighbors):
            ranks[j] = np.where(ind_X[i] == ind_X_embedded[i, j])[0][0]
        ranks -= n_neighbors
        t += np.sum(ranks[ranks > 0])
    t = 1.0 - t * (2.0 / (n_samples * n_neighbors *
                          (2.0 * n_samples - 3.0 * n_neighbors - 1.0)))
    return t
开发者ID:lmcinnes,项目名称:sstsne,代码行数:60,代码来源:ss_t_sne.py


示例10: test_radius_neighbors

def test_radius_neighbors():
    # Checks whether Returned distances are less than `radius`
    # At least one point should be returned when the `radius` is set
    # to mean distance from the considering point to other points in
    # the database.
    # Moreover, this test compares the radius neighbors of LSHForest
    # with the `sklearn.neighbors.NearestNeighbors`.
    n_samples = 12
    n_features = 2
    n_iter = 10
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = ignore_warnings(LSHForest, category=DeprecationWarning)()
    # Test unfitted estimator
    assert_raises(ValueError, lshf.radius_neighbors, X[0])

    ignore_warnings(lshf.fit)(X)

    for i in range(n_iter):
        # Select a random point in the dataset as the query
        query = X[rng.randint(0, n_samples)].reshape(1, -1)

        # At least one neighbor should be returned when the radius is the
        # mean distance from the query to the points of the dataset.
        mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
        neighbors = lshf.radius_neighbors(query, radius=mean_dist,
                                          return_distance=False)

        assert_equal(neighbors.shape, (1,))
        assert_equal(neighbors.dtype, object)
        assert_greater(neighbors[0].shape[0], 0)
        # All distances to points in the results of the radius query should
        # be less than mean_dist
        distances, neighbors = lshf.radius_neighbors(query,
                                                     radius=mean_dist,
                                                     return_distance=True)
        assert_array_less(distances[0], mean_dist)

    # Multiple points
    n_queries = 5
    queries = X[rng.randint(0, n_samples, n_queries)]
    distances, neighbors = lshf.radius_neighbors(queries,
                                                 return_distance=True)

    # dists and inds should not be 1D arrays or arrays of variable lengths
    # hence the use of the object dtype.
    assert_equal(distances.shape, (n_queries,))
    assert_equal(distances.dtype, object)
    assert_equal(neighbors.shape, (n_queries,))
    assert_equal(neighbors.dtype, object)

    # Compare with exact neighbor search
    query = X[rng.randint(0, n_samples)].reshape(1, -1)
    mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
    nbrs = NearestNeighbors(algorithm='brute', metric='cosine').fit(X)

    distances_exact, _ = nbrs.radius_neighbors(query, radius=mean_dist)
    distances_approx, _ = lshf.radius_neighbors(query, radius=mean_dist)
开发者ID:AlexisMignon,项目名称:scikit-learn,代码行数:59,代码来源:test_approximate.py


示例11: getSimMat

    def getSimMat(self, type = 'euclidean', ftr_type = 'data', orderFlag = True, pca_dim=20):
        if ftr_type == 'ftr':
            #use input features
            self.slctData = [ts for ts in self.slctData if ((ts.ftr is not None) and (len(ts.ftr) > 0))]
            dataMat = [ts.ftr for ts in self.slctData]
        elif ftr_type == 'data':
            #use input data
            dataMat = [ts.val for ts in self.slctData]
        else:
            print 'unknown ftr_type for ftr_type:', ftr_type
        if pca_dim > len(dataMat):
            pca_dim = int(math.ceil(len(dataMat)/2.0))

        if type  == 'euclidean': #euclidean distance based on time series data
            self.simMat = skmpw.euclidean_distances(dataMat)
        elif type == 'pca_euc': #extract feature based on PCA, then use Euclidean distance
            pca = skd.PCA(n_components=pca_dim)
            dataMat = pca.fit_transform(dataMat)
            self.simMat = skmpw.euclidean_distances(dataMat)
        elif type == 'nmf_euc': #extract feature based on NMF, then use Euclidean distance
            nmf = skd.NMF(n_components=pca_dim)
            dataMat = nmf.fit_transform(dataMat)
            self.simMat = skmpw.euclidean_distances(dataMat)
        elif type =='ica_euc': #extract feature based on ICA, then use Euclidean distance
            ica = skd.FastICA(n_components=pca_dim)
            dataMat = ica.fit_transform(dataMat)
            self.simMat = skmpw.euclidean_distances(dataMat)
        elif type =='cosine':
            self.simMat = skmpw.pairwise_distances(dataMat, metric='cosine')
        elif type == 'pca_cos': #extract feature based on PCA, then use cosine distance
            pca = skd.PCA(n_components=pca_dim)
            dataMat = pca.fit_transform(dataMat)
            self.simMat = skmpw.pairwise_distances(dataMat, metric='cosine')
        elif type == 'nmf_cos': #extract feature based on NMF, then use cosine distance
            nmf = skd.NMF(n_components=pca_dim)
            dataMat = nmf.fit_transform(dataMat)
            self.simMat = skmpw.pairwise_distances(dataMat, metric='cosine')
        elif type =='ica_cos': #extract feature based on ICA, then use cosine distance
            ica = skd.FastICA(n_components=pca_dim)
            dataMat = ica.fit_transform(dataMat)
            self.simMat = skmpw.pairwise_distances(dataMat, metric='cosine')
        else:
            print 'unknown type for similarity matrix: ', type

        #rearrange the order of data in simMat
        self.slctDataMat = dataMat
        if orderFlag:
            link = spc.hierarchy.linkage(self.simMat)
            dend = spc.hierarchy.dendrogram(link, no_plot=True)
            order = dend['leaves']
            self.slctData = [self.slctData[i] for i in order] #rearrange order
            self.simMat = [self.simMat[i] for i in order]
            for i in xrange(len(self.simMat)):
                self.simMat[i] = [self.simMat[i][j] for j in order]
            self.slctDataMat = [self.slctDataMat[i] for i in order]
        # self.patchOrdering = [ts.ptchNm for ts in self.slctData] #record new ordering
        self.patchOrdering = JSONifyData(self.slctData) # Deok wants all the data for each patch in the response
        self.clstData = self.slctData
        self.clstSimMat = self.simMat
开发者ID:KonstantinosX,项目名称:TimeGrouper,代码行数:59,代码来源:ts_clustering.py


示例12: predict

def predict(dialogue_session, line):
    lowest = ('x',1)
    data = dataDict[dialogue_session][1][line,:]
    for vector in vDict:
        predictor = vDict[vector]
        if pair.pairwise_distances(predictor,data,'cosine') < lowest[1]:
            lowest = (vector, pair.pairwise_distances(predictor,data,'cosine'))
    return lowest
开发者ID:DmitriyLeybel,项目名称:multimodal_prediction,代码行数:8,代码来源:predictability_svd_final.py


示例13: cramer_statistic

    def cramer_statistic(self, n_jobs=1):
        '''
        Applies the Cramer Statistic to the datasets.

        Parameters
        ----------
        n_jobs : int, optional
            Sets the number of cores to use to calculate
            pairwise distances. Default is 1.
        '''
        # Adjust what we call n,m based on the larger dimension.
        # Then the looping below is valid.
        if self.data_matrix1.shape[0] >= self.data_matrix2.shape[0]:
            m = self.data_matrix1.shape[0]
            n = self.data_matrix2.shape[0]
            larger = self.data_matrix1
            smaller = self.data_matrix2
        else:
            n = self.data_matrix1.shape[0]
            m = self.data_matrix2.shape[0]
            larger = self.data_matrix2
            smaller = self.data_matrix1

        pairdist11 = pairwise_distances(larger, metric="euclidean",
                                        n_jobs=n_jobs)
        pairdist22 = pairwise_distances(smaller, metric="euclidean",
                                        n_jobs=n_jobs)
        pairdist12 = pairwise_distances(larger, smaller,
                                        metric="euclidean", n_jobs=n_jobs)

        # Take sqrt of each
        # We default to using the Cramer kernel in Baringhaus & Franz (2004)
        # \phi(dist) = sqrt(dist) / 2.
        # The normalization values below reflect this
        pairdist11 = np.sqrt(pairdist11)
        pairdist12 = np.sqrt(pairdist12)
        pairdist22 = np.sqrt(pairdist22)

        term1 = 0.0
        term2 = 0.0
        term3 = 0.0
        for i in range(m):
            for j in range(n):
                term1 += pairdist12[i, j]
            for ii in range(m):
                term2 += pairdist11[i, ii]

            if i < n:
                for jj in range(n):
                    term3 += pairdist22[i, jj]

        m, n = float(m), float(n)

        term1 *= (1 / (m * n))
        term2 *= (1 / (2 * m ** 2.))
        term3 *= (1 / (2 * n ** 2.))

        self._distance = (m * n / (m + n)) * (term1 - term2 - term3)
开发者ID:Astroua,项目名称:TurbuStat,代码行数:58,代码来源:cramer.py


示例14: run_step

    def run_step(self, run_number, step_size, howlong):
        dfslot = self.get_input_slot("df")
        df = dfslot.data()
        dfslot.update(run_number)
        if dfslot.has_updated() or dfslot.has_deleted():
            dfslot.reset()
            logger.info("Reseting history because of changes in the input df")
            dfslot.update(run_number, df)
            # TODO: be smarter with changed values

        m = step_size

        indices = dfslot.next_created(m)
        m = indices_len(indices)

        i = None
        j = None
        Si = self._buf.matrix()

        arrayslot = self.get_input_slot("array")
        if arrayslot is not None and arrayslot.data() is not None:
            array = arrayslot.data()
            logger.debug("Using array instead of DataFrame columns")
            if Si is not None:
                i = array[self._last_index]
            j = array[indices]
        if j is None:
            if self.columns is None:
                self.columns = df.columns.delete(np.where(df.columns == Module.UPDATE_COLUMN))
            elif not isinstance(self.columns, pd.Index):
                self.columns = pd.Index(self.columns)
            rows = df[self.columns]
            if Si is not None:
                i = rows.loc[self._last_index]
                assert len(i) == len(self._last_index)
            j = rows.loc[fix_loc(indices)]
            assert len(j) == indices_len(indices)

        Sj = pairwise_distances(j, metric=self._metric, n_jobs=self._n_jobs)
        if Si is None:
            mat = self._buf.resize(Sj.shape[0])
            mat[:, :] = Sj
            self._last_index = dfslot.last_index[indices]
        else:
            Sij = pairwise_distances(i, j, metric=self._metric, n_jobs=self._n_jobs)
            n0 = i.shape[0]
            n1 = n0 + j.shape[0]
            mat = self._buf.resize(n1)
            mat[0:n0, n0:n1] = Sij
            mat[n0:n1, 0:n0] = Sij.T
            mat[n0:n1, n0:n1] = Sj
            self._last_index = self._last_index.append(df.index[indices])
            # truth = pairwise_distances(array[0:n1], metric=self._metric)
            # import pdb
            # pdb.set_trace()
            # assert np.allclose(mat,truth)
        return self._return_run_step(dfslot.next_state(), steps_run=m)
开发者ID:jdfekete,项目名称:progressivis,代码行数:57,代码来源:pairwise.py


示例15: test_radius_neighbors

def test_radius_neighbors():
    """Checks whether Returned distances are less than `radius`

    At least one point should be returned when the `radius` is set
    to mean distance from the considering point to other points in
    the database.
    Moreover, this test compares the radius neighbors of LSHForest
    with the `sklearn.neighbors.NearestNeighbors`.
    """
    n_samples = 12
    n_features = 2
    n_iter = 10
    rng = np.random.RandomState(42)
    X = rng.rand(n_samples, n_features)

    lshf = LSHForest()
    # Test unfitted estimator
    assert_raises(ValueError, lshf.radius_neighbors, X[0])

    lshf.fit(X)

    for i in range(n_iter):
        query = X[rng.randint(0, n_samples)]
        mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
        neighbors = lshf.radius_neighbors(query, radius=mean_dist,
                                          return_distance=False)
        # At least one neighbor should be returned.
        assert_greater(neighbors.shape[0], 0)
        # All distances should be less than mean_dist
        distances, neighbors = lshf.radius_neighbors(query,
                                                     radius=mean_dist,
                                                     return_distance=True)
        assert_array_less(distances[0], mean_dist)

    # Multiple points
    n_queries = 5
    queries = X[rng.randint(0, n_samples, n_queries)]
    distances, neighbors = lshf.radius_neighbors(queries,
                                                 return_distance=True)
    assert_equal(neighbors.shape[0], n_queries)
    assert_equal(distances.shape[0], n_queries)
    # dists and inds should not be 2D arrays
    assert_equal(distances.ndim, 1)
    assert_equal(neighbors.ndim, 1)

    # Compare with exact neighbor search
    query = X[rng.randint(0, n_samples)]
    mean_dist = np.mean(pairwise_distances(query, X, metric='cosine'))
    nbrs = NearestNeighbors(algorithm='brute', metric='cosine')
    nbrs.fit(X)

    distances_approx, _ = lshf.radius_neighbors(query, radius=mean_dist)
    distances_exact, _ = nbrs.radius_neighbors(query, radius=mean_dist)
    # Distances of exact neighbors is less than or equal to approximate
    assert_true(np.all(np.less_equal(np.sort(distances_exact[0]),
                                     np.sort(distances_approx[0]))))
开发者ID:CC-Fu-CC,项目名称:scikit-learn,代码行数:56,代码来源:test_approximate.py


示例16: fit

	def fit(self, X, y=None, c=None):
		"""Fit the model using X as training data.

		Parameters
		----------
		X : array, shape (n_samples, n_features) or (n_samples, n_samples)
			If the metric is 'precomputed' X must be a square distance
			matrix. Otherwise it contains a sample per row.
		"""
		X = check_array(X, accept_sparse=['csr', 'csc', 'coo'], dtype=np.float64)
		random_state = check_random_state(self.random_state)

		if self.early_exaggeration < 1.0:
			raise ValueError("early_exaggeration must be at least 1, but is "
							 "%f" % self.early_exaggeration)

		if self.n_iter < 200:
			raise ValueError("n_iter should be at least 200")

		if self.metric == "precomputed":
			if self.init == 'pca':
				raise ValueError("The parameter init=\"pca\" cannot be used "
								 "with metric=\"precomputed\".")
			if X.shape[0] != X.shape[1]:
				raise ValueError("X should be a square distance matrix")
			distances = X
		else:
			if self.verbose:
				print("[t-SNE] Computing pairwise distances...")

			if self.metric == "euclidean":
				distances = pairwise_distances(X, metric=self.metric, squared=True)
			else:
				distances = pairwise_distances(X, metric=self.metric)

		# Degrees of freedom of the Student's t-distribution. The suggestion
		# alpha = n_components - 1 comes from "Learning a Parametric Embedding
		# by Preserving Local Structure" Laurens van der Maaten, 2009.
		alpha = max(self.n_components - 1.0, 1)
		n_samples = X.shape[0]
		self.training_data_ = X

		P = _joint_probabilities(distances, self.perplexity, self.verbose)
		self.P = deepcopy(P)
		if self.init == 'pca':
			pca = RandomizedPCA(n_components=self.n_components,
								random_state=random_state)
			X_embedded = pca.fit_transform(X)
		elif self.init == 'random':
			X_embedded = None
		else:
			raise ValueError("Unsupported initialization scheme: %s"
							 % self.init)

		self.embedding_ = self._tsne(P, alpha, n_samples, random_state,
									 X_embedded=X_embedded, c=c)
开发者ID:Kazjon,项目名称:deep_creeval,代码行数:56,代码来源:AnimTSNE_experiment.py


示例17: knn_dist

def knn_dist(x, x_ctrl, s=100, p=1):
    x_tmp = random_subsample(x_ctrl, 200000, replace=False)
    xs = kmeans_subsample(x_tmp, s)
    if p == 1:
        min_dist = np.min(pairwise_distances(X=x, Y=xs, metric="l1"), axis=1)
    elif p == 2:
        min_dist = np.min(pairwise_distances(X=x, Y=xs, metric="l2"), axis=1)
    assert len(min_dist) == x.shape[0]

    return min_dist
开发者ID:eiriniar,项目名称:CellCnn,代码行数:10,代码来源:downsample.py


示例18: multiQuadricKernel

    def multiQuadricKernel(X, X2=None, offset=1.0, jobs=1, *args, **kwargs):

        offset = float(offset)
        if X2 is not None:
            distanceMatrix = pairwise.pairwise_distances(X, X2, n_jobs=jobs)
        else:
            distanceMatrix = pairwise.pairwise_distances(X, n_jobs=jobs)
        result = np.sqrt(distanceMatrix**2 + offset**2)

        return result
开发者ID:FabianIsensee,项目名称:machine_learning_1,代码行数:10,代码来源:ex6jens.py


示例19: kpca_cluster

def kpca_cluster(data,nclusters=100,ncomponents=40,topwhat=10,zscored=False):
    '''

    Computes clustering of bag-of-words vectors of articles

    INPUT
    folder      model folder
    nclusters   number of clusters

    '''
    from sklearn.cluster import KMeans
    # filtering out some noise words
    stops = map(lambda x:x.lower().strip(),open('stopwords.txt').readlines()[6:])

    # vectorize non-stopwords 
    bow = TfidfVectorizer(min_df=2,stop_words=stops)
    X = bow.fit_transform(data)

    # creating bow-index-to-word map
    idx2word = dict(zip(bow.vocabulary_.values(),bow.vocabulary_.keys()))

    # using now stopwords and filtering out digits
    print 'Computing pairwise distances' 
    K = pairwise_distances(X,metric='l2',n_jobs=1)
    perc = 50.0
    width = percentile(K.flatten(),perc)

    # KPCA transform bow vectors
    Xc = KernelPCA(n_components=ncomponents,kernel='rbf',gamma=width).fit_transform(X)
    
    if zscored:
        Xc = zscore(Xc)
    
    # compute clusters
    km = KMeans(n_clusters=nclusters).fit(Xc)
    Xc = km.predict(Xc)

    clusters = []
    for icluster in range(nclusters):
        nmembers = (Xc==icluster).sum()
        if True:#nmembers < len(data) / 5.0 and nmembers > 1: # only group clusters big enough but not too big
            members = (Xc==icluster).nonzero()[0]
            topwordidx = array(X[members,:].sum(axis=0))[0].argsort()[-topwhat:][::-1]
            topwords = ' '.join([idx2word[wi] for wi in topwordidx])
            meanDist = triu(pairwise_distances(X[members,:],metric='l2',n_jobs=1)).sum()
            meanDist = meanDist / (len(members) + (len(members)**2 - len(members))/2.0)
            # print u'Cluster %d'%icluster + u' %d members'%nmembers + u' mean Distance %f'%meanDist + u'\n\t'+topwords
            clusters.append({
                'name':'Cluster-%d'%icluster,
                'description': topwords,
                'members': list(members),
                'meanL2Distances': meanDist
                })

    return clusters
开发者ID:christinakraus,项目名称:political-affiliation-prediction,代码行数:55,代码来源:newsreader.py


示例20: cramer_statistic

    def cramer_statistic(self, n_jobs=1):
        '''
        Applies the Cramer Statistic to the datasets.

        Parameters
        ----------

        n_jobs : int, optional
            Sets the number of cores to use to calculate
            pairwise distances
        '''
        # Adjust what we call n,m based on the larger dimension.
        # Then the looping below is valid.
        if self.data_matrix1.shape[0] >= self.data_matrix2.shape[0]:
            m = self.data_matrix1.shape[0]
            n = self.data_matrix2.shape[0]
            larger = self.data_matrix1
            smaller = self.data_matrix2
        else:
            n = self.data_matrix1.shape[0]
            m = self.data_matrix2.shape[0]
            larger = self.data_matrix2
            smaller = self.data_matrix1

        pairdist11 = pairwise_distances(
            larger, metric="euclidean", n_jobs=n_jobs)
        pairdist22 = pairwise_distances(
            smaller, metric="euclidean", n_jobs=n_jobs)
        pairdist12 = pairwise_distances(
            larger, smaller,
            metric="euclidean", n_jobs=n_jobs)

        term1 = 0.0
        term2 = 0.0
        term3 = 0.0
        for i in range(m):
            for j in range(n):
                term1 += pairdist12[i, j]
            for ii in range(m):
                term2 += pairdist11[i, ii]

            if i < n:
                for jj in range(n):
                    term3 += pairdist22[i, jj]

        m, n = float(m), float(n)

        term1 *= (1 / (m * n))
        term2 *= (1 / (2 * m ** 2.))
        term3 *= (1 / (2 * n ** 2.))

        self.distance = (m * n / (m + n)) * (term1 - term2 - term3)

        return self
开发者ID:keflavich,项目名称:TurbuStat,代码行数:54,代码来源:cramer.py



注:本文中的sklearn.metrics.pairwise.pairwise_distances函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python pairwise.pairwise_distances_chunked函数代码示例发布时间:2022-05-27
下一篇:
Python pairwise.manhattan_distances函数代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap