• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

Python metrics.pairwise_distances函数代码示例

原作者: [db:作者] 来自: [db:来源] 收藏 邀请

本文整理汇总了Python中sklearn.metrics.pairwise_distances函数的典型用法代码示例。如果您正苦于以下问题:Python pairwise_distances函数的具体用法?Python pairwise_distances怎么用?Python pairwise_distances使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。



在下文中一共展示了pairwise_distances函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。

示例1: test_silhouette

def test_silhouette():
    # Tests the Silhouette Coefficient.
    dataset = datasets.load_iris()
    X = dataset.data
    y = dataset.target
    D = pairwise_distances(X, metric='euclidean')
    # Given that the actual labels are used, we can assume that S would be
    # positive.
    silhouette = silhouette_score(D, y, metric='precomputed')
    assert(silhouette > 0)
    # Test without calculating D
    silhouette_metric = silhouette_score(X, y, metric='euclidean')
    assert_almost_equal(silhouette, silhouette_metric)
    # Test with sampling
    silhouette = silhouette_score(D, y, metric='precomputed',
                                  sample_size=int(X.shape[0] / 2),
                                  random_state=0)
    silhouette_metric = silhouette_score(X, y, metric='euclidean',
                                         sample_size=int(X.shape[0] / 2),
                                         random_state=0)
    assert(silhouette > 0)
    assert(silhouette_metric > 0)
    assert_almost_equal(silhouette_metric, silhouette)
    # Test with sparse X
    X_sparse = csr_matrix(X)
    D = pairwise_distances(X_sparse, metric='euclidean')
    silhouette = silhouette_score(D, y, metric='precomputed')
    assert(silhouette > 0)
开发者ID:0664j35t3r,项目名称:scikit-learn,代码行数:28,代码来源:test_unsupervised.py


示例2: _hdbscan_generic

def _hdbscan_generic(X, min_samples=5, alpha=1.0,
                     metric='minkowski', p=2, leaf_size=None, gen_min_span_tree=False):
    if metric == 'minkowski':
        if p is None:
            raise TypeError('Minkowski metric given but no p value supplied!')
        if p < 0:
            raise ValueError('Minkowski metric with negative p value is not defined!')

        distance_matrix = pairwise_distances(X, metric=metric, p=p)
    else:
        distance_matrix = pairwise_distances(X, metric=metric)

    mutual_reachability_ = mutual_reachability(distance_matrix,
                                               min_samples, alpha)

    min_spanning_tree = mst_linkage_core(mutual_reachability_)

    if gen_min_span_tree:
        result_min_span_tree = min_spanning_tree.copy()
        for index, row in enumerate(result_min_span_tree[1:], 1):
            candidates = np.where(np.isclose(mutual_reachability_[row[1]], row[2]))[0]
            candidates = np.intersect1d(candidates, min_spanning_tree[:index, :2].astype(int))
            candidates = candidates[candidates != row[1]]
            assert (len(candidates) > 0)
            row[0] = candidates[0]
    else:
        result_min_span_tree = None

    min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :]

    single_linkage_tree = label(min_spanning_tree)

    return single_linkage_tree, result_min_span_tree
开发者ID:xsongx,项目名称:hdbscan,代码行数:33,代码来源:hdbscan_.py


示例3: _hdbscan_small_kdtree

def _hdbscan_small_kdtree(X, min_cluster_size=5, min_samples=None, 
                          metric='minkowski', p=2):

    if metric == 'minkowski':
        if p is None:
            raise TypeError('Minkowski metric given but no p value supplied!')
        if p < 0:
            raise ValueError('Minkowski metric with negative p value is not defined!')

        distance_matrix = pairwise_distances(X, metric=metric, p=p)
    else:
        distance_matrix = pairwise_distances(X, metric=metric)

    mutual_reachability_ = kdtree_mutual_reachability(X, 
                                                      distance_matrix,
                                                      metric,
                                                      p=p,
                                                      min_points=min_samples)

    min_spanning_tree = mst_linkage_core(mutual_reachability_)
    min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :]
    
    single_linkage_tree = label(min_spanning_tree)
    condensed_tree = condense_tree(single_linkage_tree, 
                                               min_cluster_size)
    stability_dict = compute_stability(condensed_tree)
    cluster_list = get_clusters(condensed_tree, stability_dict)
    
    labels = -1 * np.ones(X.shape[0], dtype=int)
    for index, cluster in enumerate(cluster_list):
        labels[cluster] = index
    return labels, condensed_tree, single_linkage_tree, min_spanning_tree
开发者ID:rbkreisberg,项目名称:hdbscan,代码行数:32,代码来源:hdbscan_.py


示例4: smart_initialize

def smart_initialize(data, k, seed=None):
    """
    Use k-means++ to initialize a good set of centroids
    :param data: whole dataset
    :param k: number of centroids
    :param seed: random seed
    :return: initial centroids
    """
    if seed is not None:  # useful for obtaining consistent results
        np.random.seed(seed)
    centroids = np.zeros((k, data.shape[1]))

    # Randomly choose the first centroid.
    # Since we have no prior knowledge, choose uniformly at random
    idx = np.random.randint(data.shape[0])
    centroids[0] = data[idx, :].toarray()
    # Compute distances from the first centroid chosen to all the other data points
    distances = pairwise_distances(data, centroids[0:1], metric='euclidean').flatten()

    for i in range(1, k):
        # Choose the next centroid randomly, so that the probability for each data point to be chosen
        # is directly proportional to its squared distance from the nearest centroid.
        # Roughly speaking, a new centroid should be as far as from other centroids as possible.
        idx = np.random.choice(data.shape[0], 1, p=distances / sum(distances))
        centroids[i] = data[idx, :].toarray()
        # Now compute distances from the centroids to all data points
        distances = np.min(pairwise_distances(data, centroids[0:i + 1], metric='euclidean'), axis=1)

    return centroids
开发者ID:yf23,项目名称:Machine_Learning_UW,代码行数:29,代码来源:assignment.py


示例5: _rsl_small_kdtree

def _rsl_small_kdtree(X, cut, k=5, alpha=1.4142135623730951, gamma=5, metric='minkowski', p=2):

    if metric == 'minkowski':
        if p is None:
            raise TypeError('Minkowski metric given but no p value supplied!')
        if p < 0:
            raise ValueError('Minkowski metric with negative p value is not defined!')

        distance_matrix = pairwise_distances(X, metric=metric, p=p)
    else:
        distance_matrix = pairwise_distances(X, metric=metric)

    mutual_reachability_ = kdtree_mutual_reachability(X,
                                                      distance_matrix,
                                                      metric,
                                                      p=p,
                                                      min_points=k,
                                                      alpha=alpha)

    min_spanning_tree = mst_linkage_core(mutual_reachability_)
    min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), :]

    single_linkage_tree = label(min_spanning_tree)
    single_linkage_tree = SingleLinkageTree(single_linkage_tree)

    labels = single_linkage_tree.get_clusters(cut, gamma)

    return labels, single_linkage_tree
开发者ID:evelynmitchell,项目名称:hdbscan,代码行数:28,代码来源:robust_single_linkage_.py


示例6: visualize_class_separation

def visualize_class_separation(X, labels):
  _, (ax1,ax2) = pyplot.subplots(ncols=2)
  label_order = np.argsort(labels)
  ax1.imshow(pairwise_distances(X[label_order]), interpolation='nearest')
  ax2.imshow(pairwise_distances(labels[label_order,None]),
             interpolation='nearest')
  pyplot.show()
开发者ID:EdwardBetts,项目名称:metaviro,代码行数:7,代码来源:sandwich.py


示例7: class_separation

def class_separation(X, labels):
  unique_labels, label_inds = np.unique(labels, return_inverse=True)
  ratio = 0
  for li in xrange(len(unique_labels)):
    Xc = X[label_inds==li]
    Xnc = X[label_inds!=li]
    ratio += pairwise_distances(Xc).mean() / pairwise_distances(Xc,Xnc).mean()
  return ratio / len(unique_labels)
开发者ID:lyleaf,项目名称:metric_learn,代码行数:8,代码来源:metric_learn_test.py


示例8: eval

    def eval(self, X):
        """Evaluate the kernel density estimation

        Parameters
        ----------
        X : array_like
            array of points at which to evaluate the KDE.  Shape is
            (n_points, n_dim), where n_dim matches the dimension of
            the training points.

        Returns
        -------
        dens : ndarray
            array of shape (n_points,) giving the density at each point.
            The density will be normalized for metric='gaussian' or
            metric='tophat', and will be unnormalized otherwise.
        """
        X = np.atleast_2d(X)
        if X.ndim != 2:
            raise ValueError('X must be two-dimensional')

        if X.shape[1] != self.X_.shape[1]:
            raise ValueError('dimensions of X do not match training dimension')

        if self.metric == 'gaussian':
            # wrangle gaussian into scikit-learn's 'rbf' kernel
            gamma = 0.5 / self.h / self.h
            D = pairwise_kernels(X, self.X_, metric='rbf', gamma=gamma)
            D /= np.sqrt(2 * np.pi * self.h ** (2 * X.shape[1]))
            dens = D.sum(1)

        elif self.metric == 'tophat':
            # use Ball Tree to efficiently count neighbors
            bt = BallTree(self.X_)
            counts = bt.query_radius(X, self.h,
                                     count_only=True)
            dens = counts / n_volume(self.h, X.shape[1])

        elif self.metric == 'exponential':
            D = pairwise_distances(X, self.X_)
            dens = np.exp(-abs(D) / self.h)
            dens = dens.sum(1)
            dens /= n_volume(self.h, X.shape[1]) * special.gamma(X.shape[1])

        elif self.metric == 'quadratic':
            D = pairwise_distances(X, self.X_)
            dens = (1 - (D / self.h) ** 2)
            dens[D > self.h] = 0
            dens = dens.sum(1)
            dens /= 2. * n_volume(self.h, X.shape[1]) / (X.shape[1] + 2)

        else:
            D = pairwise_kernels(X, self.X_, metric=self.metric, **self.kwargs)
            dens = D.sum(1)

        return dens
开发者ID:BTY2684,项目名称:astroML,代码行数:56,代码来源:density_estimation.py


示例9: __call__

    def __call__(self, X_train, X_test, y_train, y_test):
        X = np.vstack([X_train, X_test])
        y = np.hstack([y_train, y_test])
        unique_labels, label_inds = np.unique(y, return_inverse=True)
        ratio = 0
        for li in range(len(unique_labels)):
            Xc = X[label_inds == li]
            Xnc = X[label_inds != li]
            ratio += pairwise_distances(Xc).mean() \
                / pairwise_distances(Xc, Xnc).mean()

        return -ratio / len(unique_labels)
开发者ID:svecon,项目名称:metric-learn,代码行数:12,代码来源:class_separation.py


示例10: outlier_clusters_ward

def outlier_clusters_ward(x, y, skill=None, memory=None):
    # TODO: incorporate skill
    data = np.vstack((x, y)).T

    if len(data) == 0:
        # uh.
        print 'clustering: NO cluster members!'
        cluster_centers = np.array([[-1, -1]])
        cluster_labels = []
        labels = []
        n_clusters = 0
        dist_within = np.array([])

    elif len(data) == 1:
        print 'clustering: only 1 data point!'
        cluster_centers = data
        cluster_labels = [0]
        labels = np.array([0])
        n_clusters = 1
        dist_within = np.array([0])

    else:
        dist_within = 1000
        dist_max = 75
        n_clusters = 0
        n_clusters_max = 10

        clusterer = AgglomerativeClustering(n_clusters=n_clusters,
                memory=memory)

        # while dist_within > dist_max, keep adding clusters
        while (dist_within > dist_max) * (n_clusters < n_clusters_max):
            # iterate n_clusters
            n_clusters += 1
            clusterer.set_params(n_clusters=n_clusters)

            # cluster
            labels = clusterer.fit_predict(data)

            # get cluster_centers
            cluster_labels = range(n_clusters)
            cluster_centers = np.array([np.mean(data[labels == i], axis=0)
                                        for i in cluster_labels])

            # find dist_within: the maximum pairwise distance inside a cluster
            dist_within = np.max([np.max(pairwise_distances(
                                  data[labels == i]))
                                  for i in cluster_labels])

    dist_within_final = np.array([np.max(pairwise_distances(
            data[labels == i])) for i in cluster_labels])

    return cluster_centers, cluster_labels, labels, n_clusters, dist_within_final
开发者ID:kapadia,项目名称:SpaceWarps,代码行数:53,代码来源:make_lens_catalog.py


示例11: test_precomputed

def test_precomputed(random_state=42):
    """Tests unsupervised NearestNeighbors with a distance matrix."""
    # Note: smaller samples may result in spurious test success
    rng = np.random.RandomState(random_state)
    X = rng.random_sample((10, 4))
    Y = rng.random_sample((3, 4))
    DXX = metrics.pairwise_distances(X, metric='euclidean')
    DYX = metrics.pairwise_distances(Y, X, metric='euclidean')
    for method in ['kneighbors']:
        # TODO: also test radius_neighbors, but requires different assertion

        # As a feature matrix (n_samples by n_features)
        nbrs_X = neighbors.NearestNeighbors(n_neighbors=3)
        nbrs_X.fit(X)
        dist_X, ind_X = getattr(nbrs_X, method)(Y)

        # As a dense distance matrix (n_samples by n_samples)
        nbrs_D = neighbors.NearestNeighbors(n_neighbors=3, algorithm='brute',
                                            metric='precomputed')
        nbrs_D.fit(DXX)
        dist_D, ind_D = getattr(nbrs_D, method)(DYX)
        assert_array_almost_equal(dist_X, dist_D)
        assert_array_almost_equal(ind_X, ind_D)

        # Check auto works too
        nbrs_D = neighbors.NearestNeighbors(n_neighbors=3, algorithm='auto',
                                            metric='precomputed')
        nbrs_D.fit(DXX)
        dist_D, ind_D = getattr(nbrs_D, method)(DYX)
        assert_array_almost_equal(dist_X, dist_D)
        assert_array_almost_equal(ind_X, ind_D)

        # Check X=None in prediction
        dist_X, ind_X = getattr(nbrs_X, method)(None)
        dist_D, ind_D = getattr(nbrs_D, method)(None)
        assert_array_almost_equal(dist_X, dist_D)
        assert_array_almost_equal(ind_X, ind_D)

        # Must raise a ValueError if the matrix is not of correct shape
        assert_raises(ValueError, getattr(nbrs_D, method), X)

    target = np.arange(X.shape[0])
    for Est in (neighbors.KNeighborsClassifier,
                neighbors.RadiusNeighborsClassifier,
                neighbors.KNeighborsRegressor,
                neighbors.RadiusNeighborsRegressor):
        print(Est)
        est = Est(metric='euclidean')
        est.radius = est.n_neighbors = 1
        pred_X = est.fit(X, target).predict(Y)
        est.metric = 'precomputed'
        pred_D = est.fit(DXX, target).predict(DYX)
        assert_array_almost_equal(pred_X, pred_D)
开发者ID:AlexandreAbraham,项目名称:scikit-learn,代码行数:53,代码来源:test_neighbors.py


示例12: find_distance_matrix

 def find_distance_matrix(self, metric='cosine'):
     '''
     compute distance matrix between topis using cosine or euclidean
     distance (default=cosine distance)
     '''
     if metric == 'cosine':
         self.distance_matrix = pairwise_distances(self.topics,
                                                   metric='cosine')
         # diagonals should be exactly zero, so remove rounding errors
         numpy.fill_diagonal(self.distance_matrix, 0)
     if metric == 'euclidean':
         self.distance_matrix = pairwise_distances(self.topics,
                                                   metric='euclidean')
开发者ID:nlesc-sherlock,项目名称:analyzing-corpora,代码行数:13,代码来源:clustering.py


示例13: update_clfs_M

  def update_clfs_M(self, clfs, M):
    self.clfs = clfs
    self.M = M

    self.knn_test_dist, self.knn_test =  NearestNeighbors(self.k,  algorithm='brute', metric='mahalanobis', VI=self.M).fit(self.X_train).kneighbors(self.X_test)
    self.preds_train = np.array([e.predict(self.X_train) for e in clfs]).T
    self.preds_proba_train = np.array([e.predict_proba(self.X_train) for e in clfs]).swapaxes(0,1)
    self.preds_proba_train_smoothed = self.preds_proba_train + 0.01
    self.preds_test = np.array([e.predict(self.X_test) for e in clfs]).T
    self.preds_proba_test = np.array([e.predict_proba(self.X_test) for e in clfs]).swapaxes(0,1)
    self.pp_train = np.array([pt==yt for pt,yt in itertools.izip(self.preds_train, self.y_train)])
    self.pp_test = np.array([pt==yt for pt,yt in itertools.izip(self.preds_test, self.y_test)])
    self.pd_pp_test = pairwise_distances(self.pp_test, self.pp_train, metric='hamming')
    self.pd_preds_test = pairwise_distances(self.preds_test, self.preds_train, metric='hamming')
开发者ID:hippozhu,项目名称:dcs,代码行数:14,代码来源:DES.py


示例14: update_input

 def update_input(self, clf):
   preds_train = np.array([e.predict(self.X_train) for e in clf.estimators_]).T
   self.pp_train = np.array([pt==yt for pt,yt in itertools.izip(preds_train, self.y_train)])
   preds_test = np.array([e.predict(self.X_test) for e in clf.estimators_]).T
   self.pp_test = np.array([pt==yt for pt,yt in itertools.izip(preds_test, self.y_test)])
   self.G = np.zeros(self.M.shape)
   self.active_set = None
   self.ij = []
   self.ijl = []
   self.loss = np.inf
   self.pd_pp = pairwise_distances(self.pp_train, metric='hamming')
   np.fill_diagonal(self.pd_pp, np.inf)
   self.pd_pp_test = pairwise_distances(self.pp_test, self.pp_train, metric='hamming')
   self.step_size = self.alpha
   self.step_size_break = False
开发者ID:hippozhu,项目名称:dcs,代码行数:15,代码来源:lmnn_pp.py


示例15: visualize_document_clusters

def visualize_document_clusters(wiki, tf_idf, centroids, cluster_assignment, k,
                                map_index_to_word, display_content=True):
    '''wiki: original dataframe
       tf_idf: data matrix, sparse matrix format
       map_index_to_word: SFrame specifying the mapping betweeen words and column indices
       display_content: if True, display 8 nearest neighbors of each centroid'''
    
    print('==========================================================')

    # Visualize each cluster c
    for c in xrange(k):
        # Cluster heading
        print('Cluster {0:d}    '.format(c)),
        # Print top 5 words with largest TF-IDF weights in the cluster
        idx = centroids[c].argsort()[::-1]
        for i in xrange(5): # Print each word along with the TF-IDF weight
            print('{0:s}:{1:.3f}'.format(map_index_to_word['category'][idx[i]], centroids[c,idx[i]])),
        print('')
        
        if display_content:
            # Compute distances from the centroid to all data points in the cluster,
            # and compute nearest neighbors of the centroids within the cluster.
            distances = pairwise_distances(tf_idf, [centroids[c]], metric='euclidean').flatten()
            distances[cluster_assignment!=c] = float('inf') # remove non-members from consideration
            nearest_neighbors = distances.argsort()
            # For 8 nearest neighbors, print the title as well as first 180 characters of text.
            # Wrap the text at 80-character mark.
            for i in xrange(8):
                text = ' '.join(wiki[nearest_neighbors[i]]['text'].split(None, 25)[0:25])
                print('\n* {0:50s} {1:.5f}\n  {2:s}\n  {3:s}'.format(wiki[nearest_neighbors[i]]['name'],
                    distances[nearest_neighbors[i]], text[:90], text[90:180] if len(text) > 90 else ''))
        print('==========================================================')
开发者ID:howardx,项目名称:machinelearningspecwashington,代码行数:32,代码来源:kmeans_text_data.py


示例16: display_single_tf_idf_cluster

def display_single_tf_idf_cluster(cluster, df_map):
    '''map_index_to_word: SFrame specifying the mapping betweeen words and column indices'''

    wiki_subset = cluster['dataframe']
    tf_idf_subset = cluster['matrix']
    centroid = cluster['centroid']

    # Print top 5 words with largest TF-IDF weights in the cluster
    idx = centroid.argsort()[::-1]
    for i in range(5):
        print('{0:s}:{1:.3f}'.format(df_map[df_map['idx'] == idx[i]]['word'].values[0], centroid[idx[i]]))
    print('')

    # Compute distances from the centroid to all data points in the cluster.
    distances = pairwise_distances(tf_idf_subset, [centroid], metric='euclidean').flatten()
    # compute nearest neighbors of the centroid within the cluster.
    nearest_neighbors = distances.argsort()
    # For 8 nearest neighbors, print the title as well as first 180 characters of text.
    # Wrap the text at 80-character mark.
    for i in range(8):
        text = ' '.join(wiki_subset.iloc[nearest_neighbors[i]]['text'].split(None, 25)[0:25])
        print('* {0:50s} {1:.5f}\n  {2:s}\n  {3:s}'.format(wiki_subset.iloc[nearest_neighbors[i]]['name'],
              distances[nearest_neighbors[i]], text[:90], text[90:180] if len(text) > 90 else ''))
        print(text)
    print()
开发者ID:chern-git,项目名称:ml-clustering,代码行数:25,代码来源:ml_cluster6.py


示例17: sim_calc

 def sim_calc(self):
     nt = self.corpora[0]
     self.scores = {}
     for corp in self.corpora:
         i_nt = []
         i_c2 = []
         rows = self.ekk_rows[corp[0]]
         for i, word in enumerate(self.ekk_rows['NT']):
             if word in rows:
                 i_nt.append(i)
                 i_c2.append(self.ekk_rows[corp[0]].index(word))
         d_c2 = np.memmap(
             '{0}{1}/{4}/{2}/{5}_{2}_lems=False_{4}_min_occ={3}_{6}no_stops=False_NORMED.dat'.format(
                 self.base, corp[0], corp[1], corp[2], self.english, self.prefix, self.svd),
             dtype='float32', shape=(len(rows), len(rows)))[i_c2]
         d_c2 = d_c2[:, i_c2]
         d_nt = np.memmap(
             '{0}{1}/{4}/{2}/{5}_{2}_lems=False_{4}_min_occ={3}_{6}no_stops=False_NORMED.dat'.format(
                 self.base, nt[0], nt[1], nt[2], self.english, self.prefix,
                 self.svd), dtype='float32',
             shape=(len(self.ekk_rows['NT']), len(self.ekk_rows['NT'])))[
             i_nt]
         d_nt = d_nt[:, i_nt]
         self.scores['{0}_{1}'.format('NT', corp[0])] = np.average(np.diag(
             1 - pairwise_distances(d_nt, d_c2, metric='cosine',
                                    n_jobs=12)))
开发者ID:sonofmun,项目名称:DissProject,代码行数:26,代码来源:compare_vectors.py


示例18: train

    def train(self, reactions, predictor_headers, response_headers, filename):
        print "Preparing arrays"
        data, labels = self._prepareArrays(reactions, predictor_headers, response_headers)
        old_settings = np.seterr(divide='raise') # we don't want division by zero to pass

        # This is how metric learn determines bounds internally
        # but the lower bound can be zero this way (especially for low-dimensional data)
        # which causes divide by zero errors
        print "Calculating bounds"
        pair_dists = pairwise_distances(data)
        bounds = np.percentile(pair_dists, (5, 95))
        # the extra check ensures against divide-by-zero errors later
        if bounds[0] == 0:
            bounds[0] = min(pair_dists[np.nonzero(pair_dists)])
            print "Lowerbound was 0. Set to {}".format(bounds[0])
        
        print "Preparing {} constraints with bounds of ({}, {})".format(self.num_constraints, bounds[0], bounds[1])
        constraints = self.metric_object.prepare_constraints(labels, data.shape[0], self.num_constraints)
        print "Fitting"
        self.metric_object.fit(data, constraints, bounds=bounds)
        
        self.save(filename)
        np.seterr(**old_settings)
        
        print "Transforming training set"
        return self.metric_object.transform()
开发者ID:nihaoCC,项目名称:DRP,代码行数:26,代码来源:ITML.py


示例19: ds_clustering

def ds_clustering(clusters,support_vectors, f_values, new_element):
    '''
    clustering the new element 
    Efficient Out-of-Sample extension of Dominant set clusters
    Massimiliano et. al., NIPS 2004
    for all h in S: if sum(a(h,i)*x(h)  > f(x)  then i is assigned to S)      
    '''
    if clusters ==None or support_vectors==None or new_element == None:
        return None
    sum_axs = []
    for i in np.arange(len(clusters)):
        S = clusters[i]
        S_old = S.copy()
        x = support_vectors[i]
        
        #print 'len S ', len(S), 'len x', len(x)
        
        from sklearn.metrics import euclidean_distances , pairwise_distances
        #euclidean_distances(X, Y=None, Y_norm_squared=None, squared=False):
        new_arr = [new_element]
        dis = pairwise_distances(new_arr,S, metric='sqeuclidean')
        sigma2 = np.median(dis)    
        a_hj = np.exp(-dis / sigma2)      

        #print dis, a_hj
        sum_ax = 0.        
        for h in np.arange(len(S_old)):            
            sum_ax = sum_ax + a_hj[0][h]*x[h]
        #print 'i =',i,' sum_ax', sum_ax, 'f_values ', f_values[i]   
        sum_axs.append(sum_ax)
        
    #print np.argmax(sum_axs), '  ', np.max(sum_axs)   
    if np.max(sum_axs) >= 0.5*f_values[np.argmax(sum_axs)]:
            return np.argmax(sum_axs)
    return None
开发者ID:baothien,项目名称:tiensy,代码行数:35,代码来源:ds_outlier.py


示例20: rankInCluster

    def rankInCluster(self,labels,centers_features,K,X,tweets=None):
        clusters = dict((clusId,{'all':[],'best':"",'first':"",'words':"","n":0,'sentiment':0}) for clusId in range(K))
        if not tweets:
            tweets = self.tweets
        # In each cluster, do the following :
        # 1) sort tweets by created time in descending order
        # 2) get the first tweet (in terms of time)
        # 3) find the tweet that is closet to the cluster centroid (best tweet)

        for i,label in enumerate(labels):
            clusters[label]['all'].append(tweets[i])
            clusters[label]['n'] += 1

        for label in labels:
            clusters[label]['all'] = sorted(clusters[label]['all'], key=lambda x:x.time, reverse=True)
            clusters[label]['first'] = clusters[label]['all'][-1].printTweet()

        # Find the best tweet and avg sentiment in each cluster
        for clusId in xrange(K):
            print "{} tweets in cluster {}".format(len(clusters[clusId]['all']), clusId)
            tweetIdxInClus = np.where(labels == clusId)
            clusters[clusId]['sentiment'] = np.mean(X[tweetIdxInClus,-1])
            if not clusters[clusId]["n"]:
                break
            #print tweetIdxInClus
            centerCoord = centers_features[clusId].reshape(1,-1)
            distToCtr = pairwise_distances(X[tweetIdxInClus], centerCoord)  # dimension: (n_tweets, 1)


            # Calculate tweet popularity/quality feature
            popularity = []
            for i,t in enumerate(tweets):
                if i in tweetIdxInClus[0]:
                    popularity.append([t.retweetCnt,t.favCnt,t.isRetweet,t.followers])
            popularity = np.array(popularity)  # n_tweet X 5
            coef = np.array([.5,.5,-.8,.2]) # hard-coded coefficient
            #print "popularity:{}".format(popularity.dot(coef).shape)
            norm_popularity = normalize(popularity).dot(coef).reshape(-1,1)
            #print norm_popularity
            #print norm_popularity.shape

            feat = np.add(distToCtr, norm_popularity)
            bestTweetId = np.argmax(feat)
            clusters[clusId]['best'] = tweets[tweetIdxInClus[0][bestTweetId]].printTweet()

        # Get the top words in each cluster
        sorted_centers_features = centers_features.argsort()[:, ::-1]
        for ctr in xrange(K):
            top3words = []
            found = 0
            for field in sorted_centers_features[ctr]: # Get the top 3 common words
                try:
                    top3words.append(self.tfidfDict[field].encode('utf-8', 'ignore'))
                    if found == 2:
                        break
                    found +=1
                except IndexError:
                    continue
            clusters[ctr]['words'] = "/".join(top3words)
        return clusters
开发者ID:acatwang,项目名称:tweetsCluster,代码行数:60,代码来源:app.py



注:本文中的sklearn.metrics.pairwise_distances函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。


鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
Python metrics.precision_recall_curve函数代码示例发布时间:2022-05-27
下一篇:
Python metrics.normalized_mutual_info_score函数代码示例发布时间:2022-05-27
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap