本文整理汇总了Python中sklearn.cluster.DBSCAN类的典型用法代码示例。如果您正苦于以下问题:Python DBSCAN类的具体用法?Python DBSCAN怎么用?Python DBSCAN使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了DBSCAN类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: current_datapoints_dbscan
def current_datapoints_dbscan(self):
"""
Method clusters points-outliers (after current_datapoints_threshold_filter and current_datapoints_outliers_filter) into slice-clusters using DBSCAN.
Returns dict of slice-clusters - base for event-candidates. Uses self.eps attribute to estimate cluster boundaries.
"""
nets = self.current_datapoints.keys()
ids = concatenate([self.current_datapoints[x]['ids'] for x in nets])
coords = concatenate([self.current_datapoints[x]['array'] for x in nets])
weights = concatenate([self.current_datapoints[x]['weights'] for x in nets])
if len(ids) > 0:
clustering = DBSCAN(eps=self.eps, min_samples=5)
labels = clustering.fit_predict(coords)
core_ids = ids[clustering.core_sample_indices_]
ids = ids[labels > -1]
coords = coords[labels > -1]
weights = weights[labels > -1]
labels = labels[labels > -1]
ret_tab = {}
for i in range(len(labels)):
try:
ret_tab[labels[i]].append({'id':ids[i], 'lng':coords[i,0], 'lat':coords[i,1], 'weight':weights[i], 'is_core':ids[i] in core_ids})
except KeyError:
ret_tab[labels[i]] = [{'id':ids[i], 'lng':coords[i,0], 'lat':coords[i,1], 'weight':weights[i], 'is_core':ids[i] in core_ids}]
return ret_tab
else:
return {}
开发者ID:city-pulse,项目名称:mskpulse.backend,代码行数:26,代码来源:detector.py
示例2: cluster_dbscan
def cluster_dbscan(matrix, distance_measure="sts", eps=1):
"""Clusters the distance matrix for a given epsilon value, if distance
measure is sts. Other distance measures are: [‘cityblock’, ‘cosine’,
‘euclidean’, ‘l1’, ‘l2’, ‘manhattan’, ‘braycurtis’, ‘canberra’,
‘chebyshev’, ‘correlation’, ‘dice’, ‘hamming’, ‘jaccard’, ‘kulsinski’,
‘mahalanobis’, ‘matching’, ‘minkowski’, ‘rogerstanimoto’, ‘russellrao’,
‘seuclidean’, ‘sokalmichener’, ‘sokalsneath’, ‘sqeuclidean’, ‘yule’]
Parameters
----------
matrix: np.matrix
The input matrix. If distance measure is sts, this should be the sts
distance matrix. If other distance, this should be the time-series
matrix of size ngenes x nsamples.
distance_measure: str
The distance measure, default is sts, short time-series distance.
Any distance measure available in scikit-learn is available here.
Note: multiple time-series is NOT supported for distances other than
"sts".
Returns
-------
cluster_labels: list of int
A list of size ngenes that defines cluster membership.
"""
if (distance_measure == "sts"):
dbs = DBSCAN(eps=eps, metric='precomputed', min_samples=2)
else:
dbs = DBSCAN(eps=eps, metric=distance_measure, min_samples=2)
cluster_labels = dbs.fit_predict(matrix)
return cluster_labels
开发者ID:beiko-lab,项目名称:ananke,代码行数:31,代码来源:_cluster.py
示例3: _fit_dbscan
def _fit_dbscan(self, x):
# clustering
for r in xrange(self.repeats):
# info
if self.debug is True:
print '\t[%s][c:%d][r:%d]' % (self.clus_type, k, r + 1),
# fit and evaluate model
model = DBSCAN(eps=1.0, min_samples=100)
model.fit_predict(x)
k = len(set(model.labels_)) - (1 if -1 in model.labels_ else 0)
self._labels[r] = model.labels_
self._parameters[r] = model.core_sample_indices_
# build equivalent gmm
model_gmm = GMM(n_components=k, covariance_type="full")
model_gmm.means_ = model.core_sample_indices_
model_gmm.covars_ = sp.ones(
(k, self.input_dim)) * self.sigma_factor
model_gmm.weights_ = sp.array(
[(self._labels[r] == i).sum() for i in xrange(k)])
# evaluate goodness of fit
self._ll[r] = model_gmm.score(x).sum()
if self.gof_type == 'aic':
self._gof[r] = model_gmm.aic(x)
if self.gof_type == 'bic':
self._gof[r] = model_gmm.bic(x)
# debug info
if self.debug is True:
print self._gof[r]
开发者ID:pmeier82,项目名称:BOTMpy,代码行数:32,代码来源:cluster.py
示例4: cluster_mappings
def cluster_mappings(vector_inpath, do_pca=False, target_dim=100, indices_inpath=None, epsilon=2.5, min_s=20):
# TODO: CLustering parameters
# TODO: Metric cosine similarity or euclidian distance
print alt("Load mappings...")
indices, model = load_mappings_from_model(vector_inpath)
X = numpy.array([model[key] for key in indices])
# del model
if do_pca:
print alt("Truncate vectors with PCA to %i dimensions..." %(target_dim))
pca = PCA(n_components=target_dim)
pca.fit(X)
X = pca.transform(X)
print alt("Cluster points...")
# k = 2 * X[0].shape[0] - 1
# min_pts = k + 1
#dbscan = DBSCAN(eps=0.1, min_samples=20, metric='cosine',algorithm='brute')
dbscan = DBSCAN(eps=epsilon, min_samples=min_s)
dbscan.fit(X)
labels = dbscan.labels_
print get_cluster_size(labels)
print alt("Finished clustering!")
sscore = silhouette_score(X, labels)
print("Silhouette Coefficient: %0.3f" %(sscore))
if indices_inpath:
resolve_indices(indices, labels, indices_inpath, model)
开发者ID:dboth,项目名称:thesis_ba,代码行数:25,代码来源:cluster_mappings.py
示例5: dbscan_outliers
def dbscan_outliers(data, genes, eps, min_samples, max_samples=1, as_json=True):
db = DBSCAN(eps=eps, min_samples=min_samples)
# sd_scaler = StandardScaler()
res = dr.get_dataset_ensembl_info()
outliers_id = []
for g in genes:
# scaled = sd_scaler.fit(data.loc[g, :])
fit = db.fit(np.reshape(data.loc[g, :], (196, 1)))
candidates = itemfreq(fit.labels_)
try:
class_zero = candidates[0][1]
class_one = candidates[1][1]
support = min(class_one, class_zero)
if min_samples < support <= max_samples:
info = [gene for gene in res if gene.ensemblgeneid == g][0]
formatted_info = {"id": g, "name": info.genename, "type": info.genetype, "samples": str(support),
"distance": "NA"}
jinfo = json.dumps(formatted_info)
jinfo += ","
outliers_id.append(g)
print("outlier found :" + g)
if as_json:
yield (jinfo)
else:
yield (formatted_info)
except:
pass
开发者ID:armell,项目名称:RNASEqTool,代码行数:31,代码来源:outliers.py
示例6: cluster
def cluster():
eps_set = 0.5 * np.arange(1, 7)
npt_set = np.arange(1, 6)
scores = []
global res
res = []
for eps in eps_set:
for npt in npt_set:
est = DBSCAN(eps=eps, min_samples=npt)
est.fit(x)
ari = metrics.adjusted_rand_score(y, est.labels_)
scores.append(ari)
n_noise = len([ l for l in est.labels_ if l == -1])
res.append((ari, np.max(est.labels_) + 1 , n_noise))
print ari
max_score = np.max(scores)
max_idx = scores.index(max_score)
max_eps = eps_set[max_idx / len(npt_set)]
max_npt = npt_set[max_idx % len(npt_set)]
print max_score, max_eps, max_npt
scores = np.array(scores).reshape(len(eps_set), len(npt_set))
pl.imshow(scores, interpolation='nearest', cmap=pl.cm.spectral)
pl.colorbar()
pl.xticks(np.arange(len(npt_set)), npt_set)
pl.yticks(np.arange(len(eps_set)), eps_set)
pl.ylabel('eps')
pl.xlabel('min_samples')
pl.show()
开发者ID:harrylclc,项目名称:ist557,代码行数:28,代码来源:dbscan.py
示例7: clusterMalwareNames
def clusterMalwareNames(malwareNames):
# strictly lexical clustering over malware-names
wordCount = {}
# create a distance matrix
matrix = np.zeros((len(malwareNames), len(malwareNames)))
for i in range(len(malwareNames)):
for j in range(len(malwareNames)):
if matrix[i, j] == 0.0:
matrix[i, j] = computeSimilarity(malwareNames[i], malwareNames[j])
matrix[j, i] = matrix[i, j]
# Scikit-Learn's DBSCAN implementation to cluster the malware-names
clust = DBSCAN(eps=0.1, min_samples=5, metric="precomputed")
clust.fit(matrix)
preds = clust.labels_
clabels = np.unique(preds)
# create Word-Count Map
for i in range(clabels.shape[0]):
if clabels[i] < 0:
continue
cmem_ids = np.where(preds == clabels[i])[0]
cmembers = []
for cmem_id in cmem_ids:
cmembers.append(malwareNames[cmem_id])
wordCount[", ".join(uniqueList(cmembers))] = len(cmem_ids)
return wordCount
开发者ID:M0nk2y,项目名称:malware-crawler,代码行数:31,代码来源:vtTool.py
示例8: find_tracks
def find_tracks(data, eps=20, min_samples=20):
"""Applies the DBSCAN algorithm from scikit-learn to find tracks in the data.
Parameters
----------
data : array-like
An array of (x, y, z, hits) data points
eps : number, optional
The minimum distance between adjacent points in a cluster
min_samples : number, optional
The min number of points in a cluster
Returns
-------
tracks : list
A list of tracks. Each track is an ndarray of points.
"""
xyz = data[:, 0:3]
dbs = DBSCAN(eps=eps, min_samples=min_samples)
dbs.fit(xyz)
tracks = []
for track in (np.where(dbs.labels_ == n)[0] for n in np.unique(dbs.labels_) if n != -1):
tracks.append(data[track])
return tracks
开发者ID:tarvos14,项目名称:pytpc,代码行数:27,代码来源:tracking.py
示例9: classify_core
def classify_core(self, N_CLUSTERS, clusterType, data_for_trial_type, begin_time, end_time):
BEGIN_TIME_FRAME = begin_time*self.griddy.TIME_GRID_SPACING
END_TIME_FRAME = end_time*self.griddy.TIME_GRID_SPACING
data = data_for_trial_type[:,BEGIN_TIME_FRAME:END_TIME_FRAME,self.griddy.VEL_X]
labels = None
if clusterType == 'kmeans':
kmeans = KMeans(n_clusters=N_CLUSTERS)
kmeans.fit(data)
labels = kmeans.labels_
elif clusterType == 'affinity_propagation':
ap = AffinityPropagation(damping=0.75)
ap.fit(data)
labels = ap.labels_
N_CLUSTERS = np.max(self.labels)+1
elif clusterType == 'DBSCAN':
dbscan = DBSCAN()
dbscan.fit(data)
labels = dbscan.labels_
N_CLUSTERS = np.max(labels)+1
print 'N_CLUSTERS=' + str(N_CLUSTERS)
elif clusterType == 'AgglomerativeClustering':
ac = AgglomerativeClustering(n_clusters=N_CLUSTERS)
ac.fit(data)
labels = ac.labels_
else:
print 'ERROR: clusterType: ' + clusterType + ' is not recognized'
return (labels, N_CLUSTERS)
开发者ID:SashaRayshubskiy,项目名称:osmotropotaxis_analysis_python,代码行数:31,代码来源:fly_trajectory_classifier.py
示例10: cluster_tweets
def cluster_tweets(tweets):
#TODO get TFIDF vector
#do clustering
ner_tags = [get_ner_tags(tweet).tolist() for tweet in tweets['tweet']]
vectorizer = TfidfVectorizer(preprocessor=_dummy_preprocess, tokenizer=lambda x:x,
binary=True,
min_df=0, use_idf=True, smooth_idf=True)
tfidf = vectorizer.fit_transform(ner_tags)
#ner_tags = [get_ner_tags(tweet) for tweet in tweets['tweet']]
print "clustering started"
t0 = time()
#cluster = AgglomerativeClustering(n_clusters=3, affinity="cosine" )
#cluster = MiniBatchKMeans(n_clusters=10, max_iter=100, batch_size=100)
#metric=sklearn.metrics.pairwise.cosine_distances
cluster = DBSCAN(min_samples=2, eps=0.5)
clustered = cluster.fit(tfidf.todense())
#clustered = cluster.fit(ner_tags)
labels = clustered.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print "clustering finished in %.3f seconds"%(time()-t0)
print "%d clusters detected"%n_clusters_
tweets['cluster'] = labels
tweets['ner'] = ner_tags
return tweets
开发者ID:Kaushalya,项目名称:tweet_summary,代码行数:28,代码来源:summarizer.py
示例11: cluster_with_dbscan
def cluster_with_dbscan(vectors, epsilon=0.5, min_samples=5, distances=None, metric="euclidean"):
# precomputing our distances will be faster as we can use multiple cores
if distances is None:
distances = pairwise_distances(vectors, n_jobs=-1, metric=metric)
dbscan = DBSCAN(eps=epsilon, min_samples=min_samples, metric="precomputed")
return dbscan.fit_predict(distances)
开发者ID:CylanceSPEAR,项目名称:NMAP-Cluster,代码行数:7,代码来源:clustering.py
示例12: search_charges
def search_charges(self, data, z=0, threshold = 30):
A = deriv(data,z)
print 'Searching charges...'
time0 = time.time()
det = A[3]*A[5]-A[4]**2
dx = -(A[1]*A[5]-A[2]*A[4])/det
dy = -(A[2]*A[3]-A[1]*Aa[4])/det
datamax = A[0]+A[1]*dx+A[2]*dy+A[3]*dx**2/2+A[4]*dx*dy+A[5]*dy**2/2
t = np.where((np.abs(dx) < 1)*(np.abs(dy) < 1)*(np.abs(datamax) > threshold)*(det > 0))
x = np.array([t[1]+dx[t], t[0]+dy[t]]).T
db = DBSCAN(min_samples = 1, eps = 1)
db.fit_predict(x)
n_charges = np.max(db.labels_)+1
qi = np.zeros(n_charges)
xi = np.zeros((3,n_charges))
for i in range(0, n_charges):
xi[0:2,i] = np.mean(x[db.labels_ == i,:], axis=0)
qi[i] = np.mean(datamax[t][db.labels_ == i])
self.set_charges(qi,xi)
print 'Done! Elapsed time: '+str(time.time()-time0)
return self
开发者ID:temik42,项目名称:lib,代码行数:31,代码来源:pyfield.py
示例13: on_squaremsg_received
def on_squaremsg_received(self, msg):
detected_squares = []
for square_msg in msg.squares:
detected_squares.append(TrackedSquare.from_msg(square_msg))
self._prev_squares.append(detected_squares)
all_squares = list(itertools.chain.from_iterable(self._prev_squares))
square_centers = [list(s.center) + [s.hue] for s in all_squares]
data = np.array(square_centers)
ms = DBSCAN(eps=64, min_samples=3)
ms.fit(data)
labels = ms.labels_
ts_msg = TrackedSquares()
for i, s in enumerate(all_squares):
label = np.int0(labels[i])
if label < 0:
continue
s.tracking_colour = TrackedSquare.TRACKING_COLOURS[label % len(TrackedSquare.TRACKING_COLOURS)]
s.tracking_detected = True
ts_msg.squares.append(s.to_msg())
self._squares_pub.publish(ts_msg)
开发者ID:Knifa,项目名称:Glasgow-Baxter,代码行数:27,代码来源:understanding.py
示例14: plot_dbscan
def plot_dbscan():
X, y = make_blobs(random_state=0, n_samples=12)
dbscan = DBSCAN()
clusters = dbscan.fit_predict(X)
clusters
fig, axes = plt.subplots(3, 4, figsize=(11, 8), subplot_kw={'xticks': (), 'yticks': ()})
# Plot clusters as red, green and blue, and outliers (-1) as white
colors = ['r', 'g', 'b']
markers = ['o', '^', 'v']
# iterate over settings of min_samples and eps
for i, min_samples in enumerate([2, 3, 5]):
for j, eps in enumerate([1, 1.5, 2, 3]):
# instantiate DBSCAN with a particular setting
dbscan = DBSCAN(min_samples=min_samples, eps=eps)
# get cluster assignments
clusters = dbscan.fit_predict(X)
print("min_samples: %d eps: %f cluster: %s" % (min_samples, eps, clusters))
if np.any(clusters == -1):
c = ['w'] + colors
m = ['o'] + markers
else:
c = colors
m = markers
discrete_scatter(X[:, 0], X[:, 1], clusters, ax=axes[i, j], c=c, s=8, markers=m)
inds = dbscan.core_sample_indices_
# vizualize core samples and clusters.
if len(inds):
discrete_scatter(X[inds, 0], X[inds, 1], clusters[inds],
ax=axes[i, j], s=15, c=colors,
markers=markers)
axes[i, j].set_title("min_samples: %d eps: %.1f" % (min_samples, eps))
fig.tight_layout()
开发者ID:ABcDexter,项目名称:introduction_to_ml_with_python,代码行数:35,代码来源:plot_dbscan.py
示例15: cluster_DBSCAN
def cluster_DBSCAN(args):
"""
Clustering with Ward hierarchical clustering: constructs a tree and cuts it.
"""
#load data
g_it = node_link_data.node_link_data_to_eden(input = args.input_file, input_type = "file")
vec = graph.Vectorizer(r = args.radius,d = args.distance, nbits = args.nbits)
logger.info('Vectorizer: %s' % vec)
X = vec.transform(g_it, n_jobs = args.n_jobs)
logger.info('Instances: %d Features: %d with an avg of %d features per instance' % (X.shape[0], X.shape[1], X.getnnz() / X.shape[0]))
#project to lower dimensional space to use clustering algorithms
transformer = TruncatedSVD(n_components=args.n_components)
X_dense=transformer.fit_transform(X)
#log statistics on data
logger.info('Dimensionality reduction Instances: %d Features: %d with an avg of %d features per instance' % (X_dense.shape[0], X_dense.shape[1], X.getnnz() / X.shape[0]))
#clustering
clustering_algo = DBSCAN(eps = args.eps)
y = clustering_algo.fit_predict(X_dense)
msg = 'Predictions statistics: '
msg += util.report_base_statistics(y)
logger.info(msg)
#save model for vectorizer
out_file_name = "vectorizer"
eden_io.dump(vec, output_dir_path = args.output_dir_path, out_file_name = out_file_name)
logger.info("Written file: %s/%s",args.output_dir_path, out_file_name)
#save result
out_file_name = "labels"
eden_io.store_matrix(matrix = y, output_dir_path = args.output_dir_path, out_file_name = out_file_name, output_format = "text")
logger.info("Written file: %s/%s",args.output_dir_path, out_file_name)
开发者ID:nickgentoo,项目名称:pyEDeN,代码行数:35,代码来源:cluster_DBSCAN.py
示例16: train_dbscan
def train_dbscan():
print "starting dbscan clustering..."
model = DBSCAN(eps=dbs_eps, min_samples=dbs_min_samples, metric=dbs_metric, algorithm='auto')
model.fit(X)
core_ponts = model.core_sample_indices_
if output_core_points:
print "core points data index"
print core_points
print "num of core points %d" %(len(core_ponts))
print "all points clutser index"
cluster_index = model.labels_
if output_cluster_members:
#print cluster_index
cluster_members = {}
for i,c in enumerate(cluster_index):
index_list = cluster_members.get(c, list())
index_list.append(i)
cluster_members[c] = index_list
for cl, indx_list in cluster_members.iteritems():
if cl > 0:
print "cluster index %d size %d" %(cl, len(indx_list))
else:
print "noise points size %d" %(len(indx_list))
print indx_list
print "num of clusters %d" %(cluster_index.max() + 1)
开发者ID:ahnqirage,项目名称:avenir,代码行数:28,代码来源:cluster.py
示例17: DBScan_Flux
def DBScan_Flux(phots, ycenters, xcenters, dbsClean=0, useTheForce=False):
"""Class methods are similar to regular functions.
Note:
Do not include the `self` parameter in the ``Args`` section.
Args:
param1: The first parameter.
param2: The second parameter.
Returns:
True if successful, False otherwise.
"""
dbsPhots = DBSCAN()#n_jobs=-1)
stdScaler = StandardScaler()
phots = np.copy(phots.ravel())
phots[~np.isfinite(phots)] = np.median(phots[np.isfinite(phots)])
featuresNow = np.transpose([stdScaler.fit_transform(ycenters[:,None]).ravel(), \
stdScaler.fit_transform(xcenters[:,None]).ravel(), \
stdScaler.fit_transform(phots[:,None]).ravel() ] )
# print(featuresNow.shape)
dbsPhotsPred= dbsPhots.fit_predict(featuresNow)
return dbsPhotsPred == dbsClean
开发者ID:exowanderer,项目名称:ExoplanetTSO,代码行数:29,代码来源:bak_auxiliary.py
示例18: dbscan
def dbscan(similarity, concepts=2, euclid=False):
if euclid:
model = DBSCAN(eps=0.6, min_samples=10, algorithm='auto', leaf_size=30)
return model.fit_predict(similarity)
else:
model = DBSCAN(eps=0.6, min_samples=10, metric='precomputed', algorithm='auto', leaf_size=30)
return model.fit_predict(1 - similarity)
开发者ID:thran,项目名称:experiments2.0,代码行数:7,代码来源:clusterings.py
示例19: dbscan_outliers
def dbscan_outliers(df):
"""
Find outliers (noise points) using DBSCAN.
Parameters
----------
df: A pandas.DataFrame
Returns
-------
A tuple of (a sklearn.DBSCAN instance, a pandas.DataFrame)
"""
scaler = StandardScaler()
scaler.fit(df)
scaled = scaler.transform(df)
dbs = DBSCAN()
db = dbs.fit(scaled)
outliers = dbs.fit_predict(scaled)
df_o = df.ix[np.nonzero(outliers)]
return db, df_o
开发者ID:nwngeek212,项目名称:MachineLearningConcepts,代码行数:25,代码来源:helper.py
示例20: get_clusters
def get_clusters(tracks):
neighbors = g.m.neighborsSpin.value()
dist = g.m.neighborDistanceSpin.value()
data = np.array([[tr['mean_x'], tr['mean_y']] for tr in tracks])
scanner = DBSCAN(eps=dist, min_samples=neighbors)
ids = scanner.fit_predict(data)
return ids
开发者ID:BrettJSettle,项目名称:MotilityTracking,代码行数:7,代码来源:MotilityTracking.py
注:本文中的sklearn.cluster.DBSCAN类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论