本文整理汇总了Python中sklearn.metrics.silhouette_samples函数的典型用法代码示例。如果您正苦于以下问题:Python silhouette_samples函数的具体用法?Python silhouette_samples怎么用?Python silhouette_samples使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了silhouette_samples函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: crank_feats
def crank_feats(fargs):
rss, ccs, lv, installed_in, dfile, nfeatures = fargs
noaa_init(installed_in)
wat = pd.read_csv(dfile).set_index('station')
es = ['e' + str(x) for x in range(0, nfeatures)]
#ccs = [3, 4, 6, 7, 8, 9, 10, 11, 12, 15, 20, 30, 40]
#rss=[0, 1]
prefix='eigen' + str(nfeatures)
#let's do some clustering with the six eigenvectors and see how they hold together
flatnew, nmeans, nstds = flatten(wat[es]) #strictly speaking not necessary since
flatold, omeans, ostds = flatten(wat[lv])
#note: this method flattens wat internally
produce_kmeans_climates(wat, es, ccs, rss, prefix)
for rs in rss:
kf = pd.read_csv(noaafile('climates/' + prefix + '_rs_' + str(rs) + '.csv'))
for cc in ccs:
#this silhouettes thing gobbles memory, I'm guessing because each worker
#creates an entire new metric matrix.
kf['sil_eigen_' + str(cc)] = silhouette_samples(flatnew, kf['vtx'+str(cc)].values)
#pull out silhouette scores on the old metric too, just for fun...
kf['sil_old_' + str(cc)] = silhouette_samples(flatold, kf['vtx'+str(cc)].values)
kf.to_csv(noaafile('climates/' + prefix + '_sil_rs_' + str(rs) + '.csv'), index=False)
开发者ID:lpriccio,项目名称:noaa,代码行数:27,代码来源:nfeats_20160407.py
示例2: bestRep
def bestRep(dat,labels,outName):
bestExample = []
silSamp = metrics.silhouette_samples(dat, labels)
for num in np.unique(labels):
clusterMask = labels==num
bestExample.append(outName[clusterMask][np.argmax(silSamp[clusterMask])])
return bestExample
开发者ID:dstuck,项目名称:CompChemClustering,代码行数:7,代码来源:chemClustering.py
示例3: test_silhouette_samples
def test_silhouette_samples(self):
result = self.df.metrics.silhouette_samples()
expected = metrics.silhouette_samples(self.data, self.pred)
self.assertTrue(isinstance(result, pdml.ModelSeries))
self.assert_index_equal(result.index, self.df.index)
self.assert_numpy_array_almost_equal(result.values, expected)
开发者ID:Sandy4321,项目名称:pandas-ml,代码行数:7,代码来源:test_metrics.py
示例4: get_silhouette
def get_silhouette(df):
df=df[(df.AB!=".")].copy()
df.loc[:,'AB']=pd.to_numeric(df.loc[:,'AB'])
df.loc[:,'CN']=pd.to_numeric(df.loc[:,'CN'])
tp=df.iloc[0,:].loc['svtype']
[mn_CN, mn_AB]=df.loc[:, ['CN', 'AB']].mean(skipna=True)
[sd_CN, sd_AB]=df.loc[:, ['CN', 'AB']].std(skipna=True)
if df.loc[:,'GT'].unique().size==1:
df.loc[:,'sil_gt_avg']=1
df.loc[:, 'sil_gt']=1
df=df[ ['var_id', 'sample', 'svtype', 'AF', 'GT', 'CN', 'AB', 'sil_gt_avg', 'sil_gt']]
return df
#standardize the 2 dims
if sd_AB>0.01:
df.loc[:, 'AB1']=(df.loc[:,'AB']-mn_AB)/sd_AB
else:
df.loc[:, 'AB1']=df.loc[:, 'AB']
if tp in ['DEL', 'DUP', 'MEI'] or sd_CN>0.01:
df.loc[:, 'CN1']=(df.loc[:,'CN']-mn_CN)/sd_CN
else:
df.loc[:, 'CN1']=df.loc[:, 'CN']
gt_code={'0/0':1, '0/1':2, '1/1':3}
df.loc[:,'gtn']=df.loc[:, 'GT'].map(gt_code)
dist_2d_sq=spatial.distance.squareform(spatial.distance.pdist(df[['AB1', 'CN1']], metric='cityblock'))
df.loc[:, 'sil_gt_avg']=metrics.silhouette_score(dist_2d_sq, df.loc[:, 'gtn'].values, metric='precomputed')
df.loc[:, 'sil_gt']=metrics.silhouette_samples(dist_2d_sq, df.loc[:, 'gtn'].values, metric='precomputed')
df=df[ ['var_id', 'sample', 'svtype', 'AF', 'GT', 'CN', 'AB', 'sil_gt_avg', 'sil_gt']]
return df
开发者ID:abelhj,项目名称:svtools,代码行数:35,代码来源:gt_silhouette.py
示例5: silhouette_original_clusterings
def silhouette_original_clusterings(dataset='CB1', neuropil='Antennal_lobe', clusterer_or_k=60):
"""Returns a pandas dataframe with the silhouette index of each cluster member.
The dataframe have columns (cluster_id, member_id, silhouette).
"""
# Read the expression matrix
print('Reading expression matrix')
Xdf = ExpressionDataset.dataset(dset=dataset, neuropil=neuropil).Xdf(index_type='string')
# Generate a flat map cluster_id -> members
print('Finding cluster assignments')
clusters_df, _ = get_original_clustering(dataset=dataset, neuropil=neuropil,
clusterer_or_k=clusterer_or_k)
dfs = []
for cluster_id, members in zip(clusters_df.cluster_id,
clusters_df.original_voxels_in_cluster):
dfs.append(pd.DataFrame({'cluster_id': cluster_id, 'member_id': members}))
members_df = pd.concat(dfs).set_index('member_id').loc[Xdf.index]
# Compute the distance matrix - this must be parameterised
print('Computing distance')
import mkl
mkl.set_num_threads(6)
D = dicedist_metric(Xdf)
# Compute silhouette
# Here we could go for the faster implementation in third_party, if needed
print('Computing silhouette index')
members_df['silhouette'] = silhouette_samples(D.values,
members_df.cluster_id.values,
metric='precomputed')
return (members_df.
reset_index().
rename(columns=lambda col: {'index': 'member_id'}.get(col, col))
[['cluster_id', 'member_id', 'silhouette']])
开发者ID:strawlab,项目名称:braincode,代码行数:35,代码来源:clusters_quality.py
示例6: cluster_driver
def cluster_driver(a_driver):
# print a_driver['DStats']
# print "#############################DStats Above##################################################"
X = StandardScaler().fit_transform(a_driver['DStats'])
# print X
# print "DStats are.....::" , a_driver['DStats']
# print "X is...........::" , X
# print "############################Scaled X Above###################################################"
db = DBSCAN(eps=0.6, min_samples=5).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print "###############################################################################"
# print('Estimated number of clusters: %d' % n_clusters_)
# print 'Count of Predicts::', len(X)
# print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels))
# print "##############################DBSCAN X Below#################################################"
# print X G:/Continuing Education/Research & Presentations/Self - Machine Learning/Kaggle/DriverTelemetricAnalysis-AXA/'
# try:
return (metrics.silhouette_samples(X, labels)+1)/2
开发者ID:RobbieShan,项目名称:MindOnData,代码行数:27,代码来源:eda+3.0.py
示例7: silhouette_analysis
def silhouette_analysis(clustering, labels=None):
distance_df = clustering['distance_df']
if labels is None:
labels = clustering['labels']
sample_scores = silhouette_samples(distance_df, metric='precomputed', labels=labels)
score = np.mean(sample_scores)
return sample_scores, score
开发者ID:IanEisenberg,项目名称:Self_Regulation_Ontology,代码行数:7,代码来源:utils.py
示例8: cluster
def cluster(algorithm, data, topics, make_silhouette=False):
print str(algorithm)
clusters = algorithm.fit_predict(data)
labels = algorithm.labels_
print 'Homogeneity: %0.3f' % metrics.homogeneity_score(topics, labels)
print 'Completeness: %0.3f' % metrics.completeness_score(topics, labels)
print 'V-measure: %0.3f' % metrics.v_measure_score(topics, labels)
print 'Adjusted Rand index: %0.3f' % metrics.adjusted_rand_score(topics, labels)
print 'Silhouette test: %0.3f' % metrics.silhouette_score(data, labels)
print ' ***************** '
silhouettes = metrics.silhouette_samples(data, labels)
num_clusters = len(set(clusters))
print 'num clusters: %d' % num_clusters
print 'num fitted: %d' % len(clusters)
# Make a silhouette plot if the flag is set
if make_silhouette:
order = numpy.lexsort((-silhouettes, clusters))
indices = [numpy.flatnonzero(clusters[order] == num_clusters) for k in range(num_clusters)]
ytick = [(numpy.max(ind)+numpy.min(ind))/2 for ind in indices]
ytickLabels = ["%d" % x for x in range(num_clusters)]
cmap = cm.jet( numpy.linspace(0,1,num_clusters) ).tolist()
clr = [cmap[i] for i in clusters[order]]
fig = plt.figure()
ax = fig.add_subplot(111)
ax.barh(range(data.shape[0]), silhouettes[order], height=1.0,
edgecolor='none', color=clr)
ax.set_ylim(ax.get_ylim()[::-1])
plt.yticks(ytick, ytickLabels)
plt.xlabel('Silhouette Value')
plt.ylabel('Cluster')
plt.savefig('cluster.png')
开发者ID:RuthRainbow,项目名称:DataMining,代码行数:34,代码来源:scilearn.py
示例9: visualize_silhouette_score
def visualize_silhouette_score(X,y_km):
cluster_labels = np.unique(y_km)
n_clusters = cluster_labels.shape[0]
silhouette_vals = metrics.silhouette_samples(X,
y_km,
metric='euclidean')
y_ax_lower, y_ax_upper = 0, 0
yticks = []
for i, c in enumerate(cluster_labels):
c_silhouette_vals = silhouette_vals[y_km == c]
c_silhouette_vals.sort()
y_ax_upper += len(c_silhouette_vals)
color = cm.jet(i / n_clusters)
plt.barh(range(y_ax_lower, y_ax_upper),
c_silhouette_vals,
height=1.0,
edgecolor='none',
color=color)
yticks.append((y_ax_lower + y_ax_upper) / 2)
y_ax_lower += len(c_silhouette_vals)
silhouette_avg = np.mean(silhouette_vals)
plt.axvline(silhouette_avg,
color="red",
linestyle="--")
plt.yticks(yticks, cluster_labels + 1)
plt.ylabel('Cluster')
plt.xlabel('Silhouette coefficient')
plt.show()
开发者ID:wislish,项目名称:Python-Data-Analysis,代码行数:30,代码来源:userClassify.py
示例10: fit
def fit(self, X, y=None, **kwargs):
"""
Fits the model and generates the silhouette visualization.
"""
# TODO: decide to use this method or the score method to draw.
# NOTE: Probably this would be better in score, but the standard score
# is a little different and I'm not sure how it's used.
# Fit the wrapped estimator
self.estimator.fit(X, y, **kwargs)
# Get the properties of the dataset
self.n_samples_ = X.shape[0]
self.n_clusters_ = self.estimator.n_clusters
# Compute the scores of the cluster
labels = self.estimator.predict(X)
self.silhouette_score_ = silhouette_score(X, labels)
self.silhouette_samples_ = silhouette_samples(X, labels)
# Draw the silhouette figure
self.draw(labels)
# Return the estimator
return self
开发者ID:DistrictDataLabs,项目名称:yellowbrick,代码行数:25,代码来源:silhouette.py
示例11: cluster_driver
def cluster_driver(a_driver):
# print a_driver['DStats']
# print "#############################DStats Above#################################ValueError: zero-size array to reduction operation minimum which has no identity#################"
X = StandardScaler().fit_transform(a_driver['DStats'])
# print X
# print "DStats are.....::" , a_driver['DStats']
# print "X is...........::" ,['AvgDistDel', 'AvgACosDel', 'SDevDistDel', 'SDevACosDel','TotalTime','SkewDistDel','SkewACosDel'] X
# print "############################Scaled X Above###################################################"
# db = KMeans(n_clusters=20,n_jobs = -1).fit(X)
db = DBSCAN(eps=0.45).fit(X)
# core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
# core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
# n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print "###############################################################################"
# print('Estimated number of clusters: %d' % n_clusters_)
# print 'Count of Predicts::', len(X)
print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels,metric="mahalanobis"))
# print "##############################DBSCAN X Below#################################################"
# print X G:/Continuing Education/Research & Presentations/Self - Machine Learning/Kaggle/DriverTelemetricAnalysis-AXA/'
# try:
return (metrics.silhouette_samples(X, labels,metric="mahalanobis")+1)/2
开发者ID:RobbieShan,项目名称:MindOnData,代码行数:27,代码来源:eda+11.0.py
示例12: run_clutering
def run_clutering(n_sites,order_dict,sim_mat):
n_clusters = 6
name_file = 'clustering_sil' + str(n_clusters)
output_file = open(name_file,'w')
name_file1 = 'clustering_labels' + str(n_clusters)
output_file1 = open(name_file1,'w')
spectral = cluster.SpectralClustering(n_clusters=n_clusters, \
eigen_solver='arpack',affinity='precomputed')
labels = spectral.fit_predict(sim_mat)
silhouette_avg = metrics.silhouette_score(sim_mat,labels)
output_file.write(" ".join(["aver silhouette_score:",str(silhouette_avg)]))
# Compute the silhouette scores for each sample
sample_silhouette_values = metrics.silhouette_samples(sim_mat, labels)
for siteid in order_dict:
stringa = ' '.join( \
[siteid,
str(sample_silhouette_values[order_dict[siteid]])])
output_file.write(stringa +'\n')
for siteid in order_dict:
stringa = ' '.join( \
[str(siteid),str(labels[order_dict[siteid]])
])
output_file1.write(stringa +'\n')
开发者ID:SherazT,项目名称:Radiumone_code,代码行数:29,代码来源:compute_spectral_clustering.py
示例13: calculateNumberOfIdealClusters
def calculateNumberOfIdealClusters(maxAmount, corpus):
print "Initializing silhouette analysis"
range_n_clusters = range(2, maxAmount) # max amount of clusters equal to amount of jobs
silhouette_high = 0;
silhouette_high_n_clusters = 2;
for n_clusters in range_n_clusters:
# Initialize the clusterer with n_clusters value
cluster = AgglomerativeClustering(n_clusters=n_clusters, linkage="ward", affinity="euclidean")
cluster_labels = cluster.fit_predict(corpus)
# The silhouette_score gives the average value for all the samples.
# This gives a perspective into the density and separation of the formed clusters
silhouette_avg = silhouette_score(corpus, cluster_labels)
print "For n_clusters = %d, the average silhouette_score is: %.5f" % (n_clusters, silhouette_avg)
if (silhouette_avg > silhouette_high):
silhouette_high = silhouette_avg
silhouette_high_n_clusters = n_clusters
# Compute the silhouette scores for each sample
sample_silhouette_values = silhouette_samples(corpus, cluster_labels)
print ("Highest score = %f for n_clusters = %d" % (silhouette_high, silhouette_high_n_clusters))
return silhouette_high_n_clusters
开发者ID:edwardmp,项目名称:clustering-job-offers-and-assessing-job-similarity,代码行数:27,代码来源:clustering.py
示例14: find_clusters
def find_clusters(df, k_vals=[4, 9, 16, 25], how='hierarchical'):
'''Find clusters, and if method is k-means run silhouette analysis
to determine the value of k.
Args:
df (data frame): A data frame with normalised expression data.
k_vals (list or range): The range over which to test k.
how ('hierarchical' or 'kmeans'): Clustering method.
Returns:
A list of cluster numbers.
'''
## Don't run the silhouette analysis for hierarchical clustering,
## just calculate the clusters using estimate of k.
if how == 'hierarchical':
k = int(np.sqrt((len(df) / 2.0)))
hc = hac.linkage(df, method='average')
optimal_clusters = hac.fcluster(hc, t=k, criterion='maxclust')
## If method is k-means, run silhouette analysis.
elif how == 'kmeans':
best_combined_score = 0
optimal_k = 2
## Try values of k from range and keep track of optimal k according
## to silhouette score.
for k in k_vals:
km = KMeans(n_clusters=k, random_state=10)
clusters = km.fit_predict(df)
silhouette_avg = silhouette_score(df, clusters)
sample_silhouette_values = silhouette_samples(df, clusters)
above_mean = 0
silhouette_sizes = []
for i in range(k):
ith_cluster_silhouette_values = sample_silhouette_values[clusters == i]
size_cluster_i = ith_cluster_silhouette_values.shape[0]
silhouette_sizes.append(size_cluster_i)
if max(ith_cluster_silhouette_values) > silhouette_avg:
above_mean += 1
## This combined score should pick the best value of k
above_mean_score = float(above_mean) / k
std_score = 1.0/np.std(silhouette_sizes) if np.std(silhouette_sizes) > 1.0 else 1.0
combined_score = (silhouette_avg + above_mean_score + std_score) / 3
## Put the clusters in the new column in the data frame.
if combined_score > best_combined_score:
best_combined_score = combined_score
optimal_k = k
optimal_clusters = clusters
optimal_clusters = [cluster + 1 for cluster in optimal_clusters]
return optimal_clusters
开发者ID:peteashton,项目名称:dots_for_microarrays,代码行数:57,代码来源:dots_analysis.py
示例15: test_gmm
def test_gmm():
sil = pyclust.validate.Silhouette()
sil_score = sil.score(X, ypred, sample_size=None)
print(sil_score[0])
print(sil.sample_scores[:10])
print(silhouette_score(X, ypred, sample_size=None))
print(silhouette_samples(X, ypred)[:10])
开发者ID:dominguezus,项目名称:pyclust,代码行数:11,代码来源:test_silhouette.py
示例16: compute_sil_score_vector
def compute_sil_score_vector(filelist):
"""returns dictionary indexed by num_clusters and
values which are vectors of silscore for all samples
"""
silscore = dict()
for f in filelist:
y, X = get_labels_features(f)
num_clusters = np.unique(y).shape[0]
silscore[num_clusters]= sklm.silhouette_samples(X,y)
return silscore
开发者ID:akhil137,项目名称:nasa,代码行数:11,代码来源:silscore.py
示例17: grind_kmeans
def grind_kmeans(fargs):
rss, ccs, mus, lv, installed_in, mdat, prefix = fargs
noaa_init(installed_in)
produce_kmeans_climates(mdat, lv, ccs, rss, prefix)
for rs in rss:
kf = pd.read_csv(noaafile('climates/' + prefix + '_rs_' + str(rs) + '.csv'))
for cc in ccs:
#this silhouettes thing gobbles memory, I'm guessing because each worker
#creates an entire new metric matrix.
kf['sil' + str(cc)] = silhouette_samples(mus, kf['vtx'+str(cc)].values, metric='precomputed')
kf.to_csv(noaafile('climates/' + prefix + '_sil_rs_' + str(rs) + '.csv'), index=False)
开发者ID:lpriccio,项目名称:noaa,代码行数:13,代码来源:nfeats_20160407.py
示例18: silhouette_samples
def silhouette_samples(clusters, word2vec_model):
labels = []
matrix = []
for i in range(len(clusters)):
words = clusters[i][-1]
_, mat = get_words_matrix(words, word2vec_model)
for j in range(len(mat)):
matrix.append(list(mat[j]))
labels.append(i)
matrix = np.array(matrix)
labels = np.array(labels)
samples_score = metrics.silhouette_samples(matrix, labels)
return labels, samples_score
开发者ID:hxiaofeng,项目名称:HTopicModel,代码行数:13,代码来源:terms_analysis.py
示例19: identify_accurate_number_of_clusters
def identify_accurate_number_of_clusters(self, model, compounds, max_range=3):
silhouette_avg_scores = []
for n_cluster in range(2, max_range):
assigned_cluster = cluster.KMeans(n_clusters=n_cluster,
n_init=20).fit_predict(model)
silhouette_avg = silhouette_score(model, assigned_cluster)
silhouette_avg_scores.append(silhouette_avg)
max_silhouette_score = max(silhouette_avg_scores)
index_max_score = silhouette_avg_scores.index(max_silhouette_score)
final_cluster_num = range(2, max_range)[index_max_score]
final_assigned_cluster = cluster.KMeans(n_init=20,
n_clusters=final_cluster_num).fit_predict(model)
final_sample_sil_vals = silhouette_samples(model, final_assigned_cluster)
return final_assigned_cluster, final_cluster_num, final_sample_sil_vals
开发者ID:sandialabs,项目名称:BioCompoundML,代码行数:14,代码来源:cluster.py
示例20: get_silhouette_scores
def get_silhouette_scores(X, km, nc):
# The silhouette_score gives the average value for all the samples.
# This gives a perspective into the density and separation of the formed
# clusters
cluster_labels = km.labels_
silhouette_avg = silhouette_score(X, cluster_labels)
#print ("For n_clusters =" + str(nc) + "The average silhouette_score is :"
# + str(silhouette_avg))
# Compute the silhouette scores for each sample
sample_silhouette_values = silhouette_samples(X, cluster_labels)
return silhouette_avg, sample_silhouette_values
开发者ID:ferranc,项目名称:blogproject,代码行数:14,代码来源:my_functions.py
注:本文中的sklearn.metrics.silhouette_samples函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论