本文整理汇总了Python中sklearn.covariance.MinCovDet类的典型用法代码示例。如果您正苦于以下问题:Python MinCovDet类的具体用法?Python MinCovDet怎么用?Python MinCovDet使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了MinCovDet类的14个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: launch_mcd_on_dataset
def launch_mcd_on_dataset(n_samples, n_features, n_outliers, tol_loc, tol_cov,
tol_support):
rand_gen = np.random.RandomState(0)
data = rand_gen.randn(n_samples, n_features)
# add some outliers
outliers_index = rand_gen.permutation(n_samples)[:n_outliers]
outliers_offset = 10. * \
(rand_gen.randint(2, size=(n_outliers, n_features)) - 0.5)
data[outliers_index] += outliers_offset
inliers_mask = np.ones(n_samples).astype(bool)
inliers_mask[outliers_index] = False
pure_data = data[inliers_mask]
# compute MCD by fitting an object
mcd_fit = MinCovDet(random_state=rand_gen).fit(data)
T = mcd_fit.location_
S = mcd_fit.covariance_
H = mcd_fit.support_
# compare with the estimates learnt from the inliers
error_location = np.mean((pure_data.mean(0) - T) ** 2)
assert(error_location < tol_loc)
error_cov = np.mean((empirical_covariance(pure_data) - S) ** 2)
assert(error_cov < tol_cov)
assert(np.sum(H) >= tol_support)
assert_array_almost_equal(mcd_fit.mahalanobis(data), mcd_fit.dist_)
开发者ID:JeongSeonGyo,项目名称:EnergyData,代码行数:26,代码来源:test_robust_covariance.py
示例2: test_mcd_issue1127
def test_mcd_issue1127():
# Check that the code does not break with X.shape = (3, 1)
# (i.e. n_support = n_samples)
rnd = np.random.RandomState(0)
X = rnd.normal(size=(3, 1))
mcd = MinCovDet()
mcd.fit(X)
开发者ID:JeongSeonGyo,项目名称:EnergyData,代码行数:7,代码来源:test_robust_covariance.py
示例3: getMahalanobisRobust
def getMahalanobisRobust(dat, critical_alpha = 0.01, good_rows = np.zeros(0)):
'''Calculate the Mahalanobis distance from the sample vector.'''
if good_rows.size == 0:
good_rows = np.any(~np.isnan(dat), axis=1);
#import pdb
#pdb.set_trace()
try:
robust_cov = MinCovDet().fit(dat[good_rows])
mahalanobis_dist = np.sqrt(robust_cov.mahalanobis(dat))
except ValueError:
#this step will fail if the covariance matrix is not singular. This happens if the data is not
#a unimodal symetric distribution. For example there is too many small noisy particles. Therefore
#I will take a safe option and return zeros in the mahalanobis distance if this is the case.
mahalanobis_dist = np.zeros(dat.shape[0])
#critial distance of the maholanobis distance using the chi-square distirbution
#https://en.wikiversity.org/wiki/Mahalanobis%27_distance
#http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chi2.html
maha_lim = chi2.ppf(1-critical_alpha, dat.shape[1])
outliers = mahalanobis_dist>maha_lim
return mahalanobis_dist, outliers, maha_lim
开发者ID:KezhiLi,项目名称:Multiworm_Tracking,代码行数:28,代码来源:getFilteredSkels.py
示例4: mahalanobis_plot
def mahalanobis_plot(ctry=None, df=None, weighted=True, inliers=False):
"""
See http://scikit-learn.org/0.13/modules/outlier_detection.html#\
fitting-an-elliptic-envelop
for details.
"""
if df is None and ctry is None:
raise ValueError('Either the country or a dataframe must be supplied')
elif df is None:
df = load_res(ctry, weighted=weighted)
if inliers:
df = get_inliers(df=df)
X = df.values
robust_cov = MinCovDet().fit(X)
#-----------------------------------------------------------------------------
# compare estimators learnt from the full data set with true parameters
emp_cov = EmpiricalCovariance().fit(X)
#-----------------------------------------------------------------------------
# Display results
fig = plt.figure()
fig.subplots_adjust(hspace=-.1, wspace=.4, top=.95, bottom=.05)
#-----------------------------------------------------------------------------
# Show data set
ax1 = fig.add_subplot(1, 1, 1)
ax1.scatter(X[:, 0], X[:, 1], alpha=.5, color='k', marker='.')
ax1.set_title(country_code[ctry])
#-----------------------------------------------------------------------------
# Show contours of the distance functions
xx, yy = np.meshgrid(np.linspace(ax1.get_xlim()[0], ax1.get_xlim()[1],
100),
np.linspace(ax1.get_ylim()[0], ax1.get_ylim()[1],
100))
zz = np.c_[xx.ravel(), yy.ravel()]
#-----------------------------------------------------------------------------
mahal_emp_cov = emp_cov.mahalanobis(zz)
mahal_emp_cov = mahal_emp_cov.reshape(xx.shape)
emp_cov_contour = ax1.contour(xx, yy, np.sqrt(mahal_emp_cov),
cmap=plt.cm.PuBu_r,
linestyles='dashed')
#-----------------------------------------------------------------------------
mahal_robust_cov = robust_cov.mahalanobis(zz)
mahal_robust_cov = mahal_robust_cov.reshape(xx.shape)
robust_contour = ax1.contour(xx, yy, np.sqrt(mahal_robust_cov),
cmap=plt.cm.YlOrBr_r, linestyles='dotted')
ax1.legend([emp_cov_contour.collections[1], robust_contour.collections[1]],
['MLE dist', 'robust dist'],
loc="upper right", borderaxespad=0)
ax1.grid()
return (fig, ax1, ctry)
开发者ID:RaoUmer,项目名称:data-wrangling,代码行数:50,代码来源:outliers_after_weighting.py
示例5: analyze
def analyze(self, mahalanobis_tolerance=2):
self.inlier_points = np.zeros((len(self.points), 2))
for id1 in range(len(self.points)):
id2 = closest_point(self.points, self.points[id1], id1)[0]
#keep lines fro plotting purposes
self.linedata[3*id1] = self.points[id1]
self.linedata[3*id1+1] = self.points[id2]
self.linedata[3*id1+2] = [None, None]
# we are repeating every pi/2, so we compress the angle space by 4x
a = 4*math.atan2((self.points[id1, 1] - self.points[id2, 1]), (self.points[id1, 0] - self.points[id2, 0]))
r = np.linalg.norm(self.points[id1] - self.points[id2])
self.polardata[id1] = [r*math.cos(a), r*math.sin(a)]
#find the minimal covariance inlier cluster
self.polar_cov = MinCovDet().fit(self.polardata)
# extract the grid angle and size. angle is divided by 4 because
# we previously scaled it up to repeat every 90 deg
self.theta = math.atan2(-self.polar_cov.location_[1], self.polar_cov.location_[0])/4
self.step_size = np.linalg.norm(self.polar_cov.location_)
# extract inlier points
polar_mahal = self.polar_cov.mahalanobis(self.polardata)**(0.33)
inlier_count = 0
for i in range(len(polar_mahal)):
if polar_mahal[i] < mahalanobis_tolerance: # stdev tolerance to outliers
self.inlier_points[inlier_count] = self.points[i]
self.inlier_indicies[inlier_count] = i
inlier_count += 1
self.normalized_points = rotate(self.inlier_points[:inlier_count], -self.theta)/self.step_size
#enumerate grid IDs
origin_id = closest_point(self.normalized_points, np.mean(self.normalized_points))[0]
self.normalized_points = self.normalized_points - self.normalized_points[origin_id]
inlier_count = 0
self.bounds = [sys.maxint, sys.maxint, -sys.maxint, -sys.maxint]
for p in self.normalized_points:
x = round(p[0])
y = round(p[1])
d = np.linalg.norm(p-[x, y])
if d < 0.4: #tolerance from unit position
self.normalized_points[inlier_count] = [x, y]
if (x < self.bounds[0]):
self.bounds[0] = x
if (x > self.bounds[2]):
self.bounds[2] = x
if (y < self.bounds[1]):
self.bounds[1] = y
if (y > self.bounds[3]):
self.bounds[3] = y
inlier_count += 1
self.normalized_points = self.normalized_points[:inlier_count]
开发者ID:jcl5m1,项目名称:CVToolsPython,代码行数:57,代码来源:GridTest.py
示例6: estimateGaussian
def estimateGaussian(nb_objects_init, nb_objects_final, thr, who, genes, siRNA,
loadingFolder = '../resultData/thrivisions/predictions',
threshold=0.05,):
arr=np.vstack((thr, nb_objects_init, nb_objects_final)).T
#deleting siRNAs that have only one experiment
print len(siRNA)
all_=Counter(siRNA);siRNA = np.array(siRNA)
toDelsi=filter(lambda x: all_[x]==1, all_)
toDelInd=[]
for si in toDelsi:
toDelInd.extend(np.where(siRNA==si)[0])
print len(toDelInd)
dd=dict(zip(range(4), [arr, who, genes, siRNA]))
for array_ in dd:
dd[array_]=np.delete(dd[array_],toDelInd,0 )
arr, who, genes, siRNA = [dd[el] for el in range(4)]
print arr.shape
arr_ctrl=arr[np.where(np.array(genes)=='ctrl')]
ctrlcov=MinCovDet().fit(arr_ctrl)
robdist= ctrlcov.mahalanobis(arr)*np.sign(arr[:,0]-np.mean(arr[:,0]))
new_siRNA=np.array(siRNA)[np.where((genes!='ctrl')&(robdist>0))]
pval,qval =empiricalPvalues(np.absolute(robdist[np.where(genes=='ctrl')])[:, np.newaxis],\
robdist[np.where((genes!='ctrl')&(robdist>0))][:, np.newaxis],\
folder=loadingFolder, name="thrivision", sup=True, also_pval=True)
assert new_siRNA.shape==qval.shape
hits=Counter(new_siRNA[np.where(qval<threshold)[0]])
hits=filter(lambda x: float(hits[x])/all_[x]>=0.5, hits)
gene_hits = [genes[list(siRNA).index(el)] for el in hits]
gene_hits=Counter(gene_hits)
return robdist, pval,qval, hits, gene_hits
开发者ID:PeterJackNaylor,项目名称:Xb_screen,代码行数:36,代码来源:thrivision.py
示例7: len
#X1 = preprocessing.scale(X2)
n_samples = len(X)
n_outliers = n_samples*0.05
n_features = 2
# generate data
# gen_cov = np.eye(n_features)
# gen_cov[0, 0] = 2.
# X = np.dot(np.random.randn(n_samples, n_features), gen_cov)
# # add some outliers
# outliers_cov = np.eye(n_features)
# outliers_cov[np.arange(1, n_features), np.arange(1, n_features)] = 7.
# X[-n_outliers:] = np.dot(np.random.randn(n_outliers, n_features), outliers_cov)
# fit a Minimum Covariance Determinant (MCD) robust estimator to data
robust_cov = MinCovDet().fit(X)
# compare estimators learnt from the full data set with true parameters
emp_cov = EmpiricalCovariance().fit(X)
###############################################################################
# Display results
fig = plt.figure()
plt.subplots_adjust(hspace=-.1, wspace=.4, top=.95, bottom=.05)
# Show data set
subfig1 = plt.subplot(3, 1, 1)
inlier_plot = subfig1.scatter(X[:, 0], X[:, 1],
color='black', label='inliers')
outlier_plot = subfig1.scatter(X[:, 0][-n_outliers:], X[:, 1][-n_outliers:],
color='red', label='outliers')
开发者ID:tkskow,项目名称:ProsjektOppgave,代码行数:31,代码来源:mahalanobis_test.py
示例8: Determinant
n_samples = 125
n_outliers = 25
n_features = 2
# generate data
gen_cov = np.eye(n_features)
gen_cov[0, 0] = 2.
X = np.dot(np.random.randn(n_samples, n_features), gen_cov)
# add some outliers
outliers_cov = np.eye(n_features)
outliers_cov[np.arange(1, n_features), np.arange(1, n_features)] = 7.
X[-n_outliers:] = np.dot(np.random.randn(n_outliers, n_features), outliers_cov)
# fit a Minimum Covariance Determinant (MCD) robust estimator to data
robust_cov = MinCovDet().fit(X)
# compare estimators learnt from the full data set with true parameters
emp_cov = EmpiricalCovariance().fit(X)
###############################################################################
# Display results
fig = pl.figure()
pl.subplots_adjust(hspace=-.1, wspace=.4, top=.95, bottom=.05)
# Show data set
subfig1 = pl.subplot(3, 1, 1)
inlier_plot = subfig1.scatter(X[:, 0], X[:, 1],
color='black', label='inliers')
outlier_plot = subfig1.scatter(X[:, 0][-n_outliers:], X[:, 1][-n_outliers:],
color='red', label='outliers')
开发者ID:FH235918,项目名称:scikit-learn,代码行数:30,代码来源:plot_mahalanobis_distances.py
示例9: main
def main():
parser = argparse.ArgumentParser(
description='Plot outlier-like distances for a 2-dimensional dataset')
parser.add_argument(
'dataset', type=argparse.FileType('r'),
help='a CSV file containing the dataset')
parser.add_argument(
'--plot', type=str, choices=['train', 'grid'], default='grid',
help='plot the dataset or a grid evenly distributed over its span')
parser.add_argument(
'--plotdims', type=int, choices=[2, 3], default=2,
help='the number of dimensions to plot')
args = parser.parse_args()
X = np.loadtxt(args.dataset, delimiter=',')
fig = plt.figure()
xformer = NullTransformer()
if X.shape[1] > 2:
xformer = PCA(n_components=2)
X = xformer.fit_transform(X)
if args.plotdims == 2:
plt.scatter(X[:, 0], X[:, 1], s=60, linewidth='0')
else:
plt.scatter(X[:, 0], X[:, 1])
plt.show(block=False)
path_to_script = os.path.realpath(__file__)
dir_of_script = os.path.dirname(path_to_script)
dataset_path = dir_of_script + '/outliers.npy'
np.save(dataset_path, X)
###########################################################################
# Train autoencoder with the n samples until convergence. Run
# evenly distributed samples through the autoencoder and compute
# their reconstruction error.
###########################################################################
maxseq_orig = np.max(X)
minseq_orig = np.min(X)
seqrange = np.abs(maxseq_orig - minseq_orig)
maxseq = maxseq_orig + 0.5 * seqrange
minseq = minseq_orig - 0.5 * seqrange
print("minseq", minseq, "maxseq", maxseq)
if args.plot == 'grid':
seq = np.linspace(minseq, maxseq, num=50, endpoint=True)
Xplot = np.array([_ for _ in product(seq, seq)])
else:
Xplot = X
robust_cov = MinCovDet().fit(X)
robust_md = robust_cov.mahalanobis(Xplot)
empirical_cov = EmpiricalCovariance().fit(X)
empirical_md = empirical_cov.mahalanobis(Xplot)
# Assume Xplot is at least 2-dimensional.
if Xplot.shape[1] > 2:
Xplot2d = bh_sne(Xplot)
else:
Xplot2d = Xplot
robust_md01 = robust_md - np.nanmin(robust_md)
robust_md01 = robust_md01 / np.nanmax(robust_md01)
empirical_md01 = empirical_md - np.nanmin(empirical_md)
empirical_md01 = empirical_md01 / np.nanmax(empirical_md01)
fig = plt.figure()
if args.plotdims == 2:
ax = fig.add_subplot(1, 1, 1)
ax.scatter(Xplot2d[:, 0], Xplot2d[:, 1],
cmap=plt.cm.jet, c=robust_md01, s=60, linewidth='0')
else:
ax = fig.add_subplot(1, 1, 1, projection='3d')
ax.plot_trisurf(Xplot2d[:, 0], Xplot2d[:, 1], robust_md01,
cmap=plt.cm.jet, color=robust_md01)
ax.set_zlabel('Mahalanobis distance')
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_title('Mahalanobis distance (robust covariance)')
fig = plt.figure()
if args.plotdims == 2:
ax = fig.add_subplot(1, 1, 1)
ax.scatter(Xplot2d[:, 0], Xplot2d[:, 1],
cmap=plt.cm.jet, c=empirical_md01, s=60, linewidth='0')
else:
ax = fig.add_subplot(1, 1, 1, projection='3d')
ax.plot_trisurf(Xplot2d[:, 0], Xplot2d[:, 1], empirical_md01,
cmap=plt.cm.jet, color=empirical_md01)
ax.set_zlabel('Mahalanobis distance')
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_title('Mahalanobis distance (empirical covariance)')
#.........这里部分代码省略.........
开发者ID:Libardo1,项目名称:modeling,代码行数:101,代码来源:outliers.py
示例10: enumerate
# computation
for i, n_outliers in enumerate(range_n_outliers):
for j in range(repeat):
# generate data
X = np.random.randn(n_samples, n_features)
# add some outliers
outliers_index = np.random.permutation(n_samples)[:n_outliers]
outliers_offset = 10. * \
(np.random.randint(2, size=(n_outliers, n_features)) - 0.5)
X[outliers_index] += outliers_offset
inliers_mask = np.ones(n_samples).astype(bool)
inliers_mask[outliers_index] = False
# fit a Minimum Covariance Determinant (MCD) robust estimator to data
S = MinCovDet().fit(X)
# compare raw robust estimates with the true location and covariance
err_loc_mcd[i, j] = np.sum(S.location_ ** 2)
err_cov_mcd[i, j] = S.error_norm(np.eye(n_features))
# compare estimators learnt from the full data set with true parameters
err_loc_emp_full[i, j] = np.sum(X.mean(0) ** 2)
err_cov_emp_full[i, j] = EmpiricalCovariance().fit(X).error_norm(
np.eye(n_features))
# compare with an empirical covariance learnt from a pure data set
# (i.e. "perfect" MCD)
pure_X = X[inliers_mask]
pure_location = pure_X.mean(0)
pure_emp_cov = EmpiricalCovariance().fit(pure_X)
err_loc_emp_pure[i, j] = np.sum(pure_location ** 2)
err_cov_emp_pure[i, j] = pure_emp_cov.error_norm(np.eye(n_features))
开发者ID:AlexLerman,项目名称:scikit-learn,代码行数:29,代码来源:plot_robust_vs_empirical_covariance.py
示例11: ols
lm2 = ols('word_diff ~ Age + C(Centre_ID)',
data=clean_st,subset=subset).fit()
print(lm2.summary())
# <markdowncell>
# # Snippets. Might come back to this later:
# <codecell>
from scipy.stats import pearsonr
from sklearn.covariance import MinCovDet
# just look at what's interesting for now, and drop the NAs involved
clean = st_v_merged.loc[:,['norm_diff','Interview_Suggested_Ranking_numerical_']]
clean = clean.dropna(axis=0)
# calculate robust covariance estimate, calculate what's too far away
mcd = MinCovDet()
mcd.fit(clean)
pearsonr(clean.iloc[:,0],clean.iloc[:,1])
# <codecell>
d = mcd.mahalanobis(clean)
d.sort()
d
开发者ID:kenben,项目名称:Suas,代码行数:29,代码来源:volunteer_quickLook.py
示例12: Determinant
n_samples = 125
n_outliers = 25
n_features = 2
# generate data
gen_cov = np.eye(n_features)
gen_cov[0, 0] = 2.
X = np.dot(np.random.randn(n_samples, n_features), gen_cov)
# add some outliers
outliers_cov = np.eye(n_features)
outliers_cov[np.arange(1, n_features), np.arange(1, n_features)] = 7.
X[-n_outliers:] = np.dot(np.random.randn(n_outliers, n_features), outliers_cov)
# fit a Minimum Covariance Determinant (MCD) robust estimator to data
robust_cov = MinCovDet().fit(X)
# compare estimators learnt from the full data set with true parameters
emp_cov = EmpiricalCovariance().fit(X)
# Display results
fig = pl.figure()
# Show data set
subfig1 = pl.subplot(3, 1, 1)
subfig1.scatter(X[:, 0], X[:, 1], color='black', label='inliers')
subfig1.scatter(X[:, 0][-n_outliers:], X[:, 1][-n_outliers:],
color='red', label='outliers')
subfig1.set_xlim(subfig1.get_xlim()[0], 11.)
subfig1.set_title("Mahalanobis distances of a contaminated data set:")
开发者ID:aravindgd,项目名称:scikit-learn,代码行数:30,代码来源:plot_mahalanobis_distances.py
示例13: __init__
def __init__(self, support_fraction = 0.95, verbose = True, chi2_percentile = 0.995):
self.verbose = verbose
self.support_fraction = support_fraction
self.chi2 = stats.chi2
self.mcd = MCD(store_precision = True, support_fraction = support_fraction)
self.chi2_percentile = chi2_percentile
开发者ID:Alkesten,项目名称:Python-Numerics,代码行数:6,代码来源:outlier.py
示例14: Outlier_detection
class Outlier_detection(object):
def __init__(self, support_fraction = 0.95, verbose = True, chi2_percentile = 0.995):
self.verbose = verbose
self.support_fraction = support_fraction
self.chi2 = stats.chi2
self.mcd = MCD(store_precision = True, support_fraction = support_fraction)
self.chi2_percentile = chi2_percentile
def fit(self, X):
"""Prints some summary stats (if verbose is one) and returns the indices of what it consider to be extreme"""
self.mcd.fit(X)
mahalanobis = lambda p: distance.mahalanobis(p, self.mcd.location_, self.mcd.precision_ )
d = np.array(map(mahalanobis, X)) #Mahalanobis distance values
self.d2 = d ** 2 #MD squared
n, self.degrees_of_freedom_ = X.shape
self.iextreme_values = (self.d2 > self.chi2.ppf(0.995, self.degrees_of_freedom_) )
if self.verbose:
print "%.3f proportion of outliers at %.3f%% chi2 percentile, "%(self.iextreme_values.sum()/float(n), self.chi2_percentile)
print "with support fraction %.2f."%self.support_fraction
return self
def plot(self,log=False, sort = False ):
"""
Cause plotting is always fun.
log: transform the distance-sq to a log ( distance-sq )
sort: sort the data according to distnace before plotting
ifollow: a set if indices to mark with yellow, useful for seeing where data lies across views.
"""
n = self.d2.shape[0]
fig = plt.figure()
x = np.arange( n )
ax = fig.add_subplot(111)
transform = (lambda x: x ) if not log else (lambda x: np.log(x))
chi_line = self.chi2.ppf(self.chi2_percentile, self.degrees_of_freedom_)
chi_line = transform( chi_line )
d2 = transform( self.d2 )
if sort:
isort = np.argsort( d2 )
ax.scatter(x, d2[isort], alpha = 0.7, facecolors='none' )
plt.plot( x, transform(self.chi2.ppf( np.linspace(0,1,n),self.degrees_of_freedom_ )), c="r", label="distribution assuming normal" )
else:
ax.scatter(x, d2 )
extreme_values = d2[ self.iextreme_values ]
ax.scatter( x[self.iextreme_values], extreme_values, color="r" )
ax.hlines( chi_line, 0, n,
label ="%.1f%% $\chi^2$ quantile"%(100*self.chi2_percentile), linestyles = "dotted" )
ax.legend()
ax.set_ylabel("distance squared")
ax.set_xlabel("observation")
ax.set_xlim(0, self.d2.shape[0])
plt.show()
开发者ID:Alkesten,项目名称:Python-Numerics,代码行数:64,代码来源:outlier.py
注:本文中的sklearn.covariance.MinCovDet类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论