本文整理汇总了Python中sklearn.utils.resample函数的典型用法代码示例。如果您正苦于以下问题:Python resample函数的具体用法?Python resample怎么用?Python resample使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了resample函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: initialize
def initialize(self, X, k, random_seed, method='naive'):
if method == 'naive':
# Randomly pick k data points to be the centroids of the k clusters
centroids = resample(X, n_samples=k, random_state=random_seed, replace=False)
elif method == 'kmeans++': # https://en.wikipedia.org/wiki/K-means%2B%2B
# Step 1: Choose one center uniformly at random from among the data points
centroids = resample(X, n_samples=1, random_state=random_seed, replace=False)
N = len(X)
# Sampling the 1~k centroids
for i in range(1, k):
distances = [ -1 ] * N
# Step 2: For each data point x, compute D(x)
for j in range(N):
# The distance between x and the nearest center that has already been chosen
distances[j] = min(np.linalg.norm(X[j] - centroid) for centroid in centroids)
# Step 3: Choose one new data point at randome as a new center,
# using a weighted probability distribution where a point x is chosen with probability proportional to D(x)^2
square_distances = [ distance ** 2 for distance in distances ]
total_square_distance = sum(square_distances)
# Naturally excluded already selected data points, because their probability is 0
probabilities = [ square_distance / total_square_distance for square_distance in square_distances ]
new_centroid_index = np.random.choice(range(N), size=1, replace=False, p=probabilities)[0]
centroids = np.append(centroids, [ X[new_centroid_index] ], axis=0)
return centroids
开发者ID:bluesilence,项目名称:python,代码行数:28,代码来源:KMeans.py
示例2: test_resample
def test_resample():
# Border case not worth mentioning in doctests
assert resample() is None
# Check that invalid arguments yield ValueError
assert_raises(ValueError, resample, [0], [0, 1])
assert_raises(ValueError, resample, [0, 1], [0, 1],
replace=False, n_samples=3)
assert_raises(ValueError, resample, [0, 1], [0, 1], meaning_of_life=42)
# Issue:6581, n_samples can be more when replace is True (default).
assert_equal(len(resample([1, 2], n_samples=5)), 5)
开发者ID:allefpablo,项目名称:scikit-learn,代码行数:11,代码来源:test_utils.py
示例3: run_scikit_digits
def run_scikit_digits(epochs=0, layers=0, neuron_count=0):
""" Run Handwritten Digits dataset from Scikit-Learn. Learning set is split
into 70% for training, 15% for testing, and 15% for validation.
Parameters
----------
epochs : int
Number of iterations of the the traininng loop for the whole dataset
layers : int
Number of layers (not counting the input layer, but does count output
layer)
neuron_count : list
The number of neurons in each of the layers (in order), does not count
the bias term
Attributes
----------
target_values : list
The possible values for each training vector
"""
# Imported from linear_neuron
temp_digits = datasets.load_digits()
digits = utils.resample(temp_digits.data, random_state=3)
temp_answers = utils.resample(temp_digits.target, random_state=3)
# images = utils.resample(temp_digits.images, random_state=0)
num_of_training_vectors = 1250
answers, answers_to_test, validation_answers = (
temp_answers[:num_of_training_vectors],
temp_answers[num_of_training_vectors : num_of_training_vectors + 260],
temp_answers[num_of_training_vectors + 260 :],
)
training_set, testing_set, validation_set = (
digits[:num_of_training_vectors],
digits[num_of_training_vectors : num_of_training_vectors + 260],
digits[num_of_training_vectors + 260 :],
)
###########
# network.visualization(training_set[10], answers[10])
# network.visualization(training_set[11], answers[11])
# network.visualization(training_set[12], answers[12])
network = Network(layers, neuron_count, training_set[0])
network.train(training_set, answers, epochs)
f = open("my_net.pickle", "wb")
# fr = open('my_net.pickle', 'rb')
dill.dump(network, f)
# network = pickle.load(fr)
# fr.close()
f.close()
# guess_list = network.run_unseen(testing_set)
return network.run_unseen(testing_set)
开发者ID:totalgood,项目名称:capstone,代码行数:54,代码来源:net_launch.py
示例4: resample_training_dataset
def resample_training_dataset(self, labels, feature_array, sizes = (5000,500)):
"""
Inputs:
- labels
- features
- sizes: tuple, for each class (0,1,etc)m the number of training chunks you want.
i.e for 500 seizures, 5000 baseline, sizes = (5000, 500), as 0 is baseline, 1 is Seizure
Takes labels and features an
WARNING: Up-sampling target class prevents random forest oob from being accurate.
"""
if len (labels.shape) == 1:
labels = labels[:, None]
resampled_labels = []
resampled_features = []
for i,label in enumerate(np.unique(labels.astype('int'))):
class_inds = np.where(labels==label)[0]
class_labels = labels[class_inds]
class_features = feature_array[class_inds,:]
if class_features.shape[0] < sizes[i]: # need to oversample
class_features_duplicated = np.vstack([class_features for i in range(int(sizes[i]/class_features.shape[0]))])
class_labels_duplicated = np.vstack([class_labels for i in range(int(sizes[i]/class_labels.shape[0]))])
n_extra_needed = sizes[i] - class_labels_duplicated.shape[0]
extra_features = resample(class_features, n_samples = n_extra_needed,random_state = 7, replace = False)
extra_labels = resample(class_labels, n_samples = n_extra_needed,random_state = 7, replace = False)
boot_array = np.vstack([class_features_duplicated,extra_features])
boot_labels = np.vstack([class_labels_duplicated,extra_labels])
elif class_features.shape[0] > sizes[i]: # need to undersample
boot_array = resample(class_features, n_samples = sizes[i],random_state = 7, replace = False)
boot_labels = resample(class_labels, n_samples = sizes[i],random_state = 7, replace = False)
elif class_features.shape[0] == sizes[i]:
logging.debug('label '+str(label)+ ' had exact n as sample, doing nothing!')
boot_array = class_features
boot_labels = class_labels
else:
print(class_features.shape[0], sizes[i])
print ('fuckup')
resampled_features.append(boot_array)
resampled_labels.append(boot_labels)
# stack both up...
resampled_labels = np.vstack(resampled_labels)
resampled_features = np.vstack(resampled_features)
logging.debug('Original label counts: '+str(pd.Series(labels[:,0]).value_counts()))
logging.debug('Resampled label counts: '+str(pd.Series(resampled_labels[:,0]).value_counts()))
return resampled_labels, resampled_features
开发者ID:jcornford,项目名称:pyecog,代码行数:53,代码来源:classifier.py
示例5: run_mnist
def run_mnist(epochs, layers, neuron_count):
""" Run Mnist dataset and output a guess list on the Kaggle test_set
Parameters
----------
epochs : int
Number of iterations of the the traininng loop for the whole dataset
layers : int
Number of layers (not counting the input layer, but does count output
layer)
neuron_count : list
The number of neurons in each of the layers (in order), does not count
the bias term
Attributes
----------
target_values : list
The possible values for each training vector
"""
with open('train.csv', 'r') as f:
reader = csv.reader(f)
t = list(reader)
train = [[int(x) for x in y] for y in t[1:]]
with open('test.csv', 'r') as f:
reader = csv.reader(f)
raw_nums = list(reader)
test_set = [[int(x) for x in y] for y in raw_nums[1:]]
ans_train = [x[0] for x in train]
train_set = [x[1:] for x in train]
ans_train.pop(0)
train_set.pop(0)
train_set = utils.resample(train_set, random_state=2)
ans_train = utils.resample(ans_train, random_state=2)
network = Network(layers, neuron_count, train_set[0])
network.train(train_set, ans_train, epochs)
# For validation purposes
# guess_list = network.run_unseen(train_set[4000:4500])
# network.report_results(guess_list, ans_train[4000:4500])
# guess_list = network.run_unseen(train_set[4500:5000])
# network.report_results(guess_list, ans_train[4500:5000])
guess_list = network.run_unseen(test_set)
with open('digits.txt', 'w') as d:
for elem in guess_list:
d.write(str(elem)+'\n')
开发者ID:uglyboxer,项目名称:finnegan,代码行数:52,代码来源:net_launch.py
示例6: test_resample_stratified
def test_resample_stratified():
# Make sure resample can stratify
rng = np.random.RandomState(0)
n_samples = 100
p = .9
X = rng.normal(size=(n_samples, 1))
y = rng.binomial(1, p, size=n_samples)
_, y_not_stratified = resample(X, y, n_samples=10, random_state=0,
stratify=None)
assert np.all(y_not_stratified == 1)
_, y_stratified = resample(X, y, n_samples=10, random_state=0, stratify=y)
assert not np.all(y_stratified == 1)
assert np.sum(y_stratified) == 9 # all 1s, one 0
开发者ID:daniel-perry,项目名称:scikit-learn,代码行数:15,代码来源:test_utils.py
示例7: eval_prox_random
def eval_prox_random(self, n_sample_node=5, sample_nodes=[]):
cs = self.cs
measurements = {}
nodes = cs.nodes()
test_nodes = []
if len(sample_nodes):
if type(sample_nodes[0]) is str:
test_nodes = sample_nodes
elif type(sample_nodes[0]) is int:
test_nodes = [nodes[i] for i in sample_nodes]
else:
test_nodes = resample(nodes, n_samples=n_sample_node)
# nae of coordinate-based proximity vs ground-proximity
coor_test = self.coor_all[test_nodes]
ground_prox = (
cs.proximity_to(sources=test_nodes, dests=cs.nodes()).as_matrix().transpose()
) # shape: test_nodes x all_nodes
coor_prox = np.dot(coor_test.as_matrix().transpose(), self.coor_all.as_matrix())
nae = pd.Series.combine(
pd.Series(coor_prox.flatten()), pd.Series(ground_prox.flatten()), lambda c, g: abs(c - g) / g
)
nae_plot = pd.Series(np.linspace(0.0, 1.0, num=len(nae)), index=nae.order())
measurements["nae"] = nae
measurements["nae_plot"] = nae_plot
return measurements
开发者ID:blublud,项目名称:coordinate_learning,代码行数:30,代码来源:path_accum_coorsys.py
示例8: bootstrap_auc
def bootstrap_auc(df, col, pred_col, n_bootstrap=1000):
"""
Calculate the boostrapped AUC for a given col trying to predict a pred_col.
Parameters
----------
df : pandas.DataFrame
col : str
column to retrieve the values from
pred_col : str
the column we're trying to predict
n_boostrap : int
the number of bootstrap samples
Returns
-------
list : AUCs for each sampling
"""
scores = np.zeros(n_bootstrap)
old_len = len(df)
df.dropna(subset=[col], inplace=True)
new_len = len(df)
if new_len < old_len:
logger.info("Dropping NaN values in %s to go from %d to %d rows" % (col, old_len, new_len))
preds = df[pred_col].astype(int)
for i in range(n_bootstrap):
sampled_counts, sampled_pred = resample(df[col], preds)
if is_single_class(sampled_pred, col=pred_col):
continue
scores[i] = roc_auc_score(sampled_pred, sampled_counts)
return scores
开发者ID:hammerlab,项目名称:cohorts,代码行数:31,代码来源:model.py
示例9: fit
def fit(self, dataSet):
for clt in self.forest:
randSet= resample(dataSet)
#print "randSet size = %d" % len(randSet)
target = [x[0] for x in randSet]
train = [x[1:] for x in randSet]
clt.fit(train, target)
开发者ID:agag4510118,项目名称:CS412-Introduction-to-Data-Mining,代码行数:7,代码来源:RandomForest.py
示例10: boot_estimates
def boot_estimates(model, X, y, nboot):
'''
Evaluate coefficient estimates for nboot boostrap samples
'''
coefs = [np.hstack([model.fit(iX, iy).intercept_, model.fit(iX, iy).coef_.ravel()])
for iX, iy in (resample(X, y) for i in xrange(nboot))]
return np.vstack(coefs)
开发者ID:thomasbrawner,项目名称:python_tools,代码行数:7,代码来源:marginal_effects_example.py
示例11: downsample
def downsample(y, sizes = [30000, 3000]):
# classes = Counter(y)
res = []
for class_i, sz in enumerate(sizes):
indices = [x for x in y == class_i if x]
res.append(resample(indices, replace = True, n_samples = sz))
return tuple(res)
开发者ID:vadimnazarov,项目名称:llama,代码行数:7,代码来源:llama.py
示例12: run_method_usage
def run_method_usage(methods,cases):
methods = [m[0] for m in methods]
# Bootstrap the percentage error bars:
percents =[]
for i in range(10000):
nc = resample(cases)
percents.append(100*np.sum(nc,axis=0)/len(nc))
percents=np.array(percents)
mean_percents = np.mean(percents,axis=0)
std_percents = np.std(percents,axis=0)*1.96
inds=np.argsort(mean_percents).tolist()
inds.reverse()
avg_usage = np.mean(mean_percents)
fig = plt.figure()
ax = fig.add_subplot(111)
x=np.arange(len(methods))
ax.plot(x,[avg_usage]*len(methods),'-',color='0.25',lw=1,alpha=0.2)
ax.bar(x, mean_percents[inds], 0.6, color=paired[0],linewidth=0,
yerr=std_percents[inds],ecolor=paired[1])
#ax.set_title('Method Occurrence')
ax.set_ylabel('Occurrence %',fontsize=30)
ax.set_xlabel('Method',fontsize=30)
ax.set_xticks(np.arange(len(methods)))
ax.set_xticklabels(np.array(methods)[inds],fontsize=8)
fig.autofmt_xdate()
fix_axes()
plt.tight_layout()
fig.savefig(figure_path+'method_occurrence.pdf', bbox_inches=0)
fig.show()
return inds,mean_percents[inds]
开发者ID:IDEALLab,项目名称:design_method_recommendation_JMD_2014,代码行数:30,代码来源:paper_experiments.py
示例13: balanced_resample
def balanced_resample(data, labels):
"""Do a balanced resampling of data and labels, returning them
See the test routine at the bottom for an example of behavior
"""
most_common, num_required = mstats.mode(labels)
possible_labels = np.unique(labels)
data_resampled = []
labels_resampled = []
for possible_label in possible_labels:
in_this_label = labels == possible_label
data_buffered = np.array([])
data_buffered = np.reshape(data_buffered, (0, data.shape[1]))
labels_buffered = np.array([])
while len(data_buffered) < num_required:
data_buffered = np.vstack([data_buffered, data[in_this_label]])
labels_buffered = np.hstack([labels_buffered, labels[in_this_label]])
single_data_resampled, single_labels_resampled = utils.resample(
data_buffered,
labels_buffered,
n_samples=int(num_required),
replace=True
)
data_resampled.append(single_data_resampled)
labels_resampled.append(single_labels_resampled)
return np.vstack(data_resampled).astype(data.dtype), np.hstack(labels_resampled).astype(labels.dtype)
开发者ID:DSsoto,项目名称:Sub8,代码行数:31,代码来源:utils.py
示例14: fit
def fit(self, X, Y):
num_examples = len(X)
data_indices = np.arange(num_examples)
self.data = X
Y = np.array(Y, dtype=float)
sample = resample(data_indices, replace=False, n_samples=min(20, num_examples), random_state=0)
for i in sample:
y = Y[i]
self.S.add(i)
self.y[i] = y
self.alpha[i] = 0.0
self.g[i] = y
for i in xrange(5):
min_delta = 999999999
for i in data_indices:
self.process(i, Y[i])
delta = self.reprocess()
min_delta = min(min_delta, delta)
if min_delta < self.tau: break
data_indices = shuffle(data_indices)
while True:
delta = self.reprocess()
if delta < self.tau: break
开发者ID:woohp,项目名称:ai_tidbits,代码行数:28,代码来源:lasvm.py
示例15: test_mnist
def test_mnist(self):
mnist = fetch_mldata('MNIST original')
X, Y = resample(mnist.data, mnist.target, replace=False, n_samples=1000, random_state=0)
X = X.astype(float)
Y = [1 if y == 0 else -1 for y in Y]
svm = LASVM(C=10, tau=0.001)
svm.fit(X, Y)
X_test, Y_test = resample(mnist.data, mnist.target, replace=False, n_samples=300, random_state=2)
X_test = X_test.astype(float)
Y_test = [1 if y == 0 else -1 for y in Y_test]
Y_predict = svm.predict(X_test)
percent_correct = np.sum(Y_predict == Y_test) / 300.0
self.assertGreater(percent_correct, 0.95)
开发者ID:woohp,项目名称:ai_tidbits,代码行数:16,代码来源:lasvm.py
示例16: Reduce_scikit_kmeans
def Reduce_scikit_kmeans(img, number_of_colors):
t0 = time()
from sklearn.cluster import KMeans
img_64 = np.array(img, dtype=np.float64) / 255
w, h, d = tuple(img_64.shape)
assert d == 3
image_array = np.reshape(img_64, (w * h, d))
LOGGER.info("shape=%s", image_array.shape)
from sklearn.utils import resample
image_array_sample = resample(
image_array,
replace=True,
n_samples=min([image_array.shape[0], 1000]),
random_state=1
)
kmeans = KMeans(
n_clusters=number_of_colors,
random_state=1,
precompute_distances=True).fit(image_array_sample)
labels = kmeans.predict(image_array)
LOGGER.info("ms=%s", ms(t0))
return kmeans.cluster_centers_, labels
开发者ID:rdefeo,项目名称:image_processing,代码行数:26,代码来源:color.py
示例17: show_bootstrap_statistics
def show_bootstrap_statistics(clf, X, y, features):
num_features = len(features)
coefs = []
for i in range(num_features):
coefs.append([])
for _ in range(BOOTSTRAP_ITERATIONS):
X_sample, y_sample = resample(X, y)
clf.fit(X_sample, y_sample)
for i, c in enumerate(get_normalized_coefs(clf)):
coefs[i].append(c)
poi_index = features.index('POI')
building_index = features.index('Building')
coefs[building_index] = coefs[poi_index]
intervals = []
print()
print('***** Bootstrap statistics *****')
print('{:<20}{:<20}{:<10}{:<10}'.format('Feature', '95% interval', 't-value', 'Pr(>|t|)'))
print()
for i, cs in enumerate(coefs):
values = np.array(cs)
lo = np.percentile(values, 2.5)
hi = np.percentile(values, 97.5)
interval = '({:.3f}, {:.3f})'.format(lo, hi)
tv = np.mean(values) / np.std(values)
pr = (1.0 - t.cdf(x=abs(tv), df=len(values))) * 0.5
stv = '{:.3f}'.format(tv)
spr = '{:.3f}'.format(pr)
print('{:<20}{:<20}{:<10}{:<10}'.format(features[i], interval, stv, spr))
开发者ID:milchakov,项目名称:omim,代码行数:34,代码来源:scoring_model.py
示例18: test_resample_stratify_2dy
def test_resample_stratify_2dy():
# Make sure y can be 2d when stratifying
rng = np.random.RandomState(0)
n_samples = 100
X = rng.normal(size=(n_samples, 1))
y = rng.randint(0, 2, size=(n_samples, 2))
X, y = resample(X, y, n_samples=50, random_state=rng, stratify=y)
assert y.ndim == 2
开发者ID:daniel-perry,项目名称:scikit-learn,代码行数:8,代码来源:test_utils.py
示例19: bootstrap_auc
def bootstrap_auc(y_c,y_pred,N=100):
"""Bootstrap the AUC score."""
scores=[]
for i in xrange(N):
res_y=resample(np.column_stack([y_c,y_pred]))
scores.append(roc_auc_score(res_y[:,0],res_y[:,1]))
print 'Score is :', '%.4f' % np.mean(scores),
print '+-','%.4f' % np.std(scores)
开发者ID:riblidezso,项目名称:mhc_pred,代码行数:9,代码来源:utils.py
示例20: make_pred_prob_plot_data
def make_pred_prob_plot_data(model, df, column):
dfc = df.copy()
rng = np.linspace(df[column].min(), df[column].max())
probs = []
for val in rng:
dfc[column] = val
pred_probs = model.predict_proba(dfc)[:, 1]
probs.append([boot_sample.mean() for boot_sample in (resample(pred_probs) for _ in xrange(1000))])
return rng, np.array(probs).T
开发者ID:jessedow24,项目名称:Fraud_Detection_Case_Study,代码行数:9,代码来源:make_graphs.py
注:本文中的sklearn.utils.resample函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论