本文整理汇总了Python中sklearn.utils.fixes.bincount函数的典型用法代码示例。如果您正苦于以下问题:Python bincount函数的具体用法?Python bincount怎么用?Python bincount使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了bincount函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: check_min_samples_leaf
def check_min_samples_leaf(name):
X, y = hastie_X, hastie_y
# Test if leaves contain more than leaf_count training examples
ForestEstimator = FOREST_ESTIMATORS[name]
# test boundary value
assert_raises(ValueError, ForestEstimator(min_samples_leaf=-1).fit, X, y)
assert_raises(ValueError, ForestEstimator(min_samples_leaf=0).fit, X, y)
est = ForestEstimator(min_samples_leaf=5, n_estimators=1, random_state=0)
est.fit(X, y)
out = est.estimators_[0].tree_.apply(X)
node_counts = bincount(out)
# drop inner nodes
leaf_count = node_counts[node_counts != 0]
assert_greater(np.min(leaf_count), 4, "Failed with {0}".format(name))
est = ForestEstimator(min_samples_leaf=0.25, n_estimators=1, random_state=0)
est.fit(X, y)
out = est.estimators_[0].tree_.apply(X)
node_counts = np.bincount(out)
# drop inner nodes
leaf_count = node_counts[node_counts != 0]
assert_greater(np.min(leaf_count), len(X) * 0.25 - 1, "Failed with {0}".format(name))
开发者ID:nelson-liu,项目名称:scikit-learn,代码行数:25,代码来源:test_forest.py
示例2: check_min_weight_fraction_leaf
def check_min_weight_fraction_leaf(name, X, y):
# Test if leaves contain at least min_weight_fraction_leaf of the
# training set
ForestEstimator = FOREST_ESTIMATORS[name]
rng = np.random.RandomState(0)
weights = rng.rand(X.shape[0])
total_weight = np.sum(weights)
# test both DepthFirstTreeBuilder and BestFirstTreeBuilder
# by setting max_leaf_nodes
for max_leaf_nodes in (None, 1000):
for frac in np.linspace(0, 0.5, 6):
est = ForestEstimator(min_weight_fraction_leaf=frac,
max_leaf_nodes=max_leaf_nodes,
random_state=0)
if isinstance(est, (RandomForestClassifier,
RandomForestRegressor)):
est.bootstrap = False
est.fit(X, y, sample_weight=weights)
out = est.estimators_[0].tree_.apply(X)
node_weights = bincount(out, weights=weights)
# drop inner nodes
leaf_weights = node_weights[node_weights != 0]
assert_greater_equal(
np.min(leaf_weights),
total_weight * est.min_weight_fraction_leaf,
"Failed with {0} "
"min_weight_fraction_leaf={1}".format(
name, est.min_weight_fraction_leaf))
开发者ID:EddieBurning,项目名称:scikit-learn,代码行数:29,代码来源:test_forest.py
示例3: _make_test_folds
def _make_test_folds(self, X, y=None, groups=None):
if self.shuffle:
rng = check_random_state(self.random_state)
else:
rng = self.random_state
y = np.asarray(y)
n_samlples = len(X)
y = ','.join(y).split(',')
unique_y, y_inversed = np.unique(y, return_inverse=True)
y_counts = bincount(y_inversed)
min_groups = np.min(y_counts)
if np.all(self.n_splits > y_counts):
raise ValueError("All the n_groups for individual classes"
" are less than n_splits=%d."
% (self.n_splits))
if self.n_splits > min_groups:
warnings.warn(("The least populated class in y has only %d"
" members, which is too few. The minimum"
" number of groups for any class cannot"
" be less than n_splits=%d."
% (min_groups, self.n_splits)), Warning)
# pre-assign each sample to a test fold index using individual KFold
# splitting strategies for each class so as to respect the balance of
# classes
# NOTE: Passing the data corresponding to ith class say X[y==class_i]
# will break when the data is not 100% stratifiable for all classes.
# So we pass np.zeroes(max(c, n_splits)) as data to the KFold
test_folds = iterative_stratification(X, set(y), self.n_splits, rng)
return test_folds
开发者ID:daniaki,项目名称:ppi_wrangler,代码行数:31,代码来源:cross_validation.py
示例4: _recompute_centers
def _recompute_centers( X, labels, n_clusters):
"""
Computation of cluster centers / means.
Parameters
----------
X: array-like, shape (n_samples, n_features)
labels: array of integers, shape (n_samples)
Current label assignment
n_clusters: int
Number of desired clusters
Returns
-------
centers: array, shape (n_clusters, n_features)
The resulting centers
"""
n_samples = X.shape[0]
n_features = X.shape[1]
# Initialize centers to all zero
centers = np.zeros((n_clusters, n_features))
n_samples_in_cluster = bincount(labels, minlength=n_clusters)
# Compute a center for each label
# For each label, average over samples and features
# TODO: IMPLEMENT
# Take all of the samples in a cluster and average their features
return centers
开发者ID:AkiraKaneshiro,项目名称:gadsdc,代码行数:33,代码来源:kmeans_exercise.py
示例5: _generate_unsampled_indices
def _generate_unsampled_indices(random_state, n_samples):
'''Samples out of bag'''
sample_indices = _generate_sample_indices(random_state, n_samples)
sample_counts = bincount(sample_indices, minlength=n_samples)
unsampled_mask = sample_counts == 0
indices_range = np.arange(n_samples)
unsampled_indices = indices_range[unsampled_mask]
return unsampled_indices
开发者ID:thomasbrawner,项目名称:regime_failure,代码行数:8,代码来源:oob_validation.py
示例6: entropy
def entropy(samples):
n_samples = len(samples)
entropy = 0.
for count in bincount(samples):
p = 1. * count / n_samples
if p > 0:
entropy -= p * np.log2(p)
return entropy
开发者ID:EddieBurning,项目名称:scikit-learn,代码行数:10,代码来源:test_forest.py
示例7: test_sample_weight
def test_sample_weight():
"""Check sample weighting."""
# Test that zero-weighted samples are not taken into account
X = np.arange(100)[:, np.newaxis]
y = np.ones(100)
y[:50] = 0.0
sample_weight = np.ones(100)
sample_weight[y == 0] = 0.0
clf = DecisionTreeClassifier(random_state=0)
clf.fit(X, y, sample_weight=sample_weight)
assert_array_equal(clf.predict(X), np.ones(100))
# Test that low weighted samples are not taken into account at low depth
X = np.arange(200)[:, np.newaxis]
y = np.zeros(200)
y[50:100] = 1
y[100:200] = 2
X[100:200, 0] = 200
sample_weight = np.ones(200)
sample_weight[y == 2] = .51 # Samples of class '2' are still weightier
clf = DecisionTreeClassifier(max_depth=1, random_state=0)
clf.fit(X, y, sample_weight=sample_weight)
assert_equal(clf.tree_.threshold[0], 149.5)
sample_weight[y == 2] = .50 # Samples of class '2' are no longer weightier
clf = DecisionTreeClassifier(max_depth=1, random_state=0)
clf.fit(X, y, sample_weight=sample_weight)
assert_equal(clf.tree_.threshold[0], 49.5) # Threshold should have moved
# Test that sample weighting is the same as having duplicates
X = iris.data
y = iris.target
duplicates = rng.randint(0, X.shape[0], 200)
clf = DecisionTreeClassifier(random_state=1)
clf.fit(X[duplicates], y[duplicates])
sample_weight = bincount(duplicates, minlength=X.shape[0])
clf2 = DecisionTreeClassifier(random_state=1)
clf2.fit(X, y, sample_weight=sample_weight)
internal = clf.tree_.children_left != tree._tree.TREE_LEAF
assert_array_almost_equal(clf.tree_.threshold[internal],
clf2.tree_.threshold[internal])
开发者ID:Niteloser,项目名称:scikit-learn,代码行数:49,代码来源:test_tree.py
示例8: _recompute_centers
def _recompute_centers( X, labels, n_clusters):
"""
Computation of cluster centers / means.
Parameters
----------
X: array-like, shape (n_samples, n_features)
labels: array of integers, shape (n_samples)
Current label assignment
n_clusters: int
Number of desired clusters
Returns
-------
centers: array, shape (n_clusters, n_features)
The resulting centers
"""
n_samples = X.shape[0]
n_features = X.shape[1]
# Initialize centers to all zero
centers = np.zeros((n_clusters, n_features))
n_samples_in_cluster = bincount(labels, minlength=n_clusters)
# Compute a center for each label
# For each label, average over samples and features
#TODO: IMPLEMENT
# Take all of the samples in a cluster and add their features
# For each sample
# What label is it? Let's say its label x
# Add feature i to label X's feature value i
for sample_idx in xrange(n_samples):
label = labels[sample_idx]
centers[label] += X[sample_idx]
#for j in xrange(n_features):
# centers[label[j]] +=X[sample_idx[j]]
# Normalize by the size of the cluster
centers /= n_samples_in_cluster[:, np.newaxis]
return centers
开发者ID:vijayvenkatesh,项目名称:datascienceplusMLcode,代码行数:47,代码来源:kmeans_exercise.py
示例9: check_min_samples_leaf
def check_min_samples_leaf(name, X, y):
# Test if leaves contain more than leaf_count training examples
ForestEstimator = FOREST_ESTIMATORS[name]
# test both DepthFirstTreeBuilder and BestFirstTreeBuilder
# by setting max_leaf_nodes
for max_leaf_nodes in (None, 1000):
est = ForestEstimator(min_samples_leaf=5,
max_leaf_nodes=max_leaf_nodes,
random_state=0)
est.fit(X, y)
out = est.estimators_[0].tree_.apply(X)
node_counts = bincount(out)
# drop inner nodes
leaf_count = node_counts[node_counts != 0]
assert_greater(np.min(leaf_count), 4,
"Failed with {0}".format(name))
开发者ID:EddieBurning,项目名称:scikit-learn,代码行数:17,代码来源:test_forest.py
示例10: _balanced_parallel_build_trees
def _balanced_parallel_build_trees(n_trees, forest, X, y, sample_weight, sample_mask, X_argsorted, seed, verbose):
"""Private function used to build a batch of trees within a job"""
from sklearn.utils import check_random_state
from sklearn.utils.fixes import bincount
import random
MAX_INT = numpy.iinfo(numpy.int32).max
random_state = check_random_state(seed)
trees = []
for i in xrange(n_trees):
if verbose > 1:
print("building tree %d of %d" % (i+1, n_trees))
seed = random_state.randint(MAX_INT)
tree = forest._make_estimator(append = False)
tree.set_params(compute_importances=forest.compute_importances)
tree.set_params(random_state = check_random_state(seed))
if forest.bootstrap:
n_samples = X.shape[0]
if sample_weight is None:
curr_sample_weight = numpy.ones((n_samples,), dtype=numpy.float64)
else:
curr_sample_weight = sample_weight.copy()
ty = list(enumerate(y))
indices = DataUtils.FilterData(ty, val=1, frac=0.5, col=1, indicesToUse=0, indicesOnly=1)[0]
indices2 = random_state.randint(0, len(indices), len(indices))
indices = [indices[j] for j in indices2]
sample_counts = bincount(indices, minlength=n_samples)
curr_sample_weight *= sample_counts
curr_sample_mask = sample_mask.copy()
curr_sample_mask[sample_counts==0] = False
tree.fit(X, y, sample_weight=curr_sample_weight, sample_mask=curr_sample_mask, X_argsorted=X_argsorted, check_input=False)
tree.indices = curr_sample_mask
else:
tree.fit(X, y, sample_weight=sample_weight, sample_mask=sample_mask, X_argsorted=X_argsorted, check_input=False)
trees.append(tree)
return trees
开发者ID:ryancoleman,项目名称:TDT-tutorial-2014,代码行数:41,代码来源:random_forest_functions.py
示例11: _recompute_centers
def _recompute_centers( X, labels, n_clusters):
"""
Computation of cluster centers / means.
Parameters
----------
X: array-like, shape (n_samples, n_features)
labels: array of integers, shape (n_samples)
Current label assignment
n_clusters: int
Number of desired clusters
Returns
-------
centers: array, shape (n_clusters, n_features)
The resulting centers
"""
n_samples = X.shape[0]
n_features = X.shape[1]
# Initialize centers to all zero
centers = np.zeros((n_clusters, n_features))
n_samples_in_cluster = bincount(labels, minlength=n_clusters)
# Compute a center for each label
# For each label, average over samples and features
#TODO: IMPLEMENT
# 1. For each sample
# 2. What label is it? Let's say its label is 'label'
# 3. Add feature X's feature i to centers[label] feature value i
# Normalize by the size of the cluster
centers /= n_samples_in_cluster[:, np.newaxis]
return centers
开发者ID:asbhat,项目名称:Data-Science-Class,代码行数:39,代码来源:template_kmeans_exercise.py
示例12: _iter_indices
def _iter_indices(self):
rng = np.random.RandomState(self.random_state)
cls_count = bincount(self.y_indices)
for n in range(self.n_iter):
train = []
test = []
for i, cls in enumerate(self.classes):
sample_size = int(cls_count[i]*(1-self.test_size))
randint = rng.randint(cls_count[i], size=sample_size)
aidx = np.where((self.y == cls))[0]
iidx = aidx[randint]
oidx = aidx[list(set(range(cls_count[i])).difference(set(randint)))]
train.extend(iidx)
test.extend(oidx)
train = rng.permutation(train)
test = rng.permutation(test)
yield train, test
开发者ID:Anhmike,项目名称:Kaggle_HomeDepot,代码行数:22,代码来源:extreme_ensemble_selection.py
示例13: _recompute_centers
def _recompute_centers( X, labels, n_clusters):
"""
Computation of cluster centers / means.
Parameters
----------
X: array-like, shape (n_samples, n_features)
labels: array of integers, shape (n_samples)
Current label assignment
n_clusters: int
Number of desired clusters
Returns
-------
centers: array, shape (n_clusters, n_features)
The resulting centers
"""
n_samples = X.shape[0]
n_features = X.shape[1]
# Initialize centers to all zero
centers = np.zeros((n_clusters, n_features))
n_samples_in_cluster = bincount(labels, minlength=n_clusters)
# Compute a center for each label
# For each label, average over samples and features
#TODO: IMPLEMENT
for i in range(n_samples):
for j in range(n_features):
centers[labels[i], j] += X[i, j]
# Normalize by the size of the cluster
centers /= n_samples_in_cluster[:, np.newaxis]
return centers
开发者ID:GusSand,项目名称:GADS7,代码行数:39,代码来源:kmeans_exercise_final.py
示例14: test_sample_weight
def test_sample_weight():
"""Check sample weighting."""
# Test that zero-weighted samples are not taken into account
X = np.arange(100)[:, np.newaxis]
y = np.ones(100)
y[:50] = 0.0
sample_weight = np.ones(100)
sample_weight[y == 0] = 0.0
clf = tree.DecisionTreeClassifier()
clf.fit(X, y, sample_weight=sample_weight)
assert_array_equal(clf.predict(X), np.ones(100))
# Test that low weighted samples are not taken into account at low depth
X = np.arange(200)[:, np.newaxis]
y = np.zeros(200)
y[50:100] = 1
y[100:200] = 2
X[100:200, 0] = 200
sample_weight = np.ones(200)
sample_weight[y == 2] = .51 # Samples of class '2' are still weightier
clf = tree.DecisionTreeClassifier(max_depth=1)
clf.fit(X, y, sample_weight=sample_weight)
assert_equal(clf.tree_.threshold[0], 149.5)
sample_weight[y == 2] = .50 # Samples of class '2' are no longer weightier
clf = tree.DecisionTreeClassifier(max_depth=1)
clf.fit(X, y, sample_weight=sample_weight)
assert_equal(clf.tree_.threshold[0], 49.5) # Threshold should have moved
# Test that sample weighting is the same as having duplicates
X = iris.data
y = iris.target
duplicates = rng.randint(0, X.shape[0], 1000)
clf = tree.DecisionTreeClassifier(random_state=1)
clf.fit(X[duplicates], y[duplicates])
from sklearn.utils.fixes import bincount
sample_weight = bincount(duplicates, minlength=X.shape[0])
clf2 = tree.DecisionTreeClassifier(random_state=1)
clf2.fit(X, y, sample_weight=sample_weight)
internal = clf.tree_.children_left != tree._tree.TREE_LEAF
assert_array_equal(clf.tree_.threshold[internal],
clf2.tree_.threshold[internal])
# Test negative weights
X = iris.data
y = iris.target
sample_weight = -np.ones(X.shape[0])
clf = tree.DecisionTreeClassifier(random_state=1)
assert_raises(ValueError, clf.fit, X, y, sample_weight=sample_weight)
sample_weight = np.ones(X.shape[0])
sample_weight[0] = -1
clf = tree.DecisionTreeClassifier(random_state=1)
clf.fit(X, y, sample_weight=sample_weight)
# Check that predict_proba returns valid probabilities in the presence of
# samples with negative weight
X = iris.data
y = iris.target
sample_weight = rng.normal(.5, 1.0, X.shape[0])
clf = tree.DecisionTreeClassifier(random_state=1)
clf.fit(X, y, sample_weight=sample_weight)
proba = clf.predict_proba(X)
assert (proba >= 0).all() and (proba <= 1).all()
开发者ID:Calvin-O,项目名称:scikit-learn,代码行数:74,代码来源:test_tree.py
示例15: sensitivity_specificity_support
#.........这里部分代码省略.........
raise ValueError("Target is %s but average='binary'. Please "
"choose another average setting." % y_type)
elif pos_label not in (None, 1):
warnings.warn("Note that pos_label (set to %r) is ignored when "
"average != 'binary' (got %r). You may use "
"labels=[pos_label] to specify a single positive class."
% (pos_label, average), UserWarning)
if labels is None:
labels = present_labels
n_labels = None
else:
n_labels = len(labels)
labels = np.hstack(
[labels, np.setdiff1d(
present_labels, labels, assume_unique=True)])
# Calculate tp_sum, pred_sum, true_sum ###
if y_type.startswith('multilabel'):
raise ValueError('imblearn does not support multilabel')
elif average == 'samples':
raise ValueError("Sample-based precision, recall, fscore is "
"not meaningful outside multilabel "
"classification. See the accuracy_score instead.")
else:
le = LabelEncoder()
le.fit(labels)
y_true = le.transform(y_true)
y_pred = le.transform(y_pred)
sorted_labels = le.classes_
# labels are now from 0 to len(labels) - 1 -> use bincount
tp = y_true == y_pred
tp_bins = y_true[tp]
if sample_weight is not None:
tp_bins_weights = np.asarray(sample_weight)[tp]
else:
tp_bins_weights = None
if len(tp_bins):
tp_sum = bincount(
tp_bins, weights=tp_bins_weights, minlength=len(labels))
else:
# Pathological case
true_sum = pred_sum = tp_sum = np.zeros(len(labels))
if len(y_pred):
pred_sum = bincount(
y_pred, weights=sample_weight, minlength=len(labels))
if len(y_true):
true_sum = bincount(
y_true, weights=sample_weight, minlength=len(labels))
# Compute the true negative
tn_sum = y_true.size - (pred_sum + true_sum - tp_sum)
# Retain only selected labels
indices = np.searchsorted(sorted_labels, labels[:n_labels])
tp_sum = tp_sum[indices]
true_sum = true_sum[indices]
pred_sum = pred_sum[indices]
tn_sum = tn_sum[indices]
if average == 'micro':
tp_sum = np.array([tp_sum.sum()])
pred_sum = np.array([pred_sum.sum()])
true_sum = np.array([true_sum.sum()])
tn_sum = np.array([tn_sum.sum()])
# Finally, we have all our sufficient statistics. Divide! #
with np.errstate(divide='ignore', invalid='ignore'):
# Divide, and on zero-division, set scores to 0 and warn:
# Oddly, we may get an "invalid" rather than a "divide" error
# here.
specificity = _prf_divide(tn_sum, tn_sum + pred_sum - tp_sum,
'specificity', 'predicted', average,
warn_for)
sensitivity = _prf_divide(tp_sum, true_sum, 'sensitivity', 'true',
average, warn_for)
# Average the results
if average == 'weighted':
weights = true_sum
if weights.sum() == 0:
return 0, 0, None
elif average == 'samples':
weights = sample_weight
else:
weights = None
if average is not None:
assert average != 'binary' or len(specificity) == 1
specificity = np.average(specificity, weights=weights)
sensitivity = np.average(sensitivity, weights=weights)
true_sum = None # return no support
return sensitivity, specificity, true_sum
开发者ID:kellyhennigan,项目名称:cueexp_scripts,代码行数:101,代码来源:classification.py
示例16: _parallel_build_estimators
def _parallel_build_estimators(n_estimators, ensemble, X, y, sample_weight,
seeds, verbose):
"""Private function used to build a batch of estimators within a job."""
# Retrieve settings
n_samples, n_features = X.shape
max_samples = ensemble.max_samples
max_features = ensemble.max_features
if (not isinstance(max_samples, (numbers.Integral, np.integer)) and
(0.0 < max_samples <= 1.0)):
max_samples = int(max_samples * n_samples)
if (not isinstance(max_features, (numbers.Integral, np.integer)) and
(0.0 < max_features <= 1.0)):
max_features = int(max_features * n_features)
bootstrap = ensemble.bootstrap
bootstrap_features = ensemble.bootstrap_features
support_sample_weight = ("sample_weight" in
getargspec(ensemble.base_estimator_.fit)[0])
# Build estimators
estimators = []
estimators_samples = []
estimators_features = []
for i in range(n_estimators):
if verbose > 1:
print("building estimator %d of %d" % (i + 1, n_estimators))
random_state = check_random_state(seeds[i])
seed = check_random_state(random_state.randint(MAX_INT))
estimator = ensemble._make_estimator(append=False)
try: # Not all estimator accept a random_state
estimator.set_params(random_state=seed)
except ValueError:
pass
# Draw features
if bootstrap_features:
features = random_state.randint(0, n_features, max_features)
else:
features = sample_without_replacement(n_features,
max_features,
random_state=random_state)
# Draw samples, using sample weights, and then fit
if support_sample_weight:
if sample_weight is None:
curr_sample_weight = np.ones((n_samples,))
else:
curr_sample_weight = sample_weight.copy()
if bootstrap:
indices = random_state.randint(0, n_samples, max_samples)
sample_counts = bincount(indices, minlength=n_samples)
curr_sample_weight *= sample_counts
else:
not_indices = sample_without_replacement(
n_samples,
n_samples - max_samples,
random_state=random_state)
curr_sample_weight[not_indices] = 0
estimator.fit(X[:, features], y, sample_weight=curr_sample_weight)
samples = curr_sample_weight > 0.
# Draw samples, using a mask, and then fit
else:
if bootstrap:
indices = random_state.randint(0, n_samples, max_samples)
else:
indices = sample_without_replacement(n_samples,
max_samples,
random_state=random_state)
sample_counts = bincount(indices, minlength=n_samples)
estimator.fit((X[indices])[:, features], y[indices])
samples = sample_counts > 0.
estimators.append(estimator)
estimators_samples.append(samples)
estimators_features.append(features)
return estimators, estimators_samples, estimators_features
开发者ID:orazaro,项目名称:kgml,代码行数:89,代码来源:bag.py
示例17: _document_frequency
def _document_frequency(X):
"""Count the number of non-zero values for each feature in sparse X."""
if sp.isspmatrix_csr(X):
return bincount(X.indices, minlength=X.shape[1])
else:
return np.diff(sp.csc_matrix(X, copy=False).indptr)
开发者ID:NPSDC,项目名称:Online-News-Clustering-SMAI-PROJECT-,代码行数:6,代码来源:featureExtraction.py
示例18: _iter_indices
def _iter_indices(self, frame, y):
"""Iterate the indices with stratification.
Parameters
----------
frame : H2OFrame
The frame to split
y : string
The column to stratify.
Returns
-------
train : np.ndarray, shape=(n_samples,)
The train indices
test : np.ndarray, shape=(n_samples,)
The test indices
"""
n_samples = frame.shape[0]
n_train, n_test = _validate_shuffle_split(n_samples,
self.test_size, self.train_size)
# need to validate y...
y = _val_y(y)
target = np.asarray(frame[y].as_data_frame(use_pandas=True)[y].tolist())
classes, y_indices = np.unique(target, return_inverse=True)
n_classes = classes.shape[0]
class_counts = bincount(y_indices)
if np.min(class_counts) < 2:
raise ValueError('The least populated class in y has only 1 '
'member, which is too few. The minimum number of labels '
'for any class cannot be less than 2.')
if n_train < n_classes:
raise ValueError('The train_size=%d should be greater than or '
'equal to the number of classes=%d' % (n_train, n_classes))
if n_test < n_classes:
raise ValueError('The test_size=%d should be greater than or '
'equal to the number of classes=%d' % (n_test, n_classes))
rng = check_random_state(self.random_state)
p_i = class_counts / float(n_samples)
n_i = np.round(n_train * p_i).astype(int)
t_i = np.minimum(class_counts - n_i, np.round(n_test * p_i).astype(int))
for _ in range(self.n_splits):
train = []
test = []
for i, class_i in enumerate(classes):
permutation = rng.permutation(class_counts[i])
perm_indices_class_i = np.where((target == class_i))[0][permutation]
train.extend(perm_indices_class_i[:n_i[i]])
test.extend(perm_indices_class_i[n_i[i]:n_i[i] + t_i[i]])
# Might end up here with less samples in train and test than we asked
# for, due to rounding errors.
if len(train) + len(test) < n_train + n_test:
missing_indices = np.where(bincount(train + test, minlength=len(target)) == 0)[0]
missing_indices = rng.permutation(missing_indices)
n_missing_train = n_train - len(train)
n_missing_test = n_test - len(test)
if n_missing_train > 0:
train.extend(missing_indices[:n_missing_train])
if n_missing_test > 0:
test.extend(missing_indices[-n_missing_test:])
train = rng.permutation(train)
test = rng.permutation(test)
yield train, test
开发者ID:tgsmith61591,项目名称:skutil,代码行数:79,代码来源:split.py
示例19: grow_forest
def grow_forest(forest, X, y, seeds, labels=None):
"""Grow a forest of random trees"""
# Convert data
X, = check_arrays(X, dtype=DTYPE, sparse_format="dense")
# Make a list container for grown trees
n_trees = forest.n_estimators
trees = []
# For each tree in the forest
for i in range(n_trees):
# Make a np.random.RandomState instance from the tree's planting seed
random_state = check_random_state(seeds[i])
# generate a random seed for a branching seed
seed = random_state.randint(MAX_INT)
# Make a decision tree object
tree = forest._make_estimator(append=False)
# Init the tree's RandomState instance with generated seed
# this will randomize what features the tree will use
tree.set_params(random_state=check_random_state(seed))
# If we are bootstraping
if forest.bootstrap:
# If we are given labels
if labels is not None:
# Then need to bootstrap via labels
# We can do this by using StratifiedShuffleSplit
# to gain a random sample from each lable
sss = cross_validation.StratifiedShuffleSplit(labels,
n_iter=1,
test_size=np.unique(labels).size,
random_state=check_random_state(seed))
# Then we'll bootstrap our X and y for the lable samples chosen
for train, test in sss:
X_lbs = X[test]
y_lbs = y[test]
break
# Then get the number of samples
n_samples = X_lbs.shape[0]
# To generate a uniform sample weight
curr_sample_weight = np.ones((n_samples,), dtype=np.float64)
# Then randomly choses n_samples from all samples with replacement
indices = random_state.randint(0, n_samples, n_samples)
# Use this method of bincount to make a randome benning histogram
# that will sum up to n_samples
sample_counts = bincount(indices, minlength=n_samples)
# Apply these randomized counts to the old uniform weights
curr_sample_weight *= sample_counts
# Fit the tree using these new sample weights
tree.fit(X_lbs, y_lbs, sample_weight=curr_sample_weight, check_input=False)
# Then set the indices of the tree only to the samples that had non-zero weights
tree.indices_ = sample_counts > 0.
else:
# Then get the number of samples
n_samples = X.shape[0]
# To generate a uniform sample weight
curr_sample_weight = np.ones((n_samples,), dtype=np.float64)
# Then randomly choses n_samples from all samples with replacement
indices = random_state.randint(0, n_samples, n_samples)
# Use this method of bincount to make a randome benning histogram
# that will sum up to n_samples
sample_counts = bincount(indices, minlength=n_samples)
# Apply these randomized counts to the old uniform weights
curr_sample_weight *= sample_counts
# Fit the tree using these new sample weights
tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)
# Then set the indices of the tree only to the samples that had non-zero weights
tree.indices_ = sample_counts > 0.
# If we aren't bootstraping
else:
# This just fit the data with no random weights
tree.fit(X, y, check_input=False)
# Add the grown tree to the container
trees.append(tree)
# return all of the trained trees
return trees
开发者ID:ruffsl,项目名称:CS7616P1,代码行数:74,代码来源:labeled_bootstraping.py
示例20: _make_test_folds
def _make_test_folds(self, frame, y):
if self.shuffle:
rng = check_random_state(self.random_state)
else:
rng = self.random_state
# validate that it's a string
y = _val_y(y) # gets a string back or None
if y is None:
raise ValueError('H2OStratifiedKFold requires a target name (got None)')
target = frame[y].as_data_frame(use_pandas=True)[y].values
n_samples = target.shape[0]
unique_y, y_inversed = np.unique(target, return_inverse=True)
y_counts = bincount(y_inversed)
min_labels = np.min(y_counts)
if np.all(self.n_folds > y_counts):
raise ValueError(('All the n_labels for individual classes'
' are less than %d folds.'
% self.n_folds), Warning)
if self.n_folds > min_labels:
warnings.warn(('The least populated class in y has only %d'
' members, which is too few. The minimum'
' number of labels for any class cannot'
' be less than n_folds=%d.'
% (min_labels, self.n_folds)), Warning)
# NOTE FROM SKLEARN:
# pre-assign each sample to a test fold index using individual KFold
# splitting strategies for each class so as to respect the balance of
# classes
# NOTE: Passing the data corresponding to ith class say X[y==class_i]
# will break when the data is not 100% stratifiable for all classes.
# So we pass np.zeroes(max(c, n_folds)) as data to the KFold.
# Remember, however that we might be using the old-fold KFold which doesn't
# have a split method...
if SK18:
per_cls_cvs = [
KFold(self.n_folds, # using sklearn's KFold here
shuffle=self.shuffle,
random_state=rng).split(np.zeros(max(count, self.n_folds)))
for count in y_counts
]
else:
per_cls_cvs = [
KFold(max(count, self.n_folds), # using sklearn's KFold here
self.n_folds,
shuffle=self.shuffle,
random_state=rng)
for count in y_counts
]
test_folds = np.zeros(n_samples, dtype=np.int)
for test_fold_indices, per_cls_splits in enumerate(zip(*per_cls_cvs)):
for cls, (_, test_split) in zip(unique_y, per_cls_splits):
cls_test_folds = test_folds[target == cls]
# the test split can be too big because we used
# KFold(...).split(X[:max(c, n_folds)]) when data is not 100%
# stratifiable for all the classes
# (we use a warning instead of raising an exception)
# If this is the case, let's trim it:
test_split = test_split[test_split < len(cls_test_folds)]
cls_test_folds[test_split] = test_fold_indices
test_folds[target == cls] = cls_test_folds
return test_folds
开发者ID:tgsmith61591,项目名称:skutil,代码行数:70,代码来源:split.py
注:本文中的sklearn.utils.fixes.bincount函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论