本文整理汇总了Python中sklearn.utils.validation._num_samples函数的典型用法代码示例。如果您正苦于以下问题:Python _num_samples函数的具体用法?Python _num_samples怎么用?Python _num_samples使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了_num_samples函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: test_check_sample_weight
def test_check_sample_weight():
from sklearn.cluster.k_means_ import _check_sample_weight
sample_weight = None
checked_sample_weight = _check_sample_weight(X, sample_weight)
assert_equal(_num_samples(X), _num_samples(checked_sample_weight))
assert_almost_equal(checked_sample_weight.sum(), _num_samples(X))
assert_equal(X.dtype, checked_sample_weight.dtype)
开发者ID:daniel-perry,项目名称:scikit-learn,代码行数:7,代码来源:test_k_means.py
示例2: _index_param_value
def _index_param_value(X, v, indices):
"""Private helper function for parameter value indexing."""
if not _is_arraylike(v) or _num_samples(v) != _num_samples(X):
# pass through: skip indexing
return v
if sp.issparse(v):
v = v.tocsr()
return safe_indexing(v, indices)
开发者ID:Meyenhofer,项目名称:pattern-recognition-2016,代码行数:8,代码来源:_validation.py
示例3: _fit_and_score
def _fit_and_score(estimator, depthmaps, offset_points_projected, direction_vectors, true_joints, scorer, train, test, verbose, parameters, fit_params, return_train_score=False, return_parameters=False, error_score='raise'):
if verbose > 1:
if parameters is None:
msg = "no parameters to be set"
else:
msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.items()))
print("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))
# Adjust length of sample weights
fit_params = fit_params if fit_params is not None else {}
fit_params = dict([(k, _index_param_value(depthmaps, v, train))
for k, v in fit_params.items()])
if parameters is not None:
estimator.set_params(**parameters)
start_time = time.time()
depth_train, offsets_train, directions_train, truths_train = _safe_split(depthmaps, offset_points_projected, direction_vectors, true_joints, train)
depth_test, offsets_test, directions_test, truths_test = _safe_split(depthmaps, offset_points_projected, direction_vectors, true_joints, test)
try:
estimator.fit(depth_train, offsets_train, directions_train, **fit_params)
except Exception as e:
if error_score == 'raise':
raise
elif isinstance(error_score, numbers.Number):
test_score = error_score
if return_train_score:
train_score = error_score
warnings.warn("Classifier fit failed. The score on this train-test"
" partition for these parameters will be set to %f. "
"Details: \n%r" % (error_score, e), FitFailedWarning)
else:
raise ValueError("error_score must be the string 'raise' or a"
" numeric value. (Hint: if using 'raise', please"
" make sure that it has been spelled correctly.)"
)
else:
test_score = _score(estimator, depth_test, truths_test, scorer)
if return_train_score:
train_score = _score(estimator, depth_train, truths_train, scorer)
scoring_time = time.time() - start_time
if verbose > 2:
msg += ", score=%f" % test_score
if verbose > 1:
end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time))
print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))
ret = [train_score] if return_train_score else []
ret.extend([test_score, _num_samples(depth_test), scoring_time])
if return_parameters:
ret.append(parameters)
return ret
开发者ID:aoikaneko,项目名称:RandomTreeWalk,代码行数:59,代码来源:grid_search.py
示例4: _fit_and_score
def _fit_and_score(estimator, Z, scorer, train, test, verbose,
parameters, fit_params, return_train_score=False,
return_parameters=False, error_score='raise'):
if verbose > 1:
if parameters is None:
msg = "no parameters to be set"
else:
msg = '%s' % (', '.join('%s=%s' % (k, v)
for k, v in list(parameters.items())))
print(("[CV] %s %s" % (msg, (64 - len(msg)) * '.')))
fit_params = fit_params if fit_params is not None else {}
if parameters is not None:
estimator.set_params(**parameters)
start_time = time.time()
Z_train = Z[train]
Z_test = Z[test]
try:
estimator.fit(Z_train, **fit_params)
except Exception as e:
if error_score == 'raise':
raise
elif isinstance(error_score, numbers.Number):
test_score = error_score
if return_train_score:
train_score = error_score
warnings.warn("Classifier fit failed. The score on this train-test"
" partition for these parameters will be set to %f. "
"Details: \n%r" % (error_score, e), FitFailedWarning)
else:
raise ValueError("error_score must be the string 'raise' or a"
" numeric value. (Hint: if using 'raise', please"
" make sure that it has been spelled correctly.)"
)
else:
test_score = _score(estimator, Z_test, scorer)
if return_train_score:
train_score = _score(estimator, Z_train, scorer)
scoring_time = time.time() - start_time
if verbose > 2:
msg += ", score=%f" % test_score
if verbose > 1:
end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time))
print(("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)))
ret = [train_score] if return_train_score else []
ret.extend([test_score, _num_samples(Z_test), scoring_time])
if return_parameters:
ret.append(parameters)
return ret
开发者ID:KartikPadmanabhan,项目名称:sparkit-learn,代码行数:57,代码来源:cross_validation.py
示例5: score_each_boost
def score_each_boost(estimator, parameters,
min_n_estimators,
X, y, sample_weight,
score_func, train, test,
verbose):
"""Run fit on one set of parameters
Returns the score and the instance of the classifier
"""
if verbose > 1:
start_time = time.time()
msg = '%s' % (', '.join('%s=%s' % (k, v)
for k, v in parameters.iteritems()))
print "[BoostGridSearchCV] %s %s" % (msg, (64 - len(msg)) * '.')
X_test, y_test, sample_weight_test = _safe_split(
estimator, X, y, sample_weight, test, train)
test_score_params = {}
if sample_weight is not None:
test_score_params['sample_weight'] = sample_weight_test
this_n_test_samples = _num_samples(X_test)
all_scores = []
all_clf_params = []
n_test_samples = []
for i, y_pred in enumerate(estimator.staged_predict(X_test)):
if i + 1 < min_n_estimators:
continue
score = score_func(y_test, y_pred, **test_score_params)
all_scores.append(score)
clf_para = copy(parameters)
clf_para['n_estimators'] = i + 1
all_clf_params.append(clf_para)
n_test_samples.append(this_n_test_samples)
# boosting may have stopped early
if len(all_scores) < estimator.n_estimators - min_n_estimators + 1:
last_score = all_scores[-1]
last_clf_params = all_clf_params[-1]
for i in range(len(all_scores),
estimator.n_estimators - min_n_estimators + 1):
all_scores.append(last_score)
clf_para = copy(last_clf_params)
clf_para['n_estimators'] = i + 1
all_clf_params.append(clf_para)
n_test_samples.append(this_n_test_samples)
if verbose > 1:
end_msg = "%s -%s" % (msg,
logger.short_format_time(time.time() -
start_time))
print "[BoostGridSearchCV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)
return all_scores, all_clf_params, n_test_samples
开发者ID:Reikyo,项目名称:hhana,代码行数:56,代码来源:grid_search.py
示例6: test_retrieve_samples_from_non_standard_shape
def test_retrieve_samples_from_non_standard_shape():
class TestNonNumericShape:
def __init__(self):
self.shape = ("not numeric",)
def __len__(self):
return len([1, 2, 3])
X = TestNonNumericShape()
assert _num_samples(X) == len(X)
开发者ID:daniel-perry,项目名称:scikit-learn,代码行数:10,代码来源:test_validation.py
示例7: fit
def fit(self, X, y):
"""Actual fitting, performing the search over parameters."""
parameter_iterable = ParameterSampler(self.param_distributions,
self.n_iter,
random_state=self.random_state)
estimator = self.estimator
cv = self.cv
n_samples = _num_samples(X)
X, y = indexable(X, y)
if y is not None:
if len(y) != n_samples:
raise ValueError('Target variable (y) has a different number '
'of samples (%i) than data (X: %i samples)'
% (len(y), n_samples))
cv = check_cv(cv, X, y, classifier=is_classifier(estimator))
if self.verbose > 0:
if isinstance(parameter_iterable, Sized):
n_candidates = len(parameter_iterable)
print("Fitting {0} folds for each of {1} candidates, totalling"
" {2} fits".format(len(cv), n_candidates,
n_candidates * len(cv)))
base_estimator = clone(self.estimator)
pre_dispatch = self.pre_dispatch
out = Parallel(
n_jobs=self.n_jobs, verbose=self.verbose,
pre_dispatch=pre_dispatch
)(
delayed(cv_fit_and_score)(clone(base_estimator), X, y, self.scoring,
parameters, cv=cv)
for parameters in parameter_iterable)
best = sorted(out, reverse=True)[0]
self.best_params_ = best[1]
self.best_score_ = best[0]
if self.refit:
# fit the best estimator using the entire dataset
# clone first to work around broken estimators
best_estimator = clone(base_estimator).set_params(
**best[1])
if y is not None:
best_estimator.fit(X, y, **self.fit_params)
else:
best_estimator.fit(X, **self.fit_params)
self.best_estimator_ = best_estimator
return self
开发者ID:MD2Korg,项目名称:cStress-model,代码行数:54,代码来源:puffMarker.py
示例8: _iter_test_indices
def _iter_test_indices(self, X, y, groups=None):
n_samples = _num_samples(X)
n_splits = self.n_splits
y = np.asarray(y)
sorted_index = np.argsort(y)
if self.shuffle:
current = 0
rng = check_random_state(self.random_state)
for i in range(n_samples // int(n_splits)):
start, stop = current, current + n_splits
rng.shuffle(sorted_index[start:stop])
current = stop
rng.shuffle(sorted_index[current:])
for i in range(n_splits):
yield sorted_index[i:n_samples:n_splits]
开发者ID:bgruening,项目名称:galaxytools,代码行数:16,代码来源:model_validations.py
示例9: check_holdout
def check_holdout(holdout, X, y, classifier=True):
is_sparse = sp.issparse(X)
if holdout is None:
holdout = 0.8
if isinstance(holdout, numbers.Integral):
if classifier:
if type_of_target(y) in ['binary', 'multiclass']:
holdout = StratifiedShuffleSplit(y, train_size=holdout)
else:
holdout = ShuffleSplit(_num_samples(y), train_size=holdout)
else:
if not is_sparse:
n_samples = len(X)
else:
n_samples = X.shape[0]
holdout = ShuffleSplit(n_samples, train_size=holdout)
return holdout
开发者ID:blauigris,项目名称:htcluster,代码行数:17,代码来源:grid.py
示例10: check_cv_coverage
def check_cv_coverage(cv, X, y, labels, expected_n_iter=None):
n_samples = _num_samples(X)
# Check that a all the samples appear at least once in a test fold
if expected_n_iter is not None:
assert_equal(cv.get_n_splits(X, y, labels), expected_n_iter)
else:
expected_n_iter = cv.get_n_splits(X, y, labels)
collected_test_samples = set()
iterations = 0
for train, test in cv.split(X, y, labels):
check_valid_split(train, test, n_samples=n_samples)
iterations += 1
collected_test_samples.update(test)
# Check that the accumulated test samples cover the whole dataset
assert_equal(iterations, expected_n_iter)
if n_samples is not None:
assert_equal(collected_test_samples, set(range(n_samples)))
开发者ID:absolutelyNoWarranty,项目名称:scikit-learn,代码行数:19,代码来源:test_split.py
示例11: my_cross_val_predict
def my_cross_val_predict(estimator, X, y=None, groups=None, cv=None, n_jobs=1,
verbose=0, fit_params=None, pre_dispatch='2*n_jobs',
method='predict'):
X, y, groups = indexable(X, y, groups)
cv = check_cv(cv, y, classifier=is_classifier(estimator))
# Ensure the estimator has implemented the passed decision function
if not callable(getattr(estimator, method)):
raise AttributeError('{} not implemented in estimator'
.format(method))
if method in ['decision_function', 'predict_proba', 'predict_log_proba']:
le = LabelEncoder()
y = le.fit_transform(y)
# We clone the estimator to make sure that all the folds are
# independent, and that it is pickle-able.
parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
pre_dispatch=pre_dispatch)
prediction_blocks = parallel(delayed(_my_fit_and_predict)(
clone(estimator), X, y, train, test, verbose, fit_params, method)
for train, test in cv.split(X, y, groups))
# Concatenate the predictions
predictions = [pred_block_i for pred_block_i, _, _ in prediction_blocks]
test_indices = np.concatenate([indices_i
for _, indices_i, _ in prediction_blocks])
scores = np.concatenate([score_i for _, _, score_i in prediction_blocks])
if not _check_is_permutation(test_indices, _num_samples(X)):
raise ValueError('cross_val_predict only works for partitions')
inv_test_indices = np.empty(len(test_indices), dtype=int)
inv_test_indices[test_indices] = np.arange(len(test_indices))
# Check for sparse predictions
if sp.issparse(predictions[0]):
predictions = sp.vstack(predictions, format=predictions[0].format)
else:
predictions = np.concatenate(predictions)
return predictions[inv_test_indices], scores
开发者ID:teopir,项目名称:ifqi,代码行数:42,代码来源:ifs.py
示例12: _fit
def _fit(self, X, y, parameter_iterable):
"""Actual fitting, performing the search over parameters."""
estimator = self.estimator
cv = self.cv
n_samples = _num_samples(X)
X, y = check_arrays(X, y, allow_lists=True, sparse_format='csr')
if y is not None:
if len(y) != n_samples:
raise ValueError('Target variable (y) has a different number '
'of samples (%i) than data (X: %i samples)'
% (len(y), n_samples))
y = np.asarray(y)
cv = check_cv(cv, X, y, classifier=is_classifier(estimator))
if not self.dataset_filenames:
self.save_dataset_filename(X, y, cv)
dataset_filenames = self.dataset_filenames
client = Client()
lb_view = client.load_balanced_view()
if self.verbose > 0:
print("Number of CPU core %d" % len(client.ids()))
self.tasks = [([lb_view.apply(evaluate, estimator, dataset_filename, params)
for dataset_filename in dataset_filenames], params)
for params in parameter_iterable]
if self.sync:
self.wait()
self.set_grid_scores()
self.set_best_score_params()
if self.refit:
self.set_best_estimator(estimator)
return self
开发者ID:brenden17,项目名称:IPyGridSearchCV,代码行数:38,代码来源:grid_search_ipy.py
示例13: predict
def predict(self, X):
neighbors = self.nbrs.kneighbors(X, self.n_neighbors, return_distance=False)
neighbors_set = get_neighbors_above_threshold(self._fit_y, neighbors[0], self.threshold)
check_is_fitted(self, 'estimators_')
if (hasattr(self.estimators_[0], "decision_function") and
is_classifier(self.estimators_[0])):
thresh = 0
else:
thresh = .5
n_samples = _num_samples(X)
if self.label_binarizer_.y_type_ == "multiclass":
maxima = np.empty(n_samples, dtype=float)
maxima.fill(-np.inf)
argmaxima = np.zeros(n_samples, dtype=int)
for i, e in enumerate(self.estimators_):
if not i in neighbors_set:
continue
pred = _predict_binary(e, X)
np.maximum(maxima, pred, out=maxima)
argmaxima[maxima == pred] = i
return self.label_binarizer_.classes_[np.array(argmaxima.T)]
else:
indices = array.array('i')
indptr = array.array('i', [0])
for i, e in enumerate(self.estimators_):
if not i in neighbors_set:
continue
indices.extend(np.where(_predict_binary(e, X) > thresh)[0])
indptr.append(len(indices))
data = np.ones(len(indices), dtype=int)
indicator = sp.csc_matrix((data, indices, indptr),
shape=(n_samples, len(self.estimators_)))
return self.label_binarizer_.inverse_transform(indicator)
开发者ID:piotrchmiel,项目名称:ziwm_hypertension,代码行数:36,代码来源:dynamic_OvR.py
示例14: _fit_and_score
def _fit_and_score(estimator, X, y, scorer, train, test, verbose,
parameters, fit_params, return_train_score=False,
return_parameters=False, error_score='raise'):
if parameters is not None:
estimator.set_params(**parameters)
start_time = time.time()
X_train, y_train = _safe_split(estimator, X, y, train)
X_test, y_test = _safe_split(estimator, X, y, test, train)
if y_train is None:
estimator.fit(X_train, **fit_params)
else:
estimator.fit(X_train, y_train, **fit_params)
test_score = estimator.score(X_test, y_test)
scoring_time = time.time() - start_time
ret = [test_score, _num_samples(X_test), scoring_time]
if return_parameters:
ret.append(parameters)
return ret
开发者ID:amueller,项目名称:dask-learn,代码行数:24,代码来源:grid_search.py
示例15: _fit
def _fit(self, X, y, parameter_iterable):
"""Actual fitting, performing the search over parameters."""
estimator = self.estimator
cv = self.cv
n_samples = _num_samples(X)
X, y = check_arrays(X, y, allow_lists=True, sparse_format='csr')
self.scorer_ = _deprecate_loss_and_score_funcs(
self.loss_func, self.score_func, self.scoring)
if y is not None:
if len(y) != n_samples:
raise ValueError('Target variable (y) has a different number '
'of samples (%i) than data (X: %i samples)'
% (len(y), n_samples))
y = np.asarray(y)
cv = check_cv(cv, X, y, classifier=is_classifier(estimator))
if self.verbose > 0:
if isinstance(parameter_iterable, Sized):
n_candidates = len(parameter_iterable)
print("Fitting {0} folds for each of {1} candidates, totalling"
" {2} fits".format(len(cv), n_candidates,
n_candidates * len(cv)))
base_estimator = clone(self.estimator)
pre_dispatch = self.pre_dispatch
out = Parallel(
n_jobs=self.n_jobs, verbose=self.verbose,
pre_dispatch=pre_dispatch)(
delayed(fit_grid_point_extended)(
X, y, base_estimator, parameters, train, test,
self.scorer_, self.verbose, **self.fit_params)
for parameters in parameter_iterable
for train, test in cv)
# out = []
# for parameters in parameter_iterable:
# fold = 1
# for train, test in cv:
# print "Processing fold", fold, self.fit_params
# out.append(fit_grid_point_extended(X, y, base_estimator, parameters, train, test, self.scorer_, self.verbose, **self.fit_params))
# fold += 1
# Out is a list of triplet: score, estimator, n_test_samples
n_fits = len(out)
n_folds = len(cv)
scores = list()
grid_extras = list()
grid_scores = list()
for grid_start in range(0, n_fits, n_folds):
n_test_samples = 0
score = 0
all_scores = []
all_extras = list()
for this_score, parameters, this_n_test_samples, extra in \
out[grid_start:grid_start + n_folds]:
all_scores.append(this_score)
all_extras.append(extra)
if self.iid:
this_score *= this_n_test_samples
n_test_samples += this_n_test_samples
score += this_score
if self.iid:
score /= float(n_test_samples)
else:
score /= float(n_folds)
scores.append((score, parameters))
# TODO: shall we also store the test_fold_sizes?
grid_scores.append(_CVScoreTuple(
parameters,
score,
np.array(all_scores)))
grid_extras.append(all_extras)
# Store the computed scores
self.grid_scores_ = grid_scores
self.extras_ = grid_extras
# Find the best parameters by comparing on the mean validation score:
# note that `sorted` is deterministic in the way it breaks ties
best = sorted(grid_scores, key=lambda x: x.mean_validation_score,
reverse=True)[0]
self.best_params_ = best.parameters
self.best_score_ = best.mean_validation_score
if self.refit:
# fit the best estimator using the entire dataset
# clone first to work around broken estimators
best_estimator = clone(base_estimator).set_params(
**best.parameters)
if y is not None:
best_estimator.fit(X, y, **self.fit_params)
else:
best_estimator.fit(X, **self.fit_params)
self.best_estimator_ = best_estimator
#.........这里部分代码省略.........
开发者ID:jbjorne,项目名称:CAMDA2014,代码行数:101,代码来源:gridSearch.py
示例16: fit_grid_point_extended
#.........这里部分代码省略.........
test : ndarray, dtype int or bool
Boolean mask or indices for test set.
scorer : callable or None.
If provided must be a scorer callable object / function with signature
``scorer(estimator, X, y)``.
verbose : int
Verbosity level.
**fit_params : kwargs
Additional parameter passed to the fit function of the estimator.
Returns
-------
score : float
Score of this parameter setting on given training / test split.
parameters : dict
The parameters that have been evaluated.
n_samples_test : int
Number of test samples in this split.
"""
if verbose > 1:
start_time = time.time()
msg = '%s' % (', '.join('%s=%s' % (k, v)
for k, v in parameters.items()))
print("[GridSearchCV] %s %s" % (msg, (64 - len(msg)) * '.'))
# update parameters of the classifier after a copy of its base structure
clf = clone(base_estimator)
clf.set_params(**parameters)
if hasattr(base_estimator, 'kernel') and callable(base_estimator.kernel):
# cannot compute the kernel values with custom function
raise ValueError("Cannot use a custom kernel function. "
"Precompute the kernel matrix instead.")
if not hasattr(X, "shape"):
if getattr(base_estimator, "_pairwise", False):
raise ValueError("Precomputed kernels or affinity matrices have "
"to be passed as arrays or sparse matrices.")
X_train = [X[idx] for idx in train]
X_test = [X[idx] for idx in test]
else:
if getattr(base_estimator, "_pairwise", False):
# X is a precomputed square kernel matrix
if X.shape[0] != X.shape[1]:
raise ValueError("X should be a square kernel matrix")
X_train = X[np.ix_(train, train)]
X_test = X[np.ix_(test, train)]
else:
X_train = X[safe_mask(X, train)]
X_test = X[safe_mask(X, test)]
if y is not None:
y_test = y[safe_mask(y, test)]
y_train = y[safe_mask(y, train)]
clf.fit(X_train, y_train, **fit_params)
if scorer is not None:
this_score = scorer(clf, X_test, y_test)
else:
this_score = clf.score(X_test, y_test)
else:
clf.fit(X_train, **fit_params)
if scorer is not None:
this_score = scorer(clf, X_test)
else:
this_score = clf.score(X_test)
if not isinstance(this_score, numbers.Number):
raise ValueError("scoring must return a number, got %s (%s)"
" instead." % (str(this_score), type(this_score)))
if verbose > 2:
msg += ", score=%f" % this_score
if verbose > 1:
end_msg = "%s -%s" % (msg,
logger.short_format_time(time.time() -
start_time))
print("[GridSearchCV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))
extraRVs = {}
if extraOut != None:
if "estimator" in extraOut:
extraRVs["estimator"] = clf
if extraOut == "auto" or "predictions" in extraOut:
predictions = clf.predict(X)
predictionIndex = 0
predictionByIndex = {}
for exampleIndex in safe_mask(X, test):
predictionByIndex[exampleIndex] = predictions[predictionIndex]
predictionIndex += 1
extraRVs["predictions"] = predictionByIndex
if (extraOut == "auto" or "importances" in extraOut) and hasattr(clf, "feature_importances_"):
extraRVs["importances"] = clf.feature_importances_
rvs = [this_score, parameters, _num_samples(X_test), extraRVs]
return rvs
开发者ID:jbjorne,项目名称:CAMDA2014,代码行数:101,代码来源:gridSearch.py
示例17: _fit
def _fit(self, X, y, parameter_iterable):
"""Actual fitting, performing the search over parameters."""
estimator = self.estimator
cv = self.cv
self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)
n_samples = _num_samples(X)
X, y = indexable(X, y)
if y is not None:
if len(y) != n_samples:
raise ValueError('Target variable (y) has a different number '
'of samples (%i) than data (X: %i samples)'
% (len(y), n_samples))
cv = check_cv(cv, X, y, classifier=is_classifier(estimator))
if self.verbose > 0:
if isinstance(parameter_iterable, Sized):
n_candidates = len(parameter_iterable)
print("Fitting {0} folds for each of {1} candidates, totalling"
" {2} fits".format(len(cv), n_candidates,
n_candidates * len(cv)))
base_estimator = clone(self.estimator)
param_grid = [(parameters, train, test)
for parameters in parameter_iterable
for (train, test) in cv]
# Because the original python code expects a certain order for the elements, we need to
# respect it.
indexed_param_grid = list(zip(range(len(param_grid)), param_grid))
par_param_grid = self.sc.parallelize(indexed_param_grid, len(indexed_param_grid))
X_bc = self.sc.broadcast(X)
y_bc = self.sc.broadcast(y)
scorer = self.scorer_
verbose = self.verbose
fit_params = self.fit_params
error_score = self.error_score
fas = _fit_and_score
def fun(tup):
(index, (parameters, train, test)) = tup
local_estimator = clone(base_estimator)
local_X = X_bc.value
local_y = y_bc.value
res = fas(local_estimator, local_X, local_y, scorer, train, test, verbose,
parameters, fit_params,
return_parameters=True, error_score=error_score)
return (index, res)
indexed_out0 = dict(par_param_grid.map(fun).collect())
out = [indexed_out0[idx] for idx in range(len(param_grid))]
X_bc.unpersist()
y_bc.unpersist()
# Out is a list of triplet: score, estimator, n_test_samples
n_fits = len(out)
n_folds = len(cv)
scores = list()
grid_scores = list()
for grid_start in range(0, n_fits, n_folds):
n_test_samples = 0
score = 0
all_scores = []
for this_score, this_n_test_samples, _, parameters in \
out[grid_start:grid_start + n_folds]:
all_scores.append(this_score)
if self.iid:
this_score *= this_n_test_samples
n_test_samples += this_n_test_samples
score += this_score
if self.iid:
score /= float(n_test_samples)
else:
score /= float(n_folds)
scores.append((score, parameters))
# TODO: shall we also store the test_fold_sizes?
grid_scores.append(_CVScoreTuple(
parameters,
score,
np.array(all_scores)))
# Store the computed scores
self.grid_scores_ = grid_scores
# Find the best parameters by comparing on the mean validation score:
# note that `sorted` is deterministic in the way it breaks ties
best = sorted(grid_scores, key=lambda x: x.mean_validation_score,
reverse=True)[0]
self.best_params_ = best.parameters
self.best_score_ = best.mean_validation_score
if self.refit:
# fit the best estimator using the entire dataset
# clone first to work around broken estimators
best_estimator = clone(base_estimator).set_params(
**best.parameters)
if y is not None:
#.........这里部分代码省略.........
开发者ID:Sandy4321,项目名称:spark-sklearn,代码行数:101,代码来源:grid_search.py
示例18: _fit_and_score
def _fit_and_score(estimator, X, y, scorer, train, test, verbose,
parameters, fit_params, return_train_score=False,
return_parameters=False, return_n_test_samples=False,
return_times=False, error_score='raise'):
"""
Fit estimator and compute scores for a given dataset split.
"""
if verbose > 1:
if parameters is None:
msg = ''
else:
msg = '%s' % (', '.join('%s=%s' % (k, v)
for k, v in parameters.items()))
LOG.info("[CV] %s %s", msg, (64 - len(msg)) * '.')
# Adjust length of sample weights
fit_params = fit_params if fit_params is not None else {}
fit_params = dict([(k, _index_param_value(X, v, train))
for k, v in fit_params.items()])
if parameters is not None:
estimator.set_params(**parameters)
start_time = time.time()
X_train, y_train = _safe_split(estimator, X, y, train)
X_test, y_test = _safe_split(estimator, X, y, test, train)
try:
if y_train is None:
estimator.fit(X_train, **fit_params)
else:
estimator.fit(X_train, y_train, **fit_params)
except Exception as e:
# Note fit time as time until error
fit_time = time.time() - start_time
score_time = 0.0
if error_score == 'raise':
raise
elif isinstance(error_score, numbers.Number):
test_score = error_score
if return_train_score:
train_score = error_score
warnings.warn("Classifier fit failed. The score on this train-test"
" partition for these parameters will be set to %f. "
"Details: \n%r" % (error_score, e), FitFailedWarning)
else:
raise ValueError("error_score must be the string 'raise' or a"
" numeric value. (Hint: if using 'raise', please"
" make sure that it has been spelled correctly.)")
else:
fit_time = time.time() - start_time
test_score = [_score(estimator, X_test, y_test, s) for s in scorer]
score_time = time.time() - start_time - fit_time
if return_train_score:
train_score = [_score(estimator, X_train, y_train, s)
for s in scorer]
if verbose > 2:
msg += ", score=".join(('%f' % ts for ts in test_score))
if verbose > 1:
total_time = score_time + fit_time
end_msg = "%s, total=%s" % (msg, logger.short_format_time(total_time))
LOG.info("[CV] %s %s", (64 - len(end_msg)) * '.', end_msg)
ret = [train_score, test_score] if return_train_score else [test_score]
if return_n_test_samples:
ret.append(_num_samples(X_test))
if return_times:
ret.extend([fit_time, score_time])
if return_parameters:
ret.append(parameters)
return ret
开发者ID:oesteban,项目名称:mriqc,代码行数:76,代码来源:_validation.py
示例19: _fit
def _fit(self, X, y, labels, parameter_iterable):
"""Actual fitting, performing the search over parameters."""
estimator = self.estimator
cv = check_cv(self.cv, y, classifier=is_classifier(estimator))
self.scorer_ = check_scoring(self.estimator, scoring=self.scoring)
n_samples = _num_samples(X)
X, y, labels = indexable(X, y, labels)
if y is not None:
if len(y) != n_samples:
raise ValueError('Target variable (y) has a different number '
'of samples (%i) than data (X: %i samples)'
% (len(y), n_samples))
n_splits = cv.get_n_splits(X, y, labels)
if self.verbose > 0 and isinstance(parameter_iterable, Sized):
n_candidates = len(parameter_iterable)
print("Fitting {0} folds for each of {1} candidates, totalling"
" {2} fits".format(n_splits, n_candidates,
n_candidates * n_splits))
base_estimator = clone(self.estimator)
pre_dispatch = self.pre_dispatch
out = Parallel(
n_jobs=self.n_jobs, verbose=self.verbose,
pre_dispatch=pre_dispatch
)(delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_,
train, test, self.verbose, parameters,
self.fit_params, return_parameters=True,
error_score=self.error_score)
for parameters in parameter_iterable
for train, test in cv.split(X, y, labels))
# Out is a list of triplet: score, estimator, n_test_samples
n_fits = len(out)
scores = list()
grid_scores = list()
for grid_start in range(0, n_fits, n_splits):
n_test_samples = 0
score = 0
all_scores = []
for this_score, this_n_test_samples, _, parameters in \
out[grid_start:grid_start + n_splits]:
all_scores.append(this_score)
if self.iid:
this_score *= this_n_test_samples
n_test_samples += this_n_test_samples
score += this_score
if self.iid:
score /= float(n_test_samples)
else:
score /= float(n_splits)
scores.append((score, parameters))
# TODO: shall we also store the test_fold_sizes?
grid_scores.append(_CVScoreTuple(
parameters,
score,
np.array(all_scores)))
# Store the computed scores
self.grid_scores_ = grid_scores
# Find the best parameters by comparing on the mean validation score:
# note that `sorted` is deterministic in the way it breaks ties
best = sorted(grid_scores, key=lambda x: x.mean_validation_score,
reverse=True)[0]
self.best_params_ = best.parameters
self.best_score_ = best.mean_validation_score
if self.refit:
# fit the best estimator using the entire dataset
# clone first to work around broken estimators
best_estimator = clone(base_estimator).set_params(
**best.parameters)
if y is not None:
best_estimator.fit(X, y, **self.fit_params)
else:
best_estimator.fit(X, **self.fit_params)
self.best_estimator_ = best_estimator
return self
开发者ID:Meyenhofer,项目名称:pattern-recognition-2016,代码行数:84,代码来源:_search.py
|
请发表评论