本文整理汇总了Python中sklearn.preprocessing.Binarizer类的典型用法代码示例。如果您正苦于以下问题:Python Binarizer类的具体用法?Python Binarizer怎么用?Python Binarizer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Binarizer类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: cv_mean_std_array
def cv_mean_std_array(X, y, alphas, ks, n_a, n_k, cv=20):
n = n_alphas*n_ks
cv_mean = np.empty(n)
cv_std = np.empty(n)
regressors = pd.DataFrame()
binarizer = Binarizer(threshold=1400)
y_binary = binarizer.transform(y).transpose().ravel()
itt_counter = 0
print 'size n_a: %d n_k: %d' %(n_a, n_k)
for i in range (0, n_a):
print 'reg. column : %d' %(i*n_k)
temp_string = 'alpha=%f' %alphas[i*n_k]
print temp_string
print regressors.shape
df_temp = pd.DataFrame()
print 'computing for alpha = %f' %(alphas[n_ks*i])
X_lasso, df_temp[temp_string] = df_Lasso(X, y, alphas[i*n_k])
regressors = pd.concat([regressors,df_temp], ignore_index=True, axis=1)
for j in range(0, n_k):
print 'i:%d, j:%d' %(i, j)
print 'computing for alpha = %f and k = %f' %(alphas[n_ks*i+j], ks[n_ks*i+j])
print 'X_lasso shape:'
print X_lasso.shape
cv_mean[n_ks*i+j], cv_std[n_ks*i+j] = knn_cv_mean_and_std(X_lasso, y_binary, alphas[n_ks*i+j], ks[n_ks*i+j], cv=cv)
itt_counter = itt_counter + 1
print 'completed %dth iteration of knn cv mean:%f std:%f, at pos:%d' % (itt_counter, cv_mean[n_ks*i+j], cv_std[n_ks*i+j], n_ks*i+j)
return cv_mean, cv_std, regressors
开发者ID:AveryLiu,项目名称:Data-Mining,代码行数:29,代码来源:kNN-iterator.py
示例2: cv_mean_std_array
def cv_mean_std_array(X, y, alphas, n_a, cv=20):
binarizer = Binarizer(threshold=1400)
y_binary = binarizer.transform(y).transpose().ravel()
cv_ols_means, cv_ols_stds, cv_lasso_means, cv_lasso_stds, cv_ridge_means, cv_ridge_stds = np.empty(n_a), np.empty(n_a), np.empty(n_a), np.empty(n_a), np.empty(n_a), np.empty(n_a)
for i in range (0, n_a):
print 'computing for alpha=%f' %alphas[i]
cv_ols_means[i], cv_ols_stds[i], cv_lasso_means[i], cv_lasso_stds[i], cv_ridge_means[i], cv_ridge_stds[i] = lm_cv_mean_and_std(X, , alphas[i])
print 'successfully computed iteration %d' %i
return cv_ols_means, cv_ols_stds, cv_lasso_means, cv_lasso_stds, cv_ridge_means, cv_ridge_stds
开发者ID:AveryLiu,项目名称:Data-Mining,代码行数:10,代码来源:linear-models-iterator.py
示例3: initialize
def initialize():
images, labels = load_mnist_data()
binarizer = Binarizer().fit(images)
images_binarized = binarizer.transform(images)
knn = KNeighborsClassifier(n_neighbors=3, metric='jaccard')
knn.fit(images_binarized, labels)
return knn
开发者ID:mikokm,项目名称:DigitGuesser,代码行数:10,代码来源:classifiers.py
示例4: binarizeMatrix
def binarizeMatrix(dataMatrix, threshold):
"""
Transforms all the inputs to either 0/1 . <0 Maps to 0. >1 Maps 1. [0,1] depends on the threshold you set between [0,1]
"""
binarizer = Binarizer(threshold=threshold)
dataMatrix = binarizer.fit_transform(dataMatrix)
return dataMatrix
开发者ID:Gliganu,项目名称:DMC_Fashion_2016,代码行数:10,代码来源:DatasetManipulator.py
示例5: test_binarizer
def test_binarizer():
X_ = np.array([[1, 0, 5], [2, 3, 0]])
for init in (np.array, sp.csr_matrix, sp.csc_matrix):
X = init(X_.copy())
binarizer = Binarizer(threshold=2.0, copy=True)
X_bin = toarray(binarizer.transform(X))
assert_equal(np.sum(X_bin == 0), 4)
assert_equal(np.sum(X_bin == 1), 2)
X_bin = binarizer.transform(X)
assert_equal(type(X), type(X_bin))
binarizer = Binarizer(copy=True).fit(X)
X_bin = toarray(binarizer.transform(X))
assert_true(X_bin is not X)
assert_equal(np.sum(X_bin == 0), 2)
assert_equal(np.sum(X_bin == 1), 4)
binarizer = Binarizer(copy=True)
X_bin = binarizer.transform(X)
assert_true(X_bin is not X)
X_bin = toarray(X_bin)
assert_equal(np.sum(X_bin == 0), 2)
assert_equal(np.sum(X_bin == 1), 4)
binarizer = Binarizer(copy=False)
X_bin = binarizer.transform(X)
assert_true(X_bin is X)
X_bin = toarray(X_bin)
assert_equal(np.sum(X_bin == 0), 2)
assert_equal(np.sum(X_bin == 1), 4)
开发者ID:Big-Data,项目名称:scikit-learn,代码行数:33,代码来源:test_preprocessing.py
示例6: test_binarizer_vs_sklearn
def test_binarizer_vs_sklearn():
# Compare msmbuilder.preprocessing.Binarizer
# with sklearn.preprocessing.Binarizer
binarizerr = BinarizerR()
binarizerr.fit(np.concatenate(trajs))
binarizer = Binarizer()
binarizer.fit(trajs)
y_ref1 = binarizerr.transform(trajs[0])
y1 = binarizer.transform(trajs)[0]
np.testing.assert_array_almost_equal(y_ref1, y1)
开发者ID:Eigenstate,项目名称:msmbuilder,代码行数:14,代码来源:test_preprocessing.py
示例7: wine_quality_white
def wine_quality_white():
# white wine quality dataset
filename = '../../data/raw/mldata/winequality-white.csv'
# The data corresponds to the 11 first column of the csv file
data = np.loadtxt(filename, usecols=tuple(range(11)), delimiter=';', dtype=float)
# Read the label
# We need to binarise the label using a threshold at 4
bn = Binarizer(threshold=4)
label = bn.fit_transform(np.loadtxt(filename, usecols=(11,), delimiter=';', dtype=int))
# We need to inverse the label -> 1=0 and 0=1
label = np.ravel(np.abs(label - 1))
np.savez('../../data/clean/uci-wine-quality-white.npz', data=data, label=label)
开发者ID:I2Cvb,项目名称:data_balancing,代码行数:15,代码来源:conversion.py
示例8: fit
def fit(self, X, y=None):
"""
Обучает бинаризатор на данных
"""
# print("Fitting binarizer...")
methods = Binarizer._UNSUPERVISED_METHODS + Binarizer._SUPERVISED_METHODS
if self.method not in methods:
raise ValueError("Method should be one of {0}".format(", ".join(methods)))
X = check_array(X, accept_sparse=['csr', 'csc'])
if issparse(X):
X = X.tocsc()
if self.method in Binarizer._UNSUPERVISED_METHODS:
self._fit_unsupervised(X)
self.joint_thresholds_ = self.thresholds_
self.joint_scores_ = self.scores_
else:
if y is None:
raise ValueError("y must not be None for supervised binarizers.")
# вынести в отдельную функцию
# y = np.array(y)
# if len(y.shape) == 1:
# self.classes_, y = np.unique(y, return_inverse=True)
# nclasses = self.classes_.shape[0]
# Y_new = np.zeros(shape=(y.shape[0], nclasses), dtype=int)
# Y_new[np.arange(y.shape[0]), y] = 1
# else:
# self.classes_ = np.arange(y.shape[1])
# Y_new = y
label_binarizer = SK_LabelBinarizer()
Y_new = label_binarizer.fit_transform(y)
self.classes_ = label_binarizer.classes_
if X.shape[0] != Y_new.shape[0]:
raise ValueError("X and y have incompatible shapes.\n"
"X has %s samples, but y has %s." %
(X.shape[0], Y_new.shape[0]))
self._fit_supervised(X, Y_new)
if len(self.classes_) <= 2:
self.joint_thresholds_ = self.thresholds_[:, 0]
self.joint_scores_ = self.scores_[:, 0]
else:
min_class_scores = np.min(self.scores_, axis=0)
max_class_scores = np.max(self.scores_, axis=0)
diffs = max_class_scores - min_class_scores
diffs[np.where(diffs == 0)] = 1
normalized_scores = (self.scores_ - min_class_scores) / diffs
# находим для каждого признака тот класс, для которого он наиболее полезен
# НАВЕРНО, МОЖНО СДЕЛАТЬ ПО_ДРУГОМУ
optimal_indexes = np.argmax(normalized_scores, axis=1)
nfeat = self.thresholds_.shape[0]
# в качестве порога бинаризации каждого признака
# берём значение для класса, где он наиболее полезен
self.joint_thresholds_ = self.thresholds_[np.arange(nfeat), optimal_indexes]
self.joint_scores_ = self.scores_[np.arange(nfeat), optimal_indexes]
# передаём пороги в sklearn.SK_Binarizer
self.binarize_transformer_ = SK_Binarizer(self.joint_thresholds_)
return self
开发者ID:AlexeySorokin,项目名称:pyparadigm,代码行数:56,代码来源:feature_selector.py
示例9: do_logreg
def do_logreg():
from sklearn.preprocessing import Binarizer, scale
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV
from scipy.stats import expon
import pandas
### load data
col_names=['mpg','cylinders','displacement','horsepower','weight',
'acceleration','model_year','origin','car_name']
df=pandas.read_csv('auto_mpg.csv')
df.columns=col_names
df=df.drop('car_name',1)
lr=LogisticRegression()
bn=Binarizer(threshold=df['mpg'].mean())
print "Performing binarization of the mpg variable into above/below average classes"
target=bn.fit_transform(df['mpg'])
data=df.drop('mpg',1)
data=scale(data)
print "Splitting into training and test sets"
data_train,data_test,target_train,target_test=train_test_split(data,target,test_size=0.5,random_state=0)
grid=[0.001, 0.01, 0.1, 1, 10, 100, 1000]
print 'Searching for optimal C in {} using {}-fold validation on test set '.format(grid,nfolds)
tuned_parameters=[{'C':grid}]
clf=GridSearchCV(lr,tuned_parameters,cv=nfolds,scoring='accuracy')
clf.fit(data_train,target_train)
for params, mean_score,_ in clf.grid_scores_:
print "{}: Mean accuracy {}".format(params,mean_score)
print """Cross-validating above/below average mpg prediction
using {}-fold validation on the test dataset.
Using the best estimator: {}
""".format(nfolds,clf.best_estimator_)
mean_cross=np.mean(cross_val_score(clf.best_estimator_,data_test,target_test,cv=nfolds))
print "Mean cross-validated accuracy after optimization is: {}".format(mean_cross)
开发者ID:jmccutchan,项目名称:GA_homework,代码行数:42,代码来源:sklearn_logreg.py
示例10: us_crime
def us_crime():
# US crime dataset
filename = '../../data/raw/mldata/communities.data'
# The missing data will be consider as NaN
# Only use 122 continuous features
tmp_data = np.genfromtxt(filename, delimiter = ',')
tmp_data = tmp_data[:, 5:]
# replace missing value by the mean
imp = Imputer(verbose = 1)
tmp_data = imp.fit_transform(tmp_data)
# extract the data to be saved
data = tmp_data[:, :-1]
bn = Binarizer(threshold=0.65)
label = np.ravel(bn.fit_transform(tmp_data[:, -1]))
np.savez('../../data/clean/uci-us-crime.npz', data=data, label=label)
开发者ID:I2Cvb,项目名称:data_balancing,代码行数:20,代码来源:conversion.py
示例11: OneHotEncoder
from sklearn.preprocessing import Binarizer, LabelEncoder, OneHotEncoder
onehot_encoder = OneHotEncoder()
label_encoder = LabelEncoder()
x = ['a', 'b', 'c']
label_x = label_encoder.fit_transform(x).reshape([len(x), 1])
print(label_x)
print(onehot_encoder.fit_transform(label_x).toarray())
binarizer = Binarizer(threshold=1.0).fit(label_x)
print(binarizer.transform(label_x))
开发者ID:yaochitc,项目名称:learning_libraries,代码行数:13,代码来源:features.py
示例12: Binarizer
# In[3]:
# Import csv data
raw_data = pd.read_csv('OnlineNewsPopularity_wLabels_deleteNoise.csv').iloc[:, 1:] # read in csv, omit the first column of url
raw_data = raw_data.iloc[:, :-1]
news_data = raw_data.iloc[:, :-1] # Take up to the second last column
news_labels = raw_data.iloc[:, -1] # Take shares column for labels
# Binarize
print '\nBinary Threshold:'
binary_threshold = np.median(raw_data[' shares'])
news_data = news_data.drop(' n_non_stop_words', 1)
print binary_threshold
binarizer = Binarizer(threshold=binary_threshold)
y_binary = binarizer.transform(news_labels).transpose().ravel()
# In[ ]:
# Discretize
# In[25]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier()
print 'Decision Tree Classifier Accuracy Rate'
tree_score = cross_val_score(tree, news_data, y_binary, cv=10)
开发者ID:AveryLiu,项目名称:Data-Mining,代码行数:29,代码来源:DecisionTree&NB.py
示例13: DictVectorizer
news_data = extracted_data.iloc[:, :-1] # Take up to the second last column
news_labels = extracted_data[' shares'] # Take shares column for labels
# Data Preprocessing
news_data_transpose = news_data.transpose()
data_into_dict = news_data_transpose.to_dict()
list_data = [v for k, v in data_into_dict.iteritems()]
# Encode
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer()
transformed_data = dv.fit_transform(list_data).toarray()
# Label Encoder - Binarization
from sklearn.preprocessing import Binarizer
binarizer = Binarizer(threshold=1400) # Threshold at 1400 because median of shares is 1400
transformed_labels = binarizer.transform(news_labels)
transformed_labels = transformed_labels.transpose().ravel() # .ravel() is to fix "Too many array indices error"
# Could be a scikit or pandas bug
############## Classification #################
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC
# Decision Tree Classifier
tree = DecisionTreeClassifier()
knn = KNeighborsClassifier()
gnb = GaussianNB()
开发者ID:AveryLiu,项目名称:Data-Mining,代码行数:31,代码来源:Data_Preprocessing_Script.py
示例14: ngram
#---------------------------------------------------------------------------------------
#
# Comment section below out if you already have made pickle files
#
#---------------------------------------------------------------------------------------
all_bigr = ngram(X_train, 'bigram') #starting with all features
print "Starting counting bigrams..."
X_train_bi_counted = count(X_train, all_bigr, 'bigram')
print "Done counting train set"
X_test_bi_counted = count(X_test, all_bigr, 'bigram')
print "Done counting test set"
print "Binarizing and dumping files"
bin = Binarizer()
X_train_bi_binary = bin.fit_transform(X_train_bi_counted)
X_test_bi_binary = bin.transform(X_test_bi_counted)
pickle.dump(X_train_bi_binary, open( "X_train_bi_binary.p", "wb" ) )
pickle.dump(X_test_bi_binary, open( "X_test_bi_binary.p", "wb" ) )
print "Done"
print "Starting tfidf vectors..."
X_train_bi_tfidf, X_test_bi_tfidf = tfidf(X_train_bi_counted, X_test_bi_counted)
pickle.dump(X_train_bi_tfidf, open( "X_train_bi_tfidf.p", "wb" ) )
pickle.dump(X_test_bi_tfidf, open( "X_test_bi_tfidf.p", "wb" ) )
print "Done"
print "Starting feature selection using CART random forests on binary files"
开发者ID:MariaBarrett,项目名称:LPIIExam,代码行数:31,代码来源:ngram.py
示例15: print
_, n_features = X.get_shape()
print('Loading test data...')
with open('data/test-svmlight.dat') as infile:
lines = infile.readlines()
n_samples = len(lines)
test = lil_matrix((n_samples, n_features))
for n,line in enumerate(lines):
for word_count in line.split():
fid, count = word_count.split(':')
test[n,int(fid)] = int(fid)
test = test.tocsr()
if opts.binarize:
print('Binarizing the data...')
binar = Binarizer(copy=False)
X = binar.transform(X)
test = binar.transform(test)
if opts.tfidf:
print('Transforming word occurrences into TF-IDF...')
tranny = TfidfTransformer()
X = tranny.fit_transform(X)
test = tranny.transform(test)
if opts.select_features:
k_features = int(opts.k_features)
if opts.select_features == 'k-best':
print('Selecting %i best features...' % k_features)
ch2 = SelectKBest(chi2, k=k_features)
if opts.select_features == 'pct':
开发者ID:Androidized,项目名称:BabysFirstTextClassifier,代码行数:31,代码来源:extract.py
示例16: load
def load(opt='custom', x_filename=None, y_filename=None, n_samples=0,
samples_on='rows', **kwargs):
"""Load a specified dataset.
This function can be used either to load one of the standard scikit-learn
datasets or a different dataset saved as X.npy Y.npy in the working
directory.
Parameters
-----------
opt : {'iris', 'digits', 'diabetes', 'boston', 'circles', 'moons',
'custom', 'GSEXXXXX'}, default: 'custom'
Name of a predefined dataset to be loaded. 'iris', 'digits', 'diabetes'
'boston', 'circles' and 'moons' refer to the correspondent
`scikit-learn` datasets. 'custom' can be used to load a custom dataset
which name is specified in `x_filename` and `y_filename` (optional).
x_filename : string, default : None
The data matrix file name.
y_filename : string, default : None
The label vector file name.
n_samples : int
The number of samples to be loaded. This comes handy when dealing with
large datasets. When n_samples is less than the actual size of the
dataset this function performs a random subsampling that is stratified
w.r.t. the labels (if provided).
samples_on : string
This can be either in ['row', 'rows'] if the samples lie on the row of
the input data matrix, or viceversa in ['col', 'cols'] the other way
around.
data_sep : string
The data separator. For instance comma, tab, blank space, etc.
Returns
-----------
X : array of float, shape : n_samples x n_features
The input data matrix.
y : array of float, shape : n_samples
The label vector; np.nan if missing.
feature_names : array of integers (or strings), shape : n_features
The feature names; a range of number if missing.
index : list of integers (or strings)
This is the samples identifier, if provided as first column (or row) of
of the input file. Otherwise it is just an incremental range of size
n_samples.
"""
data = None
try:
if opt.lower() == 'iris':
data = datasets.load_iris()
elif opt.lower() == 'digits':
data = datasets.load_digits()
elif opt.lower() == 'diabetes':
data = datasets.load_diabetes()
b = Binarizer(threshold=np.mean(data.target))
data.target = b.fit_transform(data.data)
elif opt.lower() == 'boston':
data = datasets.load_boston()
b = Binarizer(threshold=np.mean(data.target))
data.target = b.fit_transform(data.data)
elif opt.lower() == 'gauss':
means = np.array([[-1, 1, 1, 1], [0, -1, 0, 0], [1, 1, -1, -1]])
sigmas = np.array([0.33, 0.33, 0.33])
if n_samples <= 1:
n_samples = 333
xx, yy = generate_gauss(mu=means, std=sigmas, n_sample=n_samples)
data = datasets.base.Bunch(data=xx, target=yy)
elif opt.lower() == 'circles':
if n_samples == 0:
n_samples = 400
xx, yy = datasets.make_circles(n_samples=n_samples, factor=.3,
noise=.05)
data = datasets.base.Bunch(data=xx, target=yy)
elif opt.lower() == 'moons':
if n_samples == 0:
n_samples = 400
xx, yy = datasets.make_moons(n_samples=n_samples, noise=.01)
data = datasets.base.Bunch(data=xx, target=yy)
elif opt.lower() == 'custom':
data = load_custom(x_filename, y_filename, samples_on, **kwargs)
elif opt.lower().startswith('gse'):
raise Exception("Use ade_GEO2csv.py to convert GEO DataSets"
"into csv files.")
except IOError as e:
print("I/O error({0}): {1}".format(e.errno, e.strerror))
X, y = data.data, data.target
if n_samples > 0 and X.shape[0] > n_samples:
if y is not None:
try: # Legacy for sklearn
sss = StratifiedShuffleSplit(y, test_size=n_samples, n_iter=1)
# idx = np.random.permutation(X.shape[0])[:n_samples]
except TypeError:
#.........这里部分代码省略.........
开发者ID:slipguru,项目名称:adenine,代码行数:101,代码来源:data_source.py
示例17: Binarizer
from Models import InteractionFeatures, Model, Bounder, RemoveDuplicateCols, ReturnSame, f1, lad
from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
#%%
os.chdir(workspace)
logging.config.fileConfig('loggerConfig.properties')
logger = logging.getLogger('alllog')
logger.debug("Starting...")
binarizer = Binarizer(copy=True, threshold=thresh)
featureunion1 = FeatureUnion([
#('duplicater',ReturnSame()),
('if+',InteractionFeatures(method = lambda x,y:(x+y), threshold = corr_thresh,subsample = 1,logger=logger)),
('if-',InteractionFeatures(method = lambda x,y:(x-y), threshold = corr_thresh,subsample = 1,logger=logger)),
('if*',InteractionFeatures(method = lambda x,y:(x*y), threshold = corr_thresh,subsample = 1,logger=logger)),
('if/',InteractionFeatures(method = lambda x,y:(x/y), threshold = corr_thresh,subsample = 1,logger=logger)),
('if|',InteractionFeatures(method = lambda x,y:(y/x), threshold = corr_thresh,subsample = 1,logger=logger))
])
pp_pipeline = Pipeline([
('removedupes',RemoveDuplicateCols(logger=logger)),
('featureextraction',featureunion1),
('bounder',Bounder(inf,-inf))
])
开发者ID:vpatanjali,项目名称:Python,代码行数:31,代码来源:model_refactored.py
示例18: test_binarizer
def test_binarizer():
X_ = np.array([[1, 0, 5], [2, 3, -1]])
for init in (np.array, list, sparse.csr_matrix, sparse.csc_matrix):
X = init(X_.copy())
binarizer = Binarizer(threshold=2.0, copy=True)
X_bin = toarray(binarizer.transform(X))
assert_equal(np.sum(X_bin == 0), 4)
assert_equal(np.sum(X_bin == 1), 2)
X_bin = binarizer.transform(X)
assert_equal(sparse.issparse(X), sparse.issparse(X_bin))
binarizer = Binarizer(copy=True).fit(X)
X_bin = toarray(binarizer.transform(X))
assert_true(X_bin is not X)
assert_equal(np.sum(X_bin == 0), 2)
assert_equal(np.sum(X_bin == 1), 4)
binarizer = Binarizer(copy=True)
X_bin = binarizer.transform(X)
assert_true(X_bin is not X)
X_bin = toarray(X_bin)
assert_equal(np.sum(X_bin == 0), 2)
assert_equal(np.sum(X_bin == 1), 4)
binarizer = Binarizer(copy=False)
X_bin = binarizer.transform(X)
if init is not list:
assert_true(X_bin is X)
X_bin = toarray(X_bin)
assert_equal(np.sum(X_bin == 0), 2)
assert_equal(np.sum(X_bin == 1), 4)
binarizer = Binarizer(threshold=-0.5, copy=True)
for init in (np.array, list):
X = init(X_.copy())
X_bin = toarray(binarizer.transform(X))
assert_equal(np.sum(X_bin == 0), 1)
assert_equal(np.sum(X_bin == 1), 5)
X_bin = binarizer.transform(X)
# Cannot use threshold < 0 for sparse
assert_raises(ValueError, binarizer.transform, sparse.csc_matrix(X))
开发者ID:abouaziz,项目名称:scikit-learn,代码行数:46,代码来源:test_preprocessing.py
示例19: Binarizer
'LR': LogisticRegression,
'LSVC' : LinearSVC,
'SVC' : SVC
}
#%%
os.chdir(workspace)
dev_idvs_all = numpy.nan_to_num(numpy.load(dev_filename + ".npy"))
val_idvs_all = numpy.nan_to_num(numpy.load(val_filename + ".npy"))
dev_dvs = numpy.nan_to_num(numpy.load(dev_filename + "_dvs.npy"))
val_dvs = numpy.nan_to_num(numpy.load(val_filename + "_dvs.npy"))
binarizer = Binarizer(copy=True, threshold=thresh)
imputer = Imputer(copy = False)
dev_dvs_binary = binarizer.transform(dev_dvs).reshape((dev_dvs.shape[0],))
val_dvs_binary = binarizer.transform(val_dvs).reshape((val_dvs.shape[0],))
"""
from statsmodels.regression import quantile_regression
dev_idvs2 = dev_idvs[:10000,:]
inds = [i for i in xrange(dev_idvs2.shape[1]) if len(unique(dev_idvs2[:,i])) > 1]
dev_dvs2 = dev_dvs[:10000,:].reshape((10000,))
model = quantile_regression.QuantReg(dev_dvs2, dev_idvs2)
model.fit()
"""
开发者ID:vpatanjali,项目名称:Python,代码行数:31,代码来源:model.py
示例20: Binarizer
class Binarizer(TransformerMixin):
"""
Реализует различные стратегии бинаризации признаков,
вычисляя оптимальные пороги и производя бинаризацию с данными порогами
Аргументы:
----------
method: str('random', 'log_odds' or 'bns'), метод бинаризации признаков
divide_to_bins: bool(optional, default=True),
индикатор приведения количественных признаков к целочисленным
bins_number: int(optional, default=10),
число возможных значений целочисленных признаков при бинаризации
"""
_UNSUPERVISED_METHODS = ['random']
_SUPERVISED_METHODS = ['log_odds', 'bns']
_CONTINGENCY_METHODS = ['log_odds', 'bns']
def __init__(self, method, divide_to_bins=True, bins_number=10):
self.method = method
self.divide_to_bins = divide_to_bins
self.bins_number = bins_number
def fit(self, X, y=None):
"""
Обучает бинаризатор на данных
"""
# print("Fitting binarizer...")
methods = Binarizer._UNSUPERVISED_METHODS + Binarizer._SUPERVISED_METHODS
if self.method not in methods:
raise ValueError("Method should be one of {0}".format(", ".join(methods)))
X = check_array(X, accept_sparse=['csr', 'csc'])
if issparse(X):
X = X.tocsc()
if self.method in Binarizer._UNSUPERVISED_METHODS:
self._fit_unsupervised(X)
self.joint_thresholds_ = self.thresholds_
self.joint_scores_ = self.scores_
else:
if y is None:
raise ValueError("y must not be None for supervised binarizers.")
# вынести в отдельную функцию
# y = np.array(y)
# if len(y.shape) == 1:
# self.classes_, y = np.unique(y, return_inverse=True)
# nclasses = self.classes_.shape[0]
# Y_new = np.zeros(shape=(y.shape[0], nclasses), dtype=int)
# Y_new[np.arange(y.shape[0]), y] = 1
# else:
# self.classes_ = np.arange(y.shape[1])
# Y_new = y
label_binarizer = SK_LabelBinarizer()
Y_new = label_binarizer.fit_transform(y)
self.classes_ = label_binarizer.classes_
if X.shape[0] != Y_new.shape[0]:
raise ValueError("X and y have incompatible shapes.\n"
"X has %s samples, but y has %s." %
(X.shape[0], Y_new.shape[0]))
self._fit_supervised(X, Y_new)
if len(self.classes_) <= 2:
self.joint_thresholds_ = self.thresholds_[:, 0]
self.joint_scores_ = self.scores_[:, 0]
else:
min_class_scores = np.min(self.scores_, axis=0)
max_class_scores = np.max(self.scores_, axis=0)
diffs = max_class_scores - min_class_scores
diffs[np.where(diffs == 0)] = 1
normalized_scores = (self.scores_ - min_class_scores) / diffs
# находим для каждого признака тот класс, для которого он наиболее полезен
# НАВЕРНО, МОЖНО СДЕЛАТЬ ПО_ДРУГОМУ
optimal_indexes = np.argmax(normalized_scores, axis=1)
nfeat = self.thresholds_.shape[0]
# в качестве порога бинаризации каждого признака
# берём значение для класса, где он наиболее полезен
self.joint_thresholds_ = self.thresholds_[np.arange(nfeat), optimal_indexes]
self.joint_scores_ = self.scores_[np.arange(nfeat), optimal_indexes]
# передаём пороги в sklearn.SK_Binarizer
self.binarize_transformer_ = SK_Binarizer(self.joint_thresholds_)
return self
def transform(self, X):
"""
Применяем бинаризатор к данным
"""
print("Transforming binarizer...")
if hasattr(self, 'binarize_transformer_'):
return self.binarize_transformer_.transform(X)
else:
raise ValueError("Transformer is not fitted")
def _fit_unsupervised(self, X):
"""
Управляющая функция для методов подбора порога без учителя
"""
if self.method == 'random':
# случайные пороги и полезности
if issparse(X):
minimums = X.min(axis=0).toarray()
maximums = X.max(axis=0).toarray()
else:
minimums = np.min(X, axis=0)
#.........这里部分代码省略.........
开发者ID:AlexeySorokin,项目名称:pyparadigm,代码行数:101,代码来源:feature_selector.py
注:本文中的sklearn.preprocessing.Binarizer类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论