本文整理汇总了Python中sklearn.preprocessing.StandardScaler类的典型用法代码示例。如果您正苦于以下问题:Python StandardScaler类的具体用法?Python StandardScaler怎么用?Python StandardScaler使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了StandardScaler类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: check_transformer_pickle
def check_transformer_pickle(name, Transformer):
X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]],
random_state=0, n_features=2, cluster_std=0.1)
n_samples, n_features = X.shape
X = StandardScaler().fit_transform(X)
X -= X.min()
# catch deprecation warnings
with warnings.catch_warnings(record=True):
transformer = Transformer()
if not hasattr(transformer, 'transform'):
return
set_random_state(transformer)
set_fast_parameters(transformer)
# fit
if name in CROSS_DECOMPOSITION:
random_state = np.random.RandomState(seed=12345)
y_ = np.vstack([y, 2 * y + random_state.randint(2, size=len(y))])
y_ = y_.T
else:
y_ = y
transformer.fit(X, y_)
X_pred = transformer.fit(X, y_).transform(X)
pickled_transformer = pickle.dumps(transformer)
unpickled_transformer = pickle.loads(pickled_transformer)
pickled_X_pred = unpickled_transformer.transform(X)
assert_array_almost_equal(pickled_X_pred, X_pred)
开发者ID:AlexMarshall011,项目名称:scikit-learn,代码行数:29,代码来源:estimator_checks.py
示例2: check_classifiers_classes
def check_classifiers_classes(name, Classifier):
X, y = make_blobs(n_samples=30, random_state=0, cluster_std=0.1)
X, y = shuffle(X, y, random_state=7)
X = StandardScaler().fit_transform(X)
# We need to make sure that we have non negative data, for things
# like NMF
X -= X.min() - .1
y_names = np.array(["one", "two", "three"])[y]
for y_names in [y_names, y_names.astype('O')]:
if name in ["LabelPropagation", "LabelSpreading"]:
# TODO some complication with -1 label
y_ = y
else:
y_ = y_names
classes = np.unique(y_)
# catch deprecation warnings
with warnings.catch_warnings(record=True):
classifier = Classifier()
if name == 'BernoulliNB':
classifier.set_params(binarize=X.mean())
set_fast_parameters(classifier)
# fit
classifier.fit(X, y_)
y_pred = classifier.predict(X)
# training set performance
assert_array_equal(np.unique(y_), np.unique(y_pred))
if np.any(classifier.classes_ != classes):
print("Unexpected classes_ attribute for %r: "
"expected %s, got %s" %
(classifier, classes, classifier.classes_))
开发者ID:AlexMarshall011,项目名称:scikit-learn,代码行数:33,代码来源:estimator_checks.py
示例3: clustering_approach
def clustering_approach(self):
'''
Cluster user data using various clustering algos
IN: self.df_full and self.labels
OUT: results to stdout
'''
print 'Fitting clustering model'
X = self.df_full.values
y = self.labels
# scale data
scaler = StandardScaler()
X = scaler.fit_transform(X)
# KMeans
km_clf = KMeans(n_clusters=2, n_jobs=6)
km_clf.fit(X)
# swap labels as super-users are in cluster 0 (messy!!)
temp = y.apply(lambda x: 0 if x == 1 else 1)
print '\nKMeans clustering: '
self.analyse_preds(temp, km_clf.labels_)
# Agglomerative clustering
print '\nAgglomerative clustering approach: '
ac_clf = AgglomerativeClustering()
ac_labels = ac_clf.fit_predict(X)
self.analyse_preds(y, ac_labels)
return None
开发者ID:wvanamstel,项目名称:project,代码行数:30,代码来源:gitproject.py
示例4: buildCoordinationTreeRegressor
def buildCoordinationTreeRegressor(predictorColumns, element, coordinationDir = 'coordination/', md = None):
"""
Build a coordination predictor for a given element from compositional structure data of structures containing that element. Will return a model trained on all data, a mean_absolute_error score, and a table of true vs. predicted values
"""
try:
df = pd.read_csv(coordinationDir + element + '.csv')
except Exception:
print 'No data for ' + element
return None, None, None
df = df.dropna()
if('fracNobleGas' in df.columns):
df = df[df['fracNobleGas'] <= 0]
if(len(df) < 4):
print 'Not enough data for ' + element
return None, None, None
s = StandardScaler()
X = s.fit_transform(df[predictorColumns].astype('float64'))
y = df['avgCoordination'].values
rfr = RandomForestRegressor(max_depth = md)
acc = mean(cross_val_score(rfr, X, y, scoring=make_scorer(mean_absolute_error)))
X_train, X_test, y_train, y_test = train_test_split(X,y)
rfr.fit(X_train,y_train)
y_predict = rfr.predict(X_test)
t = pd.DataFrame({'True':y_test, 'Predicted':y_predict})
rfr.fit(X, y)
return rfr, t, round(acc,2)
开发者ID:rhsimplex,项目名称:matprojgeom,代码行数:33,代码来源:modelbuilder.py
示例5: train_and_test
def train_and_test(train_books, test_books, train, scale=True):
X_train, y_train, cands_train, features = get_pair_data(train_books, True)
X_test, y_test, cands_test, features = get_pair_data(test_books)
scaler = None
if scale:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
print sum(y_train)*0.1/len(y_train)
print 'Start training'
print X_train.shape
clf = train(X_train, y_train)
print 'Done training'
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)
'''
# print performance for training books
print "--------------Traning data-------------"
train_perf = evaluate_books(clf, train_books, scaler, evaluate_pair)
# print performance for testing books
print "\n"
print "--------------Testing data-------------"
test_perf = evaluate_books(clf, test_books, scaler, evaluate_pair)
'''
print 'Train Non-unique Precision:', precision(y_train_pred, y_train), 'Non-unique Recall:', recall(y_train_pred, y_train)
print 'Test Non-unique Precision:', precision(y_test_pred, y_test), 'Recall:', recall(y_test_pred, y_test)
return clf, scaler, X_train, y_train, X_test, y_test
开发者ID:TheSumitGogia,项目名称:chara-extractor,代码行数:31,代码来源:train_pair.py
示例6: load_train_data
def load_train_data(path):
print("Loading Train Data")
df = pd.read_csv(path)
# Remove line below to run locally - Be careful you need more than 8GB RAM
rows = np.random.choice(df.index.values, 40000)
df = df.ix[rows]
# df = df.sample(n=40000)
# df = df.loc[df.index]
labels = df.target
df = df.drop('target',1)
df = df.drop('ID',1)
# Junk cols - Some feature engineering needed here
df = df.fillna(-1)
X = df.values.copy()
np.random.shuffle(X)
X = X.astype(np.float32)
encoder = LabelEncoder()
y = encoder.fit_transform(labels).astype(np.int32)
scaler = StandardScaler()
X = scaler.fit_transform(X)
return X, y, encoder, scaler
开发者ID:ChiuYeeLau,项目名称:KaggleSpringleafMarketingResponse,代码行数:29,代码来源:Neural_Network.py
示例7: knn
def knn(x_train, y_train, x_valid):
x_train=np.log(x_train+1)
x_valid=np.log(x_valid+1)
where_are_nan = np.isnan(x_train)
where_are_inf = np.isinf(x_train)
x_train[where_are_nan] = 0
x_train[where_are_inf] = 0
where_are_nan = np.isnan(x_valid)
where_are_inf = np.isinf(x_valid)
x_valid[where_are_nan] = 0
x_valid[where_are_inf] = 0
scale=StandardScaler()
scale.fit(x_train)
x_train=scale.transform(x_train)
x_valid=scale.transform(x_valid)
#pca = PCA(n_components=10)
#pca.fit(x_train)
#x_train = pca.transform(x_train)
#x_valid = pca.transform(x_valid)
kneighbors=KNeighborsClassifier(n_neighbors=200,n_jobs=-1)
knn_train, knn_test = stacking(kneighbors, x_train, y_train, x_valid, "knn")
return knn_train, knn_test, "knn"
开发者ID:bifeng,项目名称:Rental-Listing-Inquiries,代码行数:26,代码来源:stacking_util_scale_magic_add.py
示例8: normalize
def normalize( training_data, test_data ):
scaler = StandardScaler()
values = scaler.fit_transform( training_data )
training_data = pd.DataFrame( values, columns=training_data.columns, index=training_data.index )
values = scaler.transform( test_data )
test_data = pd.DataFrame( values, columns=test_data.columns, index=test_data.index )
return training_data, test_data
开发者ID:divijbindlish,项目名称:quantify,代码行数:7,代码来源:preprocessing.py
示例9: run_model
def run_model( model, model_name, X, Y, X_val):
new_values = [ [x] for x in range(len(X))]
X = numpy.append(X, new_values, 1)
from sklearn.preprocessing import StandardScaler # I have a suspicion that the classifier might work better without the scaler
scaler = StandardScaler().fit(X)
X = scaler.transform(X)
max_time_val = X[-1][-1] *2 - X[-2][-1]
Y = make_black_maps_class(Y)
# Load validation data
model.fit(X, Y)
new_values = [ [max_time_val] for x in range(len(X_val))]
X_val = numpy.append(X_val, new_values, 1)
# Now predict validation output
Y_pred = model.predict(X_val)
# Crop impossible values
Y_pred[Y_pred < 0] = 0
Y_pred[Y_pred > 600] = 600
savetxt('final_pred_y{0}.csv'.format(model_name), Y_pred, delimiter=',')
black_map_count = 0
for y in Y_pred:
if y == 600:
black_map_count += 1
print black_map_count, model_name
sys.stdout.flush()
开发者ID:danielrich,项目名称:utahdatacomp,代码行数:32,代码来源:black_map_trim_history.py
示例10: load_data_csv_advanced
def load_data_csv_advanced(datafile):
"""
Loads data from given CSV file. The first line in the given CSV file is expected to be the names of the columns.
:param datafile: path of the file
:return: a NumPy array containing a data point in each row
"""
# File format for CSV file. For example, setting _X_COLUMN to 'x' means that x coordinates of geographical location
# will be at the column named 'x' in the CSV file.
_COLUMN_X = 'x'
_COLUMN_Y = 'y'
data = pd.read_csv(datafile)
# Normalize
scaler = StandardScaler()
scaler.fit(data[[_COLUMN_X, _COLUMN_Y]])
data[[_COLUMN_X, _COLUMN_Y]] = scaler.transform(data[[_COLUMN_X, _COLUMN_Y]])
# Get feature vector names by removing "x" and "y"
feature_vector_names = data.columns.difference([_COLUMN_X, _COLUMN_Y])
data_coords = data[[_COLUMN_X, _COLUMN_Y]].values
result = {"coordinates": data_coords}
for feature in feature_vector_names:
data_words = [[e.strip() for e in venue_data.split(",")] for venue_data in data[feature].values.flatten().tolist()]
result[feature] = data_words
return sparsify_data(result, None, None), scaler # None for both params since SVD is not used
开发者ID:mmathioudakis,项目名称:geotopics,代码行数:31,代码来源:io.py
示例11: lassoRegression
def lassoRegression(X,y):
print("\n### ~~~~~~~~~~~~~~~~~~~~ ###")
print("Lasso Regression")
### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
myDegree = 40
polynomialFeatures = PolynomialFeatures(degree=myDegree, include_bias=False)
Xp = polynomialFeatures.fit_transform(X)
myScaler = StandardScaler()
scaled_Xp = myScaler.fit_transform(Xp)
### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
lassoRegression = Lasso(alpha=1e-7)
lassoRegression.fit(scaled_Xp,y)
### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
dummyX = np.arange(0,2,0.01)
dummyX = dummyX.reshape((dummyX.shape[0],1))
dummyXp = polynomialFeatures.fit_transform(dummyX)
scaled_dummyXp = myScaler.transform(dummyXp)
dummyY = lassoRegression.predict(scaled_dummyXp)
outputFILE = 'plot-lassoRegression.png'
fig, ax = plt.subplots()
fig.set_size_inches(h = 6.0, w = 10.0)
ax.axis([0,2,0,15])
ax.scatter(X,y,color="black",s=10.0)
ax.plot(dummyX, dummyY, color='red', linewidth=1.5)
plt.savefig(filename = outputFILE, bbox_inches='tight', pad_inches=0.2, dpi = 600)
### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
return( None )
开发者ID:paradisepilot,项目名称:statistics,代码行数:34,代码来源:lassoRegression.py
示例12: load_data_csv
def load_data_csv(datafile):
"""
Loads data from given CSV file. The first line in the given CSV file is expected to be the names of the columns.
:param datafile: path of the file
:return: a NumPy array containing a data point in each row
"""
# File format for CSV file. For example, setting _X_COLUMN to 'x' means that x coordinates of geographical location
# will be at the column named 'x' in the CSV file.
# This will be useful later when we start adding more features.
_COLUMN_X = 'x'
_COLUMN_Y = 'y'
_COLUMN_W = 'color'
data = pd.read_csv(datafile)
# Normalize
scaler = StandardScaler()
scaler.fit(data[[_COLUMN_X, _COLUMN_Y]])
data[[_COLUMN_X, _COLUMN_Y]] = scaler.transform(data[[_COLUMN_X, _COLUMN_Y]])
data_coords = data[[_COLUMN_X, _COLUMN_Y]].values
data_words = [[e] for e in data[[_COLUMN_W]].values.flatten().tolist()]
data = {"coordinates": data_coords, "words": data_words}
return sparsify_data(data, None, None), scaler # None for both params since SVD is not used
开发者ID:mmathioudakis,项目名称:geotopics,代码行数:27,代码来源:io.py
示例13: prepare_features
def prepare_features(data, enc=None, scaler=None):
'''
One-hot encode all boolean/string (categorical) features,
and shift/scale integer/float features
'''
# X needs to contain only non-negative integers
bfs = data['bfeatures'] + 1
sfs = data['sfeatures'] + 1
# Shift/scale integer and float features to have mean=0, std=1
ifs = data['ifeatures']
ffs = data['ffeatures']
x2 = np.hstack((ifs,ffs))
if scaler is None:
scaler = StandardScaler()
x2 = scaler.fit_transform(x2)
print "Training features have mean: %s" % scaler.mean_
print "and standard deviation: %s" % scaler.std_
else:
x2 = scaler.transform(x2, copy=False)
# one-hot encode categorical features
X = np.hstack((bfs,sfs,x2))
categorical = np.arange(bfs.shape[1]+sfs.shape[1])
if enc is None:
enc = OneHotEncoder(n_values='auto', categorical_features=categorical)
X = enc.fit_transform(X)
print "One-hot encoded features have dimension %d" % X.shape[1]
else:
X = enc.transform(X)
return X, enc, scaler
开发者ID:timpalpant,项目名称:KaggleTSTextClassification,代码行数:31,代码来源:predict.6.py
示例14: cross_valid
def cross_valid(data, classifier, x_cols, y_col, **kwargs):
# Do train-test split for cross-validation
size = len(data)
kf = train_test_split(size)
y_pred = np.zeros(size)
y_pred_prob = np.zeros(size)
y = data[y_col].as_matrix().astype(np.float)
totaltime_train = 0
totaltime_test = 0
for train_index, test_index in kf:
# Fill in missing values
df = data.copy()
df = fill_missing_median(df, train_index)
# Transform and normalize
X = df[x_cols].as_matrix().astype(np.float)
scaler = StandardScaler()
X = scaler.fit_transform(X)
# Build classifier and yield predictions
y_pred[test_index], y_pred_prob[test_index], train_time, test_time \
= model(X, y, train_index, test_index, classifier, **kwargs)
totaltime_train += train_time
totaltime_test += test_time
avgtime_train = train_time/len(kf)
avgtime_test = test_time/len(kf)
return y, y_pred, y_pred_prob, avgtime_train, avgtime_test
开发者ID:alicetang0618,项目名称:Project-NFP,代码行数:25,代码来源:xiaorui.py
示例15: linregress
def linregress(X_train, X_test, y_train, y_test):
coef = []
for col in X_train.columns.tolist():
X = StandardScaler().fit_transform(X_train[col])
lr = LinearRegression()
lr.fit(X.reshape(-1, 1), y_train)
coef.append([col, lr.coef_])
coef = sorted(coef, key=lambda x: x[1])[::-1]
nos = [x[1] for x in coef]
labs = [x[0] for x in coef]
for lab in labs:
if lab == 'doubles':
labs[labs.index(lab)] = '2B'
elif lab == 'triples':
labs[labs.index(lab)] = '3B'
elif lab == 'Intercept':
idx = labs.index('Intercept')
labs.pop(idx)
nos.pop(idx)
labs = [lab.upper() for lab in labs]
x = range(len(nos))
plt.plot(x,nos, lw=2, c='b')
plt.xticks(x, labs)
plt.title('Linear Regression Coefficients (Win Percentage)')
plt.savefig('images/coefficients.png')
plt.show()
print labs
开发者ID:blemi4,项目名称:p2-baseball,代码行数:27,代码来源:baseball.py
示例16: Classifier
class Classifier(BaseEstimator):
def __init__(self):
self.label_encoder = LabelEncoder()
self.scaler = StandardScaler()
self.clf = None
def fit(self, X, y):
X = self.scaler.fit_transform(X.astype(np.float32))
y = self.label_encoder.fit_transform(y).astype(np.int32)
dtrain = xgb.DMatrix( X, label=y.astype(np.float32))
param = {'objective':'multi:softprob', 'eval_metric':'mlogloss'}
param['nthread'] = 4
param['num_class'] = 9
param['colsample_bytree'] = 0.55
param['subsample'] = 0.85
param['gamma'] = 0.95
param['min_child_weight'] = 3.0
param['eta'] = 0.05
param['max_depth'] = 12
num_round = 400 # to be faster ??
#num_round = 820
self.clf = xgb.train(param, dtrain, num_round)
def predict(self, X):
X = self.scaler.transform(X.astype(np.float32))
dtest = xgb.DMatrix(X)
label_index_array = np.argmax(self.clf.predict(dtest), axis=1)
return self.label_encoder.inverse_transform(label_index_array)
def predict_proba(self, X):
X = self.scaler.transform(X.astype(np.float32))
dtest = xgb.DMatrix(X)
return self.clf.predict(dtest)
开发者ID:thomasschmitt,项目名称:otto,代码行数:35,代码来源:classifier.py
示例17: test_transformers_data_not_an_array
def test_transformers_data_not_an_array():
# test if transformers do something sensible on training set
# also test all shapes / shape errors
transformers = all_estimators(type_filter='transformer')
X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]],
random_state=0, n_features=2, cluster_std=0.1)
X = StandardScaler().fit_transform(X)
# We need to make sure that we have non negative data, for things
# like NMF
X -= X.min() - .1
for name, Transformer in transformers:
# XXX: some transformers are transforming the input
# data. This is a bug that we'll fix later. Right now we copy
# the data each time
this_X = NotAnArray(X.copy())
this_y = NotAnArray(np.asarray(y))
if name in dont_test:
continue
# these don't actually fit the data:
if name in ['AdditiveChi2Sampler', 'Binarizer', 'Normalizer']:
continue
# And these wan't multivariate output
if name in ('PLSCanonical', 'PLSRegression', 'CCA', 'PLSSVD'):
continue
yield check_transformer, name, Transformer, this_X, this_y
开发者ID:akashaio,项目名称:scikit-learn,代码行数:26,代码来源:test_common.py
示例18: test_scaler_1d
def test_scaler_1d():
"""Test scaling of dataset along single axis"""
rng = np.random.RandomState(0)
X = rng.randn(5)
X_orig_copy = X.copy()
scaler = StandardScaler()
X_scaled = scaler.fit(X).transform(X, copy=False)
assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
assert_array_almost_equal(X_scaled.std(axis=0), 1.0)
# check inverse transform
X_scaled_back = scaler.inverse_transform(X_scaled)
assert_array_almost_equal(X_scaled_back, X_orig_copy)
# Test with 1D list
X = [0., 1., 2, 0.4, 1.]
scaler = StandardScaler()
X_scaled = scaler.fit(X).transform(X, copy=False)
assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
assert_array_almost_equal(X_scaled.std(axis=0), 1.0)
X_scaled = scale(X)
assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
assert_array_almost_equal(X_scaled.std(axis=0), 1.0)
开发者ID:MarkyV,项目名称:scikit-learn,代码行数:25,代码来源:test_preprocessing.py
示例19: main
def main():
t0 = time.time() # start time
# output files path
TRAINX_OUTPUT = "../../New_Features/train_x_processed.csv"
TEST_X_OUTPUT = "../../New_Features/test__x_processed.csv"
# input files path
TRAIN_FILE_X1 = "../../ML_final_project/sample_train_x.csv"
TRAIN_FILE_X2 = "../../ML_final_project/log_train.csv"
TEST__FILE_X1 = "../../ML_final_project/sample_test_x.csv"
TEST__FILE_X2 = "../../ML_final_project/log_test.csv"
# load files
TRAIN_DATA_X1 = np.loadtxt(TRAIN_FILE_X1, delimiter=',', skiprows=1, usecols=(range(1, 18)))
TEST__DATA_X1 = np.loadtxt(TEST__FILE_X1, delimiter=',', skiprows=1, usecols=(range(1, 18)))
TRAIN_DATA_X2 = logFileTimeCount(np.loadtxt(TRAIN_FILE_X2, delimiter=',', skiprows=1, dtype=object))
TEST__DATA_X2 = logFileTimeCount(np.loadtxt(TEST__FILE_X2, delimiter=',', skiprows=1, dtype=object))
# combine files
TRAIN_DATA_X0 = np.column_stack((TRAIN_DATA_X1, TRAIN_DATA_X2))
TEST__DATA_X0 = np.column_stack((TEST__DATA_X1, TEST__DATA_X2))
# data preprocessing
scaler = StandardScaler()
TRAIN_DATA_X = scaler.fit_transform(TRAIN_DATA_X0)
TEST__DATA_X = scaler.transform(TEST__DATA_X0)
# output processed files
outputXFile(TRAINX_OUTPUT, TRAIN_DATA_X)
outputXFile(TEST_X_OUTPUT, TEST__DATA_X)
t1 = time.time() # end time
print "...This task costs " + str(t1 - t0) + " second."
开发者ID:TeamSDJ,项目名称:ML_2015_Final,代码行数:30,代码来源:outputNewFeature.py
示例20: main
def main(trainFile, testFile, outputFile, mode, classifier):
"""
input:
1. trainFile: the training data features file
2. testFile: the test data file
3. outputFile: the file where the output of the test data has to be written
4. classifier: the classifier to be used
"""
# scale the input data
scaler = StandardScaler()
trainingData = getData(trainFile)
trainX = trainingData[0]
trainY = trainingData[1]
trainX = scaler.fit_transform(trainX)
testX = []
testY = []
# train the classifier
clf = trainClassifier(trainX, trainY, classifier, mode)
# if test mode, get test data and predict the output classes
if mode == 1:
testData = getData(testFile)
testX = testData[0]
testY = testData[1]
testX = scaler.transform(testX)
actY = test(testX, clf)
testY = testY.reshape(len(testY), 1)
# write the predicted class probabilities
output = np.concatenate((testY, actY), axis = 1)
np.savetxt(outputFile, output, fmt='%s', delimiter=',')
开发者ID:hpam1,项目名称:Machine-Learning,代码行数:29,代码来源:classifier.py
注:本文中的sklearn.preprocessing.StandardScaler类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论