本文整理汇总了Python中sklearn.preprocessing.LabelEncoder类的典型用法代码示例。如果您正苦于以下问题:Python LabelEncoder类的具体用法?Python LabelEncoder怎么用?Python LabelEncoder使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了LabelEncoder类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: train
def train(cls, X, y, word_sim_metric, classifier=LinearSVC,
feature_num=10, feature_type='sim', verbose=True):
if isinstance(classifier, type):
classifier = classifier()
labels = LabelEncoder()
y_train = labels.fit_transform(y)
@timeit
def build():
corpus = zip(X, y)
model = Pipeline([
('preprocessor', TextPreprocessor(corpus, word_sim_metric, feature_num, feature_type)),
('vectorizer', DictVectorizer()),
('classifier', classifier),
])
model.fit(X, y_train)
return model
if verbose: print("Building the model")
model, secs = build()
if verbose: print("Complete model building in {:0.3f} seconds".format(secs))
return cls(labels, model)
开发者ID:gsi-upm,项目名称:sematch,代码行数:27,代码来源:application.py
示例2: load_otto_group
def load_otto_group():
"""
Loads and returns several variables for the data set from Kaggle's Otto Group Product Classification competition.
Link: https://www.kaggle.com/c/otto-group-product-classification-challenge
Returns
----------
data : array-like
Pandas data frame containing the entire data set.
X : array-like
Training input samples.
y : array-like
Target values.
"""
file_location = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data', 'otto_group.zip')
z = ZipFile(file_location)
data = pd.read_csv(z.open('train.csv'))
data = data.set_index('id')
# move the label to the first position
cols = data.columns.tolist()
cols = cols[-1:] + cols[0:-1]
data = data[cols]
X = data.iloc[:, 1:].values
y = data.iloc[:, 0].values
# transform the labels from strings to integers
encoder = LabelEncoder()
y = encoder.fit_transform(y)
return data, X, y
开发者ID:jdwittenauer,项目名称:ionyx,代码行数:35,代码来源:datasets.py
示例3: buildTreeClassifier
def buildTreeClassifier(predictorColumns, structurestable = 'structures.csv', targetcolumn = 'pointGroup', md = None):
"""
Build a random forest-classifier model to predict some structure feature from compositional data. Will return the model trained on all data, a confusion matrix calculated , and an average accuracy score. Also returns a label encoder object
"""
df = pd.read_csv(structurestable)
df = df.dropna()
if('fracNobleGas' in df.columns):
df = df[df['fracNobleGas'] <= 0]
s = StandardScaler()
le = LabelEncoder()
X = s.fit_transform(df[predictorColumns].astype('float64'))
y = le.fit_transform(df[targetcolumn].values)
rfc = RandomForestClassifier(max_depth = md)
acc = mean(cross_val_score(rfc, X, y))
X_train, X_test, y_train, y_test = train_test_split(X,y)
rfc.fit(X_train,y_train)
y_predict = rfc.predict(X_test)
cm = confusion_matrix(y_test, y_predict)
cm = pd.DataFrame(cm, columns=le.classes_, index=le.classes_)
rfc.fit(X, y)
return rfc, cm, round(acc,2), le
开发者ID:rhsimplex,项目名称:matprojgeom,代码行数:28,代码来源:modelbuilder.py
示例4: test_multiclass_classifier_class_weight
def test_multiclass_classifier_class_weight():
"""tests multiclass with classweights for each class"""
alpha = .1
n_samples = 20
tol = .00001
max_iter = 50
class_weight = {0: .45, 1: .55, 2: .75}
fit_intercept = True
X, y = make_blobs(n_samples=n_samples, centers=3, random_state=0,
cluster_std=0.1)
step_size = get_step_size(X, alpha, fit_intercept, classification=True)
classes = np.unique(y)
clf1 = LogisticRegression(solver='sag', C=1. / alpha / n_samples,
max_iter=max_iter, tol=tol, random_state=77,
fit_intercept=fit_intercept,
class_weight=class_weight)
clf2 = clone(clf1)
clf1.fit(X, y)
clf2.fit(sp.csr_matrix(X), y)
le = LabelEncoder()
class_weight_ = compute_class_weight(class_weight, np.unique(y), y)
sample_weight = class_weight_[le.fit_transform(y)]
coef1 = []
intercept1 = []
coef2 = []
intercept2 = []
for cl in classes:
y_encoded = np.ones(n_samples)
y_encoded[y != cl] = -1
spweights1, spintercept1 = sag_sparse(X, y_encoded, step_size, alpha,
n_iter=max_iter, dloss=log_dloss,
sample_weight=sample_weight)
spweights2, spintercept2 = sag_sparse(X, y_encoded, step_size, alpha,
n_iter=max_iter, dloss=log_dloss,
sample_weight=sample_weight,
sparse=True)
coef1.append(spweights1)
intercept1.append(spintercept1)
coef2.append(spweights2)
intercept2.append(spintercept2)
coef1 = np.vstack(coef1)
intercept1 = np.array(intercept1)
coef2 = np.vstack(coef2)
intercept2 = np.array(intercept2)
for i, cl in enumerate(classes):
assert_array_almost_equal(clf1.coef_[i].ravel(),
coef1[i].ravel(),
decimal=2)
assert_almost_equal(clf1.intercept_[i], intercept1[i], decimal=1)
assert_array_almost_equal(clf2.coef_[i].ravel(),
coef2[i].ravel(),
decimal=2)
assert_almost_equal(clf2.intercept_[i], intercept2[i], decimal=1)
开发者ID:AlexisMignon,项目名称:scikit-learn,代码行数:60,代码来源:test_sag.py
示例5: process_one_cell
def process_one_cell(df_cell_train, df_cell_test):
#Working on df_train
place_counts = df_cell_train.place_id.value_counts()
mask = (place_counts[df_cell_train.place_id.values] >= 8).values
df_cell_train = df_cell_train.loc[mask]
#Working on df_test
row_ids = df_cell_test.index
#Feature engineering on x and y
df_cell_train.loc[:,'x'] *= 500.0
df_cell_train.loc[:,'y'] *= 1000.0
df_cell_test.loc[:,'x'] *= 500.0
df_cell_test.loc[:,'y'] *= 1000.0
#Preparing data
le = LabelEncoder()
y = le.fit_transform(df_cell_train.place_id.values)
X = df_cell_train.drop(['place_id'], axis=1).values
X_test = df_cell_test.values
#Applying the classifier
clf = KNeighborsClassifier(n_neighbors=36, weights=calculate_distance,
metric='manhattan')
clf.fit(X, y)
y_pred = clf.predict_proba(X_test)
pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:,::-1][:,:3])
return pred_labels, row_ids
开发者ID:kaustubh0mani,项目名称:Facebook-Predicting-Check-Ins,代码行数:30,代码来源:facebook.py
示例6: plot_model_decision_surface
def plot_model_decision_surface(clf, train_features, train_labels,
plot_step=0.02, cmap=plt.cm.RdYlBu,
markers=None, alphas=None, colors=None):
if train_features.shape[1] != 2:
raise ValueError("X_train should have exactly 2 columnns!")
x_min, x_max = train_features[:, 0].min() - plot_step, train_features[:, 0].max() + plot_step
y_min, y_max = train_features[:, 1].min() - plot_step, train_features[:, 1].max() + plot_step
xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
np.arange(y_min, y_max, plot_step))
clf_est = clone(clf)
clf_est.fit(train_features,train_labels)
if hasattr(clf_est, 'predict_proba'):
Z = clf_est.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:,1]
else:
Z = clf_est.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
cs = plt.contourf(xx, yy, Z, cmap=cmap)
le = LabelEncoder()
y_enc = le.fit_transform(train_labels)
n_classes = len(le.classes_)
plot_colors = ''.join(colors) if colors else [None] * n_classes
label_names = le.classes_
markers = markers if markers else [None] * n_classes
alphas = alphas if alphas else [None] * n_classes
for i, color in zip(range(n_classes), plot_colors):
idx = np.where(y_enc == i)
plt.scatter(train_features[idx, 0], train_features[idx, 1], c=color,
label=label_names[i], cmap=cmap, edgecolors='black',
marker=markers[i], alpha=alphas[i])
plt.legend()
plt.show()
开发者ID:Zoery,项目名称:practical-machine-learning-with-python,代码行数:35,代码来源:model_evaluation_utils.py
示例7: main
def main():
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
enc = LabelEncoder()
joined = pd.concat((train['Product_Info_2'],
test['Product_Info_2']), axis=0)
enc.fit(joined)
train['Product_Info_2'] = enc.transform(train['Product_Info_2'])
test['Product_Info_2'] = enc.transform(test['Product_Info_2'])
X_train = train.drop('Response', axis=1).values
y_train = train['Response'].values
X_test = test.values
mdl = xgb.XGBRegressor(learning_rate=0.05,
n_estimators=200,
subsample=0.5,
max_depth=6,
silent=False)
mdl.fit(X_train, y_train)
preds = mdl.predict(X_test)
preds = [min(max(1, int(round(pred))), 8) for pred in preds]
sub = pd.DataFrame({'Id': test['Id'], 'Response': preds})
sub.to_csv('submissions/xgb.csv', index=False)
开发者ID:xiaoyubai,项目名称:kaggle-prudential,代码行数:28,代码来源:base_mdl.py
示例8: test_vote_soft
def test_vote_soft():
X,y,test_X,test_Y =get_test_data()
print("bag of words")
bow = BagOfWordsClassifier()
bow_probs = bow.get_proba(X,y,test_X,prefix="t")
print("direct attribute")
da = DirectAttributeClassifier()
da_probs = da.get_proba(X,y,test_X,prefix="t")
probs = zip(*[item for p in [bow_probs,da_probs] for item in p])
train_probs = probs[0]
test_probs = probs[1]
print(len(train_probs))
for prob in train_probs:
print(prob.shape)
print(type(prob))
#train_attr = reduce(lambda a,b:a+b,train_probs)
test_attr = reduce(lambda a,b:a+b,test_probs)
pred = test_attr.idxmax(1)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(y)
pred = le.inverse_transform(pred)
print(metrics.accuracy_score(test_Y,pred))
开发者ID:ZaydH,项目名称:recipe_cuisine_type_classifier,代码行数:28,代码来源:predict.py
示例9: test_hard_vote
def test_hard_vote():
X,y,test_X,test_Y =get_test_data()
print("bag of words")
bow = BagOfWordsClassifier()
bow_probs = bow.get_proba(X,y,test_X,prefix="t")
print("direct attribute")
da = DirectAttributeClassifier()
da_probs = da.get_proba(X,y,test_X,prefix="t")
probs = zip(*[item for p in [bow_probs,da_probs] for item in p])
#train_probs = probs[0]
test_probs = probs[1]
print(len(test_probs))
preds = [x.idxmax(1) for x in test_probs]
pred = np.zeros(len(preds[0]),dtype=np.int8)
print(len(pred))
for i in range(len(preds[0])):
votes = [p[i] for p in preds]
print(votes)
pred[i]= max(set(votes),key=votes.count)
print(pred[i])
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(y)
pred = le.inverse_transform(pred)
print(metrics.accuracy_score(test_Y,pred))
"""
开发者ID:ZaydH,项目名称:recipe_cuisine_type_classifier,代码行数:31,代码来源:predict.py
示例10: train
def train(self):
input_dir = get_config().get('Classification', 'TrainingInputPath')
self.logger.info("Loading features")
file_name = os.path.join(input_dir, 'labels.csv')
labels = pd.read_csv(file_name, header=None).as_matrix()[:, 1]
labels = map(itemgetter(1),
map(os.path.split,
map(os.path.dirname, labels)))
label_encoder = LabelEncoder().fit(labels)
labels_encoded = label_encoder.transform(labels)
num_classes = len(label_encoder.classes_)
file_name = os.path.join(input_dir, 'reps.csv')
features = pd.read_csv(file_name, header=None).as_matrix()
self.logger.info("Training for {} classes.".format(num_classes))
clf = SVC(C=1, kernel='linear', probability=True)
# TODO: Try a previous LDA
try:
lda = int(get_config().get("Classification", "LDADim"))
except ValueError:
lda = None
if lda:
clf_final = clf
clf = Pipeline([('lda', LDA(n_components=lda)),
('clf', clf_final)])
clf.fit(features, labels_encoded)
file_name = os.path.join(input_dir, 'classifier.pkl')
self.logger.info("Saving classifier to '{}'".format(file_name))
with open(file_name, 'w') as f:
pickle.dump((label_encoder, clf), f)
开发者ID:albertorepo,项目名称:jarvis,代码行数:34,代码来源:training.py
示例11: loadData
def loadData(path="../data/",k=5,log='add',pca_n=0,SEED=34):
from pandas import DataFrame, read_csv
from numpy import log as ln
from sklearn.cross_validation import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
train = read_csv(path+"train.csv")
test = read_csv(path+"test.csv")
id = test.id
target = train.target
encoder = LabelEncoder()
target_nnet = encoder.fit_transform(target).astype('int32')
feat_names = [x for x in train.columns if x.startswith('feat')]
train = train[feat_names].astype(float)
test = test[feat_names]
if log == 'add':
for v in train.columns:
train[v+'_log'] = ln(train[v]+1)
test[v+'_log'] = ln(test[v]+1)
elif log == 'replace':
for v in train.columns:
train[v] = ln(train[v]+1)
test[v] = ln(test[v]+1)
if pca_n > 0:
from sklearn.decomposition import PCA
pca = PCA(pca_n)
train = pca.fit_transform(train)
test = pca.transform(test)
scaler = StandardScaler()
scaler.fit(train)
train = DataFrame(scaler.transform(train),columns=['feat_'+str(x) for x in range(train.shape[1])])
test = DataFrame(scaler.transform(test),columns=['feat_'+str(x) for x in range(train.shape[1])])
cv = KFold(len(train), n_folds=k, shuffle=True, random_state=SEED)
return train, test, target, target_nnet, id, cv, encoder
开发者ID:dmcgarry,项目名称:kaggle_otto_group,代码行数:34,代码来源:helperFunctions.py
示例12: to_numeric
def to_numeric(self, columns=[]):
le = LabelEncoder()
for i, c in enumerate(columns):
le.fit(self.M[:, c])
self.M[:, c] = le.transform(self.M[:, c])
self.M = self.M.astype(np.float)
return self
开发者ID:makgyver,项目名称:pyros,代码行数:7,代码来源:binarizer.py
示例13: prepare_labels
def prepare_labels(y):
# From here: https://www.kaggle.com/pestipeti/keras-cnn-starter
values = np.array(y)
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
return integer_encoded, label_encoder
开发者ID:melgor,项目名称:kaggle-whale-tail,代码行数:7,代码来源:training.py
示例14: labele
def labele(tbl,cols='all'):
from sklearn.preprocessing import LabelEncoder as LE
if cols=='all':cols=tbl.columns
le=LE()
for ac in tbl.columns:
tbl.loc[:,ac]=le.fit(tbl[ac]).transform(tbl[ac]) #might have to return le
return tbl
开发者ID:DistrictDataLabs,项目名称:transportation-project-1,代码行数:7,代码来源:analysis.py
示例15: ml_target
def ml_target(dataset):
""" Takes a dataset and retuns the target in a numpy.array ready for
machine learning.
Mainly transforms non-numerical variables(columns) to numbers.
Parameters
----------
copper.Dataset
Returns
-------
(label_encoder, np.array)
Notes
-----
If dataset has more than one variable with role=TARGET then the first one
is selected.
"""
cols = dataset.filter_cols(role=dataset.TARGET)
assert len(cols) > 0, 'No target variables on Dataset'
if len(cols) > 1:
import warnings
warnings.warn("Dataset contains more than one target, %s was choosed" % cols[0])
if dataset[cols[0]].dtype in (np.int, np.float):
return None, dataset[cols[0]].values
else:
le = LabelEncoder()
encoded = le.fit_transform(dataset[cols[0]].values)
return le, encoded
开发者ID:GeorgeMcIntire,项目名称:copper,代码行数:30,代码来源:transform.py
示例16: prep_data
def prep_data(df_train,df_test,test_size=0.2):
print(" ---- Start data prep")
df_train = df_train.dropna(subset=['X1'])
df_train['X1'] = (df_train['X1'].replace( '[\%,)]','',regex=True).replace( '[(]','-', regex=True ).astype(float))
labels = df_train['X1'].values
id_test = df_test['X2']
piv_train = df_train.shape[0]
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)
del df_all['X1'], df_all['X2'], df_all['X3'], df_all['X10'], df_all['X16'], df_all['X18']
df_all['X23'] = df_all['X23'].map(lambda x: str(x)[:-3])
df_all['X15'] = df_all['X15'].map(lambda x: str(x)[:-3])
df_all['X4'] = (df_all['X4'].replace( '[\$,)]','', regex=True).replace( '[(]','-', regex=True ).astype(float))
df_all['X5'] = (df_all['X5'].replace( '[\$,)]','', regex=True).replace( '[(]','-', regex=True ).astype(float))
df_all['X6'] = (df_all['X6'].replace( '[\$,)]','', regex=True).replace( '[(]','-', regex=True ).astype(float))
df_all['X30'] = (df_all['X30'].replace( '[\%,)]','', regex=True).replace( '[(]','-', regex=True ).astype(float))
df_f = feature_engineering(df_all)
vals = df_f.values
X = vals[:piv_train]
le = LabelEncoder()
y = le.fit_transform(labels)
y = labels
X_test = vals[piv_train:]
X_train, X_valid, y_train, y_valid = cross_validation.train_test_split(X, y, test_size=0.2)
print(" ---- end data prep")
return X_train, X_valid, y_train, y_valid, X_test, id_test
开发者ID:Sandy4321,项目名称:Python-Code-001,代码行数:25,代码来源:Data_Prep_Feature_engineering.py
示例17: main
def main(X_fname, Y_fname, result_fname=None):
le = LabelEncoder()
moves = pandas.read_csv(Y_fname, index_col=0)
Y = moves.values.ravel()
Y = le.fit_transform(Y)
X = io.mmread(X_fname)
print X.shape, Y.shape, len(le.classes_)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33)
xg_train = xgboost.DMatrix( X_train, label=y_train)
xg_test = xgboost.DMatrix(X_test, label=y_test)
param = {}
# use softmax multi-class classification
param['objective'] = 'multi:softprob'
param['eta'] = 0.002
param['max_depth'] = 7
param['nthread'] = 7
param['num_class'] = len(le.classes_)
param['eval_metric'] = 'merror'
evals = [ (xg_train, 'train'), (xg_test, 'eval') ]
# Train xgboost
print "Training"
t1 = time.time()
bst = xgboost.train(param, xg_train, 500, evals, early_stopping_rounds=3)
t2 = time.time()
print t2-t1
if result_fname is None:
result_fname = str(datetime.now())
bst.save_model("%s.bst"%result_fname)
开发者ID:smurching,项目名称:pokemon_ai,代码行数:34,代码来源:model_xgb.py
示例18: customEncode
def customEncode(df):
global labelencoder
le = LabelEncoder()
le.fit(df['OutcomeType'])
df['OutcomeType'] = le.transform(df['OutcomeType'])
labelencoder = le
return df
开发者ID:Waffleboy,项目名称:Kaggle,代码行数:7,代码来源:animalShelter.py
示例19: load_kernel_matrix
def load_kernel_matrix(data_path='data', study='wl_kernel', verbose=True):
"""Loading already computed kernel matrix.
Parameters:
---------
data_path: string
Path to the data folder.
study: string
Name of the folder containing the study, e.g. 'wl_kernel', which
contains the WL kernel matrix.
verbose: bool
"""
path_k_matrix = os.path.join(data_path, 'precomputed_kernels',
study, 'k_matrix.csv')
path_cls = os.path.join(data_path, 'precomputed_kernels', study,
'class_labels.csv')
K = np.loadtxt(path_k_matrix)
y = np.loadtxt(path_cls)
le = LabelEncoder()
y = le.fit_transform(y)
if verbose:
print 'n_samples: %s, n_samples_by_class: (%s - %s)' % (len(y),
len(y[y == 0]),
len(y[y == 1]))
return K, y
开发者ID:MLDroid,项目名称:jstsp2015,代码行数:28,代码来源:load_data.py
示例20: train
def train(args):
print("Loading embeddings.")
fname = "{}/labels.csv".format(args.workDir)
labels = pd.read_csv(fname, header=None).as_matrix()[:, 1]
labels = map(itemgetter(1),
map(os.path.split,
map(os.path.dirname, labels))) # Get the directory.
fname = "{}/reps.csv".format(args.workDir)
embeddings = pd.read_csv(fname, header=None).as_matrix()
le = LabelEncoder().fit(labels)
labelsNum = le.transform(labels)
param_grid = [
{'C': [1, 10, 100, 1000],
'kernel': ['linear']},
{'C': [1, 10, 100, 1000],
'gamma': [0.001, 0.0001],
'kernel': ['rbf']}
]
svm = GridSearchCV(
SVC(probability=True),
param_grid, verbose=4, cv=5, n_jobs=16
).fit(embeddings, labelsNum)
print("Best estimator: {}".format(svm.best_estimator_))
print("Best score on left out data: {:.2f}".format(svm.best_score_))
with open("{}/classifier.pkl".format(args.workDir), 'w') as f:
pickle.dump((le, svm), f)
开发者ID:luan-tran-michel,项目名称:openface,代码行数:28,代码来源:classifier.py
注:本文中的sklearn.preprocessing.LabelEncoder类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论