本文整理汇总了Python中sklearn.preprocessing.OneHotEncoder类的典型用法代码示例。如果您正苦于以下问题:Python OneHotEncoder类的具体用法?Python OneHotEncoder怎么用?Python OneHotEncoder使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了OneHotEncoder类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: get_toy_classification_data
def get_toy_classification_data(n_samples=100, centers=3, n_features=2, type_data = "blobs"):
# generate 2d classification dataset
if (type_data == "blobs"):
X, y = make_blobs(n_samples=n_samples, centers=centers, n_features=n_features)
elif(type_data == "moons"):
X, y = make_moons(n_samples=n_samples, noise=0.1)
elif(type_data == "circles"):
X, y = make_circles(n_samples=n_samples, noise=0.05)
# scatter plot, dots colored by class value
# df = DataFrame(dict(x=X[:,0], y=X[:,1], label=y))
# colors = {0:'red', 1:'blue', 2:'green'}
# fig, ax = pyplot.subplots()
# grouped = df.groupby('label')
# for key, group in grouped:
# group.plot(ax=ax, kind='scatter', x='x', y='y', label=key, color=colors[key])
# pyplot.show()
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.25, stratify = None)
classes = np.unique(y_train)
if(0):
enc = OneHotEncoder().fit(classes.reshape(-1,1))
y_train = enc.transform(y_train.reshape(-1, 1))
print (y_test)
y_test = enc.transform(y_test.reshape(-1, 1))
print (y_test)
y_train = one_hot_encode(y_train, classes)
y_test = one_hot_encode(y_test, classes)
return X_train, y_train, X_test, y_test, classes
开发者ID:manuwhs,项目名称:Trapyng,代码行数:33,代码来源:data_loaders.py
示例2: load_bees
def load_bees():
'''
helper function to load our data
'''
train_fp = "/home/ubuntu/bee_images/train"
labels = "/home/ubuntu/bee_images"
train_labels = pd.read_csv(labels + '/' + "train_labels.csv")
train_labels.set_index('id', inplace = True)
bee_images = os.listdir(train_fp)
bee_images = filter(lambda f: f[-3:] == 'jpg', bee_images)
bee_images = filter(lambda f: f != '1974.jpg', bee_images)
bees = []
for i in bee_images:
im = imread(train_fp + "/" + i, as_grey = False)
im = resize(im, (48, 48))
bees.append(im)
# divide bees by 255 to give it a 0 - 1 scale
# (255 is the current max val and zero is the min)
bees = np.array(bees)/255.0
Y = train_labels.ix[[int(x.split('.')[0]) for x in bee_images]].values
onehot = OneHotEncoder(sparse = False, n_values = 2)
Y = onehot.fit_transform(Y)
bees, Y = gen_data(bees, Y)
return balance(bees, Y)
开发者ID:jayswinney,项目名称:naive_bees,代码行数:30,代码来源:load_data.py
示例3: get_coded_data
def get_coded_data(cases_df, case_ids, coded_feature_names):
"""
Retrieves the valences corresponding to case_ids,
along with coded features, if any
Recode unknown valences to neutral.
args:
cases_df: A dataframe containing the case variables.
case_ids: list of sorted case_ids
coded_feature_names: list of column names to pull from cases_df (ie 'geniss' or ['geniss','casetyp1'])
returns:
valences: np array of valences
coded_feature_array: np array of coded features
filtered_cases_df: Dataframe containing the sorted, filtered case variables
"""
UNKNOWN_VALENCE = 0
NEUTRAL_VALENCE = 2
if isinstance(coded_feature_names, str):
coded_feature_names = [coded_feature_names]
print "coded_feature_names: ",coded_feature_names
valences = []
coded_feature_list = []
for case_id in case_ids:
valence = cases_df[cases_df['caseid'] == case_id]['direct1'].values[0]
if np.isnan(valence)==False:
valence = int(valence)
else: valence = 2
if coded_feature_names is not None:
coded_feature_row = cases_df[cases_df['caseid'] == case_id][coded_feature_names].values[0]
clean_row = []
#clean row
for val in coded_feature_row:
if val and np.isnan(val) == False:
clean_row.append(int(val))
else:
clean_row.append(0)
assert clean_row[0]>=0, ""
coded_feature_list.append(clean_row)
# Replacing unknown valence variables with netural scores.
if valence == UNKNOWN_VALENCE:
valence = NEUTRAL_VALENCE
valences.append(valence)
#one-hot encoding
if coded_feature_names is not None:
enc = OneHotEncoder()
coded_feature_array = enc.fit_transform(np.array(coded_feature_list))
print "Coded Feature Array shape: ", coded_feature_array.shape
else:
coded_feature_array = np.array([])
#Filter case df
filtered_case_df = filter_cases_df(cases_df,case_ids)
return np.array(valences),coded_feature_array,filtered_case_df
开发者ID:pinesol,项目名称:appeals,代码行数:60,代码来源:join_data.py
示例4: modelselect
def modelselect(input_filename, num_test_examples, block_size, n_estimators=100):
# Perform some model selection to determine good parameters
# Load data
X_train, y_train, X_test, y_test, scaler = loaddata(input_filename, num_test_examples, block_size)
# Feature generation using random forests
forest = RandomForestClassifier(n_estimators=n_estimators, n_jobs=-1)
forest.fit(X_train, y_train)
encoder = OneHotEncoder()
encoder.fit(forest.apply(X_train))
X_train = encoder.transform(forest.apply(X_train))
learner = SGDClassifier(
loss="hinge",
penalty="l2",
learning_rate="invscaling",
alpha=0.001,
average=10 ** 4,
eta0=0.5,
class_weight="balanced",
)
metric = "f1"
losses = ["log", "hinge", "modified_huber", "squared_hinge", "perceptron"]
penalties = ["l2", "l1", "elasticnet"]
alphas = 10.0 ** numpy.arange(-5, 0)
learning_rates = ["constant", "optimal", "invscaling"]
param_grid = [{"alpha": alphas, "loss": losses, "penalty": penalties, "learning_rate": learning_rates}]
grid_search = GridSearchCV(learner, param_grid, n_jobs=-1, verbose=2, scoring=metric, refit=True)
grid_search.fit(X_train, y_train)
print(grid_search.best_params_, grid_search.best_score_)
return grid_search
开发者ID:charanpald,项目名称:tyre-hug,代码行数:32,代码来源:outofcore.py
示例5: transform_with_gbm_to_categorical
def transform_with_gbm_to_categorical(header, tr_x, tr_y, ts_x, n_est=100, learning_rate=0.1, max_depth=5):
clf = GradientBoostingClassifier(n_estimators=n_est, learning_rate=learning_rate, max_depth=max_depth)
clf = clf.fit(tr_x, tr_y)
""" #Node count
estimators = clf.estimators_
for row in estimators:
for e in row:
print(e.tree_.node_count)"""
leaf_indices = clf.apply(tr_x)
leaf_indices = leaf_indices.reshape(leaf_indices.shape[0], -1)
ts_leaf_indices = clf.apply(ts_x)
ts_leaf_indices = ts_leaf_indices.reshape(ts_leaf_indices.shape[0], -1)
enc = OneHotEncoder()
enc.fit(np.append(leaf_indices, ts_leaf_indices, axis=0))
tr_cat_features = enc.transform(leaf_indices).toarray()
ts_cat_features = enc.transform(ts_leaf_indices).toarray()
header = ["cat_" + str(i) for i in range(ts_cat_features.shape[1])]
print("[gbm_cat] Features size: ", len(header))
return header, tr_cat_features, ts_cat_features
开发者ID:thushv89,项目名称:kaggle_tel,代码行数:25,代码来源:analyse_features.py
示例6: cost
def cost(all_thetas, weights, X, y, lamb):
thetas = unpack_thetas(all_thetas, weights)
# add column of 1's
X = X/255
a1 = np.insert(X, 0, 1, 1)
# create a binary index matrix of y data and initialize activation layers
encoder = OneHotEncoder(sparse=False)
y_matrix = encoder.fit_transform(y.T)
act_layers = activation_layers(a1, thetas)
# cost function created in seperate parts
first = np.multiply(-y_matrix, np.log(act_layers[-1]))
second = np.multiply(1 - y_matrix, np.log(1 - act_layers[-1]))
# regularization
reg_1 = lamb/(2 * len(X))
reg_2 = 0
for i in range(len(thetas)):
reg_2 += np.power(thetas[i][...,1:], 2).sum()
J = 1/len(X) * (first - second).sum() + (reg_1 * reg_2)
print('Current Cost')
print(J)
print('*' * 20)
return J
开发者ID:miketibb,项目名称:DigitRecognizer,代码行数:27,代码来源:functions.py
示例7: ExpandCategorical
class ExpandCategorical(BaseEstimator, TransformerMixin):
def __init__(self, columns, append=False, only_new=False):
if isinstance(columns, str):
columns = [columns]
self.columns = columns
self.append = append
self.only_new = only_new
def fit(self, X=None, y=None):
self.encoder_ = OneHotEncoder()
self.encoder_.fit(X.loc[:, self.columns])
# Expand the column names
new_colnames = []
for i, c in enumerate(self.columns):
this_map = self.encoder_.active_features_[self.encoder_.feature_indices_[i]:self.encoder_.feature_indices_[i+1]]
for n in this_map:
new_colnames.append("{}_{}".format(c, str(n)))
self.new_colnames_ = new_colnames
return self
def transform(self, X):
new_data = pd.DataFrame(self.encoder_.transform(X.loc[:, self.columns]).toarray(), index=X.index, columns=self.new_colnames_)
assert new_data.shape[0] == X.shape[0], "Row lengths do not match"
if self.only_new:
return new_data
res = X.copy()
if not self.append:
# Remove the unexpanded columns from the data frame
for c in self.columns:
res.drop(c, 1, inplace=True)
return res.join(new_data)
开发者ID:arackal5,项目名称:kaggle-loan-default,代码行数:32,代码来源:classes.py
示例8: load_dataset_from_file
def load_dataset_from_file(filename, examples_count, is_labeled=True, expand_categorical=True):
data = open (filename, 'r').readlines()
# Next two lines verifies that the parsing result of header is what
# we expect.
header, _unused = parse_line(data[0], is_labeled, is_header=True)
assert header == EXPECTED_HEADER
data_X = []
data_y = []
cnt = 0
for line in data[1:]:
cnt += 1
if len(data_X) == examples_count:
break
parse_result = get_features(line, is_labeled)
if parse_result == None:
continue
(features, label) = parse_result
data_X.append(np.array(features))
data_y.append(label)
if len(data_X) % 100000 == 0:
print "Processed %d rows, loaded %d examples." % (
cnt, len(data_X))
cat_X = data_X
if expand_categorical:
encoder = OneHotEncoder(categorical_features=list(CATEGORICAL_FEATURES), sparse=False)
cat_X = encoder.fit_transform(cat_X)
cat_X = MaxAbsScaler().fit_transform(cat_X)
print "Feature indices: ", encoder.feature_indices_
print "Cat_X shape: ", cat_X.shape
return (data_X, cat_X, np.array(data_y) if is_labeled else None)
开发者ID:romanie,项目名称:Diploma,代码行数:30,代码来源:common.py
示例9: convert_categorical_to_numeric
def convert_categorical_to_numeric(state_holiday):
enc = OneHotEncoder()
state_holiday[state_holiday=='a'] = 1
state_holiday[state_holiday=='b'] = 2
state_holiday[state_holiday=='c'] = 3
enc.fit(state_holiday)
return enc.transform(state_holiday).toarray()
开发者ID:akshayk0406,项目名称:Kaggle,代码行数:7,代码来源:sales.py
示例10: pywfmLocalModel
def pywfmLocalModel(trainFeature, testFeature, trainLabel, testLabel, trainIndex, testIndex, fm, cvIndex):
print 'run local: folds: ' + str(cvIndex)
trainIndex, testIndex, value1, value2 = getIntId(trainIndex, testIndex)
encoder = OneHotEncoder(n_values=[value1, value2])
trainIndex_encode = encoder.fit_transform(trainIndex)
testIndex_encode = encoder.transform(testIndex)
trainFeature = hstack((trainIndex_encode, trainFeature))
testFeature = hstack((testIndex_encode, testFeature))
'''
for i in range(len(trainLabel)):
if i == 0:
trainLabel[i] = -1
for i in range(len(testLabel)):
if i == 0:
testLabel[i] = -1
'''
model = fm.run(trainIndex_encode, trainLabel, testIndex_encode, testLabel)
predict = model.predictions
predict = np.array(predict, np.float)
predict = (predict - np.min(predict))/(np.max(predict) - np.min(predict))
return predict
开发者ID:yinjiakang,项目名称:DataMining,代码行数:31,代码来源:model.py
示例11: convert_network
def convert_network(filename,final_filename, var_flag = 0):
'''
Filename : input filename of csv filename
final_filename : o/p filename of .pickle file
'''
res = {'x':[],'y':[]}
with open(filename,'rb') as csvfile:
f = csv.reader(csvfile)
count = 0
for line in f:
if count != 0:
if var_flag == 0:
res['x'].append(line[:-2]+[line[-1]])
res['y'].append(float(line[-2]))
else:
res['x'].append(line[:-1])
res['y'].append(float(line[-1]))
count += 1
res['x'] = get_num(res['x'])
m = len(res['x'][0])-1
enc = OneHotEncoder(categorical_features = range(m),sparse = False)
enc.fit(res['x'])
res['x'] = enc.transform(res['x'])
with open(final_filename,'wb') as f:
pickle.dump(res,f)
开发者ID:gtyopal,项目名称:EE239-Big-Data-Analysis,代码行数:27,代码来源:create_data.py
示例12: pywfmPredictModel
def pywfmPredictModel(trainFeature, testFeature, trainLabel, trainIndex, testIndex, fm):
print 'run online!'
trainIndex, testIndex, value1, value2 = getIntId(trainIndex, testIndex)
encoder = OneHotEncoder(n_values=[value1, value2])
trainIndex_encode = encoder.fit_transform(trainIndex)
testIndex_encode = encoder.transform(testIndex)
trainFeature = hstack((trainIndex_encode, trainFeature))
testFeature = hstack((testIndex_encode, testFeature))
#print trainFeature
'''
for i in range(len(trainLabel)):
if i == 0:
trainLabel[i] = -1
for i in range(len(testLabel)):
if i == 0:
testLabel[i] = -1
'''
testLabel = np.zeros((testFeature.shape[0]))
model = fm.run(trainFeature, trainLabel, testFeature, testLabel)
predict = model.predictions
predict = np.array(predict, np.float)
print np.max(predict), np.min(predict)
#predict = (predict - np.min(predict))/(np.max(predict) - np.min(predict))
return predict
开发者ID:yinjiakang,项目名称:DataMining,代码行数:35,代码来源:model.py
示例13: CategoricalColumn
class CategoricalColumn(BaseEstimator, TransformerMixin):
'''
Take a string or key categorical column and transform it
to one hot encodings.
'''
def __init__(self):
'''
Set up the internal transformation.
'''
self._labeler = LabelEncoder()
self._encoder = OneHotEncoder()
def fit(self, X, y=None):
'''
Fit the label and encoding
'''
handle_none = list(map(str, X))
encoded = self._labeler.fit_transform(handle_none)
self._encoder.fit(encoded.reshape(-1, 1))
return self
def transform(self, X):
'''
Transform a column of data into one hot encodings.
Parameters
----------
X : pandas series or numpy array
'''
handle_none = list(map(str, X))
encoded = self._labeler.transform(handle_none)
return self._encoder.transform(encoded.reshape(-1, 1)).todense().astype(np.float32)
开发者ID:wballard,项目名称:tableclassifier,代码行数:33,代码来源:table_model.py
示例14: loadData
def loadData(experiment):
if experiment.has_key("size"):
size = experiment["size"]
else:
size = 0
data, label, description, reduce = experiment["dataset"]()
if size > 0:
initialReduceBlockSize = np.arange(size, size+0.2, 0.1)
testSetPercentage = 0.2
trainDataBlocks, trainLabelBlocks, testDataBlocks, testLabelBlocks = data_factory.splitDatasetInBlocks(data, np.array(label), initialReduceBlockSize, testSetPercentage)
data = trainDataBlocks[0][0]
label = trainLabelBlocks[0][0]
# if required (cancer datasets) perform binary encoding
if experiment['binary_encode']:
print "perform binary encode"
analyze(data, label, "before encode")
# encode features (one-hot-encoder / dummy coding)
enc = OneHotEncoder()
enc.fit(data)
data = enc.transform(data).toarray()
analyze(data, label, "after encode")
return data, label, description, reduce
开发者ID:sebastian-alfers,项目名称:master-thesis,代码行数:26,代码来源:experiment_run.py
示例15: getdataset
def getdataset(datasetname, onehot_encode_strings=True):
# load
dataset = fetch_mldata(datasetname)
# get X and y
X = dshape(dataset.data)
try:
target = dshape(dataset.target)
except:
print("WARNING: No target found. Taking last column of data matrix as target")
target = X[:, -1]
X = X[:, :-1]
if len(target.shape) > 1 and target.shape[1] > X.shape[1]: # some mldata sets are mixed up...
X = target
target = dshape(dataset.data)
if len(X.shape) == 1 or X.shape[1] <= 1:
for k in dataset.keys():
if k != 'data' and k != 'target' and len(dataset[k]) == X.shape[1]:
X = np.hstack((X, dshape(dataset[k])))
# one-hot for categorical values
if onehot_encode_strings:
cat_ft = [i for i in range(X.shape[1]) if 'str' in str(
type(unpack(X[0, i]))) or 'unicode' in str(type(unpack(X[0, i])))]
if len(cat_ft):
for i in cat_ft:
X[:, i] = tonumeric(X[:, i])
X = OneHotEncoder(categorical_features=cat_ft).fit_transform(X)
# if sparse, make dense
try:
X = X.toarray()
except:
pass
# convert y to monotonically increasing ints
y = tonumeric(target).astype(int)
return np.nan_to_num(X.astype(float)), y
开发者ID:Biodun,项目名称:highdimensional-decision-boundary-plot,代码行数:34,代码来源:uci_loader.py
示例16: vectorize_data
def vectorize_data(df):
cat_vars = ["UniqueCarrier",
"OriginAirportID",
"OriginAirportSeqID",
"OriginCityMarketID",
"OriginState",
"DestAirportID",
"DestAirportSeqID",
"DestCityMarketID",
"DepTimeBlk",
"ArrTimeBlk",
"DistanceGroup",
"DestState"]
con_vars = ["CRSElapsedTime",
"Distance",
"CRSDepTime",
"CRSArrTime",
"WeekDay",
"YearDay"]
df = df.dropna()
Xenc = OneHotEncoder()
X1 = Xenc.fit_transform(df[cat_vars].as_matrix())
X2 = df[con_vars].as_matrix()
X = sparse.hstack((X1, X2))
X = X.tocsr()
y = df["Cancelled"].as_matrix()
return X, y, Xenc
开发者ID:sharpround,项目名称:flight_risk,代码行数:35,代码来源:cancel_predict.py
示例17: apply_onehot
def apply_onehot(self, columns=[]):
enc = OneHotEncoder()
enc.fit(self.M[:, columns])
R = enc.transform(self.M[:, columns]).toarray()
self.M = np.c_[self.M[:,[x for x in range(self.M.shape[1]) if x not in columns]], R]
self.class_index -= len([c for c in columns if c < self.class_index])
return self
开发者ID:makgyver,项目名称:pyros,代码行数:7,代码来源:binarizer.py
示例18: _to_one_hot_encoding
def _to_one_hot_encoding(labels, dtype=np.float64):
labels = labels.reshape((labels.shape[0], 1))
"""Creates a one-hot encoding of the labels."""
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(dtype=dtype)
return enc.fit_transform(labels).toarray()
开发者ID:tvandera,项目名称:binet,代码行数:7,代码来源:datasets.py
示例19: CategoricalExpansion
class CategoricalExpansion(BaseEstimator, TransformerMixin):
"""
Uses one hot encoder to expand categorical columns
Don't use this in a pipeline
Arguments:
=========
threshold: int
The maximum number of unique values that a column can have
for it to be considered categorical
Returns:
========
Sparse matrix of expanded column.
"""
def __init__(self, threshold):
self.threshold = threshold
def fit(self, X, y=None):
uniques = [(len(x.unique()), x.dtype.kind) for n, x in X.iteritems()]
self.mask_ = [(x[0] < self.threshold and x[1] == 'i') for x in uniques]
self.encoder_ = OneHotEncoder()
self.encoder_.fit(X.loc[:, self.mask_])
return self
def transform(self, X):
return self.encoder_.transform(X.loc[:, self.mask_])
开发者ID:arackal5,项目名称:kaggle-loan-default,代码行数:27,代码来源:classes.py
示例20: Fileio
class Fileio(object):
""" Fileio helper """
def __init__(self, train='../data/train.csv', test='../data/test.csv'):
# Create a OneHotEncoder
self.encoder = OneHotEncoder()
self.trainDF = pd.read_csv(train,usecols=[0])
self.trainDF['ID'] = map(lambda x: "%s.%06i"%(x[0],x[1]), zip(['train']*NUMTRAIN, range(NUMTRAIN)))
self.testDF = pd.read_csv(test)
self.testDF['ID'] = map(lambda x: "%s.%06i"%(x[0],x[1]), zip(['test']*NUMTEST, range(NUMTEST)))
def encode(self,usecols):
self.encoder.fit(np.array(self.df.ix[:,usecols],dtype='float'))
def transformTrain(self,cols,idCol=8):
""" Transform the training set"""
x = pd.merge(self.trainDF,self.df.ix[:,[idCol]+cols],how='left',on='ID',sort=False)
ignore = ['ID','ACTION']
usecols = [c for c in x.columns if c not in ignore]
return self.encoder.transform(np.array(x.ix[:,usecols],dtype='float')), np.array(x.ACTION)
def transformTest(self,cols,idCol=8):
""" Transform the testing set"""
x = pd.merge(self.testDF.ix[:,['ID','ROLL_CODE']],self.df.ix[:,[idCol]+cols]
,how='left',on='ID',sort=False)
ignore = ['ID','ROLL_CODE']
usecols = [c for c in x.columns if c not in ignore]
return self.encoder.transform(np.array(x.ix[:,usecols],dtype='float'))
开发者ID:nmkridler,项目名称:access,代码行数:30,代码来源:fileio.py
注:本文中的sklearn.preprocessing.OneHotEncoder类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论