本文整理汇总了Python中sklearn.preprocessing.Imputer类的典型用法代码示例。如果您正苦于以下问题:Python Imputer类的具体用法?Python Imputer怎么用?Python Imputer使用的例子?那么恭喜您, 这里精选的类代码示例或许可以为您提供帮助。
在下文中一共展示了Imputer类的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: data_organizer
def data_organizer( instances, outcomes ):
"""
Operations to organize data as desired
"""
# Remove instances without GPA data
new_instances = []
new_outcomes = []
for instance,outcome in zip(instances,outcomes):
u1,u2,gpa = outcome
if not math.isnan( gpa ):
new_instances.append( [value for value in instance] )
new_outcomes.append( [value for value in outcome] )
instances = new_instances
outcomes = new_outcomes
# Fill in NaN values with median
instance_list = []
for idx,instance in enumerate(instances):
instance_list.append( [ value for value in instance ] )
bandaid = Imputer( strategy='median' )
instances = bandaid.fit_transform( instance_list )
return instances, outcomes
开发者ID:doykle,项目名称:CMPS-142-Machine-Learning-Homework,代码行数:27,代码来源:processing_nb.py
示例2: impute_and_scale
def impute_and_scale(df, scaling='std'):
"""Impute missing values with mean and scale data included in pandas dataframe.
Parameters
----------
df : pandas dataframe
dataframe to impute and scale
scaling : 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional (default 'std')
type of scaling to apply
"""
df = df.dropna(axis=1, how='all')
imputer = Imputer(strategy='mean', axis=0)
mat = imputer.fit_transform(df)
if scaling is None or scaling.lower() == 'none':
return pd.DataFrame(mat, columns=df.columns)
if scaling == 'maxabs':
scaler = MaxAbsScaler()
elif scaling == 'minmax':
scaler = MinMaxScaler()
else:
scaler = StandardScaler()
mat = scaler.fit_transform(mat)
df = pd.DataFrame(mat, columns=df.columns)
return df
开发者ID:carrondt,项目名称:Benchmarks,代码行数:31,代码来源:p1b3.py
示例3: run_whole_video
def run_whole_video(exp_folder, lims_ID):
#initializes video pointer for video of interest based on lims ID
file_string = get_file_string(exp_folder, lims_ID)
video_pointer = cv2.VideoCapture(file_string)
# import wheel data
wheel = joblib.load('dxds2.pkl')
first_non_nan = next(x for x in wheel if not isnan(x))
first_index = np.where(wheel == first_non_nan)[0]
k = first_index[0]
imp = Imputer(missing_values='NaN', strategy='mean')
wheel = imp.fit_transform(wheel)
wheel = preprocessing.MinMaxScaler((-1, 1)).fit(wheel).transform(wheel)
# self.video_pointer.set(1, 41000)
ret, frame = video_pointer.read()
# crops and converts frame into desired format
frame = cv2.cvtColor(frame[160:400, 100:640], cv2.COLOR_BGR2GRAY)
prvs = frame
nex = frame
# initialize vectors to keep track of data
count = 0
mod = 0
opticals = []
angles = []
frames = []
# length of movie
limit = int(video_pointer.get(cv2.cv.CV_CAP_PROP_FRAME_COUNT))
# create hdf file
hf = h5py.File('data_' + str(lims_ID) + '.h5', 'w')
g = hf.create_group('feature space')
vector = np.zeros((limit, 4321))
table = g.create_dataset('features', data = vector, shape =(limit, 4321))
while count <= limit:
prvs = nex
frames = process_input(prvs)
ret, frame = video_pointer.read()
nex = cv2.cvtColor(frame[160:400, 100:640], cv2.COLOR_BGR2GRAY)
optical = optical_flow(prvs, nex)
opticals = optical['mag']
angles= optical['ang']
vector_data = np.concatenate((np.reshape(wheel[k], (1)), frames, opticals, angles))
table[count, :] = vector_data
count += 1
if count%1000 == 0:
print (count)
开发者ID:mahdiramadan,项目名称:AllenCode,代码行数:60,代码来源:image_processing.py
示例4: preprocess
def preprocess(data):
non_sparse_only = True
use_all_category_only = False
use_all_impute_mean_mode = False
if non_sparse_only:
nominal_samples = data.ix[:,['var4','dummy']]
onehot_samples = onehot.transform(nominal_samples,['var4','dummy'])
onehot_samples = pd.DataFrame(onehot_samples.toarray())
numbered_samples = data.ix[:,['var7','var8','var10','var11','var13','var15','var17']]
numbered_samples[['var7','var8']] = numbered_samples[['var7','var8']].convert_objects(convert_numeric=True)
#(var7 and 8 are ordinal, converting to floats which includes NaNs will allow mean imputing of missing values)
other_samples = data.ix[:,'crimeVar1':'weatherVar236'] #all the continuous vars
other_samples = other_samples.drop(['weatherVar115'], axis=1) #nothing in this feature
samples = pd.concat([onehot_samples,numbered_samples,other_samples],axis=1) #combine w/ the cleaned up other vars
imp_nan = Imputer(missing_values=np.nan, strategy='mean', axis=0)
samples_imp = imp_nan.fit_transform(samples)
if use_all_category_only:
todo
if use_all_impute_mean_mode:
todo
return samples_imp
开发者ID:kirilligum,项目名称:cdips-fire,代码行数:27,代码来源:preprocess.py
示例5: learn
def learn():
global classifier, INPUT
print 1
data = np.genfromtxt(INPUT, delimiter=' ', dtype='f8')
np.random.shuffle(data)
n = len(data)
y = data[:,1]
x = data[:][:,range(2,54)]
# test_x = []
# test_y = []
train_x = []
train_y = []
print 2
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
x = imp.fit_transform(x)
print 3
for i in range(0, n):
if y[i] == 0:
continue
train_x.append(x[i])
train_y.append(y[i])
# if i%100==0:
# test_x.append(x[i])
# test_y.append(y[i])
# else:
# train_x.append(x[i])
# train_y.append(y[i])
print 4
classifier.fit(train_x, train_y)
print 5
开发者ID:proneetv,项目名称:adin,代码行数:30,代码来源:activity.py
示例6: ImputeCategorical
class ImputeCategorical(BaseEstimator, TransformerMixin):
"""
Encodes a specified list of columns or all columns if None.
"""
def __init__(self, columns=None):
self.columns = columns
self.imputer = None
def fit(self, data, target=None):
"""
Expects a data frame with named columns to impute.
"""
# Encode all columns if columns is None
if self.columns is None:
self.columns = data.columns
# Fit an imputer for each column in the data frame
self.imputer = Imputer(missing_values=0, strategy='most_frequent')
self.imputer.fit(data[self.columns])
return self
def transform(self, data):
"""
Uses the encoders to transform a data frame.
"""
output = data.copy()
output[self.columns] = self.imputer.transform(output[self.columns])
return output
开发者ID:NikashS,项目名称:tutorial-predicting-income,代码行数:31,代码来源:predict.py
示例7: test_3_stage
def test_3_stage(self):
from sklearn.preprocessing import Imputer
infile_name = path_of_data('missing_vals.csv')
p = Pipeline()
csv_read_node = p.add(CSVRead(infile_name))
csv_write_node = p.add(CSVWrite(self._tmp_files.get('out.csv')))
impute_node = p.add(wrap_and_make_instance(Imputer))
csv_read_node['output'] > impute_node['X_train']
impute_node['X_new'] > csv_write_node['input']
self.run_pipeline(p)
ctrl_imputer = Imputer()
ctrl_X_sa = np.genfromtxt(infile_name, dtype=None, delimiter=",",
names=True)
num_type = ctrl_X_sa[0][0].dtype
ctrl_X_nd, ctrl_X_sa_type = np_sa_to_nd(ctrl_X_sa)
ctrl_X_new_nd = ctrl_imputer.fit_transform(ctrl_X_nd)
control = ctrl_X_new_nd
result = self._tmp_files.csv_read('out.csv', True)
self.assertTrue(np.allclose(result, control))
开发者ID:macressler,项目名称:UPSG,代码行数:27,代码来源:test_pipeline.py
示例8: test
def test():
vec = DictVectorizer()
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
for filename in glob.glob(r'../dataset/UCI/*.arff'):
basename = re.sub(r'(\..*?)$','',os.path.basename(filename))
print basename
if basename != DS:
continue
# cost_matrix = pickle.load(open('../dataset/UCI/'+basename+'_cost_matrix.pkl', 'rb'))
data = arff.loadarff(filename)[0]
X = vec.fit_transform(np.array([{str(i):value for i,value in enumerate(list(row)[:-1])} for row in data])).toarray()
imp.fit(X)
X = imp.transform(X)
labels = np.array([row[-1] for row in data])
y = np.array([{v:k for k,v in enumerate(list(set(labels)))}[label] for label in labels])
random = np.random.permutation(range(len(X)))
print 'dataset ratio\t%s'%('\t'.join([alg+" "*(12-len(alg)) for alg in sorted(ALG.keys())]))
for iteration in xrange(10):
X, y, class_num, kf = X[random], y[random], set(labels), KFold(len(X), n_folds=10)
for train, test in kf:
length, train_size = len(train), 0.1
X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
X_label, X_unlabel, y_label, y_unlabel = train_test_split(X_train, y_train, test_size=1.0-train_size, random_state=0)
for R in xrange(2,10):
ones_matrix, cost_matrix = np.array([[1,1],[1,1]]), np.array([[1,1],[R,R]])
# print "%s R=%d"%(basename,R),
cross_validation("%s R=%d"%(basename,R), X_label, X_unlabel, y_label, y_unlabel, ones_matrix, cost_matrix)
exit()
开发者ID:qiangsiwei,项目名称:semi-supervied_learning,代码行数:28,代码来源:test_weight_KNN.py
示例9: plot_ROCList
def plot_ROCList(clfList, data, labels, stringList=""):
"""
Plot an ROC curve for each classifier in clfList, training on a single 80/20 split
:param clfList:
:param data:
:param labels:
:param stringList:
:return:
"""
if stringList == "":
stringList = ["" for i in range(len(labels))]
imp = Imputer(missing_values=np.NaN, strategy="mean")
data = imp.fit_transform(data)
# Cross-validate on the data once using each model to get a ROC curve
AUCs, fprs, tprs, threshs = cvList(data, labels, clfList)
# Plote a ROC for each clf in clfList
for i in range(len(clfList)):
fpr = fprs[i]
tpr = tprs[i]
plt.plot(fpr, tpr)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title(stringList[i]+" ROC Curve, AUC = "+str(AUCs[i]))
plt.savefig(stringList[i]+"_ROC.png")
plt.close()
print stringList[i] + ":" + str(AUCs[i])
开发者ID:danielgeng,项目名称:cs249_big_data_analytics,代码行数:28,代码来源:ml_models.py
示例10: computePearson
def computePearson(args):
filter(args)
with open(args.feature_file, 'r') as fp:
features = [line for line in fp.read().splitlines()
if not line.startswith('#')]
X = loadtxt(TMP_DATA_FILE)
y = loadtxt(TMP_LABEL_FILE)
assert X.shape[0] == y.shape[0]
assert X.shape[1] == len(features)
imputer = Imputer(strategy='median', copy=False)
X = imputer.fit_transform(X)
if args.output_file:
with open(args.output_file, 'w') as fp:
print >> fp, '\t'.join(['feature', 'coeff', 'pvalue'])
for i in range(len(features)):
coeff, pvalue = pearsonr(X[:, i], y)
print >> fp, '%s\t%f\t%f' % (features[i], coeff, pvalue)
if args.group_output_file:
groups = getGroups(features)
index = {features[i]: i for i in range(len(features))}
with open(args.group_output_file, 'w') as fp:
print >> fp, '\t'.join(['prefix', 'feature1', 'feature2', 'coeff', 'pvalue'])
for prefix, group in groups.iteritems():
for i in range(len(group)):
for j in range(i+1, len(group)):
coeff, pvalue = pearsonr(X[:, index[group[i]]], X[:, index[group[j]]])
print >> fp, '%s\t%s\t%s\t%f\t%f' % (
prefix, group[i], group[j], coeff, pvalue)
开发者ID:galabing,项目名称:qd2,代码行数:34,代码来源:compute_pearson.py
示例11: gettestdata
def gettestdata(fil) :
data = np.genfromtxt(fil,delimiter=',')
imp = Imputer(missing_values='NaN', strategy='median', axis=0)
X = imp.fit_transform(data[:,2:])
X = scale(X).copy()
#spr.eliminate_zeros()
return np.array(X)
开发者ID:jmannhei,项目名称:ml-rain,代码行数:7,代码来源:branchtest.py
示例12: get_some_data
def get_some_data():
data = melbourne_data;
y = data.Price
X = data[cols_to_use]
my_imputer = Imputer()
imputed_X = my_imputer.fit_transform(X)
return imputed_X, y
开发者ID:muxiaobai,项目名称:CourseExercises,代码行数:7,代码来源:partial_dependence_plots.py
示例13: calcEdges
def calcEdges(data):
n = len(data)
usersDic = {}
usersId = 0
moviesDic = {}
moviesId = 0
for i in range(n):
r = data[i]
if r[0] not in moviesDic:
moviesDic[r[0]] = moviesId
moviesId += 1
if r[1] not in usersDic:
usersDic[r[1]] = usersId
usersId += 1
E = np.zeros((moviesId, usersId))
#E = np.full((moviesId, usersId), np.nan)
for i in range(n):
user = usersDic[data[i][1]]
movie = moviesDic[data[i][0]]
E[movie, user] = data[i][2]
estimator = Imputer(0, strategy='mean')
#estimator = SoftImpute()
#estimator.fit(E)
#E = estimator.predict(E)
E = estimator.fit_transform(E)
return E, usersDic, moviesDic
开发者ID:chenchfort,项目名称:NetflixRecommender,代码行数:26,代码来源:recommend.py
示例14: bnp_svm
def bnp_svm(train, test):
print('bnpsvm')
## If a value is missing, set it to the average
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
#print("cleaning data")
train = train.sample(1000)
## set up training data
train1 = train.select_dtypes(include=['float64'])
imp.fit(train1)
train1 = imp.transform(train1)
train1 = np.array(train1).astype(float)
## set up real y
target = np.array(train['target']).astype(int)
## set up testing data
test1 = test.select_dtypes(include=['float64'])
test1 = imp.transform(test1)
test1 = np.array(test1).astype(float)
#print("training...")
clf = svm.SVC(gamma=0.001, C=100, probability=True)
#print("testing")
clf.fit(train1, target)
#print("predicting")
yhat = clf.predict_proba(test1)
return yhat
#print(bnp_svm(train, test))
开发者ID:debanjum,项目名称:KaggleBNP,代码行数:33,代码来源:svm.py
示例15: load_datasets
def load_datasets(feature_paths, label_paths):
'''
读取特征文件和标签文件并返回
'''
#定义feature数组变量,列数量和特征维度一致为41;定义空的标签变量,列数量与标签维度一致为1
feature = np.ndarray(shape=(0,41))
label = np.ndarray(shape=(0,1))
for file in feature_paths:
#使用pandas库的read_table函数读取一个特征文件的内容,其中指定分隔符为逗号、缺失值为问号且文件不包含表头行
#df = pd.read_table(file, delimiter=',', na_values='?', header=None)
#pandas.read_csv(数据源, encoding=编码格式为utf-8, parse_dates=第0列解析为日期, index_col=用作行索引的列编号)
data=pd.read_csv(file,encoding='utf-8',parse_dates=[0],index_col=0)
#DataFrame.sort_index(axis=0 (按0列排), ascending=True(升序), inplace=False(排序后是否覆盖原数据))
#data 按照时间升序排列
#data.sort_index(0,ascending=True,inplace=True)
#使用Imputer函数,通过设定strategy参数为‘mean’,使用平均值对缺失数据进行补全。
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
#fit()函数用于训练预处理器,transform()函数用于生成预处理结果。
imp.fit(df)
df = imp.transform(df)
#将预处理后的数据加入feature,依次遍历完所有特征文件
feature = np.concatenate((feature, df))
#读取标签文件
for file in label_paths:
df = pd.read_table(file, header=None)
label = np.concatenate((label, df))
#将标签归整化为一维向量
label = np.ravel(label)
return feature, label
开发者ID:HanKin2015,项目名称:ACM,代码行数:32,代码来源:机器学习标准模板.py
示例16: run_importance
def run_importance(clf, data, labels, feature_labels=[""], string=""):
"""
Fit a classifier using all the data and plot the feature importances
:param clf: Classifier object that has feature_importances_ member
:param feature_labels: names of the features
:param string: classifier name
:return: (void) plot Gini importance vs feature
"""
num_features = data.shape[1]
importances = [0]*num_features
imp = Imputer(missing_values=np.NaN, strategy="mean")
data = imp.fit_transform(data)
# run the classifier 100 times and average the importance found after each fit
for r in range(100):
clf.fit(data, labels)
importances = [importances[i]+clf.feature_importances_[i] for i in range(num_features)]
importances = [importance/100 for importance in importances]
# Filter out the features that have 0 importance (e.g. values are all 0)
# non_zeros are the indices in feature_importances that are not 0
non_zeros = [i for i in range(num_features) if not importances[i] == 0]
importances = [importances[i] for i in non_zeros]
feature_labels = [feature_labels[i] for i in non_zeros]
# Plot the features
bar_width = 0.7
plt.bar(range(len(feature_labels)), importances, bar_width)
plt.xticks([ind + +float(bar_width)/2 for ind in range(len(feature_labels))], feature_labels,rotation="vertical")
plt.gcf().subplots_adjust(bottom=0.35)
plt.xlabel("Feature")
plt.ylabel("Gini Importance")
plt.title("Gini Importance v. Features for "+string+" Classifier")
plt.show()
开发者ID:danielgeng,项目名称:cs249_big_data_analytics,代码行数:35,代码来源:ml_models.py
示例17: imputed_data
def imputed_data(df, colname, strategy="mean"):
from sklearn.preprocessing import Imputer
imr = Imputer(missing_values="NaN", strategy=strategy, axis=0)
imr = imr.fit(df[colname].reshape(-1,1))
imputed_data = imr.transform(df[colname].values.reshape(-1,1))
df[colname] = imputed_data
print("Data has been imputed to \"{}\"".format(colname))
开发者ID:reinka,项目名称:coding,代码行数:7,代码来源:Titanic+2.py
示例18: run_clfList
def run_clfList(clfList, stringList="", normalize=False):
"""
Run 100-fold 80/20 cross-validation on each classifier in clfList
print the average AUC for each classifier
:param clfList: list of classifiers to run
:param stringList: names of the classifiers
:param normalize: whether or not to normalize the data
:return: the average AUC for each classifier in clfList
"""
# data, labels = six_features(force=False)
# data, labels = six_and_time_features(force=False)
# data, labels = five_features(force=False)
# data, labels = five_and_rts(force=False)
data, labels = new_features()
if normalize:
data = normalize_data(data)
imp = Imputer(missing_values=np.NaN, strategy="mean")
data = imp.fit_transform(data)
# Cross-validate all clfs 100 times
means = kfoldcvList(data, labels, clfList, 100)
if stringList == "":
stringList = ["" for i in range(len(labels))]
# Print out the mean AUCs
for i, mean in enumerate(means):
print stringList[i]+": "+str(mean)
for mean in means:
sys.stdout.write(str(mean) + " & ")
sys.stdout.write("\n")
return means
开发者ID:danielgeng,项目名称:cs249_big_data_analytics,代码行数:33,代码来源:ml_models.py
示例19: run_main
def run_main(new_file, start, stop, dat):
with open(new_file, 'a') as file:
imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=1)
import itertools
with open(dat, "r") as text_file:
for line in itertools.islice(text_file, start, stop):
line = line.replace("NA", "NaN")
content = line.rstrip('\n').split('\t')
CpG = content.pop(0)
flag, CpG_location = get_location(CpG)
if flag == 'F':
continue
genotype_matrix = get_genotypes(CpG_location)
genotype_matrix = imp.transform(genotype_matrix)
genotype_matrix = genotype_matrix.transpose()
#run PCA
try:
PCA_matrix = run_pca(genotype_matrix)
except ValueError:
print "value error"
continue
#run linear regression
meth_values = pd.Series(content, name="meth_val", dtype=float)
model = sm.OLS(meth_values, PCA_matrix)
results = model.fit()
MethValResids = results.resid
final = pd.Series(CpG)
final = final.append(MethValResids)
fline = final.tolist()
fline = '\t'.join(str(x) for x in fline)
fline = fline + "\n"
file.write(fline)
开发者ID:CrystalHumphries,项目名称:MethylationCorrelationBlock,代码行数:34,代码来源:try_library.py
示例20: avg_message_count_by_group
def avg_message_count_by_group(df_users, df_messages, df_user_features):
columns = ["f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9", "f10"]
features = df_user_features[list(columns)].values
# Impute missing values to retain all sample data
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
X = imp.fit_transform(features)
# Preprocess dataset and standardize features to have normally distributed data
# MaxAbsScaler allows scaled features to lie between -1 and +1
X = MaxAbsScaler().fit_transform(X)
# Apply PCA decomposition and use first 3 components that explain 75% of variance
reduced_data = decomposition.PCA(n_components=3).fit_transform(X)
kmeans = KMeans(init='k-means++', n_clusters=5, n_init=10)
# Predict which group each user belongs to
cluster_labels = kmeans.fit_predict(reduced_data)
df_user_features['group.id'] = cluster_labels
# Call utility function to join the two dataframes
df_joined_users_messages = get_merged_dataframes(df_users, df_messages)
df_joined_users_messages_features = get_merged_dataframes(df_user_features, df_joined_users_messages)
# Only keep messages that were received since signing up
df_joined_users_messages_features = df_joined_users_messages_features[df_joined_users_messages_features['message.date']
>= df_joined_users_messages_features['signup.date']]
# Get the average message count grouped by group.id
avg_message_count = df_joined_users_messages_features.groupby('group.id')['message.count'].mean()
# Return the average message count grouped by user groups and rounded to 2 decimals
return np.round(avg_message_count.tolist(), decimals=2)
开发者ID:anjalibshah,项目名称:data-science-projects,代码行数:35,代码来源:Classifying&Clustering_Clients.py
注:本文中的sklearn.preprocessing.Imputer类示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论