本文整理汇总了Python中xgboost.plot_importance函数的典型用法代码示例。如果您正苦于以下问题:Python plot_importance函数的具体用法?Python plot_importance怎么用?Python plot_importance使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了plot_importance函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: train_helper
def train_helper(X_train, X_test, y_train, y_test, model_name):
xg_train = xgboost.DMatrix( X_train, label=y_train)
xg_test = xgboost.DMatrix(X_test, label=y_test)
le = load_label_encoder(model_name)
param = {}
# use softmax multi-class classification
param['objective'] = 'multi:softprob'
param['eta'] = 0.002
param['max_depth'] = 7
param['nthread'] = 7
param['num_class'] = len(le.classes_)
param['eval_metric'] = 'merror'
evals = [ (xg_train, 'train'), (xg_test, 'eval') ]
# Train xgboost
print "Training classifier..."
t1 = time.time()
bst = xgboost.train(param, xg_train, 500, evals, early_stopping_rounds=10)
xgboost.plot_importance(bst)
t2 = time.time()
print t2-t1
bst.save_model(classifier_filename(model_name))
return bst
开发者ID:smurching,项目名称:pokemon_ai,代码行数:26,代码来源:model_xgb_tiered.py
示例2: test_importance_plot_lim
def test_importance_plot_lim(self):
np.random.seed(1)
dm = xgb.DMatrix(np.random.randn(100, 100), label=[0, 1] * 50)
bst = xgb.train({}, dm)
assert len(bst.get_fscore()) == 71
ax = xgb.plot_importance(bst)
assert ax.get_xlim() == (0., 11.)
assert ax.get_ylim() == (-1., 71.)
ax = xgb.plot_importance(bst, xlim=(0, 5), ylim=(10, 71))
assert ax.get_xlim() == (0., 5.)
assert ax.get_ylim() == (10., 71.)
开发者ID:BayronP,项目名称:xgboost,代码行数:12,代码来源:test_plotting.py
示例3: run_xgb
def run_xgb(train, test, features, target, random_state=0):
eta = 0.02
max_depth = 5
subsample = 0.75
colsample_bytree = 0.7
start_time = time.time()
print('XGBoost params. ETA: {}, MAX_DEPTH: {}, SUBSAMPLE: {}, COLSAMPLE_BY_TREE: {}'.format(eta, max_depth, subsample, colsample_bytree))
params = {
"objective": "multi:softprob",
"num_class": 12,
"booster" : "gbtree",
"eval_metric": "mlogloss",
"eta": eta,
"max_depth": max_depth,
"subsample": subsample,
"colsample_bytree": colsample_bytree,
"silent": 1,
"seed": random_state,
}
num_boost_round = 500*2
early_stopping_rounds = 50
test_size = 0.3
X_train, X_valid = train_test_split(train, test_size=test_size, random_state=random_state)
print('Length train:', len(X_train.index))
print('Length valid:', len(X_valid.index))
y_train = X_train[target]
y_valid = X_valid[target]
dtrain = xgb.DMatrix(X_train[features], y_train)
dvalid = xgb.DMatrix(X_valid[features], y_valid)
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=True)
print "importance of feathure"
xgb.plot_importance(gbm)
show()
#time.sleep(60*5)
print("Validating...")
check = gbm.predict(xgb.DMatrix(X_valid[features]), ntree_limit=gbm.best_iteration)
score = log_loss(y_valid.tolist(), check)
print("Predict test set...")
test_prediction = gbm.predict(xgb.DMatrix(test[features]), ntree_limit=gbm.best_iteration)
print('Training time: {} minutes'.format(round((time.time() - start_time)/60, 2)))
return test_prediction.tolist(), score
开发者ID:worldwar2008,项目名称:kg,代码行数:51,代码来源:gq.py
示例4: run_train_validation
def run_train_validation(self):
x_train, y_train,x_validation,y_validation = self.get_train_validationset()
dtrain = xgb.DMatrix(x_train, label= y_train,feature_names=x_train.columns)
dvalidation = xgb.DMatrix(x_validation, label= y_validation,feature_names=x_validation.columns)
self.set_xgb_parameters()
evals=[(dtrain,'train'),(dvalidation,'eval')]
model = xgb.train(self.xgb_params, dtrain, evals=evals, **self.xgb_learning_params)
xgb.plot_importance(model)
plt.show()
print "features used:\n {}".format(self.get_used_features())
return
开发者ID:LevinJ,项目名称:Supply-demand-forecasting,代码行数:14,代码来源:xgbbasemodel.py
示例5: test_sklearn_plotting
def test_sklearn_plotting():
tm._skip_if_no_sklearn()
from sklearn.datasets import load_iris
iris = load_iris()
classifier = xgb.XGBClassifier()
classifier.fit(iris.data, iris.target)
import matplotlib
matplotlib.use('Agg')
from matplotlib.axes import Axes
from graphviz import Digraph
ax = xgb.plot_importance(classifier)
assert isinstance(ax, Axes)
assert ax.get_title() == 'Feature importance'
assert ax.get_xlabel() == 'F score'
assert ax.get_ylabel() == 'Features'
assert len(ax.patches) == 4
g = xgb.to_graphviz(classifier, num_trees=0)
assert isinstance(g, Digraph)
ax = xgb.plot_tree(classifier, num_trees=0)
assert isinstance(ax, Axes)
开发者ID:ChangXiaodong,项目名称:xgboost-withcomments,代码行数:27,代码来源:test_with_sklearn.py
示例6: test_plotting
def test_plotting(self):
bst2 = xgb.Booster(model_file='xgb.model')
# plotting
import matplotlib
matplotlib.use('Agg')
from matplotlib.axes import Axes
from graphviz import Digraph
ax = xgb.plot_importance(bst2)
assert isinstance(ax, Axes)
assert ax.get_title() == 'Feature importance'
assert ax.get_xlabel() == 'F score'
assert ax.get_ylabel() == 'Features'
assert len(ax.patches) == 4
ax = xgb.plot_importance(bst2, color='r',
title='t', xlabel='x', ylabel='y')
assert isinstance(ax, Axes)
assert ax.get_title() == 't'
assert ax.get_xlabel() == 'x'
assert ax.get_ylabel() == 'y'
assert len(ax.patches) == 4
for p in ax.patches:
assert p.get_facecolor() == (1.0, 0, 0, 1.0) # red
ax = xgb.plot_importance(bst2, color=['r', 'r', 'b', 'b'],
title=None, xlabel=None, ylabel=None)
assert isinstance(ax, Axes)
assert ax.get_title() == ''
assert ax.get_xlabel() == ''
assert ax.get_ylabel() == ''
assert len(ax.patches) == 4
assert ax.patches[0].get_facecolor() == (1.0, 0, 0, 1.0) # red
assert ax.patches[1].get_facecolor() == (1.0, 0, 0, 1.0) # red
assert ax.patches[2].get_facecolor() == (0, 0, 1.0, 1.0) # blue
assert ax.patches[3].get_facecolor() == (0, 0, 1.0, 1.0) # blue
g = xgb.to_graphviz(bst2, num_trees=0)
assert isinstance(g, Digraph)
ax = xgb.plot_tree(bst2, num_trees=0)
assert isinstance(ax, Axes)
开发者ID:ndingwall,项目名称:xgboost,代码行数:44,代码来源:test_basic.py
示例7: save_topn_features
def save_topn_features(self, fname="XGBRegressor_topn_features.txt", topn=-1):
ax = xgb.plot_importance(self.model)
yticklabels = ax.get_yticklabels()[::-1]
if topn == -1:
topn = len(yticklabels)
else:
topn = min(topn, len(yticklabels))
with open(fname, "w") as f:
for i in range(topn):
f.write("%s\n"%yticklabels[i].get_text())
开发者ID:Anhmike,项目名称:Kaggle_HomeDepot,代码行数:10,代码来源:xgb_utils.py
示例8: plot_feat_importances
def plot_feat_importances():
gbm = xgboost.XGBClassifier(silent=False, seed=8).fit(X_train, y_train)
plot = xgboost.plot_importance(gbm)
ticks = plot.set_yticklabels(df_xgb.columns)
importances = rf.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf.estimators_],
axis=0)
indices = np.argsort(importances)
plt.barh(range(len(indices)), importances[indices], yerr=std[indices], color='lightblue')
ticks = plt.yticks(range(len(indices)), df_xgb.columns)
开发者ID:Nathx,项目名称:ride_sharing_churn,代码行数:11,代码来源:churn.py
示例9: plot_importance
def plot_importance(self, ax=None, height=0.2,
xlim=None, title='Feature importance',
xlabel='F score', ylabel='Features',
grid=True, **kwargs):
"""Plot importance based on fitted trees.
Parameters
----------
ax : matplotlib Axes, default None
Target axes instance. If None, new figure and axes will be created.
height : float, default 0.2
Bar height, passed to ax.barh()
xlim : tuple, default None
Tuple passed to axes.xlim()
title : str, default "Feature importance"
Axes title. To disable, pass None.
xlabel : str, default "F score"
X axis title label. To disable, pass None.
ylabel : str, default "Features"
Y axis title label. To disable, pass None.
kwargs :
Other keywords passed to ax.barh()
Returns
-------
ax : matplotlib Axes
"""
import xgboost as xgb
if not isinstance(self._df.estimator, xgb.XGBModel):
raise ValueError('estimator must be XGBRegressor or XGBClassifier')
return xgb.plot_importance(self._df.estimator.booster(),
ax=ax, height=height, xlim=xlim, title=title,
xlabel=xlabel, ylabel=ylabel, grid=True, **kwargs)
开发者ID:Sandy4321,项目名称:pandas-ml,代码行数:36,代码来源:base.py
示例10: range
fscore_lo = np.percentile(fscore, 2.5, axis=0)
fscore_hi = np.percentile(fscore, 97.5, axis=0)
ind_sort = np.array(np.argsort(fscore_mean))
fscore_mean_sorted = fscore_mean[ind_sort]
# ci_sorted = fscore_ci[ind_sort]
fscore_lo_sorted = fscore_lo[ind_sort]
fscore_hi_sorted = fscore_hi[ind_sort]
feature_label_sorted = feature_label[ind_sort]
feature_label_short = []
for i in range(feature_label_sorted.size):
feature_label_short.append(dic[feature_label_sorted[i]])
get_ipython().magic(u'matplotlib inline')
plt.figure(figsize=(4,12))
axes = plt.gca()
# plt.barh(np.arange(val_sorted.size), val_sorted, xerr=ci_sorted, height=.7, color=(.4,.4,.8), align='center', ecolor=(0,0,0))
plt.barh(np.arange(fscore_mean_sorted.size), fscore_mean_sorted, xerr=np.array([fscore_mean_sorted-fscore_lo_sorted,fscore_hi_sorted-fscore_mean_sorted]), height=.7, color=(.4,.4,.8), align='center', ecolor=(0,0,0))
plt.yticks(np.arange(len(feature_label_short)), feature_label_short, fontsize=12, color=(0,0,0));
# axes.set_ylim([3.5, len(feature_label_short)-9.5])
# axes.set_xlim([0, 0.04])
plt.box(on=False)
plt.xlabel('Gini Importance',fontsize=14)
plt.grid()
# In[ ]:
np.percentile(fscore, 2.5, axis=0)
xgb.plot_importance()
开发者ID:sosata,项目名称:CS120DataAnalysis,代码行数:29,代码来源:show_importance.py
示例11: print
del xgb_train, xgb_val
gc.collect()
cv_scores.append(roc_auc_score(y_val, bst.predict(xgb.DMatrix(X_val), ntree_limit=bst.best_ntree_limit)))
print(cv_scores)
print('predicting...')
if i == 0:
pred = bst.predict(xgb.DMatrix(np.array(test_x)),
ntree_limit=bst.best_ntree_limit)
else:
pred += bst.predict(xgb.DMatrix(np.array(test_x)),
ntree_limit=bst.best_ntree_limit)
del train_x, train_y
gc.collect()
print('mean_score:', np.mean(cv_scores))
pred /= folds
df_test['is_churn'] = pred.clip(0.0000001, 0.999999)
df_test = df_test[['msno', 'is_churn']]
# df_test.to_csv(out_path + 'stack_submissions{}.csv'.format(datetime.now().strftime("%Y%m%d-%H%M%S")), index=False)
df_test = []
plt.rcParams['figure.figsize'] = (7.0, 7.0)
xgb.plot_importance(booster=bst)
plt.show()
# plt.savefig('./feature_importance.png', dpi=100)
开发者ID:zgcgreat,项目名称:WSDM,代码行数:30,代码来源:stacking_fit.py
示例12: print
cv_xgb = xgb.cv(params = our_params, dtrain = xgdmat, num_boost_round = 3000, nfold = 5,
metrics = ['error'], # Make sure you enter metrics inside a list or you may encounter issues!
early_stopping_rounds = 100) # Look for early stopping that minimizes error
print('Tail:\n')
print(cv_xgb.tail(5))
our_params = {'eta': 0.1, 'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8,
'objective': 'binary:logistic', 'max_depth':3, 'min_child_weight':1}
print('Final Train: \n')
final_gb = xgb.train(our_params, xgdmat, num_boost_round = 432)
xgb.plot_importance(final_gb)
plt.show()
#Predicting:
testdmat = xgb.DMatrix(X_pred)
y_pred = final_gb.predict(testdmat)
y_pred[y_pred > 0.5] = 1
y_pred[y_pred <= 0.5] = 0
y_pred = y_pred.astype(np.int64)
#Submission
submission = pd.DataFrame({
"PassengerId": test_df["PassengerId"],
"Survived": y_pred
开发者ID:grixxy,项目名称:ml_python,代码行数:30,代码来源:TitanicXGBoost.py
示例13: print
##xgboost.train(params, dtrain, num_boost_round=10, evals=(), obj=None,
## feval=None, maximize=False, early_stopping_rounds=None,
## evals_result=None, verbose_eval=True, learning_rates=None,
## xgb_model=None)
#
evallist = [(dtest,'eval'), (dtrain,'train')]
watchlist = [ (xg_train,'train'), (xg_test, 'test') ]
evals_result = {}
num_round = 10
bst = xgb.train(param,xg_train, num_round, evals_result=evals_result)
pred = bst.predict(xg_test)
print ('predicting, classification error=%f' % (sum( int(pred[i]) != test_Y[i] for i in range(len(test_Y))) / float(len(test_Y)) ))
xgb.plot_importance(bst)
xgb.plot_tree(bst, num_trees=2)
#=============Logistic Regression==============================================================
#Define sigmoid function
def sigmoid(z):
return 1 / (1 + e**(-z))
#Calcualte the cost to be minimized -- using the sigmoid function
def cost(theta, X, y, l):
m = X.shape[0] #Number of rows in the data
z = X.dot(theta)
O = (-1 / m) * (log(sigmoid(z)).T.dot(y) + log(1-sigmoid(z)).T.dot((1-y)))
# print(m)
# print(theta)
开发者ID:tijohnso,项目名称:Usyd_masters,代码行数:31,代码来源:mlass1_9.py
示例14: format
#test = []
pred2 = model.predict(dtest)
df2 = pd.DataFrame()
df2["Orginal"] = testDelay
df2["Predicted"] = pred2
df2.to_csv('compareDelay.csv', index = False)
import matplotlib.pyplot as plt
plt.style.use("ggplot")
mapper = { 'f{0}' . format (I): v for I, v in enumerate (train.columns)}
mapped = {mapper [k]: v for k, v in model.get_fscore().items()}
import operator
mapped = sorted(mapped.items(), key=operator.itemgetter(1))
xgb.plot_importance(mapped)
plt.show()
df = pd.DataFrame(mapped, columns=['feature', 'fscore'])
df['fscore'] = df['fscore'] / df['fscore'].sum()
df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(25, 15))
plt.title('XGBoost Feature Importance')
plt.xlabel('relative importance')
plt.gcf().savefig('feature_importance_xgb.png')
xx = np.linspace(-10,500)
yy = xx
h0 = plt.plot(xx, yy, 'k-', label="ideal Values")
plt.scatter(df2.Orginal, df2.Predicted, c = 'y')
plt.legend()
plt.show()
开发者ID:DEK11,项目名称:Predicting-EOB-delay,代码行数:31,代码来源:linear.py
示例15: plot_importance
def plot_importance(self):
ax = xgb.plot_importance(self.model)
self.save_topn_features()
return ax
开发者ID:Anhmike,项目名称:Kaggle_HomeDepot,代码行数:4,代码来源:xgb_utils.py
示例16: print
model = xgb.train(params, dtrain, 200, watchlist, maximize=True, early_stopping_rounds = 25, verbose_eval=5)
del dvalid
else:
dtrain = xgb.DMatrix(train, y)
del train, y
gc.collect()
watchlist = [(dtrain, 'train')]
model = xgb.train(params, dtrain, 30, watchlist, maximize=True, verbose_eval=1)
del dtrain
gc.collect()
print('[{}] Finish XGBoost Training'.format(time.time() - start_time))
# Plot the feature importance from xgboost
plot_importance(model)
plt.gcf().savefig('feature_importance_xgb.png')
# Load the test for predict
test = pd.read_csv(path+"test.csv", usecols=test_columns, dtype=dtypes)
test = pd.merge(test, ip_count, on='ip', how='left', sort=False)
del ip_count
gc.collect()
sub['click_id'] = test['click_id'].astype('int')
test['clicks_by_ip'] = test['clicks_by_ip'].astype('uint16')
test = timeFeatures(test)
test.drop(['click_id', 'ip'], axis=1, inplace=True)
dtest = xgb.DMatrix(test)
del test
开发者ID:ashukumar27,项目名称:Kaggle,代码行数:31,代码来源:01_xgboost.py
示例17: print
reg_alpha=0.05,
reg_lambda=2,
subsample=1.0,
colsample_bytree=1.0,
max_delta_step=1,
scale_pos_weight=1,
objective='multi:softprob',
nthread=8,
seed=0 # ,
# silent = False
)
print('training...')
xgb_model.fit(training, label)
print('predicting...')
predicted = xgb_model.predict_proba(testing)
predicted = pandas.DataFrame(predicted)
predicted.columns = xgb_model.classes_
# Name index column.
predicted.index.name = 'Id'
# Write csv.
print('Saving prediction...')
predicted.to_csv('Prediction.csv')
# feature importance
feat_imp = pandas.Series(xgb_model.booster().get_fscore()).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Feature Importances')
matplotlib.pyplot.show()
plot_importance(xgb_model, title='Feature importance')
matplotlib.pyplot.show()
plot_tree(xgb_model, num_trees=0)
matplotlib.pyplot.show()
开发者ID:MichaelPluemacher,项目名称:San-Francisco-crimes,代码行数:30,代码来源:XGBoost_model.py
示例18: train
def train(param, num_round=1000, early_stopping_rounds=20):
exec_time = time.strftime("%Y%m%d%I%p%M", time.localtime())
os.mkdir('{0}_{1}'.format(model_path, exec_time))
os.mkdir('{0}_{1}'.format(submission_path, exec_time))
train_params = param.copy()
train_params['num_boost_round'] = num_round
train_params['early_stopping_rounds'] = early_stopping_rounds
json.dump(train_params, open('{0}_{1}{2}'.format(model_path, exec_time, model_params), 'wb+'))
print 'get training data'
train_features = pd.read_csv(train_path + 'train_features.csv').astype(float)
train_labels = pd.read_csv(train_path + 'labels.csv').astype(float)
validate_features = pd.read_csv(validate_path + 'train_features.csv').astype(float)
validate_labels = pd.read_csv(validate_path + 'labels.csv').astype(float)
predict_features = pd.read_csv(predict_path + 'train_features.csv').astype(float)
create_feature_map(train_features.columns.tolist(), '{0}_{1}{2}'.format(model_path, exec_time, model_fmap_file))
train_matrix = xgboost.DMatrix(train_features.values, label=train_labels.values, feature_names=train_features.columns)
val_matrix = xgboost.DMatrix(validate_features.values, label=validate_labels.values, feature_names=validate_features.columns)
predict_matrix = xgboost.DMatrix(predict_features.values, feature_names=predict_features.columns)
watchlist = [(train_matrix, 'train'), (val_matrix, 'eval')]
print 'model training'
with open('{0}_{1}{2}'.format(model_path, exec_time, model_train_log), 'wb+') as outf:
sys.stdout = outf
model = xgboost.train(param, train_matrix, num_boost_round=num_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds)
sys.stdout = save_stdout
print 'model.best_score: {0}, model.best_iteration: {1}, model.best_ntree_limit: {2}'.format(model.best_score, model.best_iteration, model.best_ntree_limit)
print 'output offline model data'
model.save_model('{0}_{1}{2}'.format(model_path, exec_time, model_file))
model.dump_model('{0}_{1}{2}'.format(model_path, exec_time, model_dump_file))
importance = model.get_fscore(fmap='{0}_{1}{2}'.format(model_path, exec_time, model_fmap_file))
importance = sorted(importance.items(), key=operator.itemgetter(1))
df = pd.DataFrame(importance, columns=['feature', 'fscore'])
df['fscore'] = df['fscore'] / df['fscore'].sum()
df.to_csv('{0}_{1}{2}'.format(model_path, exec_time, model_feature_importance_csv), index=False)
xgboost.plot_importance(model)
plt.gcf().set_size_inches(20, 16)
plt.gcf().set_tight_layout(True)
plt.gcf().savefig('{0}_{1}{2}'.format(model_path, exec_time, model_feature_importance_file))
plt.close()
train_pred_labels = model.predict(train_matrix, ntree_limit=model.best_ntree_limit)
val_pred_labels = model.predict(val_matrix, ntree_limit=model.best_ntree_limit)
train_pred_frame = pd.Series(train_pred_labels, index=train_features.index)
train_pred_frame.name = probability_consumed_label
val_pred_frame = pd.Series(val_pred_labels, index=validate_features.index)
val_pred_frame.name = probability_consumed_label
train_true_frame = pd.read_csv(train_path + 'labels.csv')['Label']
val_true_frame = pd.read_csv(validate_path + 'labels.csv')['Label']
train_coupons = pd.read_csv(train_path + 'dataset.csv')
val_coupons = pd.read_csv(validate_path + 'dataset.csv')
train_check_matrix = train_coupons[[coupon_label]].join(train_true_frame).join(train_pred_frame)
val_check_matrix = val_coupons[[coupon_label]].join(val_true_frame).join(val_pred_frame)
print 'Average auc of train matrix: ', check_average_auc(train_check_matrix)
print 'Average auc of validate matrix', check_average_auc(val_check_matrix)
val_coupons = val_coupons.join(val_pred_frame).join(val_pred_frame.map(lambda x: 0. if x < 0.5 else 1.).rename('map')).join(val_true_frame)
val_coupons.to_csv('{0}_{1}{2}'.format(model_path, exec_time, val_diff_file), index=False)
print confusion_matrix(val_coupons['Label'], val_coupons['map'])
labels = model.predict(predict_matrix, ntree_limit=model.best_ntree_limit)
frame = pd.Series(labels, index=predict_features.index)
frame.name = probability_consumed_label
plt.figure()
frame.hist(figsize=(10, 8))
plt.title('results histogram')
plt.xlabel('predict probability')
plt.gcf().savefig('{0}_{1}{2}'.format(submission_path, exec_time, submission_hist_file))
plt.close()
submission = pd.read_csv(predict_path + 'dataset.csv')
submission = submission[[user_label, coupon_label, date_received_label]].join(frame)
submission.to_csv('{0}_{1}{2}'.format(submission_path, exec_time, submission_file), index=False)
开发者ID:alex19911222,项目名称:O2O-Coupon-Usage-Forecast,代码行数:88,代码来源:xgb.py
示例19: plot_importance_matrix
def plot_importance_matrix(self,vars_names):
pdb.set_trace()
xgb.plot_importance(self.clf)
开发者ID:jgpavez,项目名称:transfer_learning,代码行数:3,代码来源:xgboost_wrapper.py
示例20: XGBoost_regressor2
def XGBoost_regressor2():
"""
Train an XGBoost model with XGBoost lib.
This method is mainly used to find relative importance of
the features.
"""
train = xgb.DMatrix('train_libSVM.dat')
all_train = xgb.DMatrix('all_train_libSVM.dat')
test = xgb.DMatrix('test_libSVM.dat')
validation = xgb.DMatrix('validate_libSVM.dat')
param = {'max_depth': 11, 'eta': 0.002, 'silent': 1,
'objective': 'reg:linear', 'gamma': 2.2,
'subsample': 0.8, 'colsample_bytree': 0.7,
'scale_pos_weight': 0.55, 'min_child_weight': 5,
'n_jobs': 4}
# 0.03-> 900, 1600 without features of SVD similarity between
# search term and other columns
# eta ntrees error
# 0.03-> 900 -> 0.2397 * 2 = 4795
# 0.025 -> 1900 -> 4782
# 0.06 -> 640 -> 0.2400 * 2 = 4801
############# common brand & SVD brand deleted ############
# 0.03-> 900 -> 0.2397 * 2 = 4794
############ add KL distance ########
# 0.03 -> 966 -> 0.2397
# 0.03 -> 1102 -> 0.234 or so
##### add spell checking
# round = 200
# depth = 12 -> 0.235371
# depth = 11 min_cw = 5 -> 0.235316 SELECTED
# depth = 10 -> 0.235840
# depth = 9 -> 0.235912
# depth = 8 -> 0.236202
# min_child_weight = 6 -> 0.235679
# min_child_weight = 4 -> 0.235478
###### as of April 16
# 3500, 0.01, 0.238908
# 1500, 0.03, 0.238893
# 750, 0.06, 0.239034
watchlist = [(validation, 'eval'), (train, 'train')]
# TODO: do data cleaning again.
# add approximate matching
# check KL distance
# n = 1096
num_round = 10000
xgb_model = xgb.train(param, train, num_round, watchlist)
# xgb_model = xgb.cv(param, all_train, num_round, nfold=5,
# metrics={'error'})
# print xgb_model.head()
# xgb_model.info()
prediction = xgb_model.predict(test)
importance = xgb_model.get_fscore(fmap='xgb.fmap')
print importance
sorted_importance = sorted(importance.items(),
key=operator.itemgetter(1))
print sorted_importance
importance_of_feature_file\
= open('importance_of_feature_file', 'w')
pickle.dump(sorted_importance, importance_of_feature_file)
importance_of_feature_file.close()
xgb.plot_importance(xgb_model)
test_id = pd.read_pickle('id_test')
prediction = prediction * 2 + 1
prediction[prediction > 3] = 3
prediction[prediction < 1] = 1
clean_result(prediction)
pd.DataFrame({"id": test_id.values, "relevance": prediction})\
.to_csv('submission.csv', index=False)
开发者ID:versemonger,项目名称:Home-Depot-Product-Search-Relevance,代码行数:71,代码来源:para_tuning.py
注:本文中的xgboost.plot_importance函数示例由纯净天空整理自Github/MSDocs等源码及文档管理平台,相关代码片段筛选自各路编程大神贡献的开源项目,源码版权归原作者所有,传播和使用请参考对应项目的License;未经允许,请勿转载。 |
请发表评论