资讯 小学 初中 高中 语言 会计职称 学历提升 法考 计算机考试 医护考试 建工考试 教育百科
栏目分类:
子分类:
返回
空麓网用户登录
快速导航关闭
当前搜索
当前分类
子分类
实用工具
空麓网 > 计算机考试 > 软件开发 > 后端开发 > Python

分类代码选取,Xgboost代码详解与自动调参

Python 更新时间: 发布时间: 计算机考试归档 最新发布

分类代码选取,Xgboost代码详解与自动调参

import xgboost as xgb
from xgboost import plot_importance
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import warnings
from xgboost.sklearn import XGBClassifier
from sklearn import metrics
data = pd.read_excel(r'D:\csr.xlsx')
X=data[['rank','K1','rpm','nm','min']]
X.head()
y=data[['tyy']]
y.head()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
#构建数据训练集和测试集,以及特征数组
params = {
    'booster':'gbtree',
    'objective':'multi:softmax',   # 多分类问题
    'num_class':10,  # 类别数,与multi softmax并用
    'gamma':0.1,    # 用于控制是否后剪枝的参数,越大越保守,一般0.1 0.2的样子
    'max_depth':12,  # 构建树的深度,越大越容易过拟合
    'lambda':2,  # 控制模型复杂度的权重值的L2 正则化项参数,参数越大,模型越不容易过拟合
    'subsample':0.7, # 随机采样训练样本
    'colsample_bytree':3,# 这个参数默认为1,是每个叶子里面h的和至少是多少
    # 对于正负样本不均衡时的0-1分类而言,假设h在0.01附近,min_child_weight为1
    #意味着叶子节点中最少需要包含100个样本。这个参数非常影响结果,
    # 控制叶子节点中二阶导的和的最小值,该参数值越小,越容易过拟合
    'silent':0,  # 设置成1 则没有运行信息输入,最好是设置成0
    'eta':0.007,  # 如同学习率
    'seed':1000,
    'nthread':7,  #CPU线程数
    #'eval_metric':'auc'
}
# xgboost
from xgboost import XGBClassifier
xgbc_model=XGBClassifier()

# 随机森林
from sklearn.ensemble import RandomForestClassifier
rfc_model=RandomForestClassifier()

# ET
from sklearn.ensemble import ExtraTreesClassifier
et_model=ExtraTreesClassifier()

# 朴素贝叶斯
from sklearn.naive_bayes import GaussianNB
gnb_model=GaussianNB()

#K最近邻
from sklearn.neighbors import KNeighborsClassifier
knn_model=KNeighborsClassifier()

#逻辑回归
from sklearn.linear_model import LogisticRegression
lr_model=LogisticRegression()

#决策树
from sklearn.tree import DecisionTreeClassifier
dt_model=DecisionTreeClassifier()

#支持向量机
from sklearn.svm import SVC
svc_model=SVC()

# xgboost
# 随机森林
# ET
et_model.fit(X,y)

# 朴素贝叶斯
gnb_model.fit(X,y)

# K最近邻
knn_model.fit(X,y)

# 逻辑回归
lr_model.fit(X,y)

# 决策树
dt_model.fit(X,y)

# 支持向量机
svc_model.fit(X,y)

from  sklearn.model_selection import cross_val_score
print("n使用5折交叉验证方法得随机森林模型的准确率(每次迭代的准确率的均值):")
print("tXGBoost模型:",cross_val_score(xgbc_model,X,y,cv=5).mean())
print("t随机森林模型:",cross_val_score(rfc_model,X,y,cv=5).mean())
print("tET模型:",cross_val_score(et_model,X,y,cv=5).mean())
print("t高斯朴素贝叶斯模型:",cross_val_score(gnb_model,X,y,cv=5).mean())
print("tK最近邻模型:",cross_val_score(knn_model,X,y,cv=5).mean())
print("t逻辑回归:",cross_val_score(lr_model,X,y,cv=5).mean())
print("t决策树:",cross_val_score(dt_model,X,y,cv=5).mean())
print("t支持向量机:",cross_val_score(svc_model,X,y,cv=5).mean())
# 性能评估以XGboost为例
model1=XGBClassifier()
# 对训练集训练模型
model1.fit(X_train,y_train)
# 对测试集进行预测
y_pred = xgb.predict(X_test)
print("n模型的平均准确率(mean accuracy = (TP+TN)/(P+N) )")
print("tXgboost:",xgb.score(X_test,y_test))
# print('(y_test,y_pred)', y_test,y_pred)    print("n性能评价:")
print("t预测结果评价报表:n", metrics.classification_report(y_test,y_pred))
print("t混淆矩阵:n", metrics.confusion_matrix(y_test,y_pred))
#max_depth和min_child_weight参数调优
# max_depth和min_child_weight参数对最终结果有很大的影响。max_depth通常在3-10之间,min_child_weight。采用栅格搜索(grid search),我们先大范围地粗略参数,然后再小范围的微调。
# 网格搜索scoring = 'roc_auc' 只支持二分类,多分类需要修改scoring(默认支持多分类)

param_test1 = {
'max_depth':[i for i in range(3,10,2)],
'min_child_weight':[i for i in range(1,6,2)]
}
from sklearn import svm,  datasets
from sklearn import model_selection
gsearch = model_selection.GridSearchCV(
estimator = XGBClassifier(
learning_rate =0.1,
n_estimators=140, max_depth=5,
min_child_weight=1,
gamma=0,
subsample=0.8,
colsample_bytree=0.8,
objective= 'binary:logistic',
nthread=4,
scale_pos_weight=1,
seed=27),
param_grid = param_test1,
n_jobs=4,
cv=5)
gsearch.fit(X_train,y_train)
print('max_depth_min_child_weight')
print('gsearch1.grid_scores_', gsearch.cv_results_['mean_test_score'])
print('gsearch1.best_params_', gsearch.cv_results_['params'])
fig = plt.figure(figsize=(10,10))
ax1 = plt.axes(projection='3d')
zd = [[0.93400966,0.92067633 ,0.92502415 ,0.93400966, 0.92512077 ,0.92502415,
 0.92956522, 0.92067633 ,0.92502415 ,0.92956522, 0.92067633 ,0.92502415]]
xd = [3,3,3,5,5,5,7,7,7,9,9,9]
yd = [1,3,5,1,3,5,1,3,5,1,3,5]
s_temp=zd*100
ax1.scatter3D(xd,yd,zd, cmap='Blues',s=140,marker="s")  #绘制散点图
# ax1.plot3D(x,y,z,'gray')    #绘制空间曲线
plt.xlabel("max_depth")
plt.ylabel("min_child_weight")
plt.show()
print('gamma')
#调整subsample 和 colsample_bytree参数
#   尝试不同的subsample 和 colsample_bytree 参数。我们分两个阶段来进行这个步骤。这两个步骤都取0.6,0.7,0.8,0.9作为起始值。
#取0.6,0.7,0.8,0.9作为起始值
from sklearn import svm,  datasets
from sklearn import model_selection
param_test4 = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}
gsearch = GridSearchCV(
estimator = XGBClassifier(
learning_rate =0.1,
n_estimators=140,
max_depth=5,
min_child_weight=1,
gamma=0.0,
subsample=0.8,
colsample_bytree=0.8,
objective= 'binary:logistic',
nthread=4,
scale_pos_weight=1,
seed=27),
param_grid = param_test4,
n_jobs=4,
cv=5)
gsearch.fit(X_train,y_train)
print('subsample_colsample_bytree------------------')
print('gsearch1.grid_scores_',  gsearch.cv_results_['mean_test_score'])
print('gsearch1.best_params_',  gsearch.cv_results_['params'])

#正则化参数调优reg_alpha
#   由于gamma函数提供了一种更加有效的降低过拟合的方法,大部分人很少会用到这个参数,但是我们可以尝试用一下这个参数。
from sklearn import svm,  datasets
from sklearn import model_selection
param_test6 = {
 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}

from sklearn import svm,  datasets
from sklearn import model_selection
param_test6 = {
 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}
gsearch = GridSearchCV(
estimator = XGBClassifier(
learning_rate =0.1,
n_estimators=140,
max_depth=5,
min_child_weight=1,
gamma=0.0,
subsample=0.8,
colsample_bytree=0.8,
objective= 'binary:logistic',
nthread=4,
scale_pos_weight=1,
seed=27),
param_grid = param_test6,
n_jobs=4,
cv=5)
gsearch.fit(X_train,y_train)
print('reg_lambda------------------')
print('gsearch1.grid_scores_',  gsearch.cv_results_['mean_test_score'])
print('gsearch1.best_params_',  gsearch.cv_results_['params'])
#正则化参数调优reg_alpha
#   由于gamma函数提供了一种更加有效的降低过拟合的方法,大部分人很少会用到这个参数,但是我们可以尝试用一下这个参数。
from sklearn import svm,  datasets
from sklearn import model_selection
param_test7 = {
 'reg_lambda':[1e-5, 1e-2, 0.1, 1, 100]
}

from sklearn import svm,  datasets
from sklearn import model_selection


gsearch = GridSearchCV(
estimator = XGBClassifier(
learning_rate =0.1,
n_estimators=140,
max_depth=5,
min_child_weight=1,
gamma=0.0,
subsample=0.8,
colsample_bytree=0.8,
objective= 'binary:linear',
nthread=4,
scale_pos_weight=1,
seed=27),
param_grid = param_test6,
n_jobs=4,
cv=5)
gsearch.fit(X_train,y_train)
print('reg_lambda------------------')
print('gsearch1.grid_scores_',  gsearch.cv_results_['mean_test_score'])
print('gsearch1.best_params_',  gsearch.cv_results_['params'])

建议用jupyter进行编译运行,sklearn更新到最新的版本。

转载请注明:文章转载自 http://www.konglu.com/
本文地址:http://www.konglu.com/it/1041381.html
免责声明:

我们致力于保护作者版权,注重分享,被刊用文章【分类代码选取,Xgboost代码详解与自动调参】因无法核实真实出处,未能及时与作者取得联系,或有版权异议的,请联系管理员,我们会立即处理,本文部分文字与图片资源来自于网络,转载此文是出于传递更多信息之目的,若有来源标注错误或侵犯了您的合法权益,请立即通知我们,情况属实,我们会第一时间予以删除,并同时向您表示歉意,谢谢!

我们一直用心在做
关于我们 文章归档 网站地图 联系我们

版权所有 (c)2021-2023 成都空麓科技有限公司

ICP备案号:蜀ICP备2023000828号-2