import xgboost as xgb from xgboost import plot_importance from matplotlib import pyplot as plt from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score import pandas as pd import numpy as np import warnings from xgboost.sklearn import XGBClassifier from sklearn import metrics data = pd.read_excel(r'D:\csr.xlsx') X=data[['rank','K1','rpm','nm','min']] X.head() y=data[['tyy']] y.head() from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) #构建数据训练集和测试集,以及特征数组 params = { 'booster':'gbtree', 'objective':'multi:softmax', # 多分类问题 'num_class':10, # 类别数,与multi softmax并用 'gamma':0.1, # 用于控制是否后剪枝的参数,越大越保守,一般0.1 0.2的样子 'max_depth':12, # 构建树的深度,越大越容易过拟合 'lambda':2, # 控制模型复杂度的权重值的L2 正则化项参数,参数越大,模型越不容易过拟合 'subsample':0.7, # 随机采样训练样本 'colsample_bytree':3,# 这个参数默认为1,是每个叶子里面h的和至少是多少 # 对于正负样本不均衡时的0-1分类而言,假设h在0.01附近,min_child_weight为1 #意味着叶子节点中最少需要包含100个样本。这个参数非常影响结果, # 控制叶子节点中二阶导的和的最小值,该参数值越小,越容易过拟合 'silent':0, # 设置成1 则没有运行信息输入,最好是设置成0 'eta':0.007, # 如同学习率 'seed':1000, 'nthread':7, #CPU线程数 #'eval_metric':'auc' } # xgboost from xgboost import XGBClassifier xgbc_model=XGBClassifier() # 随机森林 from sklearn.ensemble import RandomForestClassifier rfc_model=RandomForestClassifier() # ET from sklearn.ensemble import ExtraTreesClassifier et_model=ExtraTreesClassifier() # 朴素贝叶斯 from sklearn.naive_bayes import GaussianNB gnb_model=GaussianNB() #K最近邻 from sklearn.neighbors import KNeighborsClassifier knn_model=KNeighborsClassifier() #逻辑回归 from sklearn.linear_model import LogisticRegression lr_model=LogisticRegression() #决策树 from sklearn.tree import DecisionTreeClassifier dt_model=DecisionTreeClassifier() #支持向量机 from sklearn.svm import SVC svc_model=SVC() # xgboost # 随机森林 # ET et_model.fit(X,y) # 朴素贝叶斯 gnb_model.fit(X,y) # K最近邻 knn_model.fit(X,y) # 逻辑回归 lr_model.fit(X,y) # 决策树 dt_model.fit(X,y) # 支持向量机 svc_model.fit(X,y) from sklearn.model_selection import cross_val_score print("n使用5折交叉验证方法得随机森林模型的准确率(每次迭代的准确率的均值):") print("tXGBoost模型:",cross_val_score(xgbc_model,X,y,cv=5).mean()) print("t随机森林模型:",cross_val_score(rfc_model,X,y,cv=5).mean()) print("tET模型:",cross_val_score(et_model,X,y,cv=5).mean()) print("t高斯朴素贝叶斯模型:",cross_val_score(gnb_model,X,y,cv=5).mean()) print("tK最近邻模型:",cross_val_score(knn_model,X,y,cv=5).mean()) print("t逻辑回归:",cross_val_score(lr_model,X,y,cv=5).mean()) print("t决策树:",cross_val_score(dt_model,X,y,cv=5).mean()) print("t支持向量机:",cross_val_score(svc_model,X,y,cv=5).mean()) # 性能评估以XGboost为例 model1=XGBClassifier() # 对训练集训练模型 model1.fit(X_train,y_train) # 对测试集进行预测 y_pred = xgb.predict(X_test) print("n模型的平均准确率(mean accuracy = (TP+TN)/(P+N) )") print("tXgboost:",xgb.score(X_test,y_test)) # print('(y_test,y_pred)', y_test,y_pred) print("n性能评价:") print("t预测结果评价报表:n", metrics.classification_report(y_test,y_pred)) print("t混淆矩阵:n", metrics.confusion_matrix(y_test,y_pred)) #max_depth和min_child_weight参数调优 # max_depth和min_child_weight参数对最终结果有很大的影响。max_depth通常在3-10之间,min_child_weight。采用栅格搜索(grid search),我们先大范围地粗略参数,然后再小范围的微调。 # 网格搜索scoring = 'roc_auc' 只支持二分类,多分类需要修改scoring(默认支持多分类) param_test1 = { 'max_depth':[i for i in range(3,10,2)], 'min_child_weight':[i for i in range(1,6,2)] } from sklearn import svm, datasets from sklearn import model_selection gsearch = model_selection.GridSearchCV( estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), param_grid = param_test1, n_jobs=4, cv=5) gsearch.fit(X_train,y_train) print('max_depth_min_child_weight') print('gsearch1.grid_scores_', gsearch.cv_results_['mean_test_score']) print('gsearch1.best_params_', gsearch.cv_results_['params']) fig = plt.figure(figsize=(10,10)) ax1 = plt.axes(projection='3d') zd = [[0.93400966,0.92067633 ,0.92502415 ,0.93400966, 0.92512077 ,0.92502415, 0.92956522, 0.92067633 ,0.92502415 ,0.92956522, 0.92067633 ,0.92502415]] xd = [3,3,3,5,5,5,7,7,7,9,9,9] yd = [1,3,5,1,3,5,1,3,5,1,3,5] s_temp=zd*100 ax1.scatter3D(xd,yd,zd, cmap='Blues',s=140,marker="s") #绘制散点图 # ax1.plot3D(x,y,z,'gray') #绘制空间曲线 plt.xlabel("max_depth") plt.ylabel("min_child_weight") plt.show() print('gamma') #调整subsample 和 colsample_bytree参数 # 尝试不同的subsample 和 colsample_bytree 参数。我们分两个阶段来进行这个步骤。这两个步骤都取0.6,0.7,0.8,0.9作为起始值。 #取0.6,0.7,0.8,0.9作为起始值 from sklearn import svm, datasets from sklearn import model_selection param_test4 = { 'subsample':[i/10.0 for i in range(6,10)], 'colsample_bytree':[i/10.0 for i in range(6,10)] } gsearch = GridSearchCV( estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=5, min_child_weight=1, gamma=0.0, subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), param_grid = param_test4, n_jobs=4, cv=5) gsearch.fit(X_train,y_train) print('subsample_colsample_bytree------------------') print('gsearch1.grid_scores_', gsearch.cv_results_['mean_test_score']) print('gsearch1.best_params_', gsearch.cv_results_['params']) #正则化参数调优reg_alpha # 由于gamma函数提供了一种更加有效的降低过拟合的方法,大部分人很少会用到这个参数,但是我们可以尝试用一下这个参数。 from sklearn import svm, datasets from sklearn import model_selection param_test6 = { 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100] } from sklearn import svm, datasets from sklearn import model_selection param_test6 = { 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100] } gsearch = GridSearchCV( estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=5, min_child_weight=1, gamma=0.0, subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), param_grid = param_test6, n_jobs=4, cv=5) gsearch.fit(X_train,y_train) print('reg_lambda------------------') print('gsearch1.grid_scores_', gsearch.cv_results_['mean_test_score']) print('gsearch1.best_params_', gsearch.cv_results_['params']) #正则化参数调优reg_alpha # 由于gamma函数提供了一种更加有效的降低过拟合的方法,大部分人很少会用到这个参数,但是我们可以尝试用一下这个参数。 from sklearn import svm, datasets from sklearn import model_selection param_test7 = { 'reg_lambda':[1e-5, 1e-2, 0.1, 1, 100] } from sklearn import svm, datasets from sklearn import model_selection gsearch = GridSearchCV( estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=5, min_child_weight=1, gamma=0.0, subsample=0.8, colsample_bytree=0.8, objective= 'binary:linear', nthread=4, scale_pos_weight=1, seed=27), param_grid = param_test6, n_jobs=4, cv=5) gsearch.fit(X_train,y_train) print('reg_lambda------------------') print('gsearch1.grid_scores_', gsearch.cv_results_['mean_test_score']) print('gsearch1.best_params_', gsearch.cv_results_['params'])
建议用jupyter进行编译运行,sklearn更新到最新的版本。