sklearn机器学习-泰坦尼克号-白红宇

强烈建议你试试无所不能的chatGPT，快点击我

sklearn机器学习-泰坦尼克号

阅读量：5161 次

发布时间：2019-06-13

本文共 11138 字，大约阅读时间需要 37 分钟。

（博主亲自录制视频）

医药统计项目可联系

QQ：231469242

randomForest.py

调参后，预测最高准确性也达到了89%

随机森林的参数

# -*- coding: utf-8 -*-"""Created on Sat Mar 31 09:30:24 2018@author: Administrator随机森林不需要预处理数据"""#导入数据预处理，包括标准化处理或正则处理from sklearn import preprocessingfrom sklearn.preprocessing import Imputerfrom sklearn import metricsimport numpy as npimport matplotlib.pyplot as pltfrom sklearn.ensemble import RandomForestClassifierfrom sklearn.model_selection import train_test_splitimport pandas as pd#中文字体设置from matplotlib.font_manager import FontPropertiesfont=FontProperties(fname=r"c:\windows\fonts\simsun.ttc",size=14)#读取变量名文件varibleFileName="titantic.xlsx"#读取目标文件targetFileName="target.xlsx"#读取exceldata=pd.read_excel(varibleFileName)data_dummies=pd.get_dummies(data)print('features after one-hot encoding:\n',list(data_dummies.columns))features=data_dummies.ix[:,"Pclass":'Embarked_S']x=features.values#数据预处理imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0) imp.fit(x)x=imp.transform(x)target=pd.read_excel(targetFileName)y=target.valuesx_train,x_test,y_train,y_test=train_test_split(x,y,random_state=0)names=features.columnstrees=1000max_depth=10#n_estimators表示树的个数，测试中100颗树足够forest=RandomForestClassifier(n_estimators=trees,random_state=0,max_depth=max_depth)forest.fit(x_train,y_train)print("random forest with %d trees:"%trees)  print("accuracy on the training subset:{:.3f}".format(forest.score(x_train,y_train)))print("accuracy on the test subset:{:.3f}".format(forest.score(x_test,y_test)))#print('Feature importances:{}'.format(forest.feature_importances_))names=features.columnsimportance=forest.feature_importances_zipped = zip(importance,names)list1=list(zipped)list1.sort(reverse=True)#print(list1)n_features=data_dummies.shape[1]plt.barh(range(n_features),forest.feature_importances_,align='center')plt.yticks(np.arange(n_features),features)plt.title("random forest with %d trees,%dmax_depth:"%(trees,max_depth))plt.xlabel('Feature Importance')plt.ylabel('Feature')plt.show()'''random forest with 1000 trees:accuracy on the training subset:0.983accuracy on the test subset:0.878random forest with 1000 trees,max_depth=4:accuracy on the training subset:0.854accuracy on the test subset:0.884random forest with 1000 trees,max_depth=5:accuracy on the training subset:0.853accuracy on the test subset:0.887random forest with 1000 trees,max_depth=9accuracy on the training subset:0.871accuracy on the test subset:0.890'''

　　

　　

去掉覆盖率低的变量后，随机森林准确性反而下降，看了随机森林不需要去计算变量覆盖率

训练数据准确性0.983

测试数据准确性0.878

'''

random forest with 1000 trees:

accuracy on the training subset:0.983

accuracy on the test subset:0.878

'''

重要因子来看，性别第一，占据40%重要性，

年龄重要性18%左右，

票价重要性17%左右

logistic.py

# -*- coding: utf-8 -*-"""Created on Sun Apr 29 22:39:35 2018@author: Administrator"""# -*- coding: utf-8 -*-"""Created on Sat Mar 31 09:30:24 2018@author: Administrator随机森林不需要预处理数据"""from sklearn.linear_model import LogisticRegression#导入数据预处理，包括标准化处理或正则处理from sklearn import preprocessingfrom sklearn.preprocessing import Imputerfrom sklearn import metricsimport numpy as npimport matplotlib.pyplot as pltfrom sklearn.model_selection import train_test_splitimport pandas as pd#中文字体设置from matplotlib.font_manager import FontPropertiesfont=FontProperties(fname=r"c:\windows\fonts\simsun.ttc",size=14)#读取变量名文件varibleFileName="titantic.xlsx"#读取目标文件targetFileName="target.xlsx"#读取exceldata=pd.read_excel(varibleFileName)data_dummies=pd.get_dummies(data)print('features after one-hot encoding:\n',list(data_dummies.columns))features=data_dummies.ix[:,"Pclass":'Embarked_S']x=features.values#数据预处理imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0) imp.fit(x)x=imp.transform(x)target=pd.read_excel(targetFileName)y=target.valuesx_train,x_test,y_train,y_test=train_test_split(x,y,random_state=0)names=features.columns#n_estimators表示树的个数，测试中100颗树足够logistic=LogisticRegression()logistic.fit(x_train,y_train)print("logistic:")  print("accuracy on the training subset:{:.3f}".format(logistic.score(x_train,y_train)))print("accuracy on the test subset:{:.3f}".format(logistic.score(x_test,y_test)))'''logistic:accuracy on the training subset:0.850accuracy on the test subset:0.875'''

　　

目前效果最好的是去掉低覆盖率的变量后，SVM准确率最高0.89

# -*- coding: utf-8 -*-"""Created on Sat Mar 31 09:30:24 2018@author: Administrator随机森林不需要预处理数据"""from sklearn.svm import SVC#导入数据预处理，包括标准化处理或正则处理from sklearn import preprocessingfrom sklearn.preprocessing import Imputerfrom sklearn import metricsimport numpy as npimport matplotlib.pyplot as pltfrom sklearn.model_selection import train_test_splitimport pandas as pd#中文字体设置from matplotlib.font_manager import FontPropertiesfont=FontProperties(fname=r"c:\windows\fonts\simsun.ttc",size=14)#读取变量名文件varibleFileName="titantic.xlsx"#读取目标文件targetFileName="target.xlsx"#读取exceldata=pd.read_excel(varibleFileName)data_dummies=pd.get_dummies(data)print('features after one-hot encoding:\n',list(data_dummies.columns))features=data_dummies.ix[:,"Pclass":'Embarked_S']x=features.values#数据预处理imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0) imp.fit(x)x=imp.transform(x)target=pd.read_excel(targetFileName)y=target.valuesx_train,x_test,y_train,y_test=train_test_split(x,y,random_state=0)names=features.columnssvm=SVC()svm.fit(x_train,y_train)print("svc:")  print("accuracy on the training subset:{:.3f}".format(svm.score(x_train,y_train)))print("accuracy on the test subset:{:.3f}".format(svm.score(x_test,y_test)))'''svc:accuracy on the training subset:0.900accuracy on the test subset:0.726'''#标准化数据X_train_scaled = preprocessing.scale(x_train)x_test_scaled = preprocessing.scale(x_test)svm1=SVC()svm1.fit(X_train_scaled,y_train)#改变C参数，调优,kernel表示核函数，用于平面转换，probability表示是否需要计算概率svm1=SVC()svm1.fit(X_train_scaled,y_train)print("accuracy on the scaled training subset:{:.3f}".format(svm1.score(X_train_scaled,y_train)))print("accuracy on the scaled test subset:{:.3f}".format(svm1.score(x_test_scaled,y_test)))'''accuracy on the scaled training subset:0.866accuracy on the scaled test subset:0.881'''#改变C参数，调优,kernel表示核函数，用于平面转换，probability表示是否需要计算概率svm2=SVC(C=10,gamma="auto",kernel='rbf',probability=True)svm2.fit(X_train_scaled,y_train)print("after c parameter=10,accuracy on the scaled training subset:{:.3f}".format(svm2.score(X_train_scaled,y_train)))print("after c parameter=10,accuracy on the scaled test subset:{:.3f}".format(svm2.score(x_test_scaled,y_test)))'''after c parameter=10,accuracy on the scaled training subset:0.878after c parameter=10,accuracy on the scaled test subset:0.890'''

xgboost1.py

效果也相当好

AUC: 0.9464ACC: 0.8841Recall: 0.8716F1-score: 0.8716Precesion: 0.8716

# -*- coding: utf-8 -*-"""Created on Sat Mar 31 09:30:24 2018@author: Administrator随机森林不需要预处理数据"""import xgboost as xgb#导入数据预处理，包括标准化处理或正则处理from sklearn import preprocessingfrom sklearn.preprocessing import Imputerfrom sklearn import metricsimport numpy as npimport matplotlib.pyplot as pltfrom sklearn.model_selection import train_test_splitimport pandas as pd#中文字体设置from matplotlib.font_manager import FontPropertiesfont=FontProperties(fname=r"c:\windows\fonts\simsun.ttc",size=14)#读取变量名文件varibleFileName="titantic.xlsx"#读取目标文件targetFileName="target.xlsx"#读取exceldata=pd.read_excel(varibleFileName)data_dummies=pd.get_dummies(data)print('features after one-hot encoding:\n',list(data_dummies.columns))features=data_dummies.ix[:,"Pclass":'Embarked_S']x=features.values#数据预处理imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0) imp.fit(x)x=imp.transform(x)target=pd.read_excel(targetFileName)y=target.valuesx_train,x_test,y_train,y_test=train_test_split(x,y,random_state=0)names=features.columnsdtrain=xgb.DMatrix(x_train,label=y_train)dtest=xgb.DMatrix(x_test)params={'booster':'gbtree',    #'objective': 'reg:linear',    'objective': 'binary:logistic',    'eval_metric': 'auc',    'max_depth':4,    'lambda':10,    'subsample':0.75,    'colsample_bytree':0.75,    'min_child_weight':2,    'eta': 0.025,    'seed':0,    'nthread':8,     'silent':1}watchlist = [(dtrain,'train')]bst=xgb.train(params,dtrain,num_boost_round=100,evals=watchlist)ypred=bst.predict(dtest)# 设置阈值, 输出一些评价指标y_pred = (ypred >= 0.5)*1#模型校验print ('AUC: %.4f' % metrics.roc_auc_score(y_test,ypred))print ('ACC: %.4f' % metrics.accuracy_score(y_test,y_pred))print ('Recall: %.4f' % metrics.recall_score(y_test,y_pred))print ('F1-score: %.4f' %metrics.f1_score(y_test,y_pred))print ('Precesion: %.4f' %metrics.precision_score(y_test,y_pred))metrics.confusion_matrix(y_test,y_pred)print("xgboost:")  print('Feature importances:{}'.format(bst.get_fscore()))'''AUC: 0.9464ACC: 0.8841Recall: 0.8716F1-score: 0.8716Precesion: 0.8716xgboost:Feature importances:{'f5': 69, 'f1': 178, 'f2': 68, 'f4': 245, 'f6': 25, 'f0': 88, 'f3': 25, 'f194': 4, 'f193': 21, 'f195': 9}'''

　　

决策树

decisionTree.py

# -*- coding: utf-8 -*-"""Created on Mon Apr 30 19:04:10 2018@author: Administrator"""from sklearn.tree import export_graphvizfrom sklearn.model_selection import train_test_splitfrom sklearn.preprocessing import Imputerimport pandas as pdimport numpy as npfrom sklearn.tree import DecisionTreeClassifierimport matplotlib.pyplot as plt#读取变量名文件varibleFileName="titantic.xlsx"#读取目标文件targetFileName="target.xlsx"#读取exceldata=pd.read_excel(varibleFileName)data_dummies=pd.get_dummies(data)print('features after one-hot encoding:\n',list(data_dummies.columns))features=data_dummies.ix[:,"Pclass":'Embarked_S']x=features.values#数据预处理imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0) imp.fit(x)x=imp.transform(x)target=pd.read_excel(targetFileName)y=target.valuesX_train,x_test,y_train,y_test=train_test_split(x,y,random_state=0)#变量名names=features.columns#调参list_average_accuracy=[]depth=range(1,30)for i in depth:    #max_depth=4限制决策树深度可以降低算法复杂度，获取更精确值    tree= DecisionTreeClassifier(max_depth=i,random_state=0)    tree.fit(X_train,y_train)    accuracy_training=tree.score(X_train,y_train)    accuracy_test=tree.score(x_test,y_test)    average_accuracy=(accuracy_training+accuracy_test)/2.0    #print("average_accuracy:",average_accuracy)    list_average_accuracy.append(average_accuracy)    max_value=max(list_average_accuracy)#索引是0开头，结果要加1best_depth=list_average_accuracy.index(max_value)+1print("best_depth:",best_depth)best_tree= DecisionTreeClassifier(max_depth=best_depth,random_state=0)best_tree.fit(X_train,y_train)accuracy_training=best_tree.score(X_train,y_train)accuracy_test=best_tree.score(x_test,y_test)print("decision tree:")    print("accuracy on the training subset:{:.3f}".format(best_tree.score(X_train,y_train)))print("accuracy on the test subset:{:.3f}".format(best_tree.score(x_test,y_test)))'''best_depth: 19decision tree:accuracy on the training subset:0.976accuracy on the test subset:0.860'''#绘图，显示因子重要性n_features=x.shape[1]plt.barh(range(n_features),best_tree.feature_importances_,align='center')plt.yticks(np.arange(n_features),features)plt.title("Decision Tree:")plt.xlabel('Feature Importance')plt.ylabel('Feature')plt.show()#生成一个dot文件，以后用cmd形式生成图片export_graphviz(best_tree,out_file="Titanic.dot",class_names=['death','live'],feature_names=names,impurity=False,filled=True)

　　

转载于:https://www.cnblogs.com/webRobot/p/8972030.html

你可能感兴趣的文章

Lintcode: Longest Common Substring

Groovy 学习手册（2）

AngularJs中directive的延迟加载

JGUI源码：响应式布局简单实现(13)

Django中间件登录后可以阅读查看

C# 网络编程之基于SMTP发送电子邮件

Ef Core增加Sql方法

java 几种常见的定时器

WIN10 计算器计算表数范围小的可以向表数范围大数据丢失

Judy alpha 第一天

Json序列化之.NET开源类库Newtonsoft.Json

xml序列化及反序列化.net对象

需要我们了解的SQL Server阻塞原因与解决方法

customize Vimium

mysql的Event 及时间操作

python中的魔法属性

心急的C小加

喝酒易醉，品茶养心，人生如梦，品茶悟道，何以解忧？唯有杜康！-- 愿君每日到此一游！

当前时间: 2024-11-20 09:27:20 当前IP: 3.15.29.209 联系邮箱:javaeecc@qq.com Copyright © 2020 - 2022 baihongyu.com 京ICP备2021015314号-2

强烈建议你试试无所不能的CHAT-GPT，快点击我

强烈建议你试试无所不能的CHAT-GPT，快点击我