数据科学与机器学习案例之Stacking集成方法对鸢尾花进行分类
数据介绍
鸢尾花数据是
R语言
中自带的数据,这里对鸢尾花数据进行Stacking模型集成
对鸢尾花数据进行分类。
> str(iris)
'data.frame': 150 obs. of 5 variables:
$ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
$ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
$ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
$ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
$ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
对于鸢尾花中的数据进行高维可视化,我们使用
R语言FactoMineR包
中的函数对鸢尾花数据进行降维展示。
> library(ggplot2)
> library(factoextra)
> library(FactoMineR)
> df <- iris[c(1, 2, 3, 4)]
> iris.pca<- PCA(df, graph = FALSE)
> fviz_pca_ind(iris.pca,
+ geom.ind = "point",
+ pointsize =3,pointshape = 21,fill.ind = iris$Species,
+ palette = c("#E41A1C", "#4DAF4A", "#999999"),
+ addEllipses = TRUE, legend.title = "Groups",
+ title="")+theme_grey()
Stacking模型集成
数据处理
> library(ggplot2)
> library(data.table)
> library(mltools)
> library(class) # k均值建模
> library(LiblineaR)
> setwd('E:\\RClass\\建模\\模型集成对鸢尾花数据进行分类')
> iris1 = data.table(iris)
> iris1[,Sepal.Length := scale(Sepal.Length,T,T)]
> iris1[,Sepal.Width := scale(Sepal.Width,T,T)]
> iris1[,Petal.Length := scale(Petal.Length,T,T)]
> iris1[,Petal.Width := scale(Petal.Width,T,T)]
>
> iris1[,Distance := sqrt(Sepal.Length^2 + Sepal.Width^2 +
+ Petal.Length^2 + Petal.Width^2)]
>
> iris1[,ID := 1:nrow(iris1)]
> colnames(iris1)
> p1 = caret::createDataPartition(iris$Species,p = .75,list = TRUE)[[1]]
> data.train = iris1[p1,]
> data.test = iris1[-p1,]
>
> data.train[,FoldID := folds(Species,nfolds = 5,stratified = T,seed = 2022)]
knn
> knncv <- list()
> knncv[['features']] <- c('Sepal.Length','Sepal.Width','Petal.Length','Petal.Width')
> knncv[['para']] <- CJ(k = seq(1,30))
> knncv[['bestscore']] <- 0
>
> for(i in seq_len(nrow(knncv[['para']]))){
+ scores <- numeric()
+ predlist <- list()
+ para <- knncv[['para']][i]
+ for(folds in 1:5){
+
+ testFold <- data.train[J(FoldID = folds),on = 'FoldID']
+ trainFold <- data.train[!J(FoldID = folds),on = 'FoldID']
+
+ testFold[,Pred:=knn(train = trainFold[,knncv[['features']],with = F],
+ test = testFold[,knncv[['features']],with = F],
+ cl = trainFold$Species,k = para$k)]
+ predlist <- c(predlist,list(testFold[,list(ID,FoldID,Pred)]))
+
+ score <- mean(testFold$Species == testFold$Pred)
+ scores <- c(scores,score) }
+
+ score <- mean(scores,na.rm = T)
+ knncv[['para']][i,Score:=score] # 打印每个参数的值
+ print(paste('Para:',paste(colnames(knncv[['para']])),knncv[['para']][i],collapse = '|'))
+
+ if(score > knncv[['bestscore']]){
+ knncv[['bestscore']] <- score
+ knncv[['bestscores']] <- scores
+ knncv[['bestpara']] <- knncv[['para']][i]
+ knncv[['bestresults']] <- rbindlist(predlist)
+ }
+ }
[1] "Para: k 1|Para: Score 0.930952380952381"
[1] "Para: k 2|Para: Score 0.929761904761905"
[1] "Para: k 3|Para: Score 0.929761904761905"
[1] "Para: k 4|Para: Score 0.920238095238095"
[1] "Para: k 5|Para: Score 0.948809523809524"
[1] "Para: k 6|Para: Score 0.947619047619048"
[1] "Para: k 7|Para: Score 0.957142857142857"
[1] "Para: k 8|Para: Score 0.957142857142857"
[1] "Para: k 9|Para: Score 0.957142857142857"
[1] "Para: k 10|Para: Score 0.957142857142857"
[1] "Para: k 11|Para: Score 0.957142857142857"
[1] "Para: k 12|Para: Score 0.939285714285714"
[1] "Para: k 13|Para: Score 0.957142857142857"
[1] "Para: k 14|Para: Score 0.957142857142857"
[1] "Para: k 15|Para: Score 0.957142857142857"
[1] "Para: k 16|Para: Score 0.957142857142857"
[1] "Para: k 17|Para: Score 0.957142857142857"
[1] "Para: k 18|Para: Score 0.948809523809524"
[1] "Para: k 19|Para: Score 0.932142857142857"
[1] "Para: k 20|Para: Score 0.94047619047619"
[1] "Para: k 21|Para: Score 0.948809523809524"
[1] "Para: k 22|Para: Score 0.94047619047619"
[1] "Para: k 23|Para: Score 0.923809523809524"
[1] "Para: k 24|Para: Score 0.923809523809524"
[1] "Para: k 25|Para: Score 0.91547619047619"
[1] "Para: k 26|Para: Score 0.91547619047619"
[1] "Para: k 27|Para: Score 0.91547619047619"
[1] "Para: k 28|Para: Score 0.886904761904762"
[1] "Para: k 29|Para: Score 0.914285714285714"
[1] "Para: k 30|Para: Score 0.923809523809524"
> knncv[["bestpara"]] # 小样本情形下的准确率很差
k Score
1: 7 0.9571429
>
我们将鸢尾花数据分为了训练集与测试集,使用k近邻模型进行了5折交叉验证,选择出的最优参数为
k=7,模型准确率为95.7%
,理论上讲Stacking模型集成
能提高鸢尾花数据分类的准确率,但是k近邻模型
的准确率已经达到95.7%
,在使用Stacking模型集成
上并不能保证会提高准确率。
> ggplot(knncv[['para']],aes(x=k,y = Score))+
+ geom_line()+
+ geom_point()+
+ theme_grey()

svm
> svmcv <- list()
> svmcv[["features"]] <- c("Sepal.Length", "Sepal.Width", "Petal.Length",
+ 'Petal.Width','Distance')
> svmcv[["para"]] <- CJ(type=1:5, cost=c(.01, .1, 1, 10, 100, 1000, 2000), Score=NA_real_)
> svmcv[["BestScore"]] <- 0
>
> for(i in seq_len(nrow(svmcv[['para']]))){
+
+ scores <- numeric() # 储存每折交叉验证得到的准确率
+ predlist <- list() # 储存五折交叉验证的结果
+ para <- svmcv[['para']][i] # 参数
+
+ for(folds in 1:5){
+ testfold <- data.train[J(FoldID = folds),on = 'FoldID']
+ trainfold <- data.train[!J(FoldID = folds),on = 'FoldID']
+
+ # 对每折的数据进行预测
+ svm <- LiblineaR(data = trainfold[,svmcv$features,with = F],
+ target = trainfold$Species,
+ type = para$type,
+ cost = para$cost ) # 对交叉验证中的数据进行预测
+ testfold[,Pred:=predict(svm,testfold[,svmcv$features,with = F])$predictions]
+
+ scores <- c(scores,mean(testfold$Species == testfold$Pred,na.rm = T))
+
+ predlist <-c(predlist,list(testfold[,list(ID,FoldID,Pred)]))
+ }
+ score <- mean(scores) # 对参数下的5折交叉验证进行平均
+ svmcv[['para']][i,Score:=score]
+ print(paste('Para:',paste(colnames(svmcv[['para']])),
+ svmcv[['para']][i],collapse = '|')) # 输出每个参数的汇总值
+ if(score > svmcv[['BestScore']]){
+ svmcv[['BestScore']] <- score
+ svmcv[['BestScores']] <- scores
+ svmcv[['BestPara']] <- svmcv[['para']][i]
+ svmcv[['BestResults']] <- rbindlist(predlist) }
+ }
[1] "Para: type 1|Para: cost 0.01|Para: Score 0.853571428571429"
[1] "Para: type 1|Para: cost 0.1|Para: Score 0.939285714285714"
[1] "Para: type 1|Para: cost 1|Para: Score 0.955952380952381"
[1] "Para: type 1|Para: cost 10|Para: Score 0.975"
[1] "Para: type 1|Para: cost 100|Para: Score 0.964285714285714"
[1] "Para: type 1|Para: cost 1000|Para: Score 0.947619047619048"
[1] "Para: type 1|Para: cost 2000|Para: Score 0.938095238095238"
[1] "Para: type 2|Para: cost 0.01|Para: Score 0.853571428571429"
[1] "Para: type 2|Para: cost 0.1|Para: Score 0.939285714285714"
[1] "Para: type 2|Para: cost 1|Para: Score 0.955952380952381"
[1] "Para: type 2|Para: cost 10|Para: Score 0.975"
[1] "Para: type 2|Para: cost 100|Para: Score 0.973809523809524"
[1] "Para: type 2|Para: cost 1000|Para: Score 0.964285714285714"
[1] "Para: type 2|Para: cost 2000|Para: Score 0.964285714285714"
[1] "Para: type 3|Para: cost 0.01|Para: Score 0.666666666666667"
[1] "Para: type 3|Para: cost 0.1|Para: Score 0.913095238095238"
[1] "Para: type 3|Para: cost 1|Para: Score 0.947619047619048"
[1] "Para: type 3|Para: cost 10|Para: Score 0.957142857142857"
[1] "Para: type 3|Para: cost 100|Para: Score 0.955952380952381"
[1] "Para: type 3|Para: cost 1000|Para: Score 0.946428571428571"
[1] "Para: type 3|Para: cost 2000|Para: Score 0.946428571428571"
[1] "Para: type 4|Para: cost 0.01|Para: Score 0.666666666666667"
[1] "Para: type 4|Para: cost 0.1|Para: Score 0.939285714285714"
[1] "Para: type 4|Para: cost 1|Para: Score 0.975"
[1] "Para: type 4|Para: cost 10|Para: Score 0.96547619047619"
[1] "Para: type 4|Para: cost 100|Para: Score 0.955952380952381"
[1] "Para: type 4|Para: cost 1000|Para: Score 0.947619047619048"
[1] "Para: type 4|Para: cost 2000|Para: Score 0.947619047619048"
[1] "Para: type 5|Para: cost 0.01|Para: Score 0.666666666666667"
[1] "Para: type 5|Para: cost 0.1|Para: Score 0.921428571428571"
[1] "Para: type 5|Para: cost 1|Para: Score 0.957142857142857"
[1] "Para: type 5|Para: cost 10|Para: Score 0.975"
[1] "Para: type 5|Para: cost 100|Para: Score 0.975"
[1] "Para: type 5|Para: cost 1000|Para: Score 0.975"
[1] "Para: type 5|Para: cost 2000|Para: Score 0.975"
>
> svmcv$BestPara
type cost Score
1: 1 10 0.975
> ggplot(svmcv[['para']],aes(x = cost,y = Score,color = factor(type)))+
+ geom_line()+
+ geom_point()+
+ scale_x_log10()
>

Stacking模型集成
> dim(knncv[['bestresults']])
[1] 114 3
> dim(svmcv[['BestResults']])
[1] 114 3
>
> metas.knn <- knncv[['bestresults']]
> metas.svm <- svmcv[['BestResults']]
>
> class(metas.knn)
[1] "data.table" "data.frame"
> dim(metas.knn)
[1] 114 3
> head(metas.knn)
ID FoldID Pred
1: 1 1 setosa
2: 3 1 setosa
3: 5 1 setosa
4: 19 1 setosa
5: 24 1 setosa
6: 26 1 setosa
>
> data.train[metas.knn,Metas.knn := Pred,on = 'ID']
> data.train[metas.svm,Metas.svm:= Pred,on = 'ID']
>
> data.train <- one_hot(data.train, cols=c("Metas.knn", "Metas.svm"), dropCols=FALSE) # 进行one_hot编码
> lrcv <- list()
> lrcv[['features']] <- setdiff(colnames(data.train),c('ID','Species','FoldID','Metas.knn','Metas.svm'))
> lrcv[['para']] <- CJ(type = c(0,6,7),cost = c(.001, .01, .1, 1, 10, 100))
> lrcv[['bestscore']] <- 0
>
> for(i in seq_len(nrow(lrcv[['para']]))){
+
+ scores <- numeric()
+ predlist <- list()
+ para <- lrcv[['para']][i]
+ for(folds in 1:5){
+ testfold <- data.train[J(FoldID =folds),on = 'FoldID']
+ trainfold <- data.train[!J(FoldID =folds),on = 'FoldID']
+
+ logreg <- LiblineaR(data = trainfold[,lrcv$features,with = F],
+ target = trainfold$Species,type = para$type,cost = para$cost)
+ testfold[,Pred := predict(logreg,testfold[,lrcv[['features']],with = F])$predictions]
+
+ scores <- c(scores,mean(testfold$Species == testfold$Pred))
+ predlist <- c(predlist,list(testfold[,list(ID,FoldID,Pred)]))
+ }
+ score <- mean(scores)
+ lrcv[['para']][i,Score:=score]
+ print(paste('Para :',paste(colnames(lrcv[['para']])),
+ lrcv[['para']][i],collapse = '|'))
+
+ if(score > lrcv[['bestscore']]){
+ lrcv[['bestscore']] <- score
+ lrcv[['bestscores']] <- scores
+ lrcv[['bestpara']] <- lrcv[['para']][i]
+ lrcv[['bestresult']] <- rbindlist(predlist)
+ }}
[1] "Para : type 0|Para : cost 0.001|Para : Score 0.921428571428571"
[1] "Para : type 0|Para : cost 0.01|Para : Score 0.96547619047619"
[1] "Para : type 0|Para : cost 0.1|Para : Score 0.948809523809524"
[1] "Para : type 0|Para : cost 1|Para : Score 0.96547619047619"
[1] "Para : type 0|Para : cost 10|Para : Score 0.96547619047619"
[1] "Para : type 0|Para : cost 100|Para : Score 0.96547619047619"
[1] "Para : type 6|Para : cost 0.001|Para : Score 0.333333333333333"
[1] "Para : type 6|Para : cost 0.01|Para : Score 0.333333333333333"
[1] "Para : type 6|Para : cost 0.1|Para : Score 0.955952380952381"
[1] "Para : type 6|Para : cost 1|Para : Score 0.975"
[1] "Para : type 6|Para : cost 10|Para : Score 0.96547619047619"
[1] "Para : type 6|Para : cost 100|Para : Score 0.96547619047619"
[1] "Para : type 7|Para : cost 0.001|Para : Score 0.929761904761905"
[1] "Para : type 7|Para : cost 0.01|Para : Score 0.96547619047619"
[1] "Para : type 7|Para : cost 0.1|Para : Score 0.948809523809524"
[1] "Para : type 7|Para : cost 1|Para : Score 0.96547619047619"
[1] "Para : type 7|Para : cost 10|Para : Score 0.96547619047619"
[1] "Para : type 7|Para : cost 100|Para : Score 0.96547619047619"
>
> data.test[,Metas.knn := knn(train = data.train[,knncv[['features']],with = F],
+ test = data.test[,knncv[['features']],with = F],
+ cl = data.train$Species,
+ k = knncv$bestpara$k)]
>
> svm <- LiblineaR(data = data.train[,svmcv[['features']],with = F],
+ target = data.train$Species,
+ type = svmcv[['BestPara']]$type,
+ cost = svmcv[['BestPara']]$cost)
> data.test[,Metas.svm := predict(svm,data.test[,svmcv[['features']],with = F])$predictions]
>
>
> data.test <- one_hot(data.test,cols=c("Metas.knn", "Metas.svm"),dropCols=FALSE)
> lr <- LiblineaR(data = data.train[,lrcv[['features']],with = F],
+ target = data.train$Species,
+ type = lrcv[['bestpara']]$type,
+ cost = lrcv[['bestpara']]$cost)
>
> data.test[,Metas.lr := predict(lr,data.test[,lrcv[['features']],with = F])$predictions]
> head(data.test)
Sepal.Length Sepal.Width Petal.Length Petal.Width Species Distance ID Metas.knn
1: -0.5353840 1.9333146 -1.165809 -1.048667 setosa 2.546204 6 setosa
2: -1.0184372 0.7861738 -1.279104 -1.311052 setosa 2.238358 8 setosa
3: -0.5353840 1.4744583 -1.279104 -1.311052 setosa 2.411561 11 setosa
4: -1.2599638 -0.1315388 -1.335752 -1.442245 setosa 2.338614 13 setosa
5: -0.5353840 0.7861738 -1.165809 -1.311052 setosa 1.995664 21 setosa
6: -0.8976739 1.4744583 -1.279104 -1.048667 setosa 2.390744 22 setosa
Metas.knn_setosa Metas.knn_versicolor Metas.knn_virginica Metas.svm Metas.svm_setosa
1: 1 0 0 setosa 1
2: 1 0 0 setosa 1
3: 1 0 0 setosa 1
4: 1 0 0 setosa 1
5: 1 0 0 setosa 1
6: 1 0 0 setosa 1
Metas.svm_versicolor Metas.svm_virginica Metas.lr
1: 0 0 setosa
2: 0 0 setosa
3: 0 0 setosa
4: 0 0 setosa
5: 0 0 setosa
6: 0 0 setosa
>
> mean(data.test$Species == data.test$Metas.knn)
[1] 0.9722222
> mean(data.test$Species == data.test$Metas.svm)
[1] 0.9722222
> mean(data.test$Species == data.test$Metas.lr)
[1] 0.9722222
>
总结:本篇博客是介绍如何使用
Stacking
模型对鸢尾花数据进行分类,由于样本量较小的问题,在对测试集进行分类识别中三种分类方法预测的结果相同,不过没有关系相信大家在本篇博客中已经学到了Stacking
模型的精髓。