数据科学与机器学习案例之Stacking集成方法对鸢尾花进行分类

发布于:2023-01-19 ⋅ 阅读:(681) ⋅ 点赞:(0)

数据科学与机器学习案例之客户的信用风险与预测

数据科学与机器学习之信用卡欺诈识别(严重类失衡数据建模)

数据科学与机器学习案例之汽车目标客户销售策略研究

数据科学与机器学习案例之WiFi定位系统的位置预测

数据科学与机器学习案例之Stacking集成方法对鸢尾花进行分类

数据介绍

鸢尾花数据是R语言中自带的数据,这里对鸢尾花数据进行Stacking模型集成对鸢尾花数据进行分类。

> str(iris)
'data.frame':   150 obs. of  5 variables:
 $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
 $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
 $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
 $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
 $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

对于鸢尾花中的数据进行高维可视化,我们使用R语言FactoMineR中的函数对鸢尾花数据进行降维展示。

> library(ggplot2)
> library(factoextra)
> library(FactoMineR)
> df <- iris[c(1, 2, 3, 4)]
> iris.pca<- PCA(df, graph = FALSE)
> fviz_pca_ind(iris.pca,
+  geom.ind = "point", 
+  pointsize =3,pointshape = 21,fill.ind = iris$Species, 
+  palette = c("#E41A1C", "#4DAF4A", "#999999"),
+  addEllipses = TRUE,  legend.title = "Groups",
+  title="")+theme_grey()

在这里插入图片描述

Stacking模型集成

数据处理

> library(ggplot2)
> library(data.table)
> library(mltools)
> library(class) # k均值建模
> library(LiblineaR)
> setwd('E:\\RClass\\建模\\模型集成对鸢尾花数据进行分类')
> iris1 = data.table(iris)
> iris1[,Sepal.Length := scale(Sepal.Length,T,T)]
> iris1[,Sepal.Width := scale(Sepal.Width,T,T)]
> iris1[,Petal.Length := scale(Petal.Length,T,T)]
> iris1[,Petal.Width := scale(Petal.Width,T,T)]
> 
> iris1[,Distance := sqrt(Sepal.Length^2 + Sepal.Width^2 +
+                        Petal.Length^2 + Petal.Width^2)]
> 
> iris1[,ID := 1:nrow(iris1)]
> colnames(iris1)
> p1 = caret::createDataPartition(iris$Species,p = .75,list = TRUE)[[1]]
> data.train = iris1[p1,]
> data.test = iris1[-p1,]
> 
> data.train[,FoldID := folds(Species,nfolds = 5,stratified = T,seed = 2022)]


knn

> knncv <- list()
> knncv[['features']] <- c('Sepal.Length','Sepal.Width','Petal.Length','Petal.Width')
> knncv[['para']] <- CJ(k = seq(1,30))
> knncv[['bestscore']] <- 0
> 
> for(i in seq_len(nrow(knncv[['para']]))){
+         scores <- numeric()
+         predlist <- list()
+         para <- knncv[['para']][i]
+       for(folds in 1:5){
+ 
+           testFold <- data.train[J(FoldID = folds),on = 'FoldID']
+           trainFold <- data.train[!J(FoldID = folds),on = 'FoldID']
+           
+           testFold[,Pred:=knn(train = trainFold[,knncv[['features']],with = F],
+                           test = testFold[,knncv[['features']],with = F],
+                           cl = trainFold$Species,k = para$k)]  
+           predlist <- c(predlist,list(testFold[,list(ID,FoldID,Pred)]))
+           
+           score <- mean(testFold$Species == testFold$Pred)
+           scores <- c(scores,score) }
+ 
+           score <- mean(scores,na.rm = T)
+           knncv[['para']][i,Score:=score] # 打印每个参数的值
+         print(paste('Para:',paste(colnames(knncv[['para']])),knncv[['para']][i],collapse = '|'))
+ 
+        if(score > knncv[['bestscore']]){
+              knncv[['bestscore']] <- score
+              knncv[['bestscores']] <- scores
+              knncv[['bestpara']] <- knncv[['para']][i] 
+              knncv[['bestresults']] <- rbindlist(predlist)
+                                         }
+                            }
[1] "Para: k 1|Para: Score 0.930952380952381"
[1] "Para: k 2|Para: Score 0.929761904761905"
[1] "Para: k 3|Para: Score 0.929761904761905"
[1] "Para: k 4|Para: Score 0.920238095238095"
[1] "Para: k 5|Para: Score 0.948809523809524"
[1] "Para: k 6|Para: Score 0.947619047619048"
[1] "Para: k 7|Para: Score 0.957142857142857"
[1] "Para: k 8|Para: Score 0.957142857142857"
[1] "Para: k 9|Para: Score 0.957142857142857"
[1] "Para: k 10|Para: Score 0.957142857142857"
[1] "Para: k 11|Para: Score 0.957142857142857"
[1] "Para: k 12|Para: Score 0.939285714285714"
[1] "Para: k 13|Para: Score 0.957142857142857"
[1] "Para: k 14|Para: Score 0.957142857142857"
[1] "Para: k 15|Para: Score 0.957142857142857"
[1] "Para: k 16|Para: Score 0.957142857142857"
[1] "Para: k 17|Para: Score 0.957142857142857"
[1] "Para: k 18|Para: Score 0.948809523809524"
[1] "Para: k 19|Para: Score 0.932142857142857"
[1] "Para: k 20|Para: Score 0.94047619047619"
[1] "Para: k 21|Para: Score 0.948809523809524"
[1] "Para: k 22|Para: Score 0.94047619047619"
[1] "Para: k 23|Para: Score 0.923809523809524"
[1] "Para: k 24|Para: Score 0.923809523809524"
[1] "Para: k 25|Para: Score 0.91547619047619"
[1] "Para: k 26|Para: Score 0.91547619047619"
[1] "Para: k 27|Para: Score 0.91547619047619"
[1] "Para: k 28|Para: Score 0.886904761904762"
[1] "Para: k 29|Para: Score 0.914285714285714"
[1] "Para: k 30|Para: Score 0.923809523809524"
> knncv[["bestpara"]] # 小样本情形下的准确率很差 
   k     Score
1: 7 0.9571429
> 

我们将鸢尾花数据分为了训练集与测试集,使用k近邻模型进行了5折交叉验证,选择出的最优参数为k=7,模型准确率为95.7%,理论上讲Stacking模型集成能提高鸢尾花数据分类的准确率,但是k近邻模型的准确率已经达到95.7%,在使用Stacking模型集成上并不能保证会提高准确率。

> ggplot(knncv[['para']],aes(x=k,y = Score))+
+ geom_line()+
+ geom_point()+
+ theme_grey()

svm

> svmcv <- list()
> svmcv[["features"]] <- c("Sepal.Length", "Sepal.Width", "Petal.Length",
+                          'Petal.Width','Distance')
> svmcv[["para"]] <- CJ(type=1:5, cost=c(.01, .1, 1, 10, 100, 1000, 2000), Score=NA_real_)
> svmcv[["BestScore"]] <- 0
> 
> for(i in seq_len(nrow(svmcv[['para']]))){
+ 
+        scores <- numeric() # 储存每折交叉验证得到的准确率
+        predlist <- list() # 储存五折交叉验证的结果
+        para <- svmcv[['para']][i] # 参数
+        
+      for(folds in 1:5){
+        testfold <- data.train[J(FoldID = folds),on = 'FoldID'] 
+        trainfold <- data.train[!J(FoldID = folds),on = 'FoldID']
+        
+        # 对每折的数据进行预测
+        svm <- LiblineaR(data = trainfold[,svmcv$features,with = F],
+                         target = trainfold$Species,
+                         type = para$type,
+                         cost = para$cost )  # 对交叉验证中的数据进行预测
+        testfold[,Pred:=predict(svm,testfold[,svmcv$features,with = F])$predictions]  
+        
+        scores <- c(scores,mean(testfold$Species == testfold$Pred,na.rm = T))
+        
+        predlist <-c(predlist,list(testfold[,list(ID,FoldID,Pred)]))
+                } 
+        score <- mean(scores) # 对参数下的5折交叉验证进行平均
+        svmcv[['para']][i,Score:=score]
+        print(paste('Para:',paste(colnames(svmcv[['para']])),
+               svmcv[['para']][i],collapse = '|')) # 输出每个参数的汇总值
+        if(score > svmcv[['BestScore']]){
+               svmcv[['BestScore']] <- score
+               svmcv[['BestScores']] <- scores
+               svmcv[['BestPara']] <- svmcv[['para']][i]
+               svmcv[['BestResults']] <- rbindlist(predlist) }
+                                   }
[1] "Para: type 1|Para: cost 0.01|Para: Score 0.853571428571429"
[1] "Para: type 1|Para: cost 0.1|Para: Score 0.939285714285714"
[1] "Para: type 1|Para: cost 1|Para: Score 0.955952380952381"
[1] "Para: type 1|Para: cost 10|Para: Score 0.975"
[1] "Para: type 1|Para: cost 100|Para: Score 0.964285714285714"
[1] "Para: type 1|Para: cost 1000|Para: Score 0.947619047619048"
[1] "Para: type 1|Para: cost 2000|Para: Score 0.938095238095238"
[1] "Para: type 2|Para: cost 0.01|Para: Score 0.853571428571429"
[1] "Para: type 2|Para: cost 0.1|Para: Score 0.939285714285714"
[1] "Para: type 2|Para: cost 1|Para: Score 0.955952380952381"
[1] "Para: type 2|Para: cost 10|Para: Score 0.975"
[1] "Para: type 2|Para: cost 100|Para: Score 0.973809523809524"
[1] "Para: type 2|Para: cost 1000|Para: Score 0.964285714285714"
[1] "Para: type 2|Para: cost 2000|Para: Score 0.964285714285714"
[1] "Para: type 3|Para: cost 0.01|Para: Score 0.666666666666667"
[1] "Para: type 3|Para: cost 0.1|Para: Score 0.913095238095238"
[1] "Para: type 3|Para: cost 1|Para: Score 0.947619047619048"
[1] "Para: type 3|Para: cost 10|Para: Score 0.957142857142857"
[1] "Para: type 3|Para: cost 100|Para: Score 0.955952380952381"
[1] "Para: type 3|Para: cost 1000|Para: Score 0.946428571428571"
[1] "Para: type 3|Para: cost 2000|Para: Score 0.946428571428571"
[1] "Para: type 4|Para: cost 0.01|Para: Score 0.666666666666667"
[1] "Para: type 4|Para: cost 0.1|Para: Score 0.939285714285714"
[1] "Para: type 4|Para: cost 1|Para: Score 0.975"
[1] "Para: type 4|Para: cost 10|Para: Score 0.96547619047619"
[1] "Para: type 4|Para: cost 100|Para: Score 0.955952380952381"
[1] "Para: type 4|Para: cost 1000|Para: Score 0.947619047619048"
[1] "Para: type 4|Para: cost 2000|Para: Score 0.947619047619048"
[1] "Para: type 5|Para: cost 0.01|Para: Score 0.666666666666667"
[1] "Para: type 5|Para: cost 0.1|Para: Score 0.921428571428571"
[1] "Para: type 5|Para: cost 1|Para: Score 0.957142857142857"
[1] "Para: type 5|Para: cost 10|Para: Score 0.975"
[1] "Para: type 5|Para: cost 100|Para: Score 0.975"
[1] "Para: type 5|Para: cost 1000|Para: Score 0.975"
[1] "Para: type 5|Para: cost 2000|Para: Score 0.975"
> 
> svmcv$BestPara
   type cost Score
1:    1   10 0.975
> ggplot(svmcv[['para']],aes(x = cost,y = Score,color = factor(type)))+
+ geom_line()+
+ geom_point()+
+ scale_x_log10()
> 

Stacking模型集成

> dim(knncv[['bestresults']])
[1] 114   3
> dim(svmcv[['BestResults']])
[1] 114   3
> 
> metas.knn <- knncv[['bestresults']]
> metas.svm <- svmcv[['BestResults']]
> 
> class(metas.knn)
[1] "data.table" "data.frame"
> dim(metas.knn)
[1] 114   3
> head(metas.knn)
   ID FoldID   Pred
1:  1      1 setosa
2:  3      1 setosa
3:  5      1 setosa
4: 19      1 setosa
5: 24      1 setosa
6: 26      1 setosa
> 
> data.train[metas.knn,Metas.knn := Pred,on = 'ID']
> data.train[metas.svm,Metas.svm:= Pred,on = 'ID']
> 
> data.train <- one_hot(data.train, cols=c("Metas.knn", "Metas.svm"), dropCols=FALSE) # 进行one_hot编码
> lrcv <- list()
> lrcv[['features']] <- setdiff(colnames(data.train),c('ID','Species','FoldID','Metas.knn','Metas.svm'))
> lrcv[['para']] <- CJ(type = c(0,6,7),cost = c(.001, .01, .1, 1, 10, 100))
> lrcv[['bestscore']] <- 0
> 
> for(i in seq_len(nrow(lrcv[['para']]))){
+  
+        scores <- numeric()
+        predlist <- list()
+        para <- lrcv[['para']][i]
+     for(folds in 1:5){
+         testfold <- data.train[J(FoldID =folds),on = 'FoldID']
+         trainfold <- data.train[!J(FoldID =folds),on = 'FoldID']
+         
+         logreg <- LiblineaR(data = trainfold[,lrcv$features,with = F],
+                      target = trainfold$Species,type = para$type,cost = para$cost)
+         testfold[,Pred := predict(logreg,testfold[,lrcv[['features']],with = F])$predictions]
+ 
+         scores <- c(scores,mean(testfold$Species == testfold$Pred))
+         predlist <- c(predlist,list(testfold[,list(ID,FoldID,Pred)]))
+                        }
+         score <- mean(scores)
+         lrcv[['para']][i,Score:=score]
+         print(paste('Para :',paste(colnames(lrcv[['para']])),
+                   lrcv[['para']][i],collapse = '|'))
+         
+         if(score > lrcv[['bestscore']]){
+                    lrcv[['bestscore']] <- score
+                    lrcv[['bestscores']] <- scores
+                    lrcv[['bestpara']] <- lrcv[['para']][i]
+                    lrcv[['bestresult']] <- rbindlist(predlist)
+                                     }}
[1] "Para : type 0|Para : cost 0.001|Para : Score 0.921428571428571"
[1] "Para : type 0|Para : cost 0.01|Para : Score 0.96547619047619"
[1] "Para : type 0|Para : cost 0.1|Para : Score 0.948809523809524"
[1] "Para : type 0|Para : cost 1|Para : Score 0.96547619047619"
[1] "Para : type 0|Para : cost 10|Para : Score 0.96547619047619"
[1] "Para : type 0|Para : cost 100|Para : Score 0.96547619047619"
[1] "Para : type 6|Para : cost 0.001|Para : Score 0.333333333333333"
[1] "Para : type 6|Para : cost 0.01|Para : Score 0.333333333333333"
[1] "Para : type 6|Para : cost 0.1|Para : Score 0.955952380952381"
[1] "Para : type 6|Para : cost 1|Para : Score 0.975"
[1] "Para : type 6|Para : cost 10|Para : Score 0.96547619047619"
[1] "Para : type 6|Para : cost 100|Para : Score 0.96547619047619"
[1] "Para : type 7|Para : cost 0.001|Para : Score 0.929761904761905"
[1] "Para : type 7|Para : cost 0.01|Para : Score 0.96547619047619"
[1] "Para : type 7|Para : cost 0.1|Para : Score 0.948809523809524"
[1] "Para : type 7|Para : cost 1|Para : Score 0.96547619047619"
[1] "Para : type 7|Para : cost 10|Para : Score 0.96547619047619"
[1] "Para : type 7|Para : cost 100|Para : Score 0.96547619047619"
> 
> data.test[,Metas.knn := knn(train = data.train[,knncv[['features']],with = F],
+                  test = data.test[,knncv[['features']],with = F],
+                  cl = data.train$Species,
+                  k = knncv$bestpara$k)]
> 
> svm <- LiblineaR(data = data.train[,svmcv[['features']],with = F],
+                  target = data.train$Species,
+                  type = svmcv[['BestPara']]$type,
+                  cost = svmcv[['BestPara']]$cost)
> data.test[,Metas.svm := predict(svm,data.test[,svmcv[['features']],with = F])$predictions]
> 
> 
> data.test <- one_hot(data.test,cols=c("Metas.knn", "Metas.svm"),dropCols=FALSE)
> lr <- LiblineaR(data = data.train[,lrcv[['features']],with = F],
+                 target = data.train$Species,
+                 type = lrcv[['bestpara']]$type,
+                 cost = lrcv[['bestpara']]$cost)
> 
> data.test[,Metas.lr := predict(lr,data.test[,lrcv[['features']],with = F])$predictions]
> head(data.test)
   Sepal.Length Sepal.Width Petal.Length Petal.Width Species Distance ID Metas.knn
1:   -0.5353840   1.9333146    -1.165809   -1.048667  setosa 2.546204  6    setosa
2:   -1.0184372   0.7861738    -1.279104   -1.311052  setosa 2.238358  8    setosa
3:   -0.5353840   1.4744583    -1.279104   -1.311052  setosa 2.411561 11    setosa
4:   -1.2599638  -0.1315388    -1.335752   -1.442245  setosa 2.338614 13    setosa
5:   -0.5353840   0.7861738    -1.165809   -1.311052  setosa 1.995664 21    setosa
6:   -0.8976739   1.4744583    -1.279104   -1.048667  setosa 2.390744 22    setosa
   Metas.knn_setosa Metas.knn_versicolor Metas.knn_virginica Metas.svm Metas.svm_setosa
1:                1                    0                   0    setosa                1
2:                1                    0                   0    setosa                1
3:                1                    0                   0    setosa                1
4:                1                    0                   0    setosa                1
5:                1                    0                   0    setosa                1
6:                1                    0                   0    setosa                1
   Metas.svm_versicolor Metas.svm_virginica Metas.lr
1:                    0                   0   setosa
2:                    0                   0   setosa
3:                    0                   0   setosa
4:                    0                   0   setosa
5:                    0                   0   setosa
6:                    0                   0   setosa
> 
> mean(data.test$Species == data.test$Metas.knn)
[1] 0.9722222
> mean(data.test$Species == data.test$Metas.svm)
[1] 0.9722222
> mean(data.test$Species == data.test$Metas.lr)
[1] 0.9722222
> 

总结:本篇博客是介绍如何使用Stacking模型对鸢尾花数据进行分类,由于样本量较小的问题,在对测试集进行分类识别中三种分类方法预测的结果相同,不过没有关系相信大家在本篇博客中已经学到了Stacking模型的精髓。


网站公告

今日签到

点亮在社区的每一天
去签到