14 多个模型筛选的工作流集合

Section 10.1 中演示了如何将workflowset对象与重抽样数据集结合使用。当时只是一个简单的应用，本章将详细讨论wokflowset的使用方法。

对于我们尚未充分理解的新数据，通常无法确定到底哪种模型会表现的更好，我们需要在多个模型和多个预处理步骤的组合中进行筛选。
一个比较好的策略是在开始分析数据时花费一定时间精力尝试多种模型方法，确定其中最有效的几种，然后投入额外的精力对这一小组模型进行深入研究以便进一步调整优化。
workflowsets包是用于创建以上和管理以上步骤的工具。

本章我们使用混泥土抗压强度数据集concrete对workflowset的使用过程进行演示。

14.1 数据载入和划分

library(tidymodels)
library(tidyverse)
tidymodels_prefer()

data(concrete)
glimpse(concrete)

Rows: 1,030
Columns: 9
$ cement               <dbl> 540.0, 540.0, 332.5, 332.5, 198.6, 266.0, 380.0, …
$ blast_furnace_slag   <dbl> 0.0, 0.0, 142.5, 142.5, 132.4, 114.0, 95.0, 95.0,…
$ fly_ash              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ water                <dbl> 162, 162, 228, 228, 192, 228, 228, 228, 228, 228,…
$ superplasticizer     <dbl> 2.5, 2.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,…
$ coarse_aggregate     <dbl> 1040.0, 1055.0, 932.0, 932.0, 978.4, 932.0, 932.0…
$ fine_aggregate       <dbl> 676.0, 676.0, 594.0, 594.0, 825.5, 670.0, 594.0, …
$ age                  <int> 28, 28, 270, 365, 360, 90, 365, 28, 28, 28, 90, 2…
$ compressive_strength <dbl> 79.99, 61.89, 40.27, 41.05, 44.30, 47.03, 43.70, …

数据集包含9个变量，其中compressive_strength是响应变量，其余的为预测变量：

age的单位为天数，表示混泥土样品的存放时间，混凝土的强度随时间的增加而增强。
cement、water等表示水泥的组成成分，单位为\(kg/m^3\)。
该数据中有一些重复的混凝土配方(即相同的成分组合)，它们可能会在训练集和测试集中同时出现，导致数据泄漏，从而导致我们对模型过分乐观的评估。为了避免这种情况，我们使用每种混凝土的平均抗压强度作为响应变量进行建模。

concrete <- concrete %>%
  group_by(across(-compressive_strength)) %>%
  summarise(
    compressive_strength = mean(compressive_strength),
    .groups = "drop"
  )

# data split
set.seed(1501)
concrete_split <- initial_split(concrete, strata = compressive_strength)
contrete_train <- training(concrete_split)
contrete_test <- testing(concrete_split)

# resample
set.seed(1502)
concrete_folds <- vfold_cv(
  contrete_train,
  strata = compressive_strength,
  v = 10, repeats = 5
)

14.2 建立模型

14.2.1 建立recipe

某些模型(如神经网络，KNN，SVM等)需要对预测变量进行标准化和中心化处理，其他模型则不需要，因此我们创建两个不同的recipe对象。

normalized_recipe <-
  recipe(compressive_strength ~ ., data = contrete_train) %>%
  step_normalize(all_predictors())

poly_recipe <-
  recipe(compressive_strength ~ ., data = contrete_train) %>%
  step_poly(all_predictors()) %>% # 建立正交多项式（二次项）
  step_interact(~ all_predictors():all_predictors()) # 交互项目

14.2.2 建立模型-parsnip及其扩展包

library(rules) # 关联规则挖掘和规则模型构建
library(baguette) # 自助聚合算法包

linear_reg_spec <-
  linear_reg(penalty = tune(), mixture = tune()) %>%
  set_mode("regression") %>%
  set_engine("glmnet")

nnet_spec <-
  mlp(hidden_units = tune(), epochs = tune(), penalty = tune()) %>%
  set_mode("regression") %>%
  set_engine("nnet", MaxNWts = 2600)

mars_spec <-
  mars(prod_degree = tune()) %>%
  set_engine("earth") %>%
  set_mode("regression")

svm_r_spec <-
  svm_rbf(cost = tune(), rbf_sigma = tune()) %>%
  set_engine("kernlab") %>%
  set_mode("regression")

svm_p_spec <-
  svm_poly(cost = tune(), degree = tune()) %>%
  set_engine("kernlab") %>%
  set_mode("regression")

knn_spec <-
  nearest_neighbor(
    neighbors = tune(),
    dist_power = tune(),
    weight_func = tune()
  ) %>%
  set_engine("kknn") %>%
  set_mode("regression")

cart_spec <-
  decision_tree(cost_complexity = tune(), min_n = tune()) %>%
  set_engine("rpart") %>%
  set_mode("regression")

bag_cart_spec <-
  bag_tree() %>%
  set_engine("rpart", times = 50L) %>%
  set_mode("regression")

rf_spec <-
  rand_forest(trees = tune(), min_n = tune(), mtry = tune()) %>%
  set_engine("ranger") %>%
  set_mode("regression")

xgb_spec <-
  boost_tree(
    tree_depth = tune(), learn_rate = tune(),
    loss_reduction = tune(), min_n = tune(),
    sample_size = tune(), trees = tune()
  ) %>%
  set_engine("xgboost") %>%
  set_mode("regression")

cubist_spec <-
  cubist_rules(committees = tune(), neighbors = tune()) %>%
  set_engine("Cubist") %>%
  set_mode("regression")

# 修改神经网络模型的默认参数范围，根据前人的分析结果
nnet_param <-
  nnet_spec %>%
  extract_parameter_set_dials() %>%
  update(hidden_units = hidden_units(c(1, 27)))

14.3 建立workflowset

workflowset对象包含两个列表：

不同模型设定组成的models列表。
不同预处理步骤组成的preproc列表，包括公式、recipe对象或用于选择变量的dplyr风格选择器。
可以通过wflow_id对每个工作流进行提取。
option列是占位符，当对工作流进行评估时可添加任何参数。
result列是占位符，存放调优函数或重抽样函数的输出结果。

normalized <-
  workflow_set(
    preproc = list(normalized = normalized_recipe),
    models = list(
      SVM_radial = svm_r_spec,
      SVM_poly = svm_p_spec,
      KNN = knn_spec,
      neural_network = nnet_spec
    )
  )

# 添加神经网络模型的超参数对象修改，使用tune或finetune时会用到
normalized <-  
  normalized %>% 
  option_add(param_info = nnet_param, id = "normalized_neural_network")

normalized

# A workflow set/tibble: 4 × 4
  wflow_id                  info             option    result    
  <chr>                     <list>           <list>    <list>    
1 normalized_SVM_radial     <tibble [1 × 4]> <opts[0]> <list [0]>
2 normalized_SVM_poly       <tibble [1 × 4]> <opts[0]> <list [0]>
3 normalized_KNN            <tibble [1 × 4]> <opts[0]> <list [0]>
4 normalized_neural_network <tibble [1 × 4]> <opts[1]> <list [0]>

model_vars <- workflow_variables(
  outcomes = compressive_strength,
  predictors = everything()
)

no_pre_proc <- 
  workflow_set(
    preproc = list(simple = model_vars),
    models = list(
      MARS = mars_spec,
      CART = cart_spec,
      CATR_bagged = bag_cart_spec,
      RF = rf_spec,
      boosting = xgb_spec,
      Cubist = cubist_spec
    )
  )
no_pre_proc

# A workflow set/tibble: 6 × 4
  wflow_id           info             option    result    
  <chr>              <list>           <list>    <list>    
1 simple_MARS        <tibble [1 × 4]> <opts[0]> <list [0]>
2 simple_CART        <tibble [1 × 4]> <opts[0]> <list [0]>
3 simple_CATR_bagged <tibble [1 × 4]> <opts[0]> <list [0]>
4 simple_RF          <tibble [1 × 4]> <opts[0]> <list [0]>
5 simple_boosting    <tibble [1 × 4]> <opts[0]> <list [0]>
6 simple_Cubist      <tibble [1 × 4]> <opts[0]> <list [0]>

with_features <- 
  workflow_set(
    preproc = list(full_quad = poly_recipe),
    models = list(linear_reg = linear_reg_spec,
                  KNN = knn_spec)
  )
with_features

# A workflow set/tibble: 2 × 4
  wflow_id             info             option    result    
  <chr>                <list>           <list>    <list>    
1 full_quad_linear_reg <tibble [1 × 4]> <opts[0]> <list [0]>
2 full_quad_KNN        <tibble [1 × 4]> <opts[0]> <list [0]>

所有的输出对象都是tibble格式，并且包含一个额外的workflow_set类。可以进行行连接整合所有工作流集合。

可以通过class()函数查看

all_workflow <- 
  bind_rows(no_pre_proc, normalized, with_features) %>% 
  # 简化wflow_id
  mutate(wflow_id = gsub("(simple_)|(normalized_)", "", wflow_id))
all_workflow

# A workflow set/tibble: 12 × 4
   wflow_id             info             option    result    
   <chr>                <list>           <list>    <list>    
 1 MARS                 <tibble [1 × 4]> <opts[0]> <list [0]>
 2 CART                 <tibble [1 × 4]> <opts[0]> <list [0]>
 3 CATR_bagged          <tibble [1 × 4]> <opts[0]> <list [0]>
 4 RF                   <tibble [1 × 4]> <opts[0]> <list [0]>
 5 boosting             <tibble [1 × 4]> <opts[0]> <list [0]>
 6 Cubist               <tibble [1 × 4]> <opts[0]> <list [0]>
 7 SVM_radial           <tibble [1 × 4]> <opts[0]> <list [0]>
 8 SVM_poly             <tibble [1 × 4]> <opts[0]> <list [0]>
 9 KNN                  <tibble [1 × 4]> <opts[0]> <list [0]>
10 neural_network       <tibble [1 × 4]> <opts[1]> <list [0]>
11 full_quad_linear_reg <tibble [1 × 4]> <opts[0]> <list [0]>
12 full_quad_KNN        <tibble [1 × 4]> <opts[0]> <list [0]>

14.4 调优并评估模型

几乎所有的all_workflow成员都包括超参数。
workflow_map()函数可以将调优函数或重抽样函数应用到workflow_set()中的所有单个workflow中(默认使用tune_grid())，不用单独调用tune_grid()等函数。

# 设定并行计算选项
library(doParallel)
library(finetune)
registerDoParallel(core = round(parallel::detectCores() * 0.8))
# 创建规则网格，大小为25
## 指定grid控制选项
race_ctrl <- 
  control_race(
    save_pred = TRUE,
    parallel_over = "everything",
    save_workflow = TRUE,
    verbose = TRUE
  )

## 网格搜索-竞争法
race_results <- 
  all_workflow %>% 
  workflow_map(
    "tune_race_anova",
    seed = 1503,
    resamples = concrete_folds,
    grid = 25,
    control = race_ctrl
  )
race_results %>% 
  collect_metrics() %>% 
  filter(.metric == "rmse") %>% 
  arrange(mean)

# A tibble: 12 × 9
   wflow_id         .config preproc model .metric .estimator  mean     n std_err
   <chr>            <chr>   <chr>   <chr> <chr>   <chr>      <dbl> <int>   <dbl>
 1 boosting         pre0_m… workfl… boos… rmse    standard    4.00    50  0.0889
 2 Cubist           pre0_m… workfl… cubi… rmse    standard    4.71    50  0.0913
 3 RF               pre0_m… workfl… rand… rmse    standard    5.20    50  0.0929
 4 CATR_bagged      pre0_m… workfl… bag_… rmse    standard    5.32    50  0.0968
 5 neural_network   pre0_m… recipe  mlp   rmse    standard    5.56    50  0.0778
 6 full_quad_linea… pre0_m… recipe  line… rmse    standard    6.22    50  0.0867
 7 SVM_radial       pre0_m… recipe  svm_… rmse    standard    6.22    50  0.114 
 8 MARS             pre0_m… workfl… mars  rmse    standard    6.28    50  0.0897
 9 SVM_poly         pre0_m… recipe  svm_… rmse    standard    6.89    50  0.111 
10 CART             pre0_m… workfl… deci… rmse    standard    7.15    50  0.165 
11 full_quad_KNN    pre0_m… recipe  near… rmse    standard    8.06    50  0.148 
12 KNN              pre0_m… recipe  near… rmse    standard    8.71    50  0.136

# 探索网格搜索结果
autoplot(
  race_results,
  rank_metric = "rmse",
  metric = "rmse",
  select_best = TRUE
) +
  geom_text(
    # mean为collect_metrics(race_results)的结果
    aes(y = mean - 1/2, label = wflow_id), 
    angle = 90, hjust = 1
  ) +
  lims(y = c(3.0, 9.5)) +
  theme(legend.position = "none")

collect_metrics(race_results)

# A tibble: 24 × 9
   wflow_id    .config      preproc model .metric .estimator  mean     n std_err
   <chr>       <chr>        <chr>   <chr> <chr>   <chr>      <dbl> <int>   <dbl>
 1 MARS        pre0_mod1_p… workfl… mars  rmse    standard   6.28     50 0.0897 
 2 MARS        pre0_mod1_p… workfl… mars  rsq     standard   0.855    50 0.00452
 3 CART        pre0_mod09_… workfl… deci… rmse    standard   7.15     50 0.165  
 4 CART        pre0_mod09_… workfl… deci… rsq     standard   0.818    50 0.00786
 5 CATR_bagged pre0_mod0_p… workfl… bag_… rmse    standard   5.32     50 0.0968 
 6 CATR_bagged pre0_mod0_p… workfl… bag_… rsq     standard   0.899    50 0.00353
 7 RF          pre0_mod18_… workfl… rand… rmse    standard   5.20     50 0.0929 
 8 RF          pre0_mod18_… workfl… rand… rsq     standard   0.904    50 0.00323
 9 boosting    pre0_mod20_… workfl… boos… rmse    standard   4.00     50 0.0889 
10 boosting    pre0_mod20_… workfl… boos… rsq     standard   0.941    50 0.00271
# ℹ 14 more rows

collect_predictions(race_results)

# A tibble: 8,916 × 7
   wflow_id .config         preproc      model  .pred compressive_strength  .row
   <chr>    <chr>           <chr>        <chr>  <dbl>                <dbl> <int>
 1 MARS     pre0_mod1_post0 workflow_va… mars  -1.77                  2.33     2
 2 MARS     pre0_mod1_post0 workflow_va… mars   1.53                  3.32     5
 3 MARS     pre0_mod1_post0 workflow_va… mars   1.48                  4.78    27
 4 MARS     pre0_mod1_post0 workflow_va… mars   4.38                  4.9     65
 5 MARS     pre0_mod1_post0 workflow_va… mars  -0.772                 6.47   112
 6 MARS     pre0_mod1_post0 workflow_va… mars   9.89                  6.81    90
 7 MARS     pre0_mod1_post0 workflow_va… mars   4.88                  6.88     6
 8 MARS     pre0_mod1_post0 workflow_va… mars  10.4                   6.9     47
 9 MARS     pre0_mod1_post0 workflow_va… mars   3.84                  6.94    56
10 MARS     pre0_mod1_post0 workflow_va… mars  13.4                   7.32    63
# ℹ 8,906 more rows

14.5 确定最终模型

提取最佳模型的workflow
根据调优结果，更新最佳模型的超参数。
拟合模型并进行后续评价或分析。

# 提取最佳模型的超参数结果
best_result <-
  race_results %>%
  extract_workflow_set_result("boosting") %>% 
  select_best(metric = "rmse")
best_result

# A tibble: 1 × 7
  trees min_n tree_depth learn_rate loss_reduction sample_size .config         
  <int> <int>      <int>      <dbl>          <dbl>       <dbl> <chr>           
1  1583     2          9     0.0365         0.0422        0.25 pre0_mod20_post0

# 更新最佳模型的超参数
boosting_test_results <- 
  race_results %>%
  extract_workflow("boosting") %>% 
  finalize_workflow(best_result) %>% 
  last_fit(concrete_split)

# 查看模型评估结果
collect_metrics(boosting_test_results)

# A tibble: 2 × 4
  .metric .estimator .estimate .config        
  <chr>   <chr>          <dbl> <chr>          
1 rmse    standard       3.45  pre0_mod0_post0
2 rsq     standard       0.953 pre0_mod0_post0

# 可视化模型预测结果
boosting_test_results %>% 
  collect_predictions() %>% 
  ggplot(aes(x = compressive_strength, y = .pred)) +
  geom_point(alpha = 0.5) +
  geom_abline(color = "gray50", lty = 2) +
  coord_obs_pred() +  # 观测值和预测值的坐标轴使用相同的scale
  labs(x = "observed", y = "predicted")

结果表明，boosting模型RMSE为3.45，rsq为0.95，在测试集上的表现最好，预测值与观测值的拟合效果也很好。
如果最后使用的不是workflow，而是recipe或model，可以使用finalize_recipe()或finalize_model()函数得到最终的model_sepc对象，然后进行last_fit()。