Model Step 1 - Train and Deploy Model

Published

January 24, 2025

This notebook trains a model to predict the number of bikes at a given bike docking station. The model is trained using the bike_model_data table from Content DB. The trained model is then:

Get data

Connect to the database:

con <- DBI::dbConnect(
  odbc::odbc(),
  Driver      = "postgresql",
  Server      = Sys.getenv("DB_SERVER"),
  Port        = "5432",
  Database    = "soleng",
  UID         = Sys.getenv("DB_USER"),
  PWD         = Sys.getenv("DB_PASSWORD"),
  BoolsAsChar = "",
  timeout     = 10
)

Split the data into a train/test split:

all_days <- tbl(con, DBI::Id(schema="content", name="bike_model_data"))

# Get a vector that contains all of the dates.
dates <- all_days %>%
  distinct(date) %>%
  collect() %>%
  arrange(desc(date)) %>%
  pull(date) %>%
  as.Date()

# Split the data into test and train.
n_days_test <- 2
n_days_to_train <- 10

train_end_date <- dates[n_days_test + 1]
train_start_date <- train_end_date - n_days_to_train

# Training data split.
train_data <- all_days %>%
  filter(
    date >= train_start_date,
    date <= train_end_date
  ) %>%
  distinct() %>%
  collect()

start = min(train_data$date)
end = max(train_data$date)
num_obs = scales::comma(nrow(train_data))

print(glue::glue(
  "The model will be trained on data from {start} to {end} ",
  "({num_obs} observations). "))
## The model will be trained on data from 2025-01-11 to 2025-01-21 (8,501 observations).

# Test data split.
test_data <- all_days %>%
  filter(date > train_end_date) %>%
  distinct() %>%
  collect()

start = min(test_data$date)
end = max(test_data$date)
num_obs = scales::comma(nrow(test_data))

print(glue::glue(
  "The model will be tested on data from {start} to {end} ",
  "({num_obs} observations). "))
## The model will be tested on data from 2025-01-22 to 2025-01-23 (1,541 observations).

Train the model

Data preprocessing

Define a recipe to clean the data.

# Define a recipe to clean the data.
recipe_spec <- 
  recipe(n_bikes ~ ., data = train_data) %>% 
  step_dummy(dow) %>%
  step_integer(id, date)

# Preview the cleaned training data.
recipe_spec %>% 
  prep(train_data) %>% 
  bake(head(train_data)) %>%
  glimpse()
## Rows: 6
## Columns: 13
## $ id            <int> 1, 1, 1, 1, 1, 1
## $ hour          <dbl> 0, 0, 0, 0, 0, 0
## $ date          <int> 1, 2, 3, 4, 5, 6
## $ month         <dbl> 1, 1, 1, 1, 1, 1
## $ lat           <dbl> 38.87035, 38.87035, 38.87035, 38.87035, 38.87035, 38.870…
## $ lon           <dbl> -76.94528, -76.94528, -76.94528, -76.94528, -76.94528, -…
## $ n_bikes       <dbl> 0, 0, 0, 0, 0, 0
## $ dow_Monday    <dbl> 0, 0, 1, 0, 0, 0
## $ dow_Saturday  <dbl> 1, 0, 0, 0, 0, 0
## $ dow_Sunday    <dbl> 0, 1, 0, 0, 0, 0
## $ dow_Thursday  <dbl> 0, 0, 0, 0, 0, 1
## $ dow_Tuesday   <dbl> 0, 0, 0, 1, 0, 0
## $ dow_Wednesday <dbl> 0, 0, 0, 0, 1, 0

Fit model

Fit a random forest model:

model_spec <- 
  rand_forest() %>%
  set_mode("regression") %>%
  set_engine("ranger")

model_workflow <- 
  workflow() %>%
  add_recipe(recipe_spec) %>%
  add_model(model_spec)

model_fit <- fit(model_workflow, data = train_data)
model_fit
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 2 Recipe Steps
## 
## • step_dummy()
## • step_integer()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Ranger result
## 
## Call:
##  ranger::ranger(x = maybe_data_frame(x), y = y, num.threads = 1,      verbose = FALSE, seed = sample.int(10^5, 1)) 
## 
## Type:                             Regression 
## Number of trees:                  500 
## Sample size:                      8501 
## Number of independent variables:  12 
## Mtry:                             3 
## Target node size:                 5 
## Variable importance mode:         none 
## Splitrule:                        variance 
## OOB prediction error (MSE):       17.24373 
## R squared (OOB):                  0.2657149

Model evaluation

predictions <- predict(model_fit, test_data)

results <- test_data %>%
  mutate(preds = predictions$.pred)

oos_metrics(results$n_bikes, results$preds)
## # A tibble: 1 × 4
##    rmse   mae   ccc    r2
##   <dbl> <dbl> <dbl> <dbl>
## 1  4.16  3.35 0.288 0.226

Model deployment

vetiver

Create a vetiver model object.

model_name <- "bike_predict_model_r"
pin_name <- glue("katie.masiello@posit.co/{model_name}")

# Get the train and test data ranges. This will be passed into the pin metadata
# so that other scripts can access this information.
date_metadata <- list(
  train_dates = c(
    as.character(min(train_data$date)), 
    as.character(max(train_data$date))
  ),
  test_dates = c(
    as.character(min(test_data$date)), 
    as.character(max(test_data$date))
  )
)

print(date_metadata)
## $train_dates
## [1] "2025-01-11" "2025-01-21"
## 
## $test_dates
## [1] "2025-01-22" "2025-01-23"

# Create the vetiver model.
v <- vetiver_model(
  model_fit, 
  model_name,
  versioned = TRUE,
  save_ptype = train_data %>%
    head(1) %>%
    select(-n_bikes),
  metadata = date_metadata
)

v
## 
## ── bike_predict_model_r ─ <bundled_workflow> model for deployment 
## A ranger regression modeling workflow using 7 features

pins

Save the model as a pin to Posit Connect:

# Use Posit Connect as a board.
board <- pins::board_connect(
  server = Sys.getenv("CONNECT_SERVER"),
  key = Sys.getenv("CONNECT_API_KEY"),
  versioned = TRUE
)
# Write the model to the board.
board %>%
 vetiver_pin_write(vetiver_model = v)

plumber

Then, deploy the model as a plumber API to Posit Connect.

# Add server
rsconnect::addServer(
  url = "https://pub.current.posit.team/__api__",
  name = "pub.current"
)

# Add account
rsconnect::connectApiUser(
  account = "katie.masiello@posit.co",
  server = "pub.current",
  apiKey = Sys.getenv("CONNECT_API_KEY"),
)

# Deploy to Connect
vetiver_deploy_rsconnect(
  board = board,
  name = pin_name,
  appId = "442",
  launch.browser = FALSE,
  appTitle = "Bikeshare Prediction: 03b - Model - API",
  predict_args = list(debug = FALSE),
  account = "katie.masiello@posit.co",
  server =  "pub.current"
)
## Building Plumber API...
## Bundle created with R version 4.4.1 is compatible with environment Kubernetes::654654567442.dkr.ecr.us-east-2.amazonaws.com/ptd-adhoc-pct:content-r4.4.1-py3.10.14-quarto1.4.557 with R version 4.4.1 from /opt/R/4.4.1/bin/R 
## Bundle requested R version 4.4.1; using /opt/R/4.4.1/bin/R from Kubernetes::654654567442.dkr.ecr.us-east-2.amazonaws.com/ptd-adhoc-pct:content-r4.4.1-py3.10.14-quarto1.4.557 which has version 4.4.1
## Performing manifest.json to packrat transformation.
## Determining session server location ...
## [rsc-session] Content GUID: 6570e768-2118-4e5c-aee5-97b7027ab1b0
## 2025/01/24 16:55:48.039330697 [rsc-session] Content ID: 442
## 2025/01/24 16:55:48.039333737 [rsc-session] Bundle ID: 2053
## 2025/01/24 16:55:48.039335413 [rsc-session] Job Key: ZVfn4cN4ZUMncnSp
## Connecting to session server http://service-4841e2c0-1d8e-4a60-9013-8ee09edacc3f.posit-team:50734 ...
## Connected to session server http://service-4841e2c0-1d8e-4a60-9013-8ee09edacc3f.posit-team:50734
## Running on host: packrat-restore-4pzb5-pvqzt
## 2025/01/24 16:55:48.609775343 Process ID: 39
## Linux distribution: Ubuntu 22.04.5 LTS (jammy)
## Running as user: uid=999 gid=999 groups=999
## 2025/01/24 16:55:48.617271857 Connect version: 2024.12.0
## 2025/01/24 16:55:48.617278652 LANG: en_US.UTF-8
## 2025/01/24 16:55:48.617302263 Working directory: /opt/rstudio-connect/mnt/app
## Using R 4.4.1
## 2025/01/24 16:55:48.617547198 R.home(): /opt/R/4.4.1/lib/R
## Using user agent string: 'RStudio R (4.4.1 x86_64-pc-linux-gnu x86_64 linux-gnu)' 
## Configuring packrat to use available credentials for private repository access.
## 2025/01/24 16:55:48.618319899 # Validating R library read / write permissions --------------------------------
## Using R library for packrat bootstrap: /opt/rstudio-connect/mnt/R/654654567442.dkr.ecr.us-east-2.amazonaws.com_ptd-adhoc-pct__content-r4.4.1-py3.10.14-quarto1.4.557/4.4.1
## # Validating managed packrat installation --------------------------------------
## Vendored packrat archive: /opt/rstudio-connect/ext/R/packrat_0.9.2.9000_70625806c44bda42a7f3aeaa92ee65542cc590be.tar.gz
## Vendored packrat SHA: 70625806c44bda42a7f3aeaa92ee65542cc590be
## Managed packrat SHA:  70625806c44bda42a7f3aeaa92ee65542cc590be
## Managed packrat version: 0.9.2.9000
## 2025/01/24 16:55:48.643649263 Managed packrat is up-to-date.
## # Validating packrat cache read / write permissions ----------------------------
## Using packrat cache directory: /opt/rstudio-connect/mnt/packrat/654654567442.dkr.ecr.us-east-2.amazonaws.com_ptd-adhoc-pct__content-r4.4.1-py3.10.14-quarto1.4.557/4.4.1
## # Setting packrat options and preparing lockfile -------------------------------
## Audited package hashes with local packrat installation.
## # Resolving R package repositories ---------------------------------------------
## Received repositories from Connect's configuration:
## 2025/01/24 16:55:49.363987851 - CRAN = "https://pkg.current.posit.team/cran/latest"
## 2025/01/24 16:55:49.363995583 - RSPM = "https://pkg.current.posit.team/cran/latest"
## Rewrote Posit Package Manager URLs to install binary packages:
## 2025/01/24 16:55:49.602624201 - Rewrote "CRAN" from "https://pkg.current.posit.team/cran/latest" to "https://pkg.current.posit.team/cran/__linux__/jammy/latest".
## - Rewrote "RSPM" from "https://pkg.current.posit.team/cran/latest" to "https://pkg.current.posit.team/cran/__linux__/jammy/latest".
## 2025/01/24 16:55:49.602882656 Received repositories from published content:
## 2025/01/24 16:55:49.602997510 - CRAN = "https://cloud.r-project.org"
## Combining repositories from configuration and content.
## Packages will be installed using the following repositories:
## 2025/01/24 16:55:49.604749822 - CRAN = "https://pkg.current.posit.team/cran/__linux__/jammy/latest"
## 2025/01/24 16:55:49.604758295 - RSPM = "https://pkg.current.posit.team/cran/__linux__/jammy/latest"
## - CRAN.1 = "https://cloud.r-project.org"
## # Installing required R packages with `packrat::restore()` ---------------------
## Installing KernSmooth (2.23-22) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:49.814678328 Installing MASS (7.3-61) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:49.822295318 Installing R6 (2.5.1) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:49.829277858 Installing RColorBrewer (1.1-3) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:49.835492493 Installing Rcpp (1.0.13) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:49.841167768 Installing SQUAREM (2021.1) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:49.846850940 Installing bit (4.5.0) ... 
##  OK (symlinked cache)
## Installing cli (3.6.3) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:49.858241646 Installing clipr (0.8.0) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:49.863906085 Installing codetools (0.2-20) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:49.869646187 Installing colorspace (2.1-1) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:49.875317207 Installing cpp11 (0.5.0) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:49.881031639 Installing crayon (1.5.3) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:49.886542900 Installing curl (5.2.3) ... 
##  OK (symlinked cache)
## Installing data.table (1.16.0) ... 
##  OK (symlinked cache)
## Installing digest (0.6.36) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:49.903687961 Installing fansi (1.0.6) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:49.909566972 Installing farver (2.1.2) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:49.915764628 Installing fastmap (1.2.0) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:49.922165970 Installing fs (1.6.4) ... 
##  OK (symlinked cache)
## Installing generics (0.1.3) ... 
##  OK (symlinked cache)
## Installing glue (1.7.0) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:49.941316932 Installing gower (1.0.1) ... 
##  OK (symlinked cache)
## Installing isoband (0.2.7) ... 
##  OK (symlinked cache)
## Installing jsonlite (1.8.9) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:49.961164826 Installing labeling (0.4.3) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:49.967919243 Installing lattice (0.22-6) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:49.974899676 Installing listenv (0.9.1) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:49.981815000 Installing magrittr (2.0.3) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:49.988761724 Installing mime (0.12) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:49.995295748 Installing nnet (7.3-19) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.001969562 Installing numDeriv (2016.8-1.1) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.008803713 Installing parallelly (1.38.0) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.018284581 Installing pkgconfig (2.0.3) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.025259345 Installing prettyunits (1.2.0) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.032215158 Installing rapidoc (9.3.4) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.039301138 Installing rappdirs (0.3.3) ... 
##  OK (symlinked cache)
## Installing rlang (1.1.4) ... 
##  OK (symlinked cache)
## Installing rpart (4.1.23) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.061238526 Installing shape (1.4.6.1) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.068391318 Installing sodium (1.3.2) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.075591734 Installing stringi (1.8.4) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.083080031 Installing swagger (5.17.14.1) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.090431089 Installing sys (3.4.2) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.097631755 Installing timeDate (4041.110) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.105158736 Installing utf8 (1.2.4) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.112367407 Installing viridisLite (0.4.2) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.119812018 Installing whisker (0.4.1) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.127087308 Installing withr (3.0.1) ... 
##  OK (symlinked cache)
## Installing yaml (2.3.10) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.142249137 Installing class (7.3-22) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.149368404 Installing RcppEigen (0.3.4.0.2) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.156276157 Installing bit64 (4.5.2) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.163323585 Installing globals (0.16.3) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.170431408 Installing munsell (0.5.1) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.177376648 Installing timechange (0.3.0) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.184312253 Installing tzdb (0.4.0) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.191362693 Installing progressr (0.14.0) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.198627841 Installing webutils (1.2.2) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.205534548 Installing Matrix (1.7-0) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.212655194 Installing nlme (3.1-166) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.219554966 Installing ellipsis (0.3.2) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.226849108 Installing later (1.3.2) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.233538015 Installing lifecycle (1.0.4) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.240510364 Installing lobstr (1.1.2) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.247412001 Installing diagram (1.6.5) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.254631488 Installing askpass (1.2.0) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.261788629 Installing future (1.34.0) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.269205180 Installing lubridate (1.9.3) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.276380007 Installing ranger (0.16.0) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.283373012 Installing survival (3.7-0) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.290396738 Installing mgcv (1.9-1) ... 
## 2025/01/24 16:55:50.297261942    OK (symlinked cache)
## 2025/01/24 16:55:50.297319544 Installing promises (1.3.0) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.304291461 Installing gtable (0.3.5) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.311178081 Installing scales (1.3.0) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.318494605 Installing vctrs (0.6.5) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.326169944 Installing openssl (2.2.0) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.333389515 Installing future.apply (1.11.2) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.340229078 Installing httpuv (1.6.15) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.347363099 Installing clock (0.7.1) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.354731438 Installing hms (1.1.3) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.361629867 Installing pillar (1.9.0) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.368890210 Installing purrr (1.0.2) ... 
##  OK (symlinked cache)
## Installing stringr (1.5.1) ... 
##  OK (symlinked cache)
## Installing tidyselect (1.2.1) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.389631816 Installing httr (1.4.7) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.396609170 Installing lava (1.8.0) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.403480941 Installing plumber (1.2.2) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.410823329 Installing progress (1.2.3) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.418086232 Installing tibble (3.2.1) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.425339004 Installing bundle (0.1.1) ... 
##  OK (symlinked cache)
## Installing prodlim (2024.06.25) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.439431370 Installing butcher (0.3.4) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.446698442 Installing cereal (0.1.0) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.453718164 Installing dplyr (1.1.4) ... 
##  OK (symlinked cache)
## Installing ggplot2 (3.5.1) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.467966860 Installing hardhat (1.4.0) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.475304748 Installing modelenv (0.1.1) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.482369106 Installing pins (1.3.0) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.489665558 Installing vroom (1.6.5) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.496852802 Installing ipred (0.9-15) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.503995631 Installing tidyr (1.3.1) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.510966066 Installing readr (2.1.5) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.518065092 Installing parsnip (1.2.1) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.525731966 Installing recipes (1.1.0) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.532631838 Installing vetiver (0.2.5) ... 
##  OK (symlinked cache)
## 2025/01/24 16:55:50.539688815 Installing workflows (1.1.4) ... 
##  OK (symlinked cache)
## Completed packrat build using Kubernetes::654654567442.dkr.ecr.us-east-2.amazonaws.com/ptd-adhoc-pct:content-r4.4.1-py3.10.14-quarto1.4.557 against R version: '4.4.1'
## Stopped session pings to http://service-4841e2c0-1d8e-4a60-9013-8ee09edacc3f.posit-team:50734
## Launching Plumber API...
DBI::dbDisconnect(con)