ETL Step 1 - Raw Data Refresh

Published

January 25, 2025

This notebook uses the bikeHelpR package to update a database with the latest bike share data. The package pulls data from https://capitalbikeshare.com which provides an API to access bike share data. The raw data is written to the Content DB database to the bike_raw_data table.

Get data from API

Use the the bikeHelpR package to get the latest data from https://capitalbikeshare.com.

Station status

feeds_station_status <- 
  bikeHelpR::feeds_urls() %>% 
  filter(name == "station_status") %>% 
  pull(url) %>% 
  bikeHelpR::get_data() 

station_status <- 
  feeds_station_status %>%
  magrittr::extract2("data") %>%
  dplyr::mutate(time = feeds_station_status$last_updated) %>%
  dplyr::select(
    is_installed, 
    num_bikes_available, 
    last_reported, 
    is_renting, 
    eightd_has_available_keys, 
    num_docks_available, 
    num_docks_disabled, 
    is_returning, 
    station_id, num_ebikes_available, 
    num_bikes_disabled, 
    time
  )

glimpse(station_status)
Rows: 783
Columns: 12
$ is_installed              <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ num_bikes_available       <int> 10, 4, 11, 7, 3, 7, 9, 7, 3, 7, 6, 3, 0, 7, …
$ last_reported             <int> 1737766215, 1737766216, 1737766216, 17377662…
$ is_renting                <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ eightd_has_available_keys <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FA…
$ num_docks_available       <int> 2, 8, 0, 8, 12, 8, 9, 4, 16, 10, 8, 12, 19, …
$ num_docks_disabled        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ is_returning              <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ station_id                <chr> "3d8b6392-9572-4c65-b71c-fd8f03af9ab2", "8d6…
$ num_ebikes_available      <int> 0, 0, 2, 4, 0, 3, 2, 0, 0, 1, 0, 0, 0, 5, 13…
$ num_bikes_disabled        <int> 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0,…
$ time                      <dttm> 2025-01-25 00:52:29, 2025-01-25 00:52:29, 2…

Station info

# The station information endpoint.
station_information_url <- 
  bikeHelpR::feeds_urls() %>% 
  filter(name == "station_information") %>% 
  pull(url)

# Call the endpoint to obtain the JSON data.
request <- httr2::request(station_information_url)
response <- httr2::req_perform(request)
json_data <- httr2::resp_body_json(response)

# Convert the JSON data into a tibble.
station_info <- 
  json_data$data %>%
  as_tibble() %>%
  tidyr::unnest_wider(stations) %>%
  select(station_id, name, lat, lon) %>%
  distinct() %>%
  mutate(
    last_updated = as.POSIXct(
      json_data$last_updated,
      origin = "1970-01-01 00:00:00 UTC"
    )
  )

glimpse(station_info)
Rows: 783
Columns: 5
$ station_id   <chr> "6d5ad96d-a704-4fa6-8b65-3ac643c5aa93", "1890204129337198…
$ name         <chr> "Lincoln Rd & Seaton Pl NE/Harry Thomas Rec Center", "Dor…
$ lat          <dbl> 38.91500, 38.87926, 38.92570, 38.86911, 38.92926, 38.8849…
$ lon          <dbl> -77.00780, -77.23358, -77.07912, -77.27154, -77.24065, -7…
$ last_updated <dttm> 2025-01-25 00:53:29, 2025-01-25 00:53:29, 2025-01-25 00:…

Update database

Write the new data from the API to the database.

con <- dbConnect(
  odbc::odbc(),
  Driver      = "postgresql",
  Server      = Sys.getenv("DB_SERVER"),
  Port        = "5432",
  Database    = "soleng",
  UID         = Sys.getenv("DB_USER"),
  PWD         = Sys.getenv("DB_PASSWORD"),
  BoolsAsChar = "",
  timeout     = 10
)
dbWriteTable(con, 
             name = DBI::Id(schema="content", name="bike_raw_data"), 
             station_status, append = TRUE)
dbWriteTable(con, 
             name = DBI::Id(schema="content", name="bike_station_info"), 
             station_info, overwrite = TRUE)
dbDisconnect(con)
print("Raw data updated 🎉")
[1] "Raw data updated 🎉"

Update the pin

Station info will also be written to a pin. This pin will be accessed by the shiny app so that it can easily get the bike station info without connecting to the database.

board <- pins::board_connect()

# Write the model to the board.
pins::pin_write(
  board,
  x = station_info,
  type = "csv",
  name = "bike-predict-r-station-info-pin",
  title = "Bikeshare Prediction - ETL output - Pinned Station Info",
  description = "Bike station info from https://capitalbikeshare.com."
)

# prune to save only the last 10 versions
pins::pin_versions_prune(board, "bike-predict-r-station-info-pin", 10)

print("Pin 🎉")
[1] "Pin 🎉"