ETL Step 2 - Tidy data

Published

December 7, 2024

This notebook tidies the raw raw_data_table from the Content DB database. The tidy data is written back to the Content DB database to the bike_model_data table.

Get data from database

Connect to Content DB to get the bike_raw_data and bike_station_info table.

con <- DBI::dbConnect(
  odbc::odbc(),
  Driver      = "postgresql",
  Server      = Sys.getenv("DB_SERVER"),
  Port        = "5432",
  Database    = "soleng",
  UID         = Sys.getenv("DB_USER"),
  PWD         = Sys.getenv("DB_PASSWORD"),
  BoolsAsChar = "",
  timeout     = 10
)

bike_raw_data <- tbl(con, 
                     DBI::Id(schema="content", name="bike_raw_data"))
glimpse(bike_raw_data)
Rows: ??
Columns: 12
Database: postgres  [soleng@localhost:/soleng]
$ station_id                <chr> "53", "72", "91", "106", "349", "1", "2", "3…
$ num_bikes_available       <dbl> 12, 9, 4, 15, 22, 13, 10, 5, 11, 6, 16, 9, 5…
$ num_ebikes_available      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ num_bikes_disabled        <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,…
$ num_docks_available       <dbl> 6, 10, 18, 20, 9, 2, 1, 12, 0, 5, 3, 5, 14, …
$ num_docks_disabled        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ is_installed              <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ is_renting                <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ is_returning              <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ last_reported             <dbl> 1568996717, 1568996326, 1568996591, 15689965…
$ eightd_has_available_keys <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FA…
$ time                      <dttm> 2019-09-20 16:25:54, 2019-09-20 16:25:54, 2…
bike_station_info <- tbl(con, DBI::Id(schema="content", name="bike_station_info"))
glimpse(bike_station_info)
Rows: ??
Columns: 5
Database: postgres  [soleng@localhost:/soleng]
$ station_id   <chr> "6edca550-d78f-4c5d-ad2c-79d1ce88c48d", "84a0159b-5f00-41…
$ name         <chr> "White Oak Transit Center", "1st & Q St SW", "Shady Grove…
$ lat          <dbl> 39.04104, 38.87082, 39.11977, 38.79306, 38.88332, 38.9586…
$ lon          <dbl> -76.98713, -77.01192, -77.16609, -77.04944, -76.92531, -7…
$ last_updated <dttm> 2024-12-07 00:52:55, 2024-12-07 00:52:55, 2024-12-07 00:…

Tidy the data

Apply the data tidying steps. All of the tidying steps are performed in SQL. The results are written back to Content DB to the bike_model_data table.

if (odbc::dbExistsTable(con, DBI::Id(schema="content", name="bike_model_data"))) {
  odbc::dbRemoveTable(con, DBI::Id(schema="content", name="bike_model_data"))
}

# Build a SQL query to tidy the data.
query <- bike_raw_data %>% 
    group_by(
        id = station_id, 
        hour = hour(time), 
        date = date(time), 
        month = month(time), 
        dow = TRIM(to_char(time, "Day"))
    ) %>%
    summarize(
        n_bikes = mean(num_bikes_available, na.rm = TRUE),
        .groups = "drop"
    ) %>%
    inner_join(
        select(bike_station_info, id = station_id, lat, lon)
    ) %>%
    dbplyr::sql_render() %>%
    stringr::str_replace("SELECT", "CREATE TABLE content.bike_model_data AS SELECT")

# Execute the SQL query.
odbc::dbSendQuery(con, query)
<OdbcResult>
  SQL  CREATE TABLE content.bike_model_data AS SELECT "LHS".*, "lat", "lon"
FROM (
  SELECT
    "id",
    "hour",
    "date",
    "month",
    "dow",
    AVG("num_bikes_available") AS "n_bikes"
  FROM (
    SELECT
      "bike_raw_data".*,
      "station_id" AS "id",
      EXTRACT(hour FROM "time") AS "hour",
      date("time") AS "date",
      EXTRACT(MONTH FROM "time") AS "month",
      TRIM(to_char("time", 'Day')) AS "dow"
    FROM "content"."bike_raw_data"
  ) AS "q01"
  GROUP BY "id", "hour", "date", "month", "dow"
) AS "LHS"
INNER JOIN "content"."bike_station_info"
  ON ("LHS"."id" = "bike_station_info"."station_id")
  ROWS Fetched: 0 [complete]
       Changed: 3899060
# Preview the table.
tbl(con, DBI::Id(schema="content", name="bike_model_data"))
# Source:   table<"content"."bike_model_data"> [?? x 8]
# Database: postgres  [soleng@localhost:/soleng]
   id                            hour date       month dow   n_bikes   lat   lon
   <chr>                        <dbl> <date>     <dbl> <chr>   <dbl> <dbl> <dbl>
 1 0099b016-32c9-4536-ac4c-dcc…     0 2023-03-22     3 Wedn…       2  38.9 -76.9
 2 0099b016-32c9-4536-ac4c-dcc…     0 2023-03-23     3 Thur…       2  38.9 -76.9
 3 0099b016-32c9-4536-ac4c-dcc…     0 2023-03-24     3 Frid…       2  38.9 -76.9
 4 0099b016-32c9-4536-ac4c-dcc…     0 2023-03-25     3 Satu…       2  38.9 -76.9
 5 0099b016-32c9-4536-ac4c-dcc…     0 2023-03-26     3 Sund…       2  38.9 -76.9
 6 0099b016-32c9-4536-ac4c-dcc…     0 2023-03-27     3 Mond…       2  38.9 -76.9
 7 0099b016-32c9-4536-ac4c-dcc…     0 2023-03-28     3 Tues…       2  38.9 -76.9
 8 0099b016-32c9-4536-ac4c-dcc…     0 2023-03-29     3 Wedn…       2  38.9 -76.9
 9 0099b016-32c9-4536-ac4c-dcc…     0 2023-03-30     3 Thur…       2  38.9 -76.9
10 0099b016-32c9-4536-ac4c-dcc…     0 2023-03-31     3 Frid…       2  38.9 -76.9
# ℹ more rows
odbc::dbDisconnect(con)
print("Complete 🎉")
[1] "Complete 🎉"