Application 2: Choropleths

Author

Erik Westlund

Published

June 12, 2025

# List of required packages
required_packages <- c(
  "readr",
  "dplyr",
  "ggplot2",
  "forcats",
  "tidyr",
  "kableExtra",
  "stringr",
  "ggrepel",
  "maps",
  "usmap",
  "sf"
)

# Install missing packages
new_packages <- required_packages[!(required_packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

# Load all packages
for (package in required_packages) {
  library(package, character.only = TRUE)
}
source(here::here("examples", "colors.R"))

Load and Prepare

As usual, we start by loading and preparing the data.

data <- readr::read_csv(here::here("data", "processed", "simulated_data.csv"))
state_data <- readr::read_csv(here::here("data", "processed", "state_data.csv"))

data <- data |>
  mutate(
    state = as.factor(state),
    received_comprehensive_postnatal_care = as.numeric(received_comprehensive_postnatal_care),
    insurance = fct_relevel(insurance, "no_insurance"),
    race_ethnicity = fct_relevel(race_ethnicity, "white"),
    edu = fct_relevel(edu, "hs"),
    job_type = fct_relevel(job_type, "unemployed"),
  )

data$self_report_income <- factor(data$self_report_income, levels = c(
  "$0–$24,999",
  "$25,000–$49,999",
  "$50,000–$74,999",
  "$75,000–$99,999",
  "$100,000–$124,999",
  "$125,000–$149,999",
  "$150,000–$174,999",
  "$175,000+"
))

# Set "$50,000–$75,000" as the reference category
data$self_report_income <- fct_relevel(data$self_report_income, "$50,000–$75,000")

Warning: 1 unknown level in `f`: $50,000–$75,000

race_ethnicity_labels <- c("American Indian or Alaska Native" = "aian",
          "White" = "white",
          "Black" = "black",
          "Asian" = "asian",
          "Hispanic" = "hispanic",
          "Native Hawaiian or Pacific Islander" = "nhpi",
          "Other" = "other")

data |> head() |> kable()

id	provider_id	state	received_comprehensive_postnatal_care	self_report_income	age	edu	race_ethnicity	insurance	job_type	dependents	distance_to_provider	obesity	heart_disease	gest_hypertension
1	1	AK	0	$75,000–$99,999	34	some_college	aian	no_insurance	unskilled	4	35.146163	0	1	0
2	1	AK	1	$75,000–$99,999	23	some_college	white	private	trade	2	11.901259	0	0	1
3	1	AK	0	$25,000–$49,999	26	some_college	white	no_insurance	unskilled	3	19.356086	1	0	0
4	1	AK	0	$25,000–$49,999	20	hs	white	no_insurance	unskilled	4	6.217129	1	0	0
5	1	AK	0	$50,000–$74,999	25	post_grad	white	no_insurance	professional	3	29.182420	0	0	0
6	1	AK	0	$100,000–$124,999	25	less_than_hs	white	state_provided	unemployed	1	0.015886	0	0	0

Basic Visualizations

Provider Trust

From our causal model, we theorize three causes of provider trust:

Provider quality
Race/ethnicity
Cultural orientation (namely, trust in institutions)

We observe race, but we do not observe provider quality or cultural orientation.

Provider quality

For provider quality, we hypothesize the political and economic conditions with respect to healthcare will influence provider quality. We do not observe provider quality directly, but we do observe what state the provider is in.

Given all this, it may help us to see:

A breakdown of race/ethnicity
A breakdown of receipt of comprehensive care by race/ethnicity
A breakdown of receipt of comprehensive care by state
A breakdown of receipt of comprehensive care by race by state

Race/ethnicity

For categorical variables, we typically first want to see a bar chart of the distribution of the variable.

# Bar chart
ggplot(data, aes(race_ethnicity)) +
  geom_bar() + 
  theme_minimal()

  ggplot(
    data |>
      count(race_ethnicity) |>
      mutate(
        proportion = n / sum(n),
        race_ethnicity = fct_recode(
          factor(race_ethnicity),
          !!! race_ethnicity_labels
        )
      ) |>
      arrange(desc(proportion)) |>
      mutate(race_ethnicity = fct_inorder(race_ethnicity) |> fct_rev())
    ,
    aes(x = proportion, y = race_ethnicity)
  ) +
  geom_bar(stat = "identity",
           fill = colors$blue$`100`,
           color = colors$blue$`400`) +
  scale_x_continuous(
    labels = scales::percent_format(accuracy = 1),
    expand = expansion(mult = c(0, 0.2))  # Add 20% padding on the right
  ) +
  labs(x = "", y = "", title = "Patients by Race/Ethnicity", caption = "Source: Simulated Data") +
  theme_minimal() +
  theme(
    axis.text = element_text(size = 12),
    axis.title = element_text(size = 14, face = "bold"),
    plot.title = element_text(hjust = 0.5, size = 12, face = "bold")
  )

# RCP vs. Race/Ethnicity
plot_data <- data |>
  group_by(race_ethnicity) |>
  summarize(
    proportion = mean(received_comprehensive_postnatal_care),
    n = n()
  ) |>
  ungroup() |>
  mutate(
    race_ethnicity = fct_recode(factor(race_ethnicity), !!!race_ethnicity_labels),
    race_ethnicity = fct_reorder(race_ethnicity, proportion)
  )

# Plot proportions by race
ggplot(plot_data, aes(x = proportion, y = race_ethnicity)) +
  geom_bar(stat = "identity", fill = colors$blue$`100`, color = colors$blue$`400`) +
  scale_x_continuous(labels = scales::percent_format(accuracy = 1)) +
  labs(
    x = "",
    y = "",
    title = "Receipt of Comprehensive Postnatal Care by Race/Ethnicity",
    caption = "Source: Simulated Data"
  ) +
  theme_minimal() +
  theme(
    axis.text = element_text(size = 12),
    axis.title = element_text(size = 14, face = "bold"),
    plot.title = element_text(hjust = 0.5, size = 12, face = "bold")
  )

Choropleth

We will now map the receipt of comprehensive postnatal care by state. This is a choropleth map.

Please note that you must think hard about whether it makes sense to map the data you are mapping. For example, if you are mapping the number of people who have received comprehensive postnatal care, you must think about whether it makes sense to compare states with different populations. In our case, we know that every state is evenly represented and every person has the same probability of being in the sample. However, these assumptions rarely hold with true to life research designs.

us_map <- us_map(regions = "states")

rcp_by_state_data <- data |>
  group_by(state) |>
  summarize(
    proportion = mean(received_comprehensive_postnatal_care),
    n = n()
  ) |>
  ungroup()

# Merge summarized data with map data
rcp_by_state_data_mappable <- us_map |>
  left_join(
    rcp_by_state_data, 
    by = c("abbr"="state")
  )

ggplot(data = rcp_by_state_data_mappable) +
  geom_sf(aes(fill = proportion), color = "white") +
  scale_fill_continuous(
    low = colors$blue$`50`,
    high = colors$blue$`900`,
    name = "Proportion",
    labels = scales::percent,
  ) +
  labs(
    title = "Proportion Receiving Comprehensive Postnatal Care By State",
    caption = "Source: Simulated Data"
  ) +
  theme_minimal() +
  theme(
    axis.text = element_blank(),
    axis.ticks = element_blank(),
    panel.grid = element_blank(),
    plot.title = element_text(hjust = 0.5, size = 12, face = "bold")
  )

Small Multiples

Even choropleths benefit from faceting by small multiples.

# Will it work? Maps  by state
race_ethnicity_care_data <- data |>
  group_by(state, race_ethnicity) |>
  summarize(
    proportion = mean(received_comprehensive_postnatal_care, na.rm = TRUE),
    n = n(),
        .groups = "drop"
  ) |>
  ungroup() |>
  mutate(
    race_ethnicity = fct_recode(race_ethnicity, !!!race_ethnicity_labels)
  )

race_ethnicity_care_sf <- us_map |>
  left_join(
    race_ethnicity_care_data,
    by = c("abbr" = "state")
  )

ggplot(race_ethnicity_care_sf) +
  geom_sf(aes(geometry = geom, fill = proportion), color = "white", size = 0.1) +
  scale_fill_continuous(
    low = colors$blue$`100`,
    high = colors$blue$`900`,
    name = "Proportion",
    labels = scales::percent,
    na.value = "grey90"
  ) +
  facet_wrap(~ race_ethnicity, ncol = 3, nrow = 3) +
  labs(
    title = "Proportion Receiving Comprehensive Postnatal Care by Race",
    fill = "Proportion",
    caption = "Source: Simulated Data"
  ) +
  theme_minimal() +
  theme(
    panel.grid = element_blank(),
    axis.text = element_blank(),
    axis.ticks = element_blank(),
    plot.title = element_text(hjust = 0.5, size = 12, face = "bold"),
    strip.text = element_text(size = 10)
  )

Choropleths can be useful, but they can be misleading. In fact, they often end up encoding things like population density as much as the spatial variation we are trying to visualize.