# List of required packages
<- c(
required_packages "readr",
"dplyr",
"ggplot2",
"forcats",
"tidyr",
"kableExtra",
"stringr",
"ggrepel",
"maps",
"usmap",
"sf"
)
# Install missing packages
<- required_packages[!(required_packages %in% installed.packages()[,"Package"])]
new_packages if(length(new_packages)) install.packages(new_packages)
# Load all packages
for (package in required_packages) {
library(package, character.only = TRUE)
}source(here::here("examples", "colors.R"))
Application 2: Choropleths
Load and Prepare
As usual, we start by loading and preparing the data.
<- readr::read_csv(here::here("data", "processed", "simulated_data.csv"))
data <- readr::read_csv(here::here("data", "processed", "state_data.csv"))
state_data
<- data |>
data mutate(
state = as.factor(state),
received_comprehensive_postnatal_care = as.numeric(received_comprehensive_postnatal_care),
insurance = fct_relevel(insurance, "no_insurance"),
race_ethnicity = fct_relevel(race_ethnicity, "white"),
edu = fct_relevel(edu, "hs"),
job_type = fct_relevel(job_type, "unemployed"),
)
$self_report_income <- factor(data$self_report_income, levels = c(
data"$0–$24,999",
"$25,000–$49,999",
"$50,000–$74,999",
"$75,000–$99,999",
"$100,000–$124,999",
"$125,000–$149,999",
"$150,000–$174,999",
"$175,000+"
))
# Set "$50,000–$75,000" as the reference category
$self_report_income <- fct_relevel(data$self_report_income, "$50,000–$75,000") data
Warning: 1 unknown level in `f`: $50,000–$75,000
<- c("American Indian or Alaska Native" = "aian",
race_ethnicity_labels "White" = "white",
"Black" = "black",
"Asian" = "asian",
"Hispanic" = "hispanic",
"Native Hawaiian or Pacific Islander" = "nhpi",
"Other" = "other")
|> head() |> kable() data
id | provider_id | state | received_comprehensive_postnatal_care | self_report_income | age | edu | race_ethnicity | insurance | job_type | dependents | distance_to_provider | obesity | multiple_gestation | diabetes | heart_disease | placenta_previa | hypertension | gest_hypertension | preeclampsia |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 1 | AK | 0 | $75,000–$99,999 | 34 | some_college | aian | no_insurance | unskilled | 4 | 35.146163 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
2 | 1 | AK | 1 | $75,000–$99,999 | 23 | some_college | white | private | trade | 2 | 11.901259 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
3 | 1 | AK | 0 | $25,000–$49,999 | 26 | some_college | white | no_insurance | unskilled | 3 | 19.356086 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 1 | AK | 0 | $25,000–$49,999 | 20 | hs | white | no_insurance | unskilled | 4 | 6.217129 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 | 1 | AK | 0 | $50,000–$74,999 | 25 | post_grad | white | no_insurance | professional | 3 | 29.182420 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
6 | 1 | AK | 0 | $100,000–$124,999 | 25 | less_than_hs | white | state_provided | unemployed | 1 | 0.015886 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
Basic Visualizations
Provider Trust
From our causal model, we theorize three causes of provider trust:
- Provider quality
- Race/ethnicity
- Cultural orientation (namely, trust in institutions)
We observe race, but we do not observe provider quality or cultural orientation.
Provider quality
For provider quality, we hypothesize the political and economic conditions with respect to healthcare will influence provider quality. We do not observe provider quality directly, but we do observe what state the provider is in.
Given all this, it may help us to see:
- A breakdown of race/ethnicity
- A breakdown of receipt of comprehensive care by race/ethnicity
- A breakdown of receipt of comprehensive care by state
- A breakdown of receipt of comprehensive care by race by state
Race/ethnicity
For categorical variables, we typically first want to see a bar chart of the distribution of the variable.
# Bar chart
ggplot(data, aes(race_ethnicity)) +
geom_bar() +
theme_minimal()
ggplot(
|>
data count(race_ethnicity) |>
mutate(
proportion = n / sum(n),
race_ethnicity = fct_recode(
factor(race_ethnicity),
!!! race_ethnicity_labels
)|>
) arrange(desc(proportion)) |>
mutate(race_ethnicity = fct_inorder(race_ethnicity) |> fct_rev())
,aes(x = proportion, y = race_ethnicity)
+
) geom_bar(stat = "identity",
fill = colors$blue$`100`,
color = colors$blue$`400`) +
scale_x_continuous(
labels = scales::percent_format(accuracy = 1),
expand = expansion(mult = c(0, 0.2)) # Add 20% padding on the right
+
) labs(x = "", y = "", title = "Patients by Race/Ethnicity", caption = "Source: Simulated Data") +
theme_minimal() +
theme(
axis.text = element_text(size = 12),
axis.title = element_text(size = 14, face = "bold"),
plot.title = element_text(hjust = 0.5, size = 12, face = "bold")
)
# RCP vs. Race/Ethnicity
<- data |>
plot_data group_by(race_ethnicity) |>
summarize(
proportion = mean(received_comprehensive_postnatal_care),
n = n()
|>
) ungroup() |>
mutate(
race_ethnicity = fct_recode(factor(race_ethnicity), !!!race_ethnicity_labels),
race_ethnicity = fct_reorder(race_ethnicity, proportion)
)
# Plot proportions by race
ggplot(plot_data, aes(x = proportion, y = race_ethnicity)) +
geom_bar(stat = "identity", fill = colors$blue$`100`, color = colors$blue$`400`) +
scale_x_continuous(labels = scales::percent_format(accuracy = 1)) +
labs(
x = "",
y = "",
title = "Receipt of Comprehensive Postnatal Care by Race/Ethnicity",
caption = "Source: Simulated Data"
+
) theme_minimal() +
theme(
axis.text = element_text(size = 12),
axis.title = element_text(size = 14, face = "bold"),
plot.title = element_text(hjust = 0.5, size = 12, face = "bold")
)
Choropleth
We will now map the receipt of comprehensive postnatal care by state. This is a choropleth map.
Please note that you must think hard about whether it makes sense to map the data you are mapping. For example, if you are mapping the number of people who have received comprehensive postnatal care, you must think about whether it makes sense to compare states with different populations. In our case, we know that every state is evenly represented and every person has the same probability of being in the sample. However, these assumptions rarely hold with true to life research designs.
<- us_map(regions = "states")
us_map
<- data |>
rcp_by_state_data group_by(state) |>
summarize(
proportion = mean(received_comprehensive_postnatal_care),
n = n()
|>
) ungroup()
# Merge summarized data with map data
<- us_map |>
rcp_by_state_data_mappable left_join(
rcp_by_state_data, by = c("abbr"="state")
)
ggplot(data = rcp_by_state_data_mappable) +
geom_sf(aes(fill = proportion), color = "white") +
scale_fill_continuous(
low = colors$blue$`50`,
high = colors$blue$`900`,
name = "Proportion",
labels = scales::percent,
+
) labs(
title = "Proportion Receiving Comprehensive Postnatal Care By State",
caption = "Source: Simulated Data"
+
) theme_minimal() +
theme(
axis.text = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
plot.title = element_text(hjust = 0.5, size = 12, face = "bold")
)
Small Multiples
Even choropleths benefit from faceting by small multiples.
# Will it work? Maps by state
<- data |>
race_ethnicity_care_data group_by(state, race_ethnicity) |>
summarize(
proportion = mean(received_comprehensive_postnatal_care, na.rm = TRUE),
n = n(),
.groups = "drop"
|>
) ungroup() |>
mutate(
race_ethnicity = fct_recode(race_ethnicity, !!!race_ethnicity_labels)
)
<- us_map |>
race_ethnicity_care_sf left_join(
race_ethnicity_care_data,by = c("abbr" = "state")
)
ggplot(race_ethnicity_care_sf) +
geom_sf(aes(geometry = geom, fill = proportion), color = "white", size = 0.1) +
scale_fill_continuous(
low = colors$blue$`100`,
high = colors$blue$`900`,
name = "Proportion",
labels = scales::percent,
na.value = "grey90"
+
) facet_wrap(~ race_ethnicity, ncol = 3, nrow = 3) +
labs(
title = "Proportion Receiving Comprehensive Postnatal Care by Race",
fill = "Proportion",
caption = "Source: Simulated Data"
+
) theme_minimal() +
theme(
panel.grid = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank(),
plot.title = element_text(hjust = 0.5, size = 12, face = "bold"),
strip.text = element_text(size = 10)
)
Choropleths can be useful, but they can be misleading. In fact, they often end up encoding things like population density as much as the spatial variation we are trying to visualize.