# List of required packages
<- c(
required_packages "dplyr",
"ggplot2",
"ggridges",
"kableExtra",
"purrr"
)
# Install missing packages
<- required_packages[!(required_packages %in% installed.packages()[,"Package"])]
new_packages if(length(new_packages)) install.packages(new_packages)
# Load all packages
for (package in required_packages) {
library(package, character.only = TRUE)
}
Application 4: Distribution Plots
Distribution
The distribution or spread of data is often of comparable importance to measures of central tendency. There are numerous visualizations that can be used to understand distributions. Below we will explore box plots, violin plots, and ridgeline plots.
Simulate Physical Activity Data
Let’s simulate data on weekly minutes of moderate-to-vigorous physical activity (MVPA) across different job types in New England states.
set.seed(123)
# Number of observations
<- 5000
n
# Function to generate data with outliers
<- function(n, mean, sd, outlier_prob = 0.1, outlier_sd = 3) {
generate_with_outliers # Generate main distribution
<- rnorm(n, mean, sd)
main_data
# Add outliers
<- runif(n) < outlier_prob
is_outlier <- rnorm(n, mean, sd * outlier_sd)
outliers
# Combine
ifelse(is_outlier, outliers, main_data)
}
# Function to generate bimodal distribution
<- function(n, mean1, mean2, sd1, sd2, prob1 = 0.5) {
generate_bimodal # Generate two normal distributions
<- rnorm(n, mean1, sd1)
dist1 <- rnorm(n, mean2, sd2)
dist2
# Randomly choose between the two distributions
<- runif(n) < prob1
is_dist1
# Combine
ifelse(is_dist1, dist1, dist2)
}
# Simulate data
<- tibble(
activity_data # Job types
job_type = sample(c("Office Work", "Service Industry", "Healthcare", "Construction", "Education", "Professional Athlete"),
size = n,
replace = TRUE,
prob = c(0.25, 0.25, 0.2, 0.15, 0.1, 0.05))
|>
) # Generate MVPA minutes based on job type
mutate(
# Base MVPA with job-specific variance and outliers
mvpa = case_when(
== "Office Work" ~ rnorm(n, mean = 80, sd = 30),
job_type == "Service Industry" ~ generate_with_outliers(n, mean = 130, sd = 35, outlier_prob = 0.15),
job_type == "Healthcare" ~ generate_bimodal(n,
job_type mean1 = 120, sd1 = 30, # First mode (more sedentary)
mean2 = 280, sd2 = 40, # Second mode (more active)
prob1 = 0.6), # 60% in first mode
== "Education" ~ rnorm(n, mean = 160, sd = 40),
job_type == "Construction" ~ rnorm(n, mean = 250, sd = 20),
job_type == "Professional Athlete" ~ rnorm(n, mean = 450, sd = 30)
job_type
),# Ensure non-negative values
mvpa = pmax(0, mvpa)
)
# Show first few rows
|> head() |> kable() activity_data
job_type | mvpa |
---|---|
Office Work | 59.63577 |
Construction | 274.26397 |
Office Work | 58.86456 |
Education | 177.27274 |
Education | 194.10753 |
Service Industry | 86.05837 |
Box Plots
Box plots are great for showing the distribution of a continuous variable across different groups. They show: - Median (middle line) - Interquartile range (box) - Range (whiskers) - Outliers (points)
# Create box plots by job type
<- ggplot(activity_data, aes(x = job_type, y = mvpa, fill = job_type)) +
p_box geom_boxplot(alpha = 0.7) +
scale_fill_viridis_d(option = "plasma") +
labs(
title = "Weekly Minutes of Moderate-to-Vigorous Physical Activity by Job Type",
x = "Job Type",
y = "Minutes per Week",
fill = "Job Type"
+
) theme_minimal() +
theme(
plot.title = element_text(size = 12, face = "bold", hjust = 0.5, margin = margin(b = 20)),
plot.title.position = "plot",
axis.title = element_text(size = 10, face = "bold"),
axis.text = element_text(size = 8),
legend.position = "bottom",
legend.title = element_text(size = 8, face = "bold"),
legend.text = element_text(size = 8)
)
p_box
Violin Plots
Violin plots show the full distribution of the data, not just the summary statistics. They’re particularly useful for: - Seeing the shape of the distribution - Identifying multiple modes - Comparing distributions across groups
# Create violin plots by job type
<- ggplot(activity_data, aes(x = job_type, y = mvpa, fill = job_type)) +
p_violin geom_violin(alpha = 0.7) +
scale_fill_viridis_d(option = "plasma") +
labs(
title = "Weekly Minutes of Moderate-to-Vigorous Physical Activity by Job Type",
x = "Job Type",
y = "Minutes per Week",
fill = "Job Type"
+
) theme_minimal() +
theme(
plot.title = element_text(size = 12, face = "bold", hjust = 0.5, margin = margin(b = 20)),
plot.title.position = "plot",
axis.title = element_text(size = 10, face = "bold"),
axis.text = element_text(size = 8),
legend.position = "bottom",
legend.title = element_text(size = 8, face = "bold"),
legend.text = element_text(size = 8)
)
p_violin
Violin-Box Plot
# Create violin-box plot
<- ggplot(activity_data, aes(x = mvpa, y = reorder(job_type, mvpa, FUN = median), fill = job_type)) +
p_violin_box # Add violin plot
geom_violin(
alpha = 0.7,
scale = "width"
+
) # Add boxplot
geom_boxplot(
width = 0.2,
alpha = 0.7,
fill = "white",
outlier.shape = NA
+
) # Customize scales
scale_x_continuous(
name = "Weekly MVPA Minutes",
limits = c(0, 600),
breaks = seq(0, 600, 100)
+
) scale_fill_viridis_d(option = "magma") +
# Add labels
labs(
title = "Distribution of Weekly Physical Activity by Job Type",
caption = "Box shows median and quartiles, violin shows full distribution",
y = "Job Type"
+
) # Customize theme
theme_minimal() +
theme(
plot.title = element_text(size = 12, face = "bold", hjust = 0.5, margin = margin(b = 10)),
plot.caption = element_text(size = 10, hjust = 0.5, margin = margin(t = 20)),
plot.title.position = "plot",
plot.caption.position = "plot",
axis.title = element_text(size = 10, face = "bold"),
axis.text = element_text(size = 8),
panel.grid.major.y = element_blank(),
panel.grid.minor = element_blank(),
legend.position = "none"
)
p_violin_box
Ridgeline Plots
Ridgeline plots are great for comparing distributions across many groups. They’re particularly useful when: - You have many groups to compare - You want to see the full distribution for each group - You want to identify patterns across groups
# Create ridgeline plot
<- ggplot(activity_data, aes(x = mvpa, y = reorder(job_type, mvpa, FUN = median), fill = after_stat(x))) +
p_ridge # Add density ridges
geom_density_ridges_gradient(
scale = 3,
alpha = 0.7,
quantile_lines = TRUE,
quantiles = 2,
show.legend = TRUE
+
) # Customize scales
scale_x_continuous(
name = "Weekly MVPA Minutes",
limits = c(0, 600),
breaks = seq(0, 600, 100)
+
) scale_fill_viridis_c(option = "magma", name = "Minutes") +
# Add labels
labs(
title = "Distribution of Weekly Physical Activity by Job Type",
caption = "Vertical lines show median minutes of activity for each job type",
y = "Job Type"
+
) # Customize theme
theme_minimal() +
theme(
plot.title = element_text(size = 12, face = "bold", hjust = 0.5, margin = margin(b = 10)),
plot.caption = element_text(size = 10, hjust = 0.5, margin = margin(t = 20)),
plot.title.position = "plot",
plot.caption.position = "plot",
axis.title = element_text(size = 10, face = "bold"),
axis.text = element_text(size = 8),
panel.grid.major.y = element_blank(),
panel.grid.minor = element_blank(),
legend.position = "right",
legend.title = element_text(size = 8, face = "bold"),
legend.text = element_text(size = 8)
)
p_ridge