Application 4: Distribution Plots

Author

Erik Westlund

Published

June 12, 2025

# List of required packages
required_packages <- c(
  "dplyr",
  "ggplot2",
  "ggridges",
  "kableExtra",
  "purrr"
)

# Install missing packages
new_packages <- required_packages[!(required_packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

# Load all packages
for (package in required_packages) {
  library(package, character.only = TRUE)
}

Distribution

The distribution or spread of data is often of comparable importance to measures of central tendency. There are numerous visualizations that can be used to understand distributions. Below we will explore box plots, violin plots, and ridgeline plots.

Simulate Physical Activity Data

Let’s simulate data on weekly minutes of moderate-to-vigorous physical activity (MVPA) across different job types in New England states.

set.seed(123)

# Number of observations
n <- 5000

# Function to generate data with outliers
generate_with_outliers <- function(n, mean, sd, outlier_prob = 0.1, outlier_sd = 3) {
  # Generate main distribution
  main_data <- rnorm(n, mean, sd)
  
  # Add outliers
  is_outlier <- runif(n) < outlier_prob
  outliers <- rnorm(n, mean, sd * outlier_sd)
  
  # Combine
  ifelse(is_outlier, outliers, main_data)
}

# Function to generate bimodal distribution
generate_bimodal <- function(n, mean1, mean2, sd1, sd2, prob1 = 0.5) {
  # Generate two normal distributions
  dist1 <- rnorm(n, mean1, sd1)
  dist2 <- rnorm(n, mean2, sd2)
  
  # Randomly choose between the two distributions
  is_dist1 <- runif(n) < prob1
  
  # Combine
  ifelse(is_dist1, dist1, dist2)
}

# Simulate data
activity_data <- tibble(
  # Job types
  job_type = sample(c("Office Work", "Service Industry", "Healthcare", "Construction", "Education", "Professional Athlete"), 
                   size = n, 
                   replace = TRUE,
                   prob = c(0.25, 0.25, 0.2, 0.15, 0.1, 0.05))
) |>
  # Generate MVPA minutes based on job type
  mutate(
    # Base MVPA with job-specific variance and outliers
    mvpa = case_when(
      job_type == "Office Work" ~ rnorm(n, mean = 80, sd = 30),
      job_type == "Service Industry" ~ generate_with_outliers(n, mean = 130, sd = 35, outlier_prob = 0.15),
      job_type == "Healthcare" ~ generate_bimodal(n, 
                                                 mean1 = 120, sd1 = 30,  # First mode (more sedentary)
                                                 mean2 = 280, sd2 = 40,  # Second mode (more active)
                                                 prob1 = 0.6),           # 60% in first mode
      job_type == "Education" ~ rnorm(n, mean = 160, sd = 40),
      job_type == "Construction" ~ rnorm(n, mean = 250, sd = 20),
      job_type == "Professional Athlete" ~ rnorm(n, mean = 450, sd = 30)
    ),
    # Ensure non-negative values
    mvpa = pmax(0, mvpa)
  )

# Show first few rows
activity_data |> head() |> kable()
job_type mvpa
Office Work 59.63577
Construction 274.26397
Office Work 58.86456
Education 177.27274
Education 194.10753
Service Industry 86.05837

Box Plots

Box plots are great for showing the distribution of a continuous variable across different groups. They show: - Median (middle line) - Interquartile range (box) - Range (whiskers) - Outliers (points)

# Create box plots by job type
p_box <- ggplot(activity_data, aes(x = job_type, y = mvpa, fill = job_type)) +
  geom_boxplot(alpha = 0.7) +
  scale_fill_viridis_d(option = "plasma") +
  labs(
    title = "Weekly Minutes of Moderate-to-Vigorous Physical Activity by Job Type",
    x = "Job Type",
    y = "Minutes per Week",
    fill = "Job Type"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 12, face = "bold", hjust = 0.5, margin = margin(b = 20)),
    plot.title.position = "plot",
    axis.title = element_text(size = 10, face = "bold"),
    axis.text = element_text(size = 8),
    legend.position = "bottom",
    legend.title = element_text(size = 8, face = "bold"),
    legend.text = element_text(size = 8)
  )

p_box

Violin Plots

Violin plots show the full distribution of the data, not just the summary statistics. They’re particularly useful for: - Seeing the shape of the distribution - Identifying multiple modes - Comparing distributions across groups

# Create violin plots by job type
p_violin <- ggplot(activity_data, aes(x = job_type, y = mvpa, fill = job_type)) +
  geom_violin(alpha = 0.7) +
  scale_fill_viridis_d(option = "plasma") +
  labs(
    title = "Weekly Minutes of Moderate-to-Vigorous Physical Activity by Job Type",
    x = "Job Type",
    y = "Minutes per Week",
    fill = "Job Type"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 12, face = "bold", hjust = 0.5, margin = margin(b = 20)),
    plot.title.position = "plot",
    axis.title = element_text(size = 10, face = "bold"),
    axis.text = element_text(size = 8),
    legend.position = "bottom",
    legend.title = element_text(size = 8, face = "bold"),
    legend.text = element_text(size = 8)
  )

p_violin

Violin-Box Plot

# Create violin-box plot
p_violin_box <- ggplot(activity_data, aes(x = mvpa, y = reorder(job_type, mvpa, FUN = median), fill = job_type)) +
  # Add violin plot
  geom_violin(
    alpha = 0.7,
    scale = "width"
  ) +
  # Add boxplot
  geom_boxplot(
    width = 0.2,
    alpha = 0.7,
    fill = "white",
    outlier.shape = NA
  ) +
  # Customize scales
  scale_x_continuous(
    name = "Weekly MVPA Minutes",
    limits = c(0, 600),
    breaks = seq(0, 600, 100)
  ) +
  scale_fill_viridis_d(option = "magma") +
  # Add labels
  labs(
    title = "Distribution of Weekly Physical Activity by Job Type",
    caption = "Box shows median and quartiles, violin shows full distribution",
    y = "Job Type"
  ) +
  # Customize theme
  theme_minimal() +
  theme(
    plot.title = element_text(size = 12, face = "bold", hjust = 0.5, margin = margin(b = 10)),
    plot.caption = element_text(size = 10, hjust = 0.5, margin = margin(t = 20)),
    plot.title.position = "plot",
    plot.caption.position = "plot",
    axis.title = element_text(size = 10, face = "bold"),
    axis.text = element_text(size = 8),
    panel.grid.major.y = element_blank(),
    panel.grid.minor = element_blank(),
    legend.position = "none"
  )

p_violin_box

Ridgeline Plots

Ridgeline plots are great for comparing distributions across many groups. They’re particularly useful when: - You have many groups to compare - You want to see the full distribution for each group - You want to identify patterns across groups

# Create ridgeline plot
p_ridge <- ggplot(activity_data, aes(x = mvpa, y = reorder(job_type, mvpa, FUN = median), fill = after_stat(x))) +
  # Add density ridges
  geom_density_ridges_gradient(
    scale = 3,
    alpha = 0.7,
    quantile_lines = TRUE,
    quantiles = 2,
    show.legend = TRUE
  ) +
  # Customize scales
  scale_x_continuous(
    name = "Weekly MVPA Minutes",
    limits = c(0, 600),
    breaks = seq(0, 600, 100)
  ) +
  scale_fill_viridis_c(option = "magma", name = "Minutes") +
  # Add labels
  labs(
    title = "Distribution of Weekly Physical Activity by Job Type",
    caption = "Vertical lines show median minutes of activity for each job type",
    y = "Job Type"
  ) +
  # Customize theme
  theme_minimal() +
  theme(
    plot.title = element_text(size = 12, face = "bold", hjust = 0.5, margin = margin(b = 10)),
    plot.caption = element_text(size = 10, hjust = 0.5, margin = margin(t = 20)),
    plot.title.position = "plot",
    plot.caption.position = "plot",
    axis.title = element_text(size = 10, face = "bold"),
    axis.text = element_text(size = 8),
    panel.grid.major.y = element_blank(),
    panel.grid.minor = element_blank(),
    legend.position = "right",
    legend.title = element_text(size = 8, face = "bold"),
    legend.text = element_text(size = 8)
  )

p_ridge