# Install required packages if not already installed
<- c("dplyr", "DT", "kableExtra", "readr", "tidyr")
required_packages <- required_packages[!(required_packages %in% installed.packages()[,"Package"])]
new_packages if(length(new_packages)) install.packages(new_packages)
# Load required packages
library(dplyr)
library(DT)
library(kableExtra)
library(readr)
library(tidyr)
Data Preparation Example
Data Source
We use CDC’s PRAMStat data for 2011.
Note
In this file, I use exposition to explain what I am doing and what I am thinking. In this context, this is an unknown data set we’re working with and I am trying to teach others.
For a project orreport, you would want to first read the data file documentation and codebook first to understand the data. Your commentary in the notebook should be targeted to the audience, conveying everything required for people to understand the steps you took and why you took them.
This data is aggregated at the location level and comes to us in “long” format.
Loading Required Libraries
We first need to load in the libraries we’ll use.
We will use:
readr
to load in the data ( I preferreadr
over the built-inread.csv
function because it is faster and has better default settings for avoiding common issues with CSVs.)dplyr
to clean data.
kableExtra
is used to make tables look nice.DT
to make tables interactive and get nice pagination on long tables.tidyr
to pivot the data to wide format.
To hide this code from the output, I typically prepend the above code with.
#| echo: false
#| message: false
Here, for illustration, I hide messages but not the code block.
Read and Explore the Data
Here we read the data using the readr::read_csv
function. Note that we use here::here
to ensure the path to the data is correct relative to the project root.
We then use the glimpse
function to take a look at the data. Glimpse tells us the number of rows and columns in the data, the names of the columns, the type of each column, and the first few rows of the data.
I create a new object df_clean
to store the cleaned data. For now, we’ll just copy the original data to this object. This is a good habit to get into, as it allows you to keep the original data for reference and to easily revert to it if needed.
# Load the data using here::here to ensure correct path resolution
<- read_csv(here::here("data", "raw", "cdc_PRAMStat_Data_for_2011_20250610.csv"))
df
# Take a look at the data structure
|> glimpse() df
Rows: 520,381
Columns: 27
$ Year <dbl> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 2…
$ LocationAbbr <chr> "AR", "AR", "CO", "CO", "DE", "DE", "GA", "…
$ LocationDesc <chr> "Arkansas", "Arkansas", "Colorado", "Colora…
$ Class <chr> "Prenatal Care", "Prenatal Care", "Prenatal…
$ Topic <chr> "Prenatal Care - Content", "Prenatal Care -…
$ Question <chr> "During any of your prenatal care visits d…
$ DataSource <chr> "PRAMS", "PRAMS", "PRAMS", "PRAMS", "PRAMS"…
$ Response <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ Data_Value_Unit <chr> "%", "%", "%", "%", "%", "%", "%", "%", "%"…
$ Data_Value_Type <chr> "Percentage", "Percentage", "Percentage", "…
$ Data_Value <dbl> 52.0, 52.0, 39.0, 35.0, 22.0, 19.0, 71.0, 3…
$ Data_Value_Footnote_Symbol <chr> "1", "1", "1", "1", "1", "1", "1", "1", "1"…
$ Data_Value_Footnote <chr> "Missing includes not applicable, don't kno…
$ Data_Value_Std_Err <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ Low_Confidence_Limit <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ High_Confidence_Limit <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ Sample_Size <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ Break_Out <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ Break_Out_Category <chr> "Maternal Age - 18 to 44 years in groupings…
$ Geolocation <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ ClassId <chr> "CLA11", "CLA11", "CLA11", "CLA11", "CLA11"…
$ TopicId <chr> "TOP19", "TOP19", "TOP19", "TOP19", "TOP19"…
$ QuestionId <chr> "QUO333", "QUO67", "QUO333", "QUO67", "QUO3…
$ LocationId <dbl> 5, 5, 8, 8, 10, 10, 13, 15, 15, 25, 25, 24,…
$ BreakOutId <chr> "BOC17", "BOC17", "BOC17", "BOC17", "BOC17"…
$ BreakOutCategoryid <chr> "BOC17", "BOC17", "BOC17", "BOC17", "BOC17"…
$ ResponseId <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
<- df df_clean
Tabbing Out Data
Always read the codebook first to understand the data. Nevertheless, it’s still helpful to tab out the data to get a sense of the variables and their values.
Let’s first look at the topics availble
table(df$Topic) |> kable()
Var1 | Freq |
---|---|
Abuse - Physical | 6799 |
Alcohol Use | 17875 |
Assisted Reproduction | 1964 |
Breastfeeding | 8829 |
Contraception - Conception | 32460 |
Contraception - Postpartum | 26716 |
Delivery - Method | 999 |
Delivery - Payment | 14802 |
HIV Test | 9975 |
Hospital Length of Stay | 9564 |
Household Characteristics | 3469 |
Income | 8143 |
Infant Health Care | 6377 |
Injury Prevention | 1692 |
Insurance Coverage | 2997 |
Maternal Health Care | 2220 |
Medicaid | 11770 |
Mental Health | 5994 |
Morbidity - Infant | 5697 |
Morbidity - Maternal | 35790 |
Multivitamin Use | 12185 |
Obesity | 11517 |
Oral Health | 10543 |
Preconception Health | 21977 |
Preconception Morbidity | 7142 |
Pregnancy History | 12875 |
Pregnancy Intention | 14249 |
Pregnancy Outcome | 2928 |
Pregnancy Recognition | 2996 |
Prenatal Care Provider | 999 |
Prenatal Care - Content | 36688 |
Prenatal Care - Initiation | 8991 |
Prenatal Care - Location | 1692 |
Prenatal Care - Payment | 14846 |
Prenatal Care - Visits | 7930 |
Sleep Behaviors | 19536 |
Smoke Exposure | 2986 |
Stress | 56867 |
Tobacco Use | 56440 |
WIC | 2862 |
Suppose we’re intersted in the relationship between Alcohol Use and Mental Health.
Let’s filter the data to include only those topics, and then look at the questions available.
|>
df filter(Topic %in% c("Alcohol Use", "Mental Health")) |>
select(QuestionId, Topic, Question) |>
distinct() |>
kable()
QuestionId | Topic | Question |
---|---|---|
QUO100 | Alcohol Use | Indicator of drinking any alcohol during the past two years |
QUO271 | Alcohol Use | (*PCH) Indicator of binge drinking (4+ drinks) during 3 months before pregnancy |
QUO8 | Alcohol Use | (*PCH) Indicator of drinking alcohol during the three months before pregnancy |
QUO9 | Alcohol Use | Indicator of whether mother reported having any alcoholic drinks during the last 3 months of pregnancy |
QUO99 | Alcohol Use | Change in drinking between three months before pregnancy and last three months of pregnancy |
QUO133 | Mental Health | During the 3 months before you got pregnant with your new baby did you have depression? |
QUO219 | Mental Health | (*PCH) Indicator of whether mother reported frequent postpartum depressive symptoms (years 2009 - 2011) |
QUO232 | Mental Health | During the 3 months before you got pregnant with your new baby did you have anxiety? |
QUO97 | Mental Health | Did a doctor nurse or other health care worker talk with you about baby blues or postpartum depression during pregnancy or after your delivery? |
QUO133 | Mental Health | During the 3 months before you got pregnant with your new baby, did you have depression? |
QUO232 | Mental Health | During the 3 months before you got pregnant with your new baby, did you have anxiety? |
QUO97 | Mental Health | Did a doctor, nurse, or other health care worker talk with you about baby blues or postpartum depression during pregnancy or after your delivery? |
Filtering Data
Suppose we’re interested in the relationship between drinking in the first three months before pregnancy and reporting depression and anxiety. H
<- c(
question_ids_of_interest "QUO271", # Binge drinking in the first three months before pregnancy
"QUO8", # Any alcohol use in the first three months before pregnancy
"QUO133", # Depression reported in the first three months before pregnancy
"QUO232" # Anxiety reported in the first three months before pregnancy
)
<- df_clean |>
df_clean filter(QuestionId %in% question_ids_of_interest)
|>
df_clean head() |>
kable()
Year | LocationAbbr | LocationDesc | Class | Topic | Question | DataSource | Response | Data_Value_Unit | Data_Value_Type | Data_Value | Data_Value_Footnote_Symbol | Data_Value_Footnote | Data_Value_Std_Err | Low_Confidence_Limit | High_Confidence_Limit | Sample_Size | Break_Out | Break_Out_Category | Geolocation | ClassId | TopicId | QuestionId | LocationId | BreakOutId | BreakOutCategoryid | ResponseId |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2011 | PRAMS Total | PRAMS Total | Maternal Behavior/Health | Alcohol Use | (*PCH) Indicator of binge drinking (4+ drinks) during 3 months before pregnancy | PRAMS | NO | % | Percentage | 78.4 | NA | NA | NA | 76.7 | 79.9 | 7089 | LBW (<=2500g) | Birth Weight | NA | CLA9 | TOP2 | QUO271 | 59 | BWT1 | BOC1 | RES23 |
2011 | PRAMS Total | PRAMS Total | Maternal Behavior/Health | Alcohol Use | (*PCH) Indicator of binge drinking (4+ drinks) during 3 months before pregnancy | PRAMS | YES | % | Percentage | 21.6 | NA | NA | NA | 20.1 | 23.3 | 1832 | LBW (<=2500g) | Birth Weight | NA | CLA9 | TOP2 | QUO271 | 59 | BWT1 | BOC1 | RES40 |
2011 | PRAMS Total | PRAMS Total | Maternal Behavior/Health | Alcohol Use | (*PCH) Indicator of binge drinking (4+ drinks) during 3 months before pregnancy | PRAMS | NO | % | Percentage | 76.8 | NA | NA | NA | 76.0 | 77.5 | 20588 | NBW (>2500g) | Birth Weight | NA | CLA9 | TOP2 | QUO271 | 59 | BWT2 | BOC1 | RES23 |
2011 | PRAMS Total | PRAMS Total | Maternal Behavior/Health | Alcohol Use | (*PCH) Indicator of binge drinking (4+ drinks) during 3 months before pregnancy | PRAMS | YES | % | Percentage | 23.2 | NA | NA | NA | 22.5 | 24.0 | 6099 | NBW (>2500g) | Birth Weight | NA | CLA9 | TOP2 | QUO271 | 59 | BWT2 | BOC1 | RES40 |
2011 | PRAMS Total | PRAMS Total | Maternal Behavior/Health | Alcohol Use | (*PCH) Indicator of binge drinking (4+ drinks) during 3 months before pregnancy | PRAMS | NA | % | Percentage | 877.0 | 1 | Missing includes not applicable, don’t know, not recorded, no responses, and legitimate skips | NA | NA | NA | NA | NA | On WIC during Pregnancy | NA | CLA9 | TOP2 | QUO271 | 59 | BOC10 | BOC10 | NA |
2011 | PRAMS Total | PRAMS Total | Maternal Behavior/Health | Alcohol Use | (*PCH) Indicator of binge drinking (4+ drinks) during 3 months before pregnancy | PRAMS | NO | % | Percentage | 74.5 | NA | NA | NA | 73.5 | 75.4 | 13716 | Non-WIC | On WIC during Pregnancy | NA | CLA9 | TOP2 | QUO271 | 59 | WIC1 | BOC10 | RES23 |
I personally prefer to remove values from the data set we do not need. We can always add them back in. By using select we also reorder the columns to make it easier to take in.
<- df_clean |>
df_clean select(
QuestionId,
Topic,
Question,
Response,
Data_Value,
Data_Value_Unit,
Data_Value_Type,
LocationAbbr,
LocationDesc,
Break_Out,
Break_Out_Category )
Renaming Variables
I prefer to rename variables to follow a consistent style and naming convention (e.g., lowercase, snake_case, noun first). Here I also rename variables to be more descriptive, such as with subgroups. Some would prefer keeping the original names, but I prefer the ergonomics of this approach.
We can use the DT::datatable
function to view the data in our document in an interactive, filterable way. You’ll want to be careful with inline this data, but with public data, this is a good way to share data (or a selection of it) with others.
<- df_clean |>
df_clean rename_with(tolower) |>
rename(
question_id = questionid,
value = data_value,
unit = data_value_unit,
type = data_value_type,
location_abbr = locationabbr,
location = locationdesc,
subgroup = break_out,
subgroup_cat = break_out_category
)
::datatable(df_clean) DT
Understanding The Data
First note that this data still needs work. Even without the codebook, we can seee:
- The response/value has “yes,” “no”, and
NA
values. - The
location_abbr
has both aggregations and location values. - The meaning of each row depends on the
subgroup
andsubgroup_cat
variables.
First, let’s get a sense of the locations in the data
|>
df_clean group_by(location) |>
summarise(
location_abbr = first(location_abbr),
n = n()
|>
) kable()
location | location_abbr | n |
---|---|---|
Arkansas | AR | 210 |
Colorado | CO | 216 |
Delaware | DE | 438 |
Georgia | GA | 210 |
Hawaii | HI | 438 |
Maine | ME | 216 |
Maryland | MD | 438 |
Massachusetts | MA | 216 |
Michigan | MI | 438 |
Minnesota | MN | 438 |
Missouri | MO | 432 |
Nebraska | NE | 210 |
New Jersey | NJ | 216 |
New Mexico | NM | 210 |
New York (excluding NYC) | NY | 210 |
New York City | YC | 216 |
Oklahoma | OK | 216 |
Oregon | OR | 210 |
PRAMS Total | PRAMS Total | 432 |
Pennsylvania | PA | 210 |
Rhode Island | RI | 216 |
Utah | UT | 438 |
Vermont | VT | 216 |
Washington | WA | 216 |
West Virginia | WV | 438 |
Wisconsin | WI | 438 |
Wyoming | WY | 438 |
We see that some non-standard choices are made here: New York and New York City are both listed as locations. Also, aggregations are included under PRAMS Total.
Lets get an overview of the subgroup categories:
|>
df_clean group_by(subgroup_cat, subgroup) |>
summarise(n = n()) |>
kable()
subgroup_cat | subgroup | n |
---|---|---|
Adequacy of Prenatal care | ADEQUATE PNC | 152 |
Adequacy of Prenatal care | INADEQUATE PNC | 152 |
Adequacy of Prenatal care | INTERMEDIATE PNC | 152 |
Adequacy of Prenatal care | UNKNOWN PNC | 152 |
Adequacy of Prenatal care | NA | 49 |
Birth Weight | LBW (<=2500g) | 152 |
Birth Weight | NBW (>2500g) | 152 |
Birth Weight | NA | 67 |
Income (years 2004 and beyond) | $10,000 to $24,999 | 152 |
Income (years 2004 and beyond) | $25,000 to $49,999 | 152 |
Income (years 2004 and beyond) | $50,000 or more | 152 |
Income (years 2004 and beyond) | Less than $10,000 | 152 |
Income (years 2004 and beyond) | NA | 49 |
Marital Status | MARRIED | 152 |
Marital Status | OTHER | 152 |
Marital Status | NA | 67 |
Maternal Age (3 Levels) | 20-29 yrs | 152 |
Maternal Age (3 Levels) | 30+ yrs | 152 |
Maternal Age (3 Levels) | <20 yrs | 152 |
Maternal Age (3 Levels) | NA | 49 |
Maternal Age (4 Levels) | 20-24 yrs | 152 |
Maternal Age (4 Levels) | 25-34 yrs | 152 |
Maternal Age (4 Levels) | 35+ yrs | 152 |
Maternal Age (4 Levels) | <20 yrs | 152 |
Maternal Age (4 Levels) | NA | 67 |
Maternal Age - 18 to 44 years in groupings | Age 18 - 24 | 152 |
Maternal Age - 18 to 44 years in groupings | Age 25 - 29 | 152 |
Maternal Age - 18 to 44 years in groupings | Age 30 - 44 | 152 |
Maternal Age - 18 to 44 years in groupings | Age 45+ | 152 |
Maternal Age - 18 to 44 years in groupings | Age < 18 | 152 |
Maternal Age - 18 to 44 years in groupings | NA | 76 |
Maternal Age - 18 to 44 years only | Age 18 - 44 | 152 |
Maternal Age - 18 to 44 years only | NA | 49 |
Maternal Education | 12 yrs | 152 |
Maternal Education | < 12 yrs | 152 |
Maternal Education | >12 yrs | 152 |
Maternal Education | NA | 76 |
Maternal Race/Ethnicity | Black, non-Hispanic | 152 |
Maternal Race/Ethnicity | Hispanic | 152 |
Maternal Race/Ethnicity | Other non-Hispanic | 152 |
Maternal Race/Ethnicity | White, non-Hispanic | 152 |
Maternal Race/Ethnicity | NA | 67 |
Medicaid Recipient | Medicaid | 152 |
Medicaid Recipient | Non-Medicaid | 152 |
Medicaid Recipient | NA | 67 |
Mother Hispanic | Hispanic | 152 |
Mother Hispanic | Non-Hispanic | 152 |
Mother Hispanic | NA | 67 |
None | None | 152 |
Number of Previous Live Births | 0 | 152 |
Number of Previous Live Births | 1 or more | 152 |
Number of Previous Live Births | NA | 76 |
On WIC during Pregnancy | Non-WIC | 152 |
On WIC during Pregnancy | WIC | 152 |
On WIC during Pregnancy | NA | 76 |
Pregnancy Intendedness | Intended | 152 |
Pregnancy Intendedness | Unintended | 152 |
Pregnancy Intendedness | NA | 49 |
Smoked 3 months before Pregnancy | Non-Smoker | 152 |
Smoked 3 months before Pregnancy | Smoker | 152 |
Smoked 3 months before Pregnancy | NA | 76 |
Smoked last 3 months of Pregnancy | Non-Smoker | 152 |
Smoked last 3 months of Pregnancy | Smoker | 152 |
Smoked last 3 months of Pregnancy | NA | 49 |
We now have a sense of the structure of the entire data file.
Let’s finally use the group_by
and summarise
functions to try narrow down groups to Ns of 1.
<- df_clean |>
df_summary group_by(question_id, location_abbr, subgroup_cat, subgroup, response) |>
summarise(
n = n(),
question = first(question),
mean_value = mean(value)
)
::datatable(df_summary) DT
We now know that the following data combine to form a “base” row:
- question_id
- location_abbr
- subgroup_cat
- subgroup
- response
With this in mind, we can now see that the following data combine to form a “base” row.
Preparing The Data For Visualizaiton
You’ll notice how the data is in a kind of long
format: there are yes, no, and NA values for each question. We want to get all values per question per location/subgroup/subgroup_cat.
Steps:
- Filter out the aggregate (non-location specific) values
- Filter out NAs on the subroup (break out) category: of course, you would need to identify WHY these are NAs and address this issue if there is a reason.
We can then use the tidyr
package to pivot the data to wide format.
<- df_clean |>
df_final filter(location_abbr != "PRAMS Total") |>
filter(!is.na(subgroup)) |>
filter(response == "YES") |> # Only keep YES responses
pivot_wider(
id_cols = c(location_abbr, subgroup_cat, subgroup),
names_from = question_id,
values_from = value,
names_prefix = "q_" # Add prefix to question ID columns
|>
) rename(
depression_within_3_months_birth = q_QUO133,
anxiety_within_3_months_birth = q_QUO232,
alcohol_use_within_3_months_birth = q_QUO8,
binge_drinking_within_3_months_birth = q_QUO271
|>
) arrange(location_abbr, subgroup_cat, subgroup)
::datatable(df_final) DT
Data Diagnostics
Let’s do a quick analysis of missing values.
|>
df_final summarise(
across(everything(), ~ 100 * sum(is.na(.)) / n())
|>
) kable()
location_abbr | subgroup_cat | subgroup | depression_within_3_months_birth | anxiety_within_3_months_birth | binge_drinking_within_3_months_birth | alcohol_use_within_3_months_birth |
---|---|---|---|---|---|---|
0 | 0 | 0 | 63.42062 | 63.42062 | 5.07365 | 5.07365 |
Let’s examine missing data across all measures, by location.
|>
df_final group_by(location_abbr) |>
summarise(
depression_missing = 100 * mean(is.na(depression_within_3_months_birth)),
anxiety_missing = 100 * mean(is.na(anxiety_within_3_months_birth)),
alcohol_missing = 100 * mean(is.na(alcohol_use_within_3_months_birth)),
binge_missing = 100 * mean(is.na(binge_drinking_within_3_months_birth))
|>
) arrange(location_abbr) |>
kable(digits = 1)
location_abbr | depression_missing | anxiety_missing | alcohol_missing | binge_missing |
---|---|---|---|---|
AR | 100.0 | 100.0 | 4.3 | 4.3 |
CO | 100.0 | 100.0 | 2.1 | 2.1 |
DE | 4.3 | 4.3 | 4.3 | 4.3 |
GA | 100.0 | 100.0 | 2.1 | 2.1 |
HI | 4.3 | 4.3 | 4.3 | 4.3 |
MA | 100.0 | 100.0 | 4.3 | 4.3 |
MD | 4.3 | 4.3 | 4.3 | 4.3 |
ME | 100.0 | 100.0 | 14.9 | 14.9 |
MI | 2.1 | 2.1 | 2.1 | 2.1 |
MN | 4.3 | 4.3 | 4.3 | 4.3 |
MO | 4.3 | 4.3 | 4.3 | 4.3 |
NE | 100.0 | 100.0 | 2.1 | 2.1 |
NJ | 100.0 | 100.0 | 6.4 | 6.4 |
NM | 100.0 | 100.0 | 4.3 | 4.3 |
NY | 100.0 | 100.0 | 4.3 | 4.3 |
OK | 100.0 | 100.0 | 2.1 | 2.1 |
OR | 100.0 | 100.0 | 2.1 | 2.1 |
PA | 100.0 | 100.0 | 4.3 | 4.3 |
RI | 100.0 | 100.0 | 6.4 | 6.4 |
UT | 6.4 | 6.4 | 6.4 | 6.4 |
VT | 100.0 | 100.0 | 12.8 | 12.8 |
WA | 100.0 | 100.0 | 4.3 | 4.3 |
WI | 2.1 | 2.1 | 2.1 | 2.1 |
WV | 10.6 | 10.6 | 10.6 | 10.6 |
WY | 6.4 | 6.4 | 6.4 | 6.4 |
YC | 100.0 | 100.0 | 6.4 | 6.4 |
Data on alcohol use is more comprehensive than on anxiety and depression. Please note that just because we’re focused on data visualization does not mean our concerns over missing data are any less important than, say, when we are doing staitstical modeling.
Save The Data
Now, let’s save this processed data to an RDS file. Before we do, let’s merge back in our full location for reference and turn our location variables into factors.
<- df_final |>
df_final left_join(
|>
df_clean select(location_abbr, location) |>
distinct(),
by = "location_abbr"
|>
) mutate(
location_abbr = factor(location_abbr),
location = factor(location)
)
saveRDS(df_final, here::here("data", "processed", "cdc_prams_df_final.rds"))