Data Simulation

Author

Erik Westlund

Published

June 12, 2025

Data Simulation

To simulate data, we implement our causal model. We will start by simulating data for root nodes or exogenous nodes (i.e., those without arrows going into them).

States/Geography

Using data from the Census and the Commonwealth Fund health grades, we’ll generate a data frame with some state-level data. This data will be used to generate state-level variables for patients, such as the political/economic conditions related to health. It will also be used to structure the distribution of race/ethnicity in the population per state.

## Population Data from US Census 2020. Combined with health scores from Commonwealth Fund
## Health Scorecard data.  The weights measure is based upon population data from the Census.
## `pec` is a z-score of the rank of the state in terms of population and used as a 
## measure of state political & economic conditions with respect to health.
## This data is for illustration purposes only.
state_population_data <- read.csv(here::here("data/raw/states_census2020_ranks.csv")) |> 
  mutate(state_weight = population/sum(population)) |> 
   mutate(
    inverted_rank = max(rank) + 1 - rank,  # Invert ranks (lower is better)
    pec = scale(inverted_rank, center = TRUE, scale = TRUE)[, 1]  # Z-score
  ) |> 
  select(-inverted_rank) 


# Load in data on race/ethnicity population from states. This is from the Census2020 file.
# It uses their one-race only and separates  Hispanic from non-Hispanic. The weights
# are just the proportion of the population of one category vs the sum of the population in
# all race/ethnicity categories. 
# This data is meant for illustration purposes only.
state_data <- readr::read_csv(here::here("data/raw/race_by_state_census2020.csv")) |> 
  pivot_longer(cols = -"Label",
               names_to = "State",
               values_to = "Population") |>
  pivot_wider(names_from = "Label", values_from = Population) |>
  mutate(across(-State, ~ as.numeric(gsub(",", "", .)))) |>
  rename(
    state = State,
    white = White,
    black = Black,
    hispanic = "Hispanic or Latino",
    asian = Asian,
    aian = "American Indian and Alaska Native",
    nhpi = "Native Hawaiian and Other Pacific Islander",
    other = Other,
  ) |> mutate(
    sum = white + hispanic + black + asian + aian + other + nhpi,
    white = white / sum,
    black = black / sum,
    hispanic = hispanic / sum,
    asian = asian / sum,
    aian = aian / sum,
    other = other / sum,
    nhpi = nhpi / sum,
  ) |>
  rename(state_name = state) |>
  select(-sum) |>
  left_join(state_population_data |> select(state_name, state, state_weight, pec)) |>
  filter(!is.na(state))

Rows: 7 Columns: 53
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (1): Label
num (52): Alabama, Alaska, Arizona, Arkansas, California, Colorado, Connecti...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Joining with `by = join_by(state_name)`

state_data |> kable()

state_name	hispanic	white	black	aian	asian	nhpi	other	state	state_weight	pec
Alabama	0.0545590	0.6552837	0.2661672	0.0047770	0.0156866	0.0005397	0.0029868	AL	0.0152792	-0.8744746
Alaska	0.0753049	0.6374530	0.0313332	0.1644998	0.0656696	0.0188247	0.0069147	AK	0.0022303	-1.6144147
Arizona	0.3184257	0.5543550	0.0460678	0.0383359	0.0361437	0.0020804	0.0045915	AZ	0.0217483	-0.6726728
Arkansas	0.0896697	0.7204210	0.1570623	0.0071740	0.0178783	0.0049854	0.0028093	AR	0.0141646	-1.0762765
California	0.4109587	0.3617622	0.0559023	0.0041172	0.1577082	0.0036446	0.0059068	CA	0.1202389	0.8072074
Colorado	0.2291691	0.6821550	0.0401439	0.0061253	0.0354114	0.0016334	0.0053620	CO	0.0175583	0.8744746
Connecticut	0.1797075	0.6571469	0.1040652	0.0018464	0.0491466	0.0002808	0.0078065	CT	0.0108044	1.3453456
Delaware	0.1101354	0.6123516	0.2248964	0.0026623	0.0447744	0.0003210	0.0048589	DE	0.0030105	0.6054055
District of Columbia	0.1176439	0.3965867	0.4273339	0.0019347	0.0502863	0.0005287	0.0056858	DC	0.0020970	-1.6816820
Florida	0.2746181	0.5350660	0.1507300	0.0020326	0.0303492	0.0005553	0.0066486	FL	0.0654994	-0.4036037
Georgia	0.1088434	0.5194994	0.3175926	0.0019740	0.0460851	0.0005911	0.0054145	GA	0.0325758	-0.6054055
Hawaii	0.1194132	0.2702167	0.0188047	0.0019950	0.4569079	0.1281214	0.0045411	HI	0.0043058	1.6144147
Idaho	0.1359265	0.8235534	0.0083944	0.0107324	0.0147823	0.0019310	0.0046801	ID	0.0017543	-1.2108110
Illinois	0.1885365	0.6027553	0.1432216	0.0013358	0.0602759	0.0002387	0.0036362	IL	0.0389639	0.2018018
Indiana	0.0849962	0.7854079	0.0977733	0.0019843	0.0255592	0.0004235	0.0038556	IN	0.0206353	-0.4708710
Iowa	0.0700867	0.8560874	0.0419642	0.0029461	0.0243428	0.0018188	0.0027540	IA	0.0097022	0.4036037
Kansas	0.1371900	0.7610919	0.0585731	0.0078602	0.0305591	0.0011169	0.0036086	KS	0.0886338	-0.9417419
Kentucky	0.0479980	0.8462734	0.0826155	0.0018658	0.0170519	0.0007995	0.0033959	KY	0.0137026	-0.2690691
Louisiana	0.0716511	0.5768320	0.3226409	0.0057743	0.0189566	0.0003790	0.0037662	LA	0.0055929	-1.1435437
Maine	0.0203311	0.9384758	0.0191895	0.0055723	0.0127355	0.0003110	0.0033848	ME	0.0041430	1.2780783
Maryland	0.1235503	0.4933212	0.3039091	0.0020410	0.0707635	0.0004360	0.0059789	MD	0.0187854	1.1435437
Massachusetts	0.1324579	0.7086173	0.0682005	0.0014007	0.0753398	0.0002398	0.0137441	MA	0.0209607	1.6816820
Michigan	0.0585621	0.7569665	0.1409480	0.0049186	0.0344768	0.0002701	0.0038580	MI	0.0303708	0.4708710
Minnesota	0.0631830	0.7958892	0.0718130	0.0104280	0.0543757	0.0004791	0.0038320	MN	0.0173539	1.0090092
Mississippi	0.0365622	0.5695525	0.3749352	0.0048714	0.0112255	0.0003603	0.0024928	MS	0.0022303	-1.5471474
Missouri	0.0518324	0.7976482	0.1184822	0.0040184	0.0226024	0.0015893	0.0038270	MO	0.0187176	-0.3363364
Montana	0.0439049	0.8755114	0.0049316	0.0627426	0.0078457	0.0008150	0.0042488	MT	0.0032972	0.0000000
Nebraska	0.1242621	0.7860186	0.0499796	0.0079683	0.0277197	0.0006978	0.0033539	NE	0.0059651	0.1345346
Nevada	0.3030463	0.4853986	0.0993841	0.0079627	0.0905442	0.0078191	0.0058451	NV	0.0094414	-0.7399401
New Hampshire	0.0449400	0.9075440	0.0141009	0.0017378	0.0269123	0.0002933	0.0044718	NH	0.0041892	1.5471474
New Jersey	0.2225201	0.5351818	0.1282448	0.0012452	0.1047746	0.0002160	0.0078175	NJ	0.0282486	0.5381382
New Mexico	0.4912203	0.3756288	0.0186271	0.0916581	0.0171357	0.0007051	0.0050249	NM	0.0120407	-1.3453456
New York	0.2026669	0.5440805	0.1416307	0.0028186	0.0983721	0.0003130	0.0101182	NY	0.0614777	1.0762765
North Carolina	0.1114968	0.6291678	0.2100691	0.0100559	0.0338956	0.0006957	0.0046190	NC	0.0317470	0.2690691
North Dakota	0.0446180	0.8495205	0.0349231	0.0498767	0.0174268	0.0011605	0.0024745	ND	0.0023693	-0.2018018
Ohio	0.0461421	0.7925496	0.1289781	0.0016772	0.0262531	0.0003977	0.0040023	OH	0.0358831	-0.0672673
Oklahoma	0.1316157	0.6713349	0.0789927	0.0869823	0.0250031	0.0022780	0.0037934	OK	0.0054548	-1.4126129
Oregon	0.1479820	0.7631278	0.0197704	0.0105671	0.0482075	0.0045738	0.0057714	OR	0.0128858	0.9417419
Pennsylvania	0.0836252	0.7611426	0.1090696	0.0011973	0.0403679	0.0002519	0.0043454	PA	0.0389319	0.7399401
Rhode Island	0.1742378	0.7214899	0.0529944	0.0033613	0.0367103	0.0003062	0.0109001	RI	0.0033372	1.4798801
South Carolina	0.0715863	0.6448878	0.2574703	0.0033661	0.0181369	0.0006259	0.0039267	SC	0.0155655	-0.8072074
South Dakota	0.0454581	0.8279207	0.0204650	0.0875287	0.0156436	0.0005785	0.0024054	SD	0.0026964	0.0672673
Tennessee	0.0721600	0.7379203	0.1632035	0.0023400	0.0202243	0.0005412	0.0036107	TN	0.0210164	-0.5381382
Texas	0.4048817	0.4099377	0.1218961	0.0030229	0.0552566	0.0009858	0.0040193	TX	0.0091583	-1.0090092
Utah	0.1564222	0.7823633	0.0118026	0.0091046	0.0249489	0.0113707	0.0039877	UT	0.0099492	-0.1345346
Vermont	0.0252702	0.9342703	0.0140972	0.0032370	0.0186740	0.0002771	0.0041742	VT	0.0019569	1.4126129
Virginia	0.1104663	0.6148877	0.1918305	0.0023193	0.0742252	0.0007531	0.0055180	VA	0.0262488	0.3363364
Washington	0.1472322	0.6837234	0.0411681	0.0126757	0.1005067	0.0086862	0.0060078	WA	0.0237116	1.2108110
West Virginia	0.0202297	0.9287010	0.0376102	0.0018512	0.0086566	0.0002492	0.0027022	WV	0.0090055	-1.4798801
Wisconsin	0.0786102	0.8144184	0.0644130	0.0085034	0.0306270	0.0003325	0.0030954	WI	0.0179233	0.6726728
Wyoming	0.1067398	0.8490302	0.0085596	0.0212970	0.0091056	0.0008840	0.0043838	WY	0.0064396	-1.2780783

readr::write_csv(state_data, here::here("data/processed/state_data.csv"))

Providers

We need to generate providers. We will have a provider for roughly every 100 patients. We will generate a provider quality measure which is mostly random, but determined partially by state political and economic considerations with respect to health.

Providers will vary by state population. We below show a random sample of 50 providers.

providers <- state_data |>
  slice(rep(1:n(), each = 1)) |>
  arrange(state) |>
  mutate(
    id = 1:n(),
  ) |> 
  select(id, state, pec)

remaining_providers <- n_providers - nrow(providers)

additional_providers <- data.frame(
  id = (nrow(providers) + 1):n_providers,
  state = sample(state_data$state, remaining_providers, replace = TRUE, prob = state_data$state_weight)
) |> 
  left_join(state_data |> select(state, pec), by = "state")

providers <- bind_rows(providers, additional_providers) |> 
  mutate(
    quality = gen_provider_quality(n_providers, pec)
  ) |> 
  arrange(id) |> 
  select(id, state, quality)

providers |> sample_n(50) |>  kable()

id	state	quality
302	WA	0.5584503
188	WI	-0.0484019
293	DC	-0.1116249
386	IA	1.4081174
317	CA	0.4050019
427	FL	-0.0005722
349	CA	-0.2516285
325	CA	1.1807350
227	WI	1.0043082
166	NC	0.1900311
463	NY	-0.1232404
495	CA	0.1136721
208	NY	2.1331043
342	OK	0.0312325
356	NC	0.9456419
395	KY	0.2885455
488	AR	0.8170037
164	OH	-0.6728434
307	WA	1.6804351
150	WA	0.1608852
110	MD	1.7158025
226	WA	0.8153007
142	GA	0.0019453
75	NM	0.5415809
172	CA	0.8553539
167	NM	1.4933018
251	CA	0.9145226
274	FL	0.5157689
70	UT	1.5896805
178	FL	0.4011250
105	CA	1.5920369
182	UT	1.0789726
306	CT	-0.8834033
398	KS	0.3625542
187	MO	-0.0047936
287	KS	0.3148554
123	KS	0.0998749
137	MO	0.6180418
233	MD	1.2898827
200	AL	0.9057118
214	CA	-0.4118376
103	OK	0.1479559
254	OH	0.2450121
354	CO	1.4982565
443	NJ	1.2863920
352	CA	0.4486037
80	NC	1.0352676
156	FL	1.1929958
437	CA	0.2827005
173	CA	1.7380359

Patients

We’ll start by generating a date frame with patients. We will generate a state for each patient based on the population of each state. We will randomly assign a race/ethnicity based upon state-level proportions. We will then assign that person to a provider in their state.

# Start by creating a dataframe with patients having state IDs proportional to
# the population of each state. Then row-wise generate race/ethnicity categories.
data <- data.frame(state = sample(
  state_data$state,
  n,
  replace = TRUE,
  prob = state_data$state_weight
)) |>
  merge(state_data |> select(state, pec, re_cats), by = "state") |>
  rowwise() |>
  mutate(race = sample(re_cats, size = 1, prob = c_across(all_of(re_cats)))) |>
  ungroup() |>
  select(-re_cats) |> 
  mutate(id = row_number())

Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
ℹ Please use `all_of()` or `any_of()` instead.
  # Was:
  data %>% select(re_cats)

  # Now:
  data %>% select(all_of(re_cats))

See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.

data |> sample_n(10) |> kable()

state	pec	race	id
TN	-0.5381382	hispanic	44601
NC	0.2690691	white	31863
NV	-0.7399401	black	34578
NY	1.0762765	white	36326
KS	-0.9417419	white	21635
NC	0.2690691	white	31121
MD	1.1435437	white	26242
MI	0.4708710	white	27201
MO	-0.3363364	white	29266
IN	-0.4708710	white	18294

Assign Patient to Provider

data <- data |> 
  rowwise() |>
  mutate(provider_id = sample(providers$id[providers$state == state], 1)) |>
  ungroup()  |> 
  left_join(providers |> select(id, quality), by = c("provider_id" = "id")) |> 
  select(id, provider_id, quality, state, pec, race) |> 
  rename(
    provider_quality = quality
  )
  

data |> sample_n(10) |> kable()

id	provider_id	provider_quality	state	pec	race
46480	46	0.9183326	VA	0.3363364	white
9219	314	-1.2354202	CO	0.8744746	white
23482	176	0.2513363	KS	-0.9417419	white
9035	100	1.8010039	CO	0.8744746	hispanic
12261	323	0.0338907	FL	-0.4036037	hispanic
16551	368	0.9353468	IL	0.2018018	white
27001	421	0.8893215	MI	0.4708710	white
48287	150	0.1608852	WA	1.2108110	asian
29880	25	0.6985157	MO	-0.3363364	white
12666	178	0.4011250	FL	-0.4036037	white

Exogenous Nodes

Now every patient has a state, a provider, a race/ethnicity, and some associated data by state.

In addition to race/ethnicity and state, the following variables have no prior causes in our model:

AGE: Age of the patient.
PCE: Parent community connections
PED: Parent education
PI: Parent intelligence
PM: Parent motivation
PSR: Parent resilience

We will generate n observations for each of these variables.

## Next generate all the other exogenous variables
# We need to assign a provider to each patient. This provider needs a quality
data <- data |>
  mutate(
    age = gen_mother_ages(n),
    parent_income = gen_incomes(
      n,
      median_income = 60000,
      sd = 25000,
      min_income = 0,
      max_income = 1000000
    ),
    parent_intelligence = rnorm(n, 1, 1),
    parent_resilience = rnorm(n, 1, 1),
    parent_motivation = rnorm(n, 1, 1),
    parent_community_connections = rnorm(n, 0, 1),
    parent_edu = gen_education(n, ed_cats, parent_intelligence, parent_resilience, parent_motivation, parent_community_connections, parent_income),
  )

data |> sample_n(10) |> kable()

id	provider_id	provider_quality	state	pec	race	age	parent_income	parent_intelligence	parent_resilience	parent_motivation	parent_community_connections	parent_edu
5285	493	1.2630188	CA	0.8072074	white	26	46351	3.2436271	0.2525813	1.6954949	0.2546516	hs
45413	70	1.5896805	UT	-0.1345346	hispanic	23	55997	1.8129310	0.8719194	1.3830863	1.4434692	some_college
7687	230	0.4252149	CA	0.8072074	hispanic	39	65023	0.9022916	0.1343653	0.2312583	-0.9743949	hs
32576	449	0.9623769	NJ	0.5381382	white	22	60420	2.3778012	2.5190224	1.0760966	-2.5790280	college
1293	204	-0.0219117	AR	-1.0762765	white	34	56641	0.3425865	0.5454315	2.8228031	-0.1848488	some_college
17874	266	2.3467958	IL	0.2018018	hispanic	23	66054	0.7721575	-0.5256282	1.0021947	0.9933364	hs
46442	66	2.5322108	VA	0.3363364	black	23	62409	1.5751782	-0.1781548	1.4913066	0.7047459	some_college
23954	92	0.1438871	KY	-0.2690691	white	27	47060	1.1336424	1.1333922	2.2675736	-0.0986224	college
10513	56	0.4856040	FL	-0.4036037	white	17	62376	1.0374659	2.9382283	1.9864772	-2.6375022	less_than_hs
14780	198	0.9845173	GA	-0.6054055	black	28	100030	1.3601545	0.4665285	1.5547804	0.6022043	hs

Working our way up from the root nodes

Religion is patterned by race but has no other determinants.

data <- data |> mutate(
  religion = gen_religion(n, rel_cats, race),
)

data |> select(id, race, religion) |> sample_n(10) |> kable()

id	race	religion
41739	white	other
30510	white	other
47129	white	christian
14477	white	christian
16091	black	other
23753	white	christian
45524	white	christian
36187	asian	hindu
20614	hispanic	christian
24763	white	other

The following variables are determined by parents’ values:

I: Intelligence
SR: Resilience
M: Motivation
CE: Community connections

data <- data |> mutate(
  intelligence = gen_correlated(parent_intelligence, target_r = 0.5),
  resilience = gen_correlated(parent_resilience, target_r = 0.5),
  motivation = gen_correlated(parent_motivation, target_r = 0.5),
  community_connections = gen_correlated(parent_community_connections, target_r = 0.5),
)

data |> select(id, parent_intelligence, intelligence, parent_resilience, resilience, parent_motivation, motivation, parent_community_connections, community_connections) |> sample_n(10) |> kable()

id	parent_intelligence	intelligence	parent_resilience	resilience	parent_motivation	motivation	parent_community_connections	community_connections
25876	0.8162152	-0.0233094	1.4151382	0.9532587	-0.5650720	-0.3885897	1.4340931	0.7340203
14227	0.5367401	0.5437273	0.1321767	0.6611476	2.6624444	2.0114176	0.4870163	0.3318152
4686	1.2870244	-0.0867434	0.9697317	-1.2280097	0.6601827	0.2003158	0.2238436	0.0487312
37029	0.0126591	0.9261652	2.2590969	1.2338338	0.9327173	0.6468705	0.9934896	0.6896500
38242	-0.4226082	-0.9823248	0.8806784	0.2909840	1.0649707	0.6756294	-2.5803932	-0.3124163
39867	0.6555069	-0.0669607	0.5228141	-1.0866989	-0.2684624	-0.1816550	-0.3244369	0.5392087
2272	0.4396705	-0.1777916	-0.9879631	-0.9986611	-0.7904935	-1.2463709	-1.5127369	-1.8910560
6760	1.3337479	0.1012518	2.4220336	0.8947314	2.3017777	0.2140353	1.6707951	1.2177864
9861	0.1687158	-0.3712220	1.7133447	0.9102988	1.2229167	-0.2483762	-0.4536596	-0.5375337
14776	2.1533717	2.3308366	0.8096811	-1.4655991	1.0262741	-0.2435721	-0.3430252	-0.3251550

The following variables are determined by a mixture personality, geography, and parents’ values:

Education (varies by personal traits and parents’ class position)
Cultural orientation (namely, trust of institutions; varies by parents class position, religion, geography, and commu nity connections)
Job type (varies by educational attainment and parental income)
Dependents (varies by income, job type, and age)
Insurance (varies by job type, state conditions, age)
Distance to provider (varies by state conditions)

data <- data |> mutate(
  edu = gen_education(n, ed_cats, intelligence, resilience, motivation, community_connections, parent_income, parent_edu),
  income = gen_incomes(
    n,
    median_income = 60000,
    sd = 25000,
    min_income = 0,
    max_income = 1000000
  ),
  cultural_orientation = gen_cultural_orientation(n, parent_income, parent_edu, pec, religion, community_connections),
  job_type = gen_job_type(n, job_type_cats, ed_cats, edu, parent_income),
  dependents = gen_dependents(n, income, job_type, age),
  insurance = gen_insurance(n, job_type_cats, job_type, pec, age),
  distance_to_provider = gen_distance(n, pec),
)

data |> select(id, edu, income, cultural_orientation, job_type, dependents, insurance, distance_to_provider) |> sample_n(10) |> kable()

id	edu	income	cultural_orientation	job_type	dependents	insurance	distance_to_provider
48841	hs	48747	0.4125719	trade	1	private	2.697340
31574	less_than_hs	51999	0.1853598	unemployed	2	private	24.720962
46429	some_college	24904	0.5749469	trade	1	state_provided	8.767639
38670	college	38613	0.6366736	professional	1	private	3.165834
42931	some_college	39275	0.3642163	unskilled	2	no_insurance	1.708844
16093	less_than_hs	39069	0.5471840	unemployed	4	no_insurance	32.856579
9144	hs	40162	0.2125100	unskilled	2	private	1.439326
31796	college	75072	0.7164579	professional	2	private	7.441291
32754	hs	22815	0.3935348	unskilled	2	no_insurance	27.329120
19168	some_college	52135	0.2906098	trade	2	no_insurance	21.536786

Comorbidity is determined by age, SES, and other comorbidities.

Obesity (varies by state conditions, age)
Multiple gestation (varies by age, obesity)
Diabetes (varies by age, obesity, income)
Heart disease (varies by age, obesity, diabetes)
Placenta previa (varies by multiple gestation)
Hypertension (varies by age, obesity)
Gestational hypertension (varies by hypertension, multiple gestation)
Preeclampsia (varies by age, hypertension, gestational hypertension, multiple gestation)

data <- data |> mutate(
  obesity = gen_obesity(n, income, edu, pec, age, target_prevalence=0.35),
  multiple_gestation = gen_multiple_gestation(n, age, obesity, target_prevalence = 0.03),
  diabetes = gen_diabetes(n, age, obesity, income, target_prevalence = 0.1),
  heart_disease = gen_heart_disease(n, age, obesity, diabetes, target_prevalence = 0.15),
  placenta_previa = gen_placenta_previa(n, age, multiple_gestation, target_prevalence = 0.01),
  hypertension = gen_hypertension(n, age, obesity, target_prevalence = 0.2),
  gest_hypertension = gen_gest_hypertension(n, hypertension, multiple_gestation, target_prevalence = 0.05),
  preeclampsia = gen_preeclampsia(n, age, hypertension, gest_hypertension, multiple_gestation, target_prevalence = 0.02),
)

data |> select(id, obesity, multiple_gestation, diabetes, heart_disease, placenta_previa, hypertension, gest_hypertension, preeclampsia) |> sample_n(10) |> kable()

id	obesity	hypertension
46910	1	0
3914	0	0
12289	1	0
40707	0	0
10945	0	0
33878	0	0
29917	0	1
46877	0	0
28348	0	0
45953	0	0

Immediate causes of receipt of comprehensive postnatal care

Provider quality is determined by state conditions and already calculated.

Personal capacity (to attend visits) is determined by:

dependents
job type
income
distance to provider

data <- data |> mutate(
  personal_capacity = gen_personal_capacity(n, dependents, job_type, income, distance_to_provider)
)

Now generate the risk profile, which is a function of:

provider quality (negatively correlated)
age
obesity
multiple gestation
diabetes
heart disease
placenta previa
hypertension
gestational hypertension
preeclampsia

data <- data |> mutate(
  risk_profile = gen_risk_profile(n, provider_quality = providers$quality[match(data$provider_id, providers$id)], 
                                 age, obesity, multiple_gestation, diabetes, heart_disease, placenta_previa, hypertension, gest_hypertension, preeclampsia)
)

Now generate risk aversion, which we see as a function of the negative conseuqences from getting really sick and how much people want to avoid them.

insurance (people without insurance will be more risk averse because of cost; positively correlated)
provider quality (people with better providers will be less risk averse since they trust the care they can get; negatively correlated)
risk profile (people with higher risk will be more risk averse since they will have more consequences if getting ill; positively correlated)

data <- data |> mutate(
  risk_aversion = gen_risk_aversion(n, insurance, provider_quality = providers$quality[match(data$provider_id, providers$id)], risk_profile)
)

Provider trust is a function of:

race/ethnicity (racial minorities are less likely to trust providers; negatively correlated)
provider quality (people with better providers will be more likely to trust them; positively correlated)
cultural orientation (people trusting institutions will be more likely to trust providers; positively correlated)

data <- data |> mutate(
  provider_trust = gen_provider_trust(n, re_cats, race, provider_quality, cultural_orientation)
)

And finally, willingness to pay, which we see as a function of:

provider quality (people with better providers will be willing to pay more; positively correlated)
income (people with higher income will be willing to pay more; positively correlated)
insurance (people with insurance will be willing to pay more since they don’t have to cover it out of pocket; positively correlated)
risk aversion (people who are more risk averse will be willing to pay more; positively correlated)
cultural orientation (people who trust institutions will be willing to pay more; positively correlated)

data <- data |> mutate(
  willingness_to_pay = gen_willingness_to_pay(n, provider_quality = providers$quality[match(data$provider_id, providers$id)], income, insurance, risk_aversion, cultural_orientation)
)

Finally, we generate the outcome of interest, receipt of comprehensive postnatal care, which is a function of:

personal capacity (people with more capacity will be more likely to attend visits; positively correlated)
willingness to pay (people who are willing to pay more will be more likely to attend visits; positively correlated)
provider quality (people with better providers will be more likely to attend visits; positively correlated)
provider trust (people who trust their providers will be more likely to attend visits; positively correlated)
risk aversion (people who are more risk averse will be more likely to attend visits; negatively correlated)
risk profile (people with higher risk will be more likely to attend visits; positively correlated)

data <- data |> 
  mutate(
    received_comprehensive_postnatal_care = gen_received_comprehensive_postnatal_care(n, personal_capacity, willingness_to_pay, provider_quality, provider_trust, risk_aversion, risk_profile)
  )

We’re going to include an income variable, but it’s going to be censored and have measurement error.

sri_labels <- c("$0–$24,999", "$25,000–$49,999", "$50,000–$74,999", "$75,000–$99,999", "$100,000–$124,999", "$125,000–$149,999", "$150,000–$174,999", "$175,000+")

data <- data |> mutate(
  self_report_income = cut(
      pmin(pmax(income + rnorm(n, mean = 0, sd = 5000), 0), 200000), # Add noise and cap
      breaks = seq(0, 200000, by = 25000),
      include.lowest = TRUE,
      right = FALSE,
      labels = sri_labels
    )
)

table(data$self_report_income)


       $0–$24,999   $25,000–$49,999   $50,000–$74,999   $75,000–$99,999 
              608             14637             21347              9666 
$100,000–$124,999 $125,000–$149,999 $150,000–$174,999         $175,000+ 
             2832               696               158                56

analysis_data <- data |> select(
  id,
  provider_id,
  state,
  received_comprehensive_postnatal_care,
  self_report_income,
  age,
  edu,
  race,
  insurance,
  job_type,
  dependents,
  distance_to_provider,
  obesity,
  multiple_gestation,
  diabetes,
  heart_disease,
  placenta_previa,
  hypertension,
  gest_hypertension,
  preeclampsia
) |>
  mutate(
    self_report_income = as.character(self_report_income),
    job_type = as.character(job_type),
    dependents = as.character(dependents),
  ) |> 
  rename(
    race_ethnicity = race
  )

readr::write_csv(analysis_data, here::here("data/processed/simulated_data.csv"))