Appendix D — Sample Balance
D.1 Overview
This document outlines the methodology employed to balance the sample data, ensuring representativeness and reliability for hypothesis testing and subsequent analysis.
The balancing process accounts for the population distribution across the variables state
, sex
, and age
.
Please note that the values presented here are specific to the group of states in the Brazilian time zone UTC-3 and do not represent the entire country.
D.2 Setting the Environment
D.3 Loading the Sample Data
Code
data <-
targets::tar_read("filtered_data", store = here::here("_targets")) |>
dplyr::filter(state %in% orbis::get_brazil_state_by_utc(-3, "state")) |>
dplyr::select(country, region, state, sex, age, msf_sc) |>
tidyr::drop_na(state, sex, age) |>
dplyr::mutate(
age_group = dplyr::case_when(
age >= 18 & age < 20 ~ "18-19",
age >= 20 & age < 25 ~ "20-24",
age >= 25 & age < 30 ~ "25-29",
age >= 30 & age < 40 ~ "30-39",
age >= 40 & age < 50 ~ "40-49",
age >= 50 & age < 60 ~ "50-59",
age >= 60 & age < 65 ~ "60-64",
age >= 65 ~ "65+"
),
age_group = factor(
age_group,
levels = c(
"18-19", "20-24", "25-29", "30-39", "40-49", "50-59", "60-64",
"65+"
),
ordered = TRUE
)
) |>
dplyr::relocate(age_group, .after = age)
data
D.4 Getting the Population Data
Source: Instituto Brasileiro de Geografia e Estatística. (n.d.). Tabela 6407: População residente, por sexo e grupos de idade [Table 6407: Resident population, by sex and age groups] [Dataset]. SIDRA. https://sidra.ibge.gov.br/tabela/6407
Code
prettycheck:::assert_internet()
ibge_6407_data <-
sidrar::get_sidra(
api = paste0(
"/t/6407/n1/all/n3/all/v/606/p/2017/c2/all/c58/1144,1145,",
"1152,2793,3299,3300,3301,3302,6798"
)
) |>
dplyr::as_tibble() |>
dplyr::select(
dplyr::all_of(
c(
"Valor", "Brasil e Unidade da Federação", "Ano", "Sexo",
"Grupo de idade"
)
)
) |>
dplyr::rename(
n = Valor,
state = `Brasil e Unidade da Federação`,
year = Ano,
sex = Sexo,
age_group = `Grupo de idade`
) |>
dplyr::filter(
state != "Brasil",
sex != "Total",
age_group != "60 anos ou mais"
) |>
dplyr::arrange(state, sex, age_group) |>
dplyr::mutate(
year = as.integer(year),
country = "Brazil",
region = orbis::get_brazil_region(state),
sex = dplyr::case_when(
sex == "Homens" ~ "Male",
sex == "Mulheres" ~ "Female"
),
sex = factor(sex, ordered = FALSE),
age_group = dplyr::case_when(
age_group == "18 a 19 anos" ~ "18-19",
age_group == "20 a 24 anos" ~ "20-24",
age_group == "25 a 29 anos" ~ "25-29",
age_group == "30 a 39 anos" ~ "30-39",
age_group == "40 a 49 anos" ~ "40-49",
age_group == "50 a 59 anos" ~ "50-59",
age_group == "60 a 64 anos" ~ "60-64",
age_group == "65 anos ou mais" ~ "65+"
),
age_group = factor(age_group, ordered = TRUE),
n = as.integer(n * 1000)
) |>
dplyr::relocate(year, country, region, state, sex, age_group, n) |>
dplyr::filter(state %in% orbis::get_brazil_state_by_utc(-3, "state")) |>
dplyr::mutate(
n_rel = n / sum(n),
n_per = (n / sum(n)) * 100
)
ibge_6407_data
Code
ibge_6407_data |> rutils:::summarize_data("age_group")
Code
ibge_6407_data |> rutils:::summarize_data("sex")
Code
ibge_6407_data |> rutils:::summarize_data("state")
Code
ibge_6407_data |> rutils:::summarize_data("region")
Code
ibge_6407_data |> rutils:::summarize_data("country")
D.5 Comparing the Sample Data with the Population Data
Code
data |> rutils:::compare_sample(ibge_6407_data, "age_group")
Code
data |> rutils:::compare_sample(ibge_6407_data, "sex")
Code
data |> rutils:::compare_sample(ibge_6407_data, "state")
Code
data |> rutils:::compare_sample(ibge_6407_data, "region")
Code
data |> rutils:::compare_sample(ibge_6407_data, "country")
D.6 Adjusting Sample Data with the Population Data
The tables above reveal an overrepresentation of states such as São Paulo and Rio de Janeiro, while states like Amapá and Tocantins are underrepresented. To address this imbalance, a cell weighting procedure was applied, using the variables state
, sex
, and age group
to ensure a more representative sample.
See Kalton & Flores-Cervantes (2003) and Kuhn (2022) to learn more about the methodology.
\[ \text{Cell weighting}: \cfrac{\% \ \text{2017 IBGE's PNAD Survey}}{\% \ \text{Sample}} \]
Code
ibge_6407_data |>
dplyr::select(state, sex, age_group, n) |>
tidyr::pivot_wider(names_from = state, values_from = n) |>
janitor::adorn_totals(c("row", "col")) |>
janitor::adorn_percentages("col") |>
janitor::adorn_pct_formatting(digits = 3) |>
janitor::adorn_ns() |>
dplyr::as_tibble()
Code
weights_data <-
ibge_6407_data |>
dplyr::left_join(
data |>
dplyr::summarise(
n = dplyr::n(),
.by = c("country", "region", "state", "sex", "age_group")
) |>
dplyr::mutate(
n_rel = n / sum(n),
n_per = (n / sum(n)) * 100
),
by = c("country", "region", "state", "sex", "age_group"),
suffix = c("_pnad", "_sample")
) |>
dplyr::relocate(year, .before = country) |>
dplyr::mutate(
cell_weight = parsnip::importance_weights(n_per_pnad / n_per_sample)
# inv_prob_weight = parsnip::importance_weights(1 / (n / sum(n)))
) |>
dplyr::select(
country, region, state, sex, age_group, cell_weight
) |>
dplyr::arrange(state, sex, age_group)
weights_data
Code
test_data <-
data |>
dplyr::mutate(
msf_sc =
msf_sc |>
lubritime:::link_to_timeline(
threshold = hms::parse_hms("12:00:00")
) |>
as.numeric()
) |>
dplyr::left_join(
weights_data,
by = c("country", "region", "state", "sex", "age_group")
)
D.7 Testing Fitting a Model with the Weights
D.7.1 Without Weights
model <-
parsnip::linear_reg() |>
parsnip::set_engine("lm") |>
parsnip::set_mode("regression")
workflow <-
workflows::workflow() |>
workflows::add_formula(msf_sc ~ sex + age) |>
workflows::add_model(model)
workflow
#> ══ Workflow ═════════════════════════════════════════════════════════════════
#> Preprocessor: Formula
#> Model: linear_reg()
#>
#> ── Preprocessor ─────────────────────────────────────────────────────────────
#> msf_sc ~ sex + age
#>
#> ── Model ────────────────────────────────────────────────────────────────────
#> Linear Regression Model Specification (regression)
#>
#> Computational engine: lm
fit <- workflow |> parsnip::fit(test_data)
Code
fit |>
broom::tidy() |>
janitor::adorn_rounding(5)
Code
fit |>
broom::glance() |>
tidyr::pivot_longer(cols = dplyr::everything()) |>
janitor::adorn_rounding(10)
D.7.2 With Weights
model <-
parsnip::linear_reg() |>
parsnip::set_engine("lm") |>
parsnip::set_mode("regression")
workflow <-
workflows::workflow() |>
workflows::add_case_weights(cell_weight) |>
workflows::add_formula(msf_sc ~ sex + age) |>
workflows::add_model(model)
workflow
#> ══ Workflow ═════════════════════════════════════════════════════════════════
#> Preprocessor: Formula
#> Model: linear_reg()
#>
#> ── Preprocessor ─────────────────────────────────────────────────────────────
#> msf_sc ~ sex + age
#>
#> ── Case Weights ─────────────────────────────────────────────────────────────
#> cell_weight
#>
#> ── Model ────────────────────────────────────────────────────────────────────
#> Linear Regression Model Specification (regression)
#>
#> Computational engine: lm
fit <- workflow |> parsnip::fit(test_data)
Code
fit |>
broom::tidy() |>
janitor::adorn_rounding(5)
Code
fit |>
broom::glance() |>
tidyr::pivot_longer(cols = dplyr::everything()) |>
janitor::adorn_rounding(10)