[Appendix D — Sample Balance]{#sec-sm-sample-balance .quarto-section-identifier}

doi:10.17605/OSF.IO/YGKTS

Appendix D — Sample Balance

D.1 Overview

This document outlines the methodology employed to balance the sample data, ensuring representativeness and reliability for hypothesis testing and subsequent analysis.

The balancing process accounts for the population distribution across the variables state, sex, and age.

Please note that the values presented here are specific to the group of states in the Brazilian time zone UTC-3 and do not represent the entire country.

D.2 Setting the Environment

Code

library(broom)
library(dplyr)
library(here)
library(janitor)
library(magrittr)
library(parsnip)
library(report)
library(rlang)
library(sidrar)
library(targets)
library(tidyr)
library(workflows)

D.3 Loading the Sample Data

Code

targets::tar_make(script = here::here("_targets.R"))

Code

data <-
  targets::tar_read("filtered_data", store = here::here("_targets")) |>
  dplyr::filter(state %in% orbis::get_brazil_state_by_utc(-3, "state")) |>
  dplyr::select(country, region, state, sex, age, msf_sc) |>
  tidyr::drop_na(state, sex, age) |>
  dplyr::mutate(
    age_group = dplyr::case_when(
      age >= 18 & age < 20 ~ "18-19",
      age >= 20 & age < 25 ~ "20-24",
      age >= 25 & age < 30 ~ "25-29",
      age >= 30 & age < 40 ~ "30-39",
      age >= 40 & age < 50 ~ "40-49",
      age >= 50 & age < 60 ~ "50-59",
      age >= 60 & age < 65 ~ "60-64",
      age >= 65 ~ "65+"
    ),
    age_group = factor(
      age_group,
      levels = c(
        "18-19", "20-24", "25-29", "30-39", "40-49", "50-59", "60-64",
        "65+"
      ),
      ordered = TRUE
    )
  ) |>
  dplyr::relocate(age_group, .after = age)

data

D.4 Getting the Population Data

Source: Instituto Brasileiro de Geografia e Estatística. (n.d.). Tabela 6407: População residente, por sexo e grupos de idade [Table 6407: Resident population, by sex and age groups] [Dataset]. SIDRA. https://sidra.ibge.gov.br/tabela/6407

Code

prettycheck:::assert_internet()

ibge_6407_data <-
  sidrar::get_sidra(
    api = paste0(
      "/t/6407/n1/all/n3/all/v/606/p/2017/c2/all/c58/1144,1145,",
      "1152,2793,3299,3300,3301,3302,6798"
    )
  ) |>
  dplyr::as_tibble() |>
  dplyr::select(
    dplyr::all_of(
      c(
        "Valor", "Brasil e Unidade da Federação", "Ano", "Sexo",
        "Grupo de idade"
      )
    )
  ) |>
  dplyr::rename(
    n = Valor,
    state = `Brasil e Unidade da Federação`,
    year = Ano,
    sex = Sexo,
    age_group = `Grupo de idade`
  ) |>
  dplyr::filter(
    state != "Brasil",
    sex != "Total",
    age_group != "60 anos ou mais"
  ) |>
  dplyr::arrange(state, sex, age_group) |>
  dplyr::mutate(
    year = as.integer(year),
    country = "Brazil",
    region = orbis::get_brazil_region(state),
    sex = dplyr::case_when(
      sex == "Homens" ~ "Male",
      sex == "Mulheres" ~ "Female"
    ),
    sex = factor(sex, ordered = FALSE),
    age_group = dplyr::case_when(
      age_group == "18 a 19 anos" ~ "18-19",
      age_group == "20 a 24 anos" ~ "20-24",
      age_group == "25 a 29 anos" ~ "25-29",
      age_group == "30 a 39 anos" ~ "30-39",
      age_group == "40 a 49 anos" ~ "40-49",
      age_group == "50 a 59 anos" ~ "50-59",
      age_group == "60 a 64 anos" ~ "60-64",
      age_group == "65 anos ou mais" ~ "65+"
    ),
    age_group = factor(age_group, ordered = TRUE),
    n = as.integer(n * 1000)
  ) |>
  dplyr::relocate(year, country, region, state, sex, age_group, n) |>
  dplyr::filter(state %in% orbis::get_brazil_state_by_utc(-3, "state")) |>
  dplyr::mutate(
    n_rel = n / sum(n),
    n_per = (n / sum(n)) * 100
  )

ibge_6407_data

Code

ibge_6407_data |> rutils:::summarize_data("age_group")

Code

ibge_6407_data |> rutils:::summarize_data("sex")

Code

ibge_6407_data |> rutils:::summarize_data("state")

Code

ibge_6407_data |> rutils:::summarize_data("region")

Code

ibge_6407_data |> rutils:::summarize_data("country")

D.5 Comparing the Sample Data with the Population Data

Code

data |> rutils:::compare_sample(ibge_6407_data, "age_group")

Code

data |> rutils:::compare_sample(ibge_6407_data, "sex")

Code

data |> rutils:::compare_sample(ibge_6407_data, "state")

Code

data |> rutils:::compare_sample(ibge_6407_data, "region")

Code

data |> rutils:::compare_sample(ibge_6407_data, "country")

D.6 Adjusting Sample Data with the Population Data

The tables above reveal an overrepresentation of states such as São Paulo and Rio de Janeiro, while states like Amapá and Tocantins are underrepresented. To address this imbalance, a cell weighting procedure was applied, using the variables state, sex, and age group to ensure a more representative sample.

See Kalton & Flores-Cervantes (2003) and Kuhn (2022) to learn more about the methodology.

\[ \text{Cell weighting}: \cfrac{\% \ \text{2017 IBGE's PNAD Survey}}{\% \ \text{Sample}} \]

Code

ibge_6407_data |>
  dplyr::select(state, sex, age_group, n) |>
  tidyr::pivot_wider(names_from = state, values_from = n) |>
  janitor::adorn_totals(c("row", "col")) |>
  janitor::adorn_percentages("col") |>
  janitor::adorn_pct_formatting(digits = 3) |>
  janitor::adorn_ns() |>
  dplyr::as_tibble()

Code

weights_data <-
  ibge_6407_data |>
  dplyr::left_join(
    data |>
      dplyr::summarise(
        n = dplyr::n(),
        .by = c("country", "region", "state", "sex", "age_group")
      ) |>
      dplyr::mutate(
        n_rel = n / sum(n),
        n_per = (n / sum(n)) * 100
      ),
    by = c("country", "region", "state", "sex", "age_group"),
    suffix = c("_pnad", "_sample")
  ) |>
  dplyr::relocate(year, .before = country) |>
  dplyr::mutate(
    cell_weight = parsnip::importance_weights(n_per_pnad / n_per_sample)
    # inv_prob_weight = parsnip::importance_weights(1 / (n / sum(n)))
  ) |>
  dplyr::select(
    country, region, state, sex, age_group, cell_weight
  ) |>
  dplyr::arrange(state, sex, age_group)

weights_data

Code

test_data <-
  data |>
  dplyr::mutate(
    msf_sc =
      msf_sc |>
      lubritime:::link_to_timeline(
        threshold = hms::parse_hms("12:00:00")
      ) |>
      as.numeric()
  ) |>
  dplyr::left_join(
    weights_data,
    by = c("country", "region", "state", "sex", "age_group")
  )

D.7 Testing Fitting a Model with the Weights

D.7.1 Without Weights

model <-
  parsnip::linear_reg() |>
  parsnip::set_engine("lm") |>
  parsnip::set_mode("regression")

workflow <-
  workflows::workflow() |>
  workflows::add_formula(msf_sc ~ sex + age) |>
  workflows::add_model(model)

workflow
#> ══ Workflow ═════════════════════════════════════════════════════════════════
#> Preprocessor: Formula
#> Model: linear_reg()
#> 
#> ── Preprocessor ─────────────────────────────────────────────────────────────
#> msf_sc ~ sex + age
#> 
#> ── Model ────────────────────────────────────────────────────────────────────
#> Linear Regression Model Specification (regression)
#> 
#> Computational engine: lm

fit <- workflow |> parsnip::fit(test_data)

Code

fit |>
  broom::tidy() |>
  janitor::adorn_rounding(5)

Code

fit |>
  broom::glance() |>
  tidyr::pivot_longer(cols = dplyr::everything()) |>
  janitor::adorn_rounding(10)

D.7.2 With Weights

model <-
  parsnip::linear_reg() |>
  parsnip::set_engine("lm") |>
  parsnip::set_mode("regression")

workflow <-
  workflows::workflow() |>
  workflows::add_case_weights(cell_weight) |>
  workflows::add_formula(msf_sc ~ sex + age) |>
  workflows::add_model(model)

workflow
#> ══ Workflow ═════════════════════════════════════════════════════════════════
#> Preprocessor: Formula
#> Model: linear_reg()
#> 
#> ── Preprocessor ─────────────────────────────────────────────────────────────
#> msf_sc ~ sex + age
#> 
#> ── Case Weights ─────────────────────────────────────────────────────────────
#> cell_weight
#> 
#> ── Model ────────────────────────────────────────────────────────────────────
#> Linear Regression Model Specification (regression)
#> 
#> Computational engine: lm

fit <- workflow |> parsnip::fit(test_data)

Code

fit |>
  broom::tidy() |>
  janitor::adorn_rounding(5)

Code

fit |>
  broom::glance() |>
  tidyr::pivot_longer(cols = dplyr::everything()) |>
  janitor::adorn_rounding(10)