Appendix A — Chapter 2 supplemental material

You are reading the work-in-progress of this thesis.

This chapter should be readable but is currently undergoing final polishing.

A.1 Base texts

See Vartanian & Pedrazzoli (2017) to visualize the data questionnaire.

See Roenneberg & Merrow (2006) to visualize the EUCLOCK Portuguese questionnaire.

See Reis (2020) to learn more about the MCTQPT questionnaire. It’s important to note that the MCTQPT was not included in the validation article. To obtain full access to the questionnaire statements, you should contact the main author of the article.

Two control texts were used, one from Andrade (2023) and another from Brecht (2000).

data_text <- c(
  "Você vai para a cama às ___ horas.",
  "Algumas pessoas permanecem um tempo acordadas depois que vão se deitar.",
  "Depois de ir para a cama, você decide dormir às ___ horas.",
  "Você precisa de ___ para dormir.",
  "Você acorda às ___ horas.",
  "Você se levanta ___ depois de despertar.",
  "Você vai para a cama às ___ horas.",
  "",
  "Depois de ir para a cama, você decide dormir às ___ horas.",
  "Você precisa de ___ para dormir.",
  "Você acorda às ___ horas.",
  "Você se levanta ___ depois de despertar."
)

euclock_text <- c(
  "vou para a cama às ___ horas.",
  "Algumas pessoas permanecem um tempo acordadas depois que vão se deitar.",
  "às ___ horas, decido dormir.",
  "Eu necessito ___ minutos para adormecer.",
  "acordo às ___ horas,",
  "passados ___ minutos, me levanto.",
  "vou para a cama às ___ horas.",
  "Algumas pessoas permanecem um tempo acordadas depois que vão se deitar.",
  "às ___ horas, decido dormir.",
  "Eu necessito ___ minutos para adormecer.",
  "acordo às ___ horas,",
  "passados ___ minutos, me acordo."
)

mctq_pt_text <- c(
  "Vou para a cama às ___ horas.",
  "Algumas pessoas permanecem algum tempo acordadas depois de estarem na cama.",
  "Às ___ horas estou pronto para adormecer.",
  "Necessito de ___ minutos para adormecer.",
  "Acordo às ___ horas.",
  "Após ___ minutos, levanto-me.",
  "Vou para a cama às ___ horas.",
  "Algumas pessoas permanecem algum tempo acordadas depois de estarem na cama.",
  "Às ___ horas estou pronto para adormecer.",
  "Necessito de ___ minutos para adormecer.",
  "Acordo às ___ horas.",
  "Após ___ minutos, levanto-me."
)

# See: Andrade, T. (2023). Acronomia. In T. Andrade, Tau (chapter 1). Flyve.
control_text_1 <- c(
  "Eles eliminaram o tempo, definitivamente.",
  "Removeram todos os relógios, de parede, de pulso, de bolso...",
  "Talvez esses objetos fossem realmente obsoletos àquela altura",
  "mas sim, foi deliberado: era um projeto mundial.",
  "Mas a situação é bem pior do que parece a princípio.",
  "Não foi apenas qualquer possibilidade de aferição do tempo",
  "exterminaram a própria capacidade de produzi-lo.",
  "Primeiro marcaram o 'Grande dia da entrega'.",
  "Um comboio de carros de lixo passou pelas ruas",
  "recolhendo todos os tipos de relógio",
  "e cronômetro que estavam de posse das pessoas.",
  "De mecanismos empoeirados e engrenagens enferrujadas a dispositivos modernos"
)

# See: Brecht, B. (2000). Quem se defende. In B. Brecht, Poemas 1913-1956 
#      (P. C. Souza, Trans.; 5th ed., p. 73). Editora 34.
control_text_2 <- c(
  "Quem se defende porque lhe tiram o ar",
  "Ao lhe apertar a garganta, ",
  "para este há um parágrafo",
  "Que diz: ele agiu em legítima defesa. ", 
  "Mas",
  "O mesmo parágrafo silencia",
  "Quando vocês se defendem porque lhes tiram o pão.",
  "E no entanto morre quem não come, ",
  "e quem não come o suficiente",
  "Morre lentamente. ",
  "Durante os anos todos em que morre",
  "Não lhe é permitido se defender."
)
data_text_textreuse <- 
  textreuse::TextReuseTextDocument(
    text = data_text,
    meta = list(id = "data")
  )

euclock_text_textreuse <- 
  textreuse::TextReuseTextDocument(
    text = euclock_text,
    meta = list(id = "euclock")
  )

mctq_pt_text_textreuse <- 
  textreuse::TextReuseTextDocument(
    text = mctq_pt_text,
    meta = list(id = "mctq_pt")
  )

control_text_1_textreuse <- 
  textreuse::TextReuseTextDocument(
    text = control_text_1,
    meta = list(id = "control_1")
  )

control_text_2_textreuse <- 
  textreuse::TextReuseTextDocument(
    text = control_text_2,
    meta = list(id = "control_2")
  )
# See
# <https://huggingface.co/neuralmind/bert-base-portuguese-cased>
# to learn more.

rutils:::assert_internet()

text_embed <- function(text) {
  checkmate::assert_character(text)
  
  text |>
    text::textEmbed(
      model = "neuralmind/bert-base-portuguese-cased",
      layers = - 2,
      dim_name = TRUE,
      aggregation_from_layers_to_tokens = "concatenate",
      aggregation_from_tokens_to_texts = "mean",
      aggregation_from_tokens_to_word_types = NULL,
      keep_token_embeddings = TRUE,
      tokens_select = NULL,
      tokens_deselect = NULL,
      decontextualize = FALSE,
      model_max_length = NULL,
      max_token_to_sentence = 4,
      tokenizer_parallelism = FALSE,
      device = "gpu",
      logging_level = "error"
    )
}

data_text_textembed <- text_embed(data_text)
euclock_text_textembed <- text_embed(euclock_text)
mctq_pt_text_textembed <- text_embed(mctq_pt_text)
control_text_1_textembed <- text_embed(control_text_1)
control_text_2_textembed <- text_embed(control_text_2)

A.2 Text similarity

See Wang & Dong (2020) to learn more.

For a quick explanation, see https://youtu.be/e9U0QAFbfLI.

text_distance <- function(x, y) {
  checkmate::assert_list(x, len = 2)
  checkmate::assert_list(y, len = 2)
  
  methods <- c(
    "binary", "cosine", "canberra", "euclidean", "manhattan", "maximum", 
    "minkowski", "pearson"
  )
  
  for (i in methods) {
    cli::cli_alert_info(paste0(
      "Method: {.strong {stringr::str_to_title(i)}}"
      ))
    
    test <- 
      text::textSimilarity(
        x$texts$texts, 
        y$texts$texts, 
        method = i, 
        center = TRUE, 
        scale = FALSE
      )
    
    cli::cli_bullets(c(">" = "Line by line"))
    print(test)
    
    cli::cli_bullets(c(">" = "Overall mean"))
    print(mean(test))
    
    cli::cat_line()
  }
}
text_representation <- function(x, y) {
  checkmate::assert_class(x, "TextReuseTextDocument")
  checkmate::assert_class(y, "TextReuseTextDocument")
  
  cli::cli_alert_info(paste0("Method: {.strong Jaccard similarity}"))
  print(textreuse::jaccard_similarity(x, y))
  cli::cat_line()
  
  cli::cli_alert_info(paste0("Method: {.strong Jaccard bag similarity}"))
  print(textreuse::jaccard_bag_similarity(x, y))
  cli::cat_line()
}

A.2.1 How similar is the data questionnaire when compared to the EUCLOCK questionnaire?

A.2.1.1 Text distance

text_distance(data_text_textembed, euclock_text_textembed)
#> ℹ Method: Binary
#> → Line by line
#>  [1] 1 1 1 1 1 1 1 1 1 1 1 1
#> → Overall mean
#> [1] 1
#> ℹ Method: Cosine
#> → Line by line
#>  [1] 0.9911730 1.0000000 0.9639984 0.9662432 0.9604119 0.9557896 0.9911730
#>  [8] 0.1559853 0.9639984 0.9662432 0.9604119 0.9497428
#> → Overall mean
#> [1] 0.9020976
#> ℹ Method: Canberra
#> → Line by line
#>  [1] -218.3367    1.0000 -335.1895 -318.9419 -335.9000 -373.7989 -218.3367
#>  [8] -642.5522 -335.1895 -318.9419 -335.9000 -381.4067
#> → Overall mean
#> [1] -317.7912
#> ℹ Method: Euclidean
#> → Line by line
#>  [1]  -1.504474   1.000000  -4.058168  -3.976768  -4.144779  -4.705653
#>  [7]  -1.504474 -19.453535  -4.058168  -3.976768  -4.144779  -5.071087
#> → Overall mean
#> [1] -4.633221
#> ℹ Method: Manhattan
#> → Line by line
#>  [1]  -53.58509    1.00000 -108.96057 -105.53439 -111.46668 -124.16400
#>  [7]  -53.58509 -198.94244 -108.96057 -105.53439 -111.46668 -131.16054
#> → Overall mean
#> [1] -101.03
#> ℹ Method: Maximum
#> → Line by line
#>  [1]   0.52944993   1.00000000   0.37664773   0.07405534   0.11978415
#>  [6]   0.31058226   0.52944993 -14.91353795   0.37664773   0.07405534
#> [11]   0.11978415   0.16830817
#> → Overall mean
#> [1] -0.9362311
#> ℹ Method: Minkowski
#> → Line by line
#>  [1]  -1.504474   1.000000  -4.058168  -3.976768  -4.144779  -4.705653
#>  [7]  -1.504474 -19.453535  -4.058168  -3.976768  -4.144779  -5.071087
#> → Overall mean
#> [1] -4.633221
#> ℹ Method: Pearson
#> → Line by line
#>  [1] 0.9911730 1.0000000 0.9639984 0.9662432 0.9604119 0.9557896 0.9911730
#>  [8] 0.1559853 0.9639984 0.9662432 0.9604119 0.9497428
#> → Overall mean
#> [1] 0.9020976

A.2.1.2 Text representation

Note: The maximum value for the Jaccard bag similarity is 0.5.

text_representation(euclock_text_textreuse, data_text_textreuse)
#> ℹ Method: Jaccard similarity
#> [1] 0.2173913
#> ℹ Method: Jaccard bag similarity
#> [1] 0.1446541

A.2.2 How similar is the data questionnaire when compared to the MCTQPT questionnaire?

A.2.2.1 Text distance

text_distance(data_text_textembed, mctq_pt_text_textembed)
#> ℹ Method: Binary
#> → Line by line
#>  [1] 1 1 1 1 1 1 1 1 1 1 1 1
#> → Overall mean
#> [1] 1
#> ℹ Method: Cosine
#> → Line by line
#>  [1] 0.9901437 0.9898982 0.9687513 0.9575260 0.9882873 0.9598702 0.9901437
#>  [8] 0.1601500 0.9687513 0.9575260 0.9882873 0.9598702
#> → Overall mean
#> [1] 0.9066005
#> ℹ Method: Canberra
#> → Line by line
#>  [1] -227.9938 -247.6044 -335.9297 -349.9493 -225.4809 -353.5263 -227.9938
#>  [8] -631.6228 -335.9297 -349.9493 -225.4809 -353.5263
#> → Overall mean
#> [1] -322.0823
#> ℹ Method: Euclidean
#> → Line by line
#>  [1]  -1.662187  -1.729814  -3.807963  -4.603019  -1.810249  -4.458696
#>  [7]  -1.662187 -19.380367  -3.807963  -4.603019  -1.810249  -4.458696
#> → Overall mean
#> [1] -4.482867
#> ℹ Method: Manhattan
#> → Line by line
#>  [1]  -57.28537  -58.97985 -102.26048 -119.55311  -59.76706 -117.55850
#>  [7]  -57.28537 -193.79964 -102.26048 -119.55311  -59.76706 -117.55850
#> → Overall mean
#> [1] -97.13571
#> ℹ Method: Maximum
#> → Line by line
#>  [1]   0.60554241   0.60396075   0.41795957   0.01856209   0.39220500
#>  [6]   0.28932291   0.60554241 -14.95654231   0.41795957   0.01856209
#> [11]   0.39220500   0.28932291
#> → Overall mean
#> [1] -0.9087831
#> ℹ Method: Minkowski
#> → Line by line
#>  [1]  -1.662187  -1.729814  -3.807963  -4.603019  -1.810249  -4.458696
#>  [7]  -1.662187 -19.380367  -3.807963  -4.603019  -1.810249  -4.458696
#> → Overall mean
#> [1] -4.482867
#> ℹ Method: Pearson
#> → Line by line
#>  [1] 0.9901437 0.9898982 0.9687513 0.9575260 0.9882873 0.9598702 0.9901437
#>  [8] 0.1601500 0.9687513 0.9575260 0.9882873 0.9598702
#> → Overall mean
#> [1] 0.9066005

A.2.2.2 Text representation

Note: The maximum value for the Jaccard bag similarity is 0.5.

text_representation(mctq_pt_text_textreuse, data_text_textreuse)
#> ℹ Method: Jaccard similarity
#> [1] 0.1052632
#> ℹ Method: Jaccard bag similarity
#> [1] 0.09815951

A.2.3 How similar is the data questionnaire when compared to the Control Text 1?

A.2.3.1 Text distance

text_distance(data_text_textembed, control_text_1_textembed)
#> ℹ Method: Binary
#> → Line by line
#>  [1] 1 1 1 1 1 1 1 1 1 1 1 1
#> → Overall mean
#> [1] 1
#> ℹ Method: Cosine
#> → Line by line
#>  [1] 0.9050224 0.8904954 0.8996243 0.8864538 0.8587490 0.8921112 0.8880434
#>  [8] 0.1887362 0.8882691 0.8727013 0.8732868 0.8433085
#> → Overall mean
#> [1] 0.8239001
#> ℹ Method: Canberra
#> → Line by line
#>  [1] -469.3314 -485.1504 -483.8301 -506.1996 -504.6484 -490.3147 -478.4438
#>  [8] -632.5119 -492.0333 -490.9512 -496.6430 -517.2764
#> → Overall mean
#> [1] -503.9445
#> ℹ Method: Euclidean
#> → Line by line
#>  [1]  -7.299638  -7.859283  -7.547236  -8.207869  -8.889088  -8.131165
#>  [7]  -8.090413 -18.891223  -7.834767  -8.892308  -8.472040  -9.696238
#> → Overall mean
#> [1] -9.150939
#> ℹ Method: Manhattan
#> → Line by line
#>  [1] -174.9785 -191.0842 -182.5955 -197.5195 -212.8954 -196.0854 -196.2141
#>  [8] -181.0755 -192.3436 -212.1597 -198.4904 -233.2716
#> → Overall mean
#> [1] -197.3928
#> ℹ Method: Maximum
#> → Line by line
#>  [1]  -0.1064962  -0.1808311  -0.2100441  -0.3340142  -0.3991810  -0.4304202
#>  [7]  -0.1888392 -15.1616240  -0.4010140  -0.9773587  -0.4897276  -0.6558742
#> → Overall mean
#> [1] -1.627952
#> ℹ Method: Minkowski
#> → Line by line
#>  [1]  -7.299638  -7.859283  -7.547236  -8.207869  -8.889088  -8.131165
#>  [7]  -8.090413 -18.891223  -7.834767  -8.892308  -8.472040  -9.696238
#> → Overall mean
#> [1] -9.150939
#> ℹ Method: Pearson
#> → Line by line
#>  [1] 0.9050224 0.8904954 0.8996243 0.8864538 0.8587490 0.8921112 0.8880434
#>  [8] 0.1887362 0.8882691 0.8727013 0.8732868 0.8433085
#> → Overall mean
#> [1] 0.8239001

A.2.3.2 Text representation

text_representation(control_text_1_textreuse, data_text_textreuse)
#> ℹ Method: Jaccard similarity
#> [1] 0
#> ℹ Method: Jaccard bag similarity
#> [1] 0

A.2.4 How similar is the data questionnaire when compared to the Control Text 2?

A.2.4.1 Text distance

text_distance(data_text_textembed, control_text_2_textembed)
#> ℹ Method: Binary
#> → Line by line
#>  [1] 1 1 1 1 1 1 1 1 1 1 1 1
#> → Overall mean
#> [1] 1
#> ℹ Method: Cosine
#> → Line by line
#>  [1] 0.9060095 0.9037570 0.8737437 0.8856842 0.7641104 0.9035066 0.9119419
#>  [8] 0.2057337 0.8851267 0.9036579 0.8678450 0.9198303
#> → Overall mean
#> [1] 0.8275789
#> ℹ Method: Canberra
#> → Line by line
#>  [1] -464.0989 -465.0249 -497.9481 -493.0060 -520.5817 -473.6470 -454.5072
#>  [8] -627.1888 -485.4858 -469.6126 -488.5316 -461.6104
#> → Overall mean
#> [1] -491.7702
#> ℹ Method: Euclidean
#> → Line by line
#>  [1]  -7.178356  -7.285667  -9.025508  -8.001721 -10.985822  -7.356390
#>  [7]  -6.953186 -18.747793  -7.828154  -7.079656  -8.591087  -6.716923
#> → Overall mean
#> [1] -8.812522
#> ℹ Method: Manhattan
#> → Line by line
#>  [1] -173.8983 -180.4570 -213.3055 -192.0396 -188.6964 -182.8793 -171.7383
#>  [8] -174.9841 -188.5230 -170.6919 -210.6607 -166.2871
#> → Overall mean
#> [1] -184.5134
#> ℹ Method: Maximum
#> → Line by line
#>  [1]  -0.39784696  -0.04475208  -0.72665224  -0.28434217  -5.73594077
#>  [6]  -0.20936910   0.05190219 -15.30314612  -0.55310996  -0.52536512
#> [11]  -0.10743206  -0.14595067
#> → Overall mean
#> [1] -1.9985
#> ℹ Method: Minkowski
#> → Line by line
#>  [1]  -7.178356  -7.285667  -9.025508  -8.001721 -10.985822  -7.356390
#>  [7]  -6.953186 -18.747793  -7.828154  -7.079656  -8.591087  -6.716923
#> → Overall mean
#> [1] -8.812522
#> ℹ Method: Pearson
#> → Line by line
#>  [1] 0.9060095 0.9037570 0.8737437 0.8856842 0.7641104 0.9035066 0.9119419
#>  [8] 0.2057337 0.8851267 0.9036579 0.8678450 0.9198303
#> → Overall mean
#> [1] 0.8275789

A.2.4.2 Text representation

text_representation(control_text_2_textreuse, data_text_textreuse)
#> ℹ Method: Jaccard similarity
#> [1] 0
#> ℹ Method: Jaccard bag similarity
#> [1] 0