From 75730d33106af885720d031456587ee95aa9493d Mon Sep 17 00:00:00 2001 From: Baudry Clara Date: Fri, 19 Sep 2025 16:09:41 +0200 Subject: [PATCH 01/17] =?UTF-8?q?fix:=20pbm=20pas=20li=C3=A9=20aux=20indic?= =?UTF-8?q?ateurs,=20ajout=20des=20tableaux=20qui=20ne=20sont=20pas=20incl?= =?UTF-8?q?us=20les=20uns=20dans=20les=20autres=20dans=20le=20cluster,=20a?= =?UTF-8?q?vant=20ils=20disparaissaient?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- R/grp_tab_in_cluster.R | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/R/grp_tab_in_cluster.R b/R/grp_tab_in_cluster.R index 16bd9e8..77b053b 100644 --- a/R/grp_tab_in_cluster.R +++ b/R/grp_tab_in_cluster.R @@ -53,7 +53,7 @@ grp_tab_in_cluster <- function(list_split, list_translation_tables) { # Nest each cluster of tables by `table_name` nested_crois <- list_split %>% - purrr::map(function(tab) { tab %>% dplyr::group_by(table_name) %>% tidyr::nest() }) + purrr::map(function(tab) {tab %>% dplyr::group_by(table_name) %>% tidyr::nest() }) # Process each cluster using inclusion relationships purrr::map2(list_translation_tables, nested_crois, function(tab_to_keep, big_tibble) { @@ -84,6 +84,12 @@ grp_tab_in_cluster <- function(list_split, list_translation_tables) { ) }) ) %>% dplyr::rename(table_name = table_eg) + # adding the tables that are not included in each other in the cluster + if(length(big_tibble$table_name) != length(tab_to_keep$passage_nom_tab$Original)){ + tables_no_inclusion <- big_tibble %>% + filter(table_name %in% setdiff(big_tibble$table_name,tab_to_keep$passage_nom_tab$Original)) + big_tibble_eg <- bind_rows(tables_no_inclusion,big_tibble_eg) + } return(big_tibble_eg) } else { From 5d374492fea41301280dd564d6a5279cd2323e52 Mon Sep 17 00:00:00 2001 From: Baudry Clara Date: Mon, 22 Sep 2025 09:45:11 +0200 Subject: [PATCH 02/17] fix: tab non incluses ajout spanning --- R/grp_tab_in_cluster.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/R/grp_tab_in_cluster.R b/R/grp_tab_in_cluster.R index 77b053b..92cc552 100644 --- a/R/grp_tab_in_cluster.R +++ b/R/grp_tab_in_cluster.R @@ -66,7 +66,7 @@ grp_tab_in_cluster <- function(list_split, list_translation_tables) { # Process the cluster big_tibble_eg <- big_tibble %>% mutate( - spanning = map(data, function(small_tibble) { small_tibble$spanning }) + spanning = map(data, function(small_tibble) {small_tibble$spanning}) ) %>% dplyr::left_join(tab_to_keep[[2]], by = c("table_name" = "Original")) %>% mutate(table_eg = ifelse(is.na(Group), table_name, Group)) %>% @@ -87,7 +87,8 @@ grp_tab_in_cluster <- function(list_split, list_translation_tables) { # adding the tables that are not included in each other in the cluster if(length(big_tibble$table_name) != length(tab_to_keep$passage_nom_tab$Original)){ tables_no_inclusion <- big_tibble %>% - filter(table_name %in% setdiff(big_tibble$table_name,tab_to_keep$passage_nom_tab$Original)) + filter(table_name %in% setdiff(big_tibble$table_name,tab_to_keep$passage_nom_tab$Original)) %>% + mutate(spanning = map(data, function(small_tibble) {small_tibble$spanning})) big_tibble_eg <- bind_rows(tables_no_inclusion,big_tibble_eg) } From 276becc16c7ccfb50316d2150083cda1883b61c5 Mon Sep 17 00:00:00 2001 From: Baudry Clara Date: Mon, 22 Sep 2025 09:47:31 +0200 Subject: [PATCH 03/17] =?UTF-8?q?feat:=20traitement=20=C3=A9quations=20sur?= =?UTF-8?q?=20les=20indicateurs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- R/analyse_metadata.R | 11 +++- R/create_edges.R | 21 +++++++- R/identify_hrc.R | 2 +- R/identify_hrc_with_eq.R | 107 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 136 insertions(+), 5 deletions(-) create mode 100644 R/identify_hrc_with_eq.R diff --git a/R/analyse_metadata.R b/R/analyse_metadata.R index 46b52c1..af2a7c9 100644 --- a/R/analyse_metadata.R +++ b/R/analyse_metadata.R @@ -53,7 +53,7 @@ #' `r lifecycle::badge("experimental")` #' #' @export -analyse_metadata <- function(df_metadata,verbose = FALSE){ +analyse_metadata <- function(df_metadata,df_eq_indicator = NULL,verbose = FALSE){ # check that the input is in the right format: right column names check_column_names <- function(df) { # Expected fixed column names @@ -102,7 +102,14 @@ analyse_metadata <- function(df_metadata,verbose = FALSE){ # start of the actual analysis df_metadata_long <- wide_to_long(df_metadata) - list_hrc_identified <- identify_hrc(df_metadata_long) + if(is.null(df_eq_indicator)){ + list_hrc_identified <- identify_hrc(df_metadata_long) + }else{ + warning("The hrc_indicator column will be ignored. All links between indicators + must be specified in a dataframe (df_eq_indicator).") + list_hrc_identified <- identify_hrc_with_eq(df_metadata_long,df_eq_indicator) + } + list_split <- split_in_clusters(list_hrc_identified) list_desc_links <- create_edges(list_split) list_translation_tables <- grp_tab_names(list_desc_links) diff --git a/R/create_edges.R b/R/create_edges.R index 1464e7e..3a0a3ef 100644 --- a/R/create_edges.R +++ b/R/create_edges.R @@ -60,23 +60,40 @@ create_edges <- function(list_split){ # Condition for clusters that only have one table if(length(big_tibble$table_name) > 1) { spannings_nom_tab <- combn(big_tibble$table_name, 2, FUN = list) - tab_to_keep <- map(spannings_nom_tab, function(crois) { + # Extract the first table (Ta) corresponding to the first table of the + # pair of tables we are studying Ta <- big_tibble %>% filter(table_name == crois[[1]]) + # Get the 'spanning' column for Ta crois_Ta <- Ta$spanning[[1]] + + # Extract the first table (Tb) corresponding to the second table of the + # pair of tables we are studying Tb <- big_tibble %>% filter(table_name == crois[[2]]) + # Get the 'spanning' column for Tb crois_Tb <- Tb$spanning[[1]] + + # Create an empty data frame to store origin → destination relationships df_origin_dest <- data.frame(from = character(), to = character(), stringsAsFactors = FALSE) + + # Check if all elements of Ta are included in Tb if(all(crois_Ta %in% crois_Tb)) { + # Add a row indicating that Ta can go to Tb df_origin_dest <- rbind(df_origin_dest, - data.frame(from = crois[[1]], to = crois[[2]],stringsAsFactors = FALSE)) + data.frame(from = crois[[1]], to = crois[[2]], stringsAsFactors = FALSE)) } + + # Check if all elements of Tb are included in Ta if(all(crois_Tb %in% crois_Ta)) { + # Add a row indicating that Tb can go to Ta df_origin_dest <- rbind(df_origin_dest, data.frame(from = crois[[2]], to = crois[[1]], stringsAsFactors = FALSE)) } + + # Return the data frame containing possible links return(df_origin_dest) }) + # Filter non empty tables and combine them tab_to_keep_compact <- tab_to_keep %>% purrr::discard(is.null) %>% diff --git a/R/identify_hrc.R b/R/identify_hrc.R index 2ecd0dd..5638ebc 100644 --- a/R/identify_hrc.R +++ b/R/identify_hrc.R @@ -48,7 +48,7 @@ identify_hrc <- function(df_metadata_long){ table_name = df_spannings$table_name ) %>% unique() df_spannings <- df_spannings %>% select(-spanning_old) - if(all(is.na(df_spannings$hrc_indicator))){ # condition pour les hiérarchies sur les indicateurs + if(all(is.na(df_spannings$hrc_indicator))){ # condition for hierarchies on indicators df_indicators <- df_spannings return(list(df_indicators,df_variable_info)) } else { diff --git a/R/identify_hrc_with_eq.R b/R/identify_hrc_with_eq.R new file mode 100644 index 0000000..b300031 --- /dev/null +++ b/R/identify_hrc_with_eq.R @@ -0,0 +1,107 @@ +#' Rename variables based on their hierarchies and their equations +#' +#' This function renames variables in a long-format metadata data frame based on +#' their hierarchical groupings. Spanning variables are renamed using the name +#' of their hierarchy in uppercase, while response variables linked by an +#' equation (specified in the `hrc_indicator` column) are grouped together, and +#' a new grouping variable is created. +#' +#' @param df_metadata_long A data frame in long format with the following +#' required columns: +#' - `table_name`: Identifies the table. +#' - `field` : name of the field of the table. +#' - `indicator`: name of the indicator of the table. +#' - `hrc_indicator`: Specifies linked response variables. +#' - `spanning_*`, `hrc_spanning_*`: Spanning variables and their hierarchies. +#' +#' @return `list_hrc_identified`, a list with two elements: +#' - `df_indicators`: The updated data frame with renamed variables and grouped +#' response variables. +#' - `df_variable_info`: A data frame mapping original variable names +#' (`spanning_old`) to their renamed counterparts (`spanning`). +#' +#' @export +#' +#' @examples +#' \dontrun{ +#' data(metadata_pizza_lettuce) +#' +#' metadata_pizza_lettuce_long <- wide_to_long(metadata_pizza_lettuce) +#' +#' list_hrc_identified <- identify_hrc(metadata_pizza_lettuce_long) +#' +#' str(list_hrc_identified) +#' } +#' +identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ + # check that the input is in the right format: right column names + check_column_names <- function(df) { + # Expected fixed column names + fixed_columns <- c("eq_name", "eq_indicator", "unit") + + # Check that the fixed columns exist + if (!all(fixed_columns %in% names(df))) { + stop("Error: The dataframe describing the equations between indicators is + missing one or more required columns: eq_name, eq_indicator, unit.") + }} + check_column_names(df_eq_indicator) + + # parse the equations + parsed_equations <- df_eq_indicator %>% + tidyr::separate(eq_indicator, into = c("total", "rhs"), sep = "=", extra = "merge") %>% + dplyr::mutate(rhs = stringr::str_trim(rhs)) %>% + tidyr::separate_rows(rhs, sep = "\\+") %>% + dplyr::mutate(rhs = stringr::str_trim(rhs)) %>% + dplyr::group_by(dplyr::across(-rhs)) %>% + dplyr::mutate(term_number = paste0("rhs", dplyr::row_number())) %>% + tidyr::pivot_wider(names_from = term_number, values_from = rhs) %>% + dplyr::ungroup() %>% + dplyr::select(eq_name,unit,total,everything()) + + # change to long format in order to join with df_metadata_long + equations_long <- parsed_equations %>% + mutate(across(c(total, starts_with("rhs")), trimws)) %>% + tidyr::pivot_longer( + cols = c(total, starts_with("rhs")), + names_to = "side", # côté équation (total / rhs1 / rhs2...) + values_to = "var" + ) %>% + filter(!is.na(var)) + + df_spannings <- df_metadata_long %>% + mutate(spanning_old = spanning) %>% + mutate(spanning = ifelse(is.na(hrc_spanning), + spanning, + toupper(hrc_spanning))) %>% + mutate(indicator = ifelse(is.na(hrc_indicator), + indicator, + toupper(sub("hrc_","",hrc_indicator)))) + df_variable_info <- data.frame( + var_start_name = df_spannings$spanning_old, + var_end_name = df_spannings$spanning, + table_name = df_spannings$table_name + ) %>% unique() + df_spannings <- df_spannings %>% select(-spanning_old) + + df_indicators <- df_spannings %>% + # delete all the non-word elements, specifically for the white spaces + mutate(across(where(is.character), ~ gsub("[^[:alnum:]_]", "", .))) %>% + left_join(equations_long, by = c("indicator" = "var")) %>% + filter(!is.na(eq_name)) %>% + dplyr::group_by(table_name) %>% + summarise( + field = last(field), + hrc_field = last(hrc_field), + spanning = paste0(toupper(last(eq_name)),"^h"), + hrc_spanning = paste0("hrc_",last(eq_name)), + indicator = last(unit), + hrc_indicator = last(hrc_indicator) + ) %>% + bind_rows(df_spannings, .) %>% + group_by(table_name) %>% + mutate(indicator = last(indicator)) %>% + ungroup() %>% + arrange(table_name) + list_hrc_identified = list(df_indicators,df_variable_info) + return(list_hrc_identified) +} From d12a8e1bcdeab7887bb4d6caf216a03c1d5255b4 Mon Sep 17 00:00:00 2001 From: Baudry Clara Date: Wed, 24 Sep 2025 16:46:09 +0200 Subject: [PATCH 04/17] =?UTF-8?q?feat:=20modif=20traitement=20eq=20sur=20i?= =?UTF-8?q?ndicateurs=20pour=20regrouper=20plus=20t=C3=B4t=20les=20tableau?= =?UTF-8?q?x=20afin=20d'avoir=20les=20bons=20tab=5Fname=20=3D>=20=C3=A0=20?= =?UTF-8?q?=C3=A9prouver=20avec=20plusieurs=20exemples=20(attention=20pas?= =?UTF-8?q?=20cas=20o=C3=B9=20indicator=20m=C3=AAme=20eq=20diff=C3=A9rente?= =?UTF-8?q?s=20variables=20de=20croisement)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- R/identify_hrc_with_eq.R | 62 +++++++++++++++++++++++++++++++++------- 1 file changed, 52 insertions(+), 10 deletions(-) diff --git a/R/identify_hrc_with_eq.R b/R/identify_hrc_with_eq.R index b300031..a68680d 100644 --- a/R/identify_hrc_with_eq.R +++ b/R/identify_hrc_with_eq.R @@ -46,6 +46,7 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ }} check_column_names(df_eq_indicator) + # browser() # parse the equations parsed_equations <- df_eq_indicator %>% tidyr::separate(eq_indicator, into = c("total", "rhs"), sep = "=", extra = "merge") %>% @@ -83,25 +84,66 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ ) %>% unique() df_spannings <- df_spannings %>% select(-spanning_old) - df_indicators <- df_spannings %>% + # df_indicators <- df_spannings %>% + # # delete all the non-word elements, specifically for the white spaces + # mutate(across(where(is.character), ~ gsub("[^[:alnum:]_]", "", .))) %>% + # left_join(equations_long, by = c("indicator" = "var")) %>% + # filter(!is.na(eq_name)) %>% + # dplyr::group_by(table_name) %>% + # summarise( + # field = last(field), + # hrc_field = last(hrc_field), + # spanning = paste0(toupper(last(eq_name)),"^h"), + # hrc_spanning = paste0("hrc_",last(eq_name)), + # indicator = last(unit), + # hrc_indicator = last(hrc_indicator) + # ) %>% + # bind_rows(df_spannings, .) %>% + # group_by(table_name) %>% + # mutate(indicator = last(indicator)) %>% + # ungroup() %>% + # arrange(table_name) + + df_spannings_eq <- df_spannings %>% # delete all the non-word elements, specifically for the white spaces mutate(across(where(is.character), ~ gsub("[^[:alnum:]_]", "", .))) %>% - left_join(equations_long, by = c("indicator" = "var")) %>% + left_join(equations_long, by = c("indicator" = "var")) + + df_initial_spannings <- df_spannings_eq %>% filter(!is.na(eq_name)) %>% - dplyr::group_by(table_name) %>% + group_by(eq_name) %>% summarise( + table_name = paste(table_name, collapse = "."), field = last(field), hrc_field = last(hrc_field), - spanning = paste0(toupper(last(eq_name)),"^h"), - hrc_spanning = paste0("hrc_",last(eq_name)), + spanning = last(spanning), + hrc_spanning = last(hrc_spanning), indicator = last(unit), - hrc_indicator = last(hrc_indicator) + hrc_indicator = last(hrc_indicator), + .groups = "drop" ) %>% - bind_rows(df_spannings, .) %>% - group_by(table_name) %>% - mutate(indicator = last(indicator)) %>% - ungroup() %>% + select(-eq_name) + + df_indicator_spannings <- df_spannings_eq %>% + filter(!is.na(eq_name)) %>% + group_by(eq_name) %>% + summarise( + table_name = paste(table_name, collapse = "."), + field = last(field), + hrc_field = last(hrc_field), + spanning = paste0(toupper(last(eq_name)), "^h"), + hrc_spanning = paste0("hrc_", last(eq_name)), + indicator = last(unit), + hrc_indicator = last(hrc_indicator), + .groups = "drop" + ) %>% + select(-eq_name) + + # browser() + df_indicators <- bind_rows(df_initial_spannings,df_indicator_spannings) %>% + select(table_name,field,hrc_field,indicator,hrc_indicator,everything()) %>% arrange(table_name) + list_hrc_identified = list(df_indicators,df_variable_info) return(list_hrc_identified) } From c9d300c5929abf1cdabcd65382344e06a0d5b3b4 Mon Sep 17 00:00:00 2001 From: Baudry Clara Date: Wed, 24 Sep 2025 16:48:02 +0200 Subject: [PATCH 05/17] test: warnings et noms colonnes df_eq_indicator --- tests/testthat/test_analyse_metadata.R | 41 +++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/tests/testthat/test_analyse_metadata.R b/tests/testthat/test_analyse_metadata.R index c699ae2..3da0d29 100644 --- a/tests/testthat/test_analyse_metadata.R +++ b/tests/testthat/test_analyse_metadata.R @@ -1,16 +1,31 @@ # test data -------------------------------------------------------------------- data(metadata_pizza_lettuce) - baseline_res <- analyse_metadata(metadata_pizza_lettuce) +df_eq_ex <- data.frame( + eq_name = c("eq1", "eq2", "eq3", "eq4"), + eq_indicator = c("A = B + C", "A = D + E", "D = F + G", "Z = X + Y + S"), + unit = c("EUR", "EUR", "EUR", "EUR"), + stringsAsFactors = FALSE +) +df_meta_for_eq <- data.frame( + table_name = c("Ta","Tb","Tc","Td","Te","Tf","Tg","Tz","Tx","Ty","Ts"), + field = NA, + hrc_field = NA, + indicator = c("A","B","C","D","E","F","G","Z","X","Y","S"), + hrc_indicator = NA, + spanning_1 = "age_class", + hrc_spanning_1 = NA +) + ################################################################### INPUT CHECKS # check that an error is returned if the names of the columns don't respect the # expected format -------------------------------------------------------------- test_that("error message for wrong column name - fixed columns",{ meta <- metadata_pizza_lettuce %>% rename(table = table_name) - expect_error(analyse_metadata(meta),"The dataframe is missing one or more required columns") + expect_error(analyse_metadata(meta),"one or more required columns: table_name, field, hrc_field, indicator, hrc_indicator") }) test_that("error message for wrong column name - dynamic columns",{ @@ -28,7 +43,6 @@ test_that("unique table name", { expect_error(analyse_metadata(meta),"Duplicate values found in 'table_name'") }) - # check that each table is named (table_name column) --------------------------- test_that("error message when some tables are not named", { @@ -38,6 +52,25 @@ test_that("error message when some tables are not named", { } ) +# check column names of df_eq_indicator +test_that("error message for wrong column name - df_eq_indicator", { + df_eq_ex_modif <- df_eq_ex %>% rename(eq_indicators = eq_indicator) + + expect_error( + suppressWarnings(analyse_metadata(df_meta_for_eq, df_eq_ex_modif)), + "one or more required columns: eq_name, eq_indicator, unit" + ) +}) + +# check that the user will be warned the hrc_indicator column will be ignored if +# df_eq_indicator is used ----------------------------------------------------- +test_that("warning when hrc_indicator is ignored because df_eq_indicator used", { + expect_warning( + analyse_metadata(df_meta_for_eq, df_eq_ex), + "The hrc_indicator column will be ignored" + ) +}) + ##################################################################### HRC CHECKS # check that hierarchies on indicators are handled properly -------------------- answer <- data.frame( @@ -97,7 +130,7 @@ answer <- data.frame( test_that("one-way table inclusion", { meta <- metadata_pizza_lettuce %>% filter(table_name == "T1") %>% - mutate(hrc_spanning_1 = as.character(NA)) %>% + mutate(hrc_spanning_1 = as.character(NA)) %>% bind_rows(.,.) %>% mutate( table_name = c("T1","T1_bis"), From 9e97f6e36e0918018d16dff73b54e899755446d1 Mon Sep 17 00:00:00 2001 From: Baudry Clara Date: Thu, 25 Sep 2025 16:33:33 +0200 Subject: [PATCH 06/17] =?UTF-8?q?feat:=20gestion=20des=20indicateurs=20qui?= =?UTF-8?q?=20ne=20sont=20pas=20dans=20les=20=C3=A9quations=20m=C3=AAme=20?= =?UTF-8?q?quand=20certaines=20=C3=A9quations=20sont=20pr=C3=A9cis=C3=A9es?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- R/analyse_metadata.R | 4 ++-- R/create_edges.R | 2 +- R/identify_hrc_with_eq.R | 32 +++++++++++++++++++++++++------- 3 files changed, 28 insertions(+), 10 deletions(-) diff --git a/R/analyse_metadata.R b/R/analyse_metadata.R index af2a7c9..137ac63 100644 --- a/R/analyse_metadata.R +++ b/R/analyse_metadata.R @@ -105,8 +105,8 @@ analyse_metadata <- function(df_metadata,df_eq_indicator = NULL,verbose = FALSE) if(is.null(df_eq_indicator)){ list_hrc_identified <- identify_hrc(df_metadata_long) }else{ - warning("The hrc_indicator column will be ignored. All links between indicators - must be specified in a dataframe (df_eq_indicator).") + warning("For the variables part of equations specified in df_eq_indicator, + the hrc_indicator column will be ignored.") list_hrc_identified <- identify_hrc_with_eq(df_metadata_long,df_eq_indicator) } diff --git a/R/create_edges.R b/R/create_edges.R index 3a0a3ef..acbbb26 100644 --- a/R/create_edges.R +++ b/R/create_edges.R @@ -57,7 +57,7 @@ create_edges <- function(list_split){ small_tibble$spanning})) }) list_desc_links <- nested_crois %>% map(function(big_tibble) { - # Condition for clusters that only have one table + # Only compare the spanning variables for clusters that have more than 1 table if(length(big_tibble$table_name) > 1) { spannings_nom_tab <- combn(big_tibble$table_name, 2, FUN = list) tab_to_keep <- map(spannings_nom_tab, function(crois) { diff --git a/R/identify_hrc_with_eq.R b/R/identify_hrc_with_eq.R index a68680d..15949b4 100644 --- a/R/identify_hrc_with_eq.R +++ b/R/identify_hrc_with_eq.R @@ -46,7 +46,6 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ }} check_column_names(df_eq_indicator) - # browser() # parse the equations parsed_equations <- df_eq_indicator %>% tidyr::separate(eq_indicator, into = c("total", "rhs"), sep = "=", extra = "merge") %>% @@ -109,7 +108,7 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ mutate(across(where(is.character), ~ gsub("[^[:alnum:]_]", "", .))) %>% left_join(equations_long, by = c("indicator" = "var")) - df_initial_spannings <- df_spannings_eq %>% + df_eq_initial_spannings <- df_spannings_eq %>% filter(!is.na(eq_name)) %>% group_by(eq_name) %>% summarise( @@ -124,7 +123,7 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ ) %>% select(-eq_name) - df_indicator_spannings <- df_spannings_eq %>% + df_eq_indicator_spannings <- df_spannings_eq %>% filter(!is.na(eq_name)) %>% group_by(eq_name) %>% summarise( @@ -139,11 +138,30 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ ) %>% select(-eq_name) - # browser() - df_indicators <- bind_rows(df_initial_spannings,df_indicator_spannings) %>% + df_indicators <- bind_rows(df_eq_initial_spannings,df_eq_indicator_spannings) %>% select(table_name,field,hrc_field,indicator,hrc_indicator,everything()) %>% arrange(table_name) - list_hrc_identified = list(df_indicators,df_variable_info) - return(list_hrc_identified) + df_no_eq_spannings <- df_spannings_eq %>% filter(is.na(eq_name)) + if(all(is.na(df_no_eq_spannings$hrc_indicator)) & nrow(df_no_eq_spannings) > 0){ + df_indicators <- bind_rows(df_indicators,df_no_eq_spannings) %>% arrange(table_name) + return(list(df_indicators,df_variable_info)) + } else { + df_no_eq_indicators <- df_no_eq_spannings %>% + filter(!is.na(hrc_indicator)) %>% + dplyr::group_by(table_name) %>% + summarise( + field = last(field), + hrc_field = last(hrc_field), + spanning = paste0(toupper(last(hrc_indicator)),"^h"), + hrc_spanning = last(hrc_indicator), + indicator = last(indicator), + hrc_indicator = last(hrc_indicator) + ) %>% + bind_rows(df_spannings, .) %>% + arrange(table_name) + df_indicators <- bind_rows(df_indicators,df_no_eq_indicators) %>% arrange(table_name) + list_hrc_identified = list(df_indicators,df_variable_info) + return(list_hrc_identified) + } } From 29158b9405467f1cb16be5b676161b961f662b4e Mon Sep 17 00:00:00 2001 From: Baudry Clara Date: Tue, 21 Oct 2025 11:12:06 +0200 Subject: [PATCH 07/17] docs: correction faute orthographe --- R/format_template.R | 2 +- R/identify_hrc_with_eq.R | 41 +++++++++++++++++++++++----------------- 2 files changed, 25 insertions(+), 18 deletions(-) diff --git a/R/format_template.R b/R/format_template.R index 06f73f1..d41bbf5 100644 --- a/R/format_template.R +++ b/R/format_template.R @@ -94,7 +94,7 @@ filter_on_marginal_of_spanning_var <- function(data, criteria, subset_keys) { #' @param data template gathering all the published cells #' @param indicator_column name of the column in which the indicators are #' @param spanning_var_tot a named list of the spanning variables and their totals -#' @param field_columns vecotr of all the columns that are fields (ex: year of collect) +#' @param field_columns vector of all the columns that are fields (ex: year of collect) #' #' @return named list of a dataframe describing the tables (metadata) and a list of #' the modalities of each hierarchical variable (modalities) diff --git a/R/identify_hrc_with_eq.R b/R/identify_hrc_with_eq.R index 15949b4..d716a68 100644 --- a/R/identify_hrc_with_eq.R +++ b/R/identify_hrc_with_eq.R @@ -142,25 +142,32 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ select(table_name,field,hrc_field,indicator,hrc_indicator,everything()) %>% arrange(table_name) + browser() df_no_eq_spannings <- df_spannings_eq %>% filter(is.na(eq_name)) - if(all(is.na(df_no_eq_spannings$hrc_indicator)) & nrow(df_no_eq_spannings) > 0){ - df_indicators <- bind_rows(df_indicators,df_no_eq_spannings) %>% arrange(table_name) - return(list(df_indicators,df_variable_info)) + + if(nrow(df_no_eq_spannings) > 0){ + if(all(is.na(df_no_eq_spannings$hrc_indicator))){ + df_indicators <- bind_rows(df_indicators,df_no_eq_spannings) %>% arrange(table_name) + return(list(df_indicators,df_variable_info)) + } else { + df_no_eq_indicators <- df_no_eq_spannings %>% + filter(!is.na(hrc_indicator)) %>% + dplyr::group_by(table_name) %>% + summarise( + field = last(field), + hrc_field = last(hrc_field), + spanning = paste0(toupper(last(hrc_indicator)),"^h"), + hrc_spanning = last(hrc_indicator), + indicator = last(indicator), + hrc_indicator = last(hrc_indicator) + ) %>% + bind_rows(df_spannings, .) %>% + arrange(table_name) + df_indicators <- bind_rows(df_indicators,df_no_eq_indicators) %>% arrange(table_name) + list_hrc_identified = list(df_indicators,df_variable_info) + return(list_hrc_identified) + } } else { - df_no_eq_indicators <- df_no_eq_spannings %>% - filter(!is.na(hrc_indicator)) %>% - dplyr::group_by(table_name) %>% - summarise( - field = last(field), - hrc_field = last(hrc_field), - spanning = paste0(toupper(last(hrc_indicator)),"^h"), - hrc_spanning = last(hrc_indicator), - indicator = last(indicator), - hrc_indicator = last(hrc_indicator) - ) %>% - bind_rows(df_spannings, .) %>% - arrange(table_name) - df_indicators <- bind_rows(df_indicators,df_no_eq_indicators) %>% arrange(table_name) list_hrc_identified = list(df_indicators,df_variable_info) return(list_hrc_identified) } From fcca6cc19beb441f99380f2cb1dcdbd78d43364f Mon Sep 17 00:00:00 2001 From: Baudry Clara Date: Mon, 27 Oct 2025 15:09:51 +0100 Subject: [PATCH 08/17] =?UTF-8?q?feat:=20format=5Ftemplate()=20prend=20en?= =?UTF-8?q?=20compte=20la=20possibilit=C3=A9=20que=20toutes=20les=20variab?= =?UTF-8?q?les=20de=20croisements=20soient=20crois=C3=A9es?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- R/format_template.R | 24 ++++++++++++++++++------ tests/testthat/test_format_template.R | 26 ++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 6 deletions(-) diff --git a/R/format_template.R b/R/format_template.R index d41bbf5..ba48d0c 100644 --- a/R/format_template.R +++ b/R/format_template.R @@ -78,15 +78,27 @@ filter_on_marginal_of_spanning_var <- function(data, criteria, subset_keys) { ) # Create filter expressions for all other keys with != other_keys <- setdiff(names(criteria), subset_keys) - filter_expr_not_in <- purrr::map2( - other_keys, - criteria[other_keys], - ~ rlang::expr(!!rlang::sym(.x) != !!.y) - ) + if(all(other_keys == "")){ + # treat the eventuality of all spanning variables being crossed on their + # non total values + not_totals_all_spannings <- purrr::map2( + subset_keys, + criteria[subset_keys], + ~ rlang::expr(!!rlang::sym(.x) != !!.y) + ) + return(data %>% filter(!!!not_totals_all_spannings)) + } else { + filter_expr_not_in <- purrr::map2( + other_keys, + criteria[other_keys], + ~ rlang::expr(!!rlang::sym(.x) != !!.y) + ) + } + # Combine the two sets of expressions combined_filter_expr <- c(filter_expr_in, filter_expr_not_in) # Apply the combined filter - data %>% filter(!!!combined_filter_expr) %>% select(-!!subset_keys) + return(data %>% filter(!!!combined_filter_expr) %>% select(-!!subset_keys)) } #' Determines the tables described in a template gathering all the published cells diff --git a/tests/testthat/test_format_template.R b/tests/testthat/test_format_template.R index 8d0b792..86b40aa 100644 --- a/tests/testthat/test_format_template.R +++ b/tests/testthat/test_format_template.R @@ -47,3 +47,29 @@ test_that("only one spanning variable", { expect_equal(length(spanning_vars), 1) }) +# check that it takes into acocunt tables that cross all spanning variables +test_that("only one spanning variable", { + # add lines where all spanning variables are crossed + template <- enterprise_template %>% + # We're only doing it for the "B" and NUMBER_EMPL crossings + filter(ACTIVITY == "B", NUMBER_EMPL != "_T") %>% + select(-LEGAL_FORM) %>% + tidyr::crossing(LEGAL_FORM = c("LL", "PA", "SP")) %>% + bind_rows(enterprise_template) + + result <- format_template( + data = template, + indicator_column = "INDICATOR", + spanning_var_tot = list(ACTIVITY = "BTSXO_S94", NUMBER_EMPL = "_T", LEGAL_FORM = "_T"), + field_columns = c("TIME_PERIOD") + )$metadata + + # select all the columns starting with "spanning_" + spanning_vars <- grep("^spanning_", names(result), value = TRUE) + + # check there is three variables + expect_equal(length(spanning_vars), 3) +}) + + + From 8aaf439ce9fdf43dd8838ace6dc11187789e7cb3 Mon Sep 17 00:00:00 2001 From: Baudry Clara Date: Wed, 5 Nov 2025 11:43:43 +0100 Subject: [PATCH 09/17] =?UTF-8?q?doc:=20identify=5Fhrc=5Fwith=5Feq()=20pou?= =?UTF-8?q?r=20meilleur=20compr=C3=A9hension?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- R/identify_hrc_with_eq.R | 49 +++++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/R/identify_hrc_with_eq.R b/R/identify_hrc_with_eq.R index d716a68..2214d16 100644 --- a/R/identify_hrc_with_eq.R +++ b/R/identify_hrc_with_eq.R @@ -46,7 +46,9 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ }} check_column_names(df_eq_indicator) - # parse the equations + # 'parsed_equations' is a data frame where each equation from 'df_eq_indicator' + # is parsed into its left-hand side ('total') and right-hand side terms ('rhs1', 'rhs2', etc.), + # with each rhs term placed in a separate column. parsed_equations <- df_eq_indicator %>% tidyr::separate(eq_indicator, into = c("total", "rhs"), sep = "=", extra = "merge") %>% dplyr::mutate(rhs = stringr::str_trim(rhs)) %>% @@ -56,7 +58,9 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ dplyr::mutate(term_number = paste0("rhs", dplyr::row_number())) %>% tidyr::pivot_wider(names_from = term_number, values_from = rhs) %>% dplyr::ungroup() %>% - dplyr::select(eq_name,unit,total,everything()) + dplyr::select(eq_name, unit, total, everything()) + + browser() # change to long format in order to join with df_metadata_long equations_long <- parsed_equations %>% @@ -68,6 +72,10 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ ) %>% filter(!is.na(var)) + # 'df_spannings' is a modified version of 'df_metadata_long' where: + # - 'spanning' is replaced by its uppercase hierarchical version if available, + # - 'indicator' is replaced by its uppercase hierarchical version + # (without the 'hrc_' prefix) if available. df_spannings <- df_metadata_long %>% mutate(spanning_old = spanning) %>% mutate(spanning = ifelse(is.na(hrc_spanning), @@ -76,38 +84,26 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ mutate(indicator = ifelse(is.na(hrc_indicator), indicator, toupper(sub("hrc_","",hrc_indicator)))) + + # 'df_variable_info' is a reference table linking original spanning names ('spanning_old') + # to their transformed counterparts ('spanning'), along with the corresponding table name. df_variable_info <- data.frame( var_start_name = df_spannings$spanning_old, var_end_name = df_spannings$spanning, table_name = df_spannings$table_name ) %>% unique() - df_spannings <- df_spannings %>% select(-spanning_old) - # df_indicators <- df_spannings %>% - # # delete all the non-word elements, specifically for the white spaces - # mutate(across(where(is.character), ~ gsub("[^[:alnum:]_]", "", .))) %>% - # left_join(equations_long, by = c("indicator" = "var")) %>% - # filter(!is.na(eq_name)) %>% - # dplyr::group_by(table_name) %>% - # summarise( - # field = last(field), - # hrc_field = last(hrc_field), - # spanning = paste0(toupper(last(eq_name)),"^h"), - # hrc_spanning = paste0("hrc_",last(eq_name)), - # indicator = last(unit), - # hrc_indicator = last(hrc_indicator) - # ) %>% - # bind_rows(df_spannings, .) %>% - # group_by(table_name) %>% - # mutate(indicator = last(indicator)) %>% - # ungroup() %>% - # arrange(table_name) + # Update 'df_spannings' by removing the temporary 'spanning_old' column. + df_spannings <- df_spannings %>% select(-spanning_old) df_spannings_eq <- df_spannings %>% # delete all the non-word elements, specifically for the white spaces mutate(across(where(is.character), ~ gsub("[^[:alnum:]_]", "", .))) %>% left_join(equations_long, by = c("indicator" = "var")) + # 'df_eq_initial_spannings' contains the initial spanning information + # for equations (rows where 'eq_name' is not missing), summarised by equation name. + # Each equation keeps the last relevant field values, with concatenated table names. df_eq_initial_spannings <- df_spannings_eq %>% filter(!is.na(eq_name)) %>% group_by(eq_name) %>% @@ -123,6 +119,9 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ ) %>% select(-eq_name) + # 'df_eq_indicator_spannings' defines the spanning information for equation indicators. + # Each equation name is transformed into its uppercase form with a "^h" suffix, + # and its hierarchical version prefixed with "hrc_". df_eq_indicator_spannings <- df_spannings_eq %>% filter(!is.na(eq_name)) %>% group_by(eq_name) %>% @@ -138,11 +137,15 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ ) %>% select(-eq_name) + # 'df_indicators' combines both initial and indicator spanning information + # into a single harmonized dataset, keeping key structural columns + # and sorting rows by table name. df_indicators <- bind_rows(df_eq_initial_spannings,df_eq_indicator_spannings) %>% select(table_name,field,hrc_field,indicator,hrc_indicator,everything()) %>% arrange(table_name) - browser() + # 'df_no_eq_spannings' contains all spanning rows + # that are not associated with any equation (eq_name is missing). df_no_eq_spannings <- df_spannings_eq %>% filter(is.na(eq_name)) if(nrow(df_no_eq_spannings) > 0){ From a78fc447826517af93fc270459c7a8b9c4c6dffb Mon Sep 17 00:00:00 2001 From: Baudry Clara Date: Wed, 5 Nov 2025 14:33:14 +0100 Subject: [PATCH 10/17] =?UTF-8?q?feat:=20gestion=20des=20=C3=A9quations=20?= =?UTF-8?q?li=C3=A9s=20(A=20=3D=20B=20+=20C,=20B=20=3D=20D=20+=20E)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- R/identify_hrc_with_eq.R | 53 ++++++++++++++++++++++++++++++++-------- 1 file changed, 43 insertions(+), 10 deletions(-) diff --git a/R/identify_hrc_with_eq.R b/R/identify_hrc_with_eq.R index 2214d16..f1f4b84 100644 --- a/R/identify_hrc_with_eq.R +++ b/R/identify_hrc_with_eq.R @@ -60,8 +60,6 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ dplyr::ungroup() %>% dplyr::select(eq_name, unit, total, everything()) - browser() - # change to long format in order to join with df_metadata_long equations_long <- parsed_equations %>% mutate(across(c(total, starts_with("rhs")), trimws)) %>% @@ -72,6 +70,35 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ ) %>% filter(!is.na(var)) + # Identify chained equations (A = B + C, B = D + E → group both equations together) + + # Build dependency links between totals and rhs + links <- parsed_equations %>% + tidyr::pivot_longer( + cols = starts_with("rhs"), + names_to = "rhs_term", + values_to = "rhs" + ) %>% + dplyr::filter(!is.na(rhs)) %>% + dplyr::select(total, rhs) %>% + dplyr::mutate( + total = trimws(as.character(total)), + rhs = trimws(as.character(rhs)) + ) %>% + dplyr::distinct() + + + # Create an adjacency-like mapping + g <- igraph::graph_from_data_frame(links, directed = TRUE) + + # Find connected components (groups of linked equations) + comp <- igraph::components(g)$membership + comp_df <- data.frame(var = names(comp), group = comp, stringsAsFactors = FALSE) + + # Add this group info to equations_long + equations_long <- equations_long %>% + left_join(comp_df, by = c("var" = "var")) + # 'df_spannings' is a modified version of 'df_metadata_long' where: # - 'spanning' is replaced by its uppercase hierarchical version if available, # - 'indicator' is replaced by its uppercase hierarchical version @@ -106,7 +133,7 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ # Each equation keeps the last relevant field values, with concatenated table names. df_eq_initial_spannings <- df_spannings_eq %>% filter(!is.na(eq_name)) %>% - group_by(eq_name) %>% + group_by(group) %>% summarise( table_name = paste(table_name, collapse = "."), field = last(field), @@ -116,26 +143,32 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ indicator = last(unit), hrc_indicator = last(hrc_indicator), .groups = "drop" - ) %>% - select(-eq_name) + ) # 'df_eq_indicator_spannings' defines the spanning information for equation indicators. # Each equation name is transformed into its uppercase form with a "^h" suffix, # and its hierarchical version prefixed with "hrc_". df_eq_indicator_spannings <- df_spannings_eq %>% filter(!is.na(eq_name)) %>% - group_by(eq_name) %>% + group_by(group) %>% summarise( table_name = paste(table_name, collapse = "."), field = last(field), hrc_field = last(hrc_field), - spanning = paste0(toupper(last(eq_name)), "^h"), - hrc_spanning = paste0("hrc_", last(eq_name)), + spanning = if(length(unique(eq_name)) > 1) { + paste0(paste0(unique(toupper(eq_name)), collapse = "_"), "^h") + } else { + paste0(toupper(last(eq_name)), "^h") + }, + hrc_spanning = if(length(unique(eq_name)) > 1) { + paste0("hrc_", paste0(unique(toupper(eq_name)), collapse = "_")) + } else { + paste0("hrc_", toupper(last(eq_name))) + }, indicator = last(unit), hrc_indicator = last(hrc_indicator), .groups = "drop" - ) %>% - select(-eq_name) + ) # 'df_indicators' combines both initial and indicator spanning information # into a single harmonized dataset, keeping key structural columns From 4b1b41f68c2ab46b2272b8df1004129d9e56d898 Mon Sep 17 00:00:00 2001 From: Baudry Clara Date: Wed, 5 Nov 2025 17:38:59 +0100 Subject: [PATCH 11/17] =?UTF-8?q?feat:=20gestion=20A=20=3D=20B=20+=20C,=20?= =?UTF-8?q?B=20=3D=20D=20+=20E,=20B=20=3D=20G=20+=20H.=20On=20a=20un=20tab?= =?UTF-8?q?leau=20A=20=3D=20B=20+=20C=20et=20B=20=3D=20D=20+=20E,=20un=20a?= =?UTF-8?q?utre=20B=20=3D=20G=20+=20H.=20Les=20deux=20sont=20dans=20le=20m?= =?UTF-8?q?=C3=AAme=20cluster=20puisqu'ils=20ont=20la=20m=C3=AAme=20unit?= =?UTF-8?q?=C3=A9.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- DESCRIPTION | 2 +- R/identify_hrc_with_eq.R | 54 ++++++++++++++++++++++++++++++++-------- 2 files changed, 45 insertions(+), 11 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 62e61f0..a04aeab 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,7 +2,7 @@ Package: rtauargus Type: Package Title: Using Tau-Argus from R Language: fr -Version: 1.3.1 +Version: 1.3.2 Depends: R (>= 3.5.0) Imports: purrr (>= 0.2), diff --git a/R/identify_hrc_with_eq.R b/R/identify_hrc_with_eq.R index f1f4b84..3df21c0 100644 --- a/R/identify_hrc_with_eq.R +++ b/R/identify_hrc_with_eq.R @@ -51,9 +51,10 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ # with each rhs term placed in a separate column. parsed_equations <- df_eq_indicator %>% tidyr::separate(eq_indicator, into = c("total", "rhs"), sep = "=", extra = "merge") %>% - dplyr::mutate(rhs = stringr::str_trim(rhs)) %>% + dplyr::mutate(rhs = trimws(rhs)) %>% tidyr::separate_rows(rhs, sep = "\\+") %>% - dplyr::mutate(rhs = stringr::str_trim(rhs)) %>% + dplyr::mutate(rhs = trimws(rhs), + total = trimws(total)) %>% dplyr::group_by(dplyr::across(-rhs)) %>% dplyr::mutate(term_number = paste0("rhs", dplyr::row_number())) %>% tidyr::pivot_wider(names_from = term_number, values_from = rhs) %>% @@ -80,24 +81,57 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ values_to = "rhs" ) %>% dplyr::filter(!is.na(rhs)) %>% - dplyr::select(total, rhs) %>% dplyr::mutate( total = trimws(as.character(total)), rhs = trimws(as.character(rhs)) ) %>% dplyr::distinct() + # browser() + # Compter le nombre de fois qu'une variable apparaît à gauche + total_counts <- parsed_equations %>% + dplyr::count(total, name = "n_total") - # Create an adjacency-like mapping - g <- igraph::graph_from_data_frame(links, directed = TRUE) + # Identifier les "totals" ambigus (définis plusieurs fois) + ambiguous_totals <- total_counts %>% + dplyr::filter(n_total > 1) %>% + dplyr::pull(total) - # Find connected components (groups of linked equations) + # Ne garder que les liens dont le total n’est pas ambigu + links_filtered <- links %>% + dplyr::filter(!total %in% ambiguous_totals) + + # Créer un graphe uniquement avec les liens non ambigus + g <- igraph::graph_from_data_frame(links_filtered %>% select(total,rhs), directed = TRUE) + + # Trouver les composantes connexes (chaînes d’équations cohérentes) comp <- igraph::components(g)$membership comp_df <- data.frame(var = names(comp), group = comp, stringsAsFactors = FALSE) - # Add this group info to equations_long + # Affecter les groupes aux équations equations_long <- equations_long %>% - left_join(comp_df, by = c("var" = "var")) + dplyr::left_join(comp_df, by = c("var" = "var")) + + # browser() + # Pour les équations dont le total est ambigu, + # on leur donne un nouveau groupe unique PAR ÉQUATION + if (length(ambiguous_totals) > 0) { + max_group <- ifelse(length(comp_df$group) == 0, 0, max(comp_df$group, na.rm = TRUE)) + + # Extraire les équations dont le total est ambigu + ambiguous_eqs <- equations_long %>% + dplyr::filter(side == "total", var %in% ambiguous_totals) %>% + dplyr::distinct(eq_name, var) %>% + dplyr::mutate(group = seq(max_group, max_group + dplyr::n() - 1)) + + # Rejoindre ces nouveaux groupes à toutes les lignes de la même équation + equations_long <- equations_long %>% + dplyr::left_join(ambiguous_eqs %>% dplyr::select(eq_name, group), + by = "eq_name", + suffix = c("", "_ambig")) %>% + dplyr::mutate(group = dplyr::coalesce(group_ambig, group)) %>% + dplyr::select(-group_ambig) + } # 'df_spannings' is a modified version of 'df_metadata_long' where: # - 'spanning' is replaced by its uppercase hierarchical version if available, @@ -135,7 +169,7 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ filter(!is.na(eq_name)) %>% group_by(group) %>% summarise( - table_name = paste(table_name, collapse = "."), + table_name = paste(unique(table_name), collapse = "."), field = last(field), hrc_field = last(hrc_field), spanning = last(spanning), @@ -152,7 +186,7 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ filter(!is.na(eq_name)) %>% group_by(group) %>% summarise( - table_name = paste(table_name, collapse = "."), + table_name = paste(unique(table_name), collapse = "."), field = last(field), hrc_field = last(hrc_field), spanning = if(length(unique(eq_name)) > 1) { From efc6ad7158af258f12be289229b648db4c83b1cb Mon Sep 17 00:00:00 2001 From: Baudry Clara Date: Thu, 6 Nov 2025 11:07:40 +0100 Subject: [PATCH 12/17] =?UTF-8?q?fix:=20d=C3=A9cale=20le=20nom=20des=20gro?= =?UTF-8?q?upes=20de=201=20pour=20=C3=A9viter=20un=20regroupement=20faux?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- R/identify_hrc_with_eq.R | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/R/identify_hrc_with_eq.R b/R/identify_hrc_with_eq.R index 3df21c0..6e396ec 100644 --- a/R/identify_hrc_with_eq.R +++ b/R/identify_hrc_with_eq.R @@ -87,7 +87,6 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ ) %>% dplyr::distinct() - # browser() # Compter le nombre de fois qu'une variable apparaît à gauche total_counts <- parsed_equations %>% dplyr::count(total, name = "n_total") @@ -112,7 +111,6 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ equations_long <- equations_long %>% dplyr::left_join(comp_df, by = c("var" = "var")) - # browser() # Pour les équations dont le total est ambigu, # on leur donne un nouveau groupe unique PAR ÉQUATION if (length(ambiguous_totals) > 0) { @@ -122,7 +120,7 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ ambiguous_eqs <- equations_long %>% dplyr::filter(side == "total", var %in% ambiguous_totals) %>% dplyr::distinct(eq_name, var) %>% - dplyr::mutate(group = seq(max_group, max_group + dplyr::n() - 1)) + dplyr::mutate(group = seq(max_group + 1, max_group + dplyr::n())) # Rejoindre ces nouveaux groupes à toutes les lignes de la même équation equations_long <- equations_long %>% From 00cac81c5276b1a965ef07b3f263c97b7456ddd8 Mon Sep 17 00:00:00 2001 From: Baudry Clara Date: Thu, 6 Nov 2025 12:24:53 +0100 Subject: [PATCH 13/17] =?UTF-8?q?feat:=20gestion=20des=20liens=20entre=20t?= =?UTF-8?q?otaux=20non=20ambigus,=20i.e.=20gestion=20hi=C3=A9rarhcies=20no?= =?UTF-8?q?n-embo=C3=AEt=C3=A9es=20sur=20indicateurs=20N.B.=20revenir=20su?= =?UTF-8?q?r=20la=20doc=20!!=20en=20fr=20actuellement?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- R/identify_hrc_with_eq.R | 106 +++++++++++++++++++++++---------------- 1 file changed, 62 insertions(+), 44 deletions(-) diff --git a/R/identify_hrc_with_eq.R b/R/identify_hrc_with_eq.R index 6e396ec..9eb6152 100644 --- a/R/identify_hrc_with_eq.R +++ b/R/identify_hrc_with_eq.R @@ -87,50 +87,68 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ ) %>% dplyr::distinct() - # Compter le nombre de fois qu'une variable apparaît à gauche - total_counts <- parsed_equations %>% - dplyr::count(total, name = "n_total") - - # Identifier les "totals" ambigus (définis plusieurs fois) - ambiguous_totals <- total_counts %>% - dplyr::filter(n_total > 1) %>% - dplyr::pull(total) - - # Ne garder que les liens dont le total n’est pas ambigu - links_filtered <- links %>% - dplyr::filter(!total %in% ambiguous_totals) - - # Créer un graphe uniquement avec les liens non ambigus - g <- igraph::graph_from_data_frame(links_filtered %>% select(total,rhs), directed = TRUE) - - # Trouver les composantes connexes (chaînes d’équations cohérentes) - comp <- igraph::components(g)$membership - comp_df <- data.frame(var = names(comp), group = comp, stringsAsFactors = FALSE) - - # Affecter les groupes aux équations - equations_long <- equations_long %>% - dplyr::left_join(comp_df, by = c("var" = "var")) - - # Pour les équations dont le total est ambigu, - # on leur donne un nouveau groupe unique PAR ÉQUATION - if (length(ambiguous_totals) > 0) { - max_group <- ifelse(length(comp_df$group) == 0, 0, max(comp_df$group, na.rm = TRUE)) - - # Extraire les équations dont le total est ambigu - ambiguous_eqs <- equations_long %>% - dplyr::filter(side == "total", var %in% ambiguous_totals) %>% - dplyr::distinct(eq_name, var) %>% - dplyr::mutate(group = seq(max_group + 1, max_group + dplyr::n())) - - # Rejoindre ces nouveaux groupes à toutes les lignes de la même équation - equations_long <- equations_long %>% - dplyr::left_join(ambiguous_eqs %>% dplyr::select(eq_name, group), - by = "eq_name", - suffix = c("", "_ambig")) %>% - dplyr::mutate(group = dplyr::coalesce(group_ambig, group)) %>% - dplyr::select(-group_ambig) - } + # browser() + + # ---- 1) Identif. totaux ambigus ---- + total_counts <- parsed_equations %>% count(total, name = "n_total") + ambiguous_totals <- total_counts %>% filter(n_total > 1) %>% pull(total) + + # ---- 2) Construire un mapping total -> total_alt par eq_name ---- + # pour toutes les équations (ambigües ou non) on crée une ligne ; + # pour les non-ambigües total_alt == total + alt_map <- parsed_equations %>% + distinct(eq_name, total) %>% + group_by(total) %>% + arrange(eq_name) %>% # ordre stable + mutate(alt_idx = row_number(), + total_alt = case_when( + n() == 1 ~ total, + alt_idx == 1 ~ total, + TRUE ~ paste0(total, "_alt", alt_idx - 1) + ) + ) %>% + ungroup() %>% + select(eq_name, total, total_alt) + + # ---- 3) Appliquer le mapping aux liens ---- + # 'links' contient total, rhs, eq_name (si tu ne l'as pas, il faut le joindre) + # ici j'assume links a colonne eq_name ; sinon faire left_join(links, parsed_equations %>% select(eq_name, total, rhs)...) auparavant + links_full <- links %>% + # remplacer le total par sa version alt spécifique à l'eq + left_join(alt_map, by = c("eq_name", "total")) %>% + mutate(total = coalesce(total_alt, total)) %>% + select(-total_alt) %>% + # maintenant, remplacer rhs s'il existe comme "total" dans alt_map : + # on doit choisir la bonne total_alt pour le rhs selon l'équation où il joue le rôle de total. + # pour cela on joint alt_map en faisant rhs -> total, et en gardant l'alt correspondant à l'eq_name de la ligne SOURCE. + left_join(alt_map, by = c("eq_name", "rhs" = "total")) %>% + mutate(rhs = coalesce(total_alt, rhs)) %>% + select(total, rhs, eq_name) %>% + distinct() + + # ---- 4) Construire le graphe complet (avec toutes les copies) ---- + g_full <- graph_from_data_frame(links_full %>% select(total, rhs), directed = TRUE) + + # ---- 5) calculer les composantes sur g_full ---- + comp_full <- components(g_full)$membership + comp_df <- data.frame(var = names(comp_full), group = as.integer(comp_full), stringsAsFactors = FALSE) + + # ---- 6) Mettre à jour equations_long : + # associer le var alt (si present) et le groupe correspondant ---- + # Remarques : + # - equations_long contient les variables originales (var) et eq_name ; + # - on veut retrouver la version "var" ou "var_alt" utilisée dans g_full. + equations_long_full <- equations_long %>% + # joindre la correspondance eq_name + var(original total) -> total_alt (si existant) + left_join(alt_map, by = c("eq_name", "var" = "total")) %>% + mutate(var_mapped = coalesce(total_alt, var)) %>% + select(-total_alt) %>% + # joindre le groupe calculé sur le graphe complet + left_join(comp_df, by = c("var_mapped" = "var")) %>% + # si pour certains var_mapped il n'y a pas de group (isolés), on peut laisser NA ou donner un groupe unique + mutate(group = as.integer(group)) + browser() # 'df_spannings' is a modified version of 'df_metadata_long' where: # - 'spanning' is replaced by its uppercase hierarchical version if available, # - 'indicator' is replaced by its uppercase hierarchical version @@ -158,7 +176,7 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ df_spannings_eq <- df_spannings %>% # delete all the non-word elements, specifically for the white spaces mutate(across(where(is.character), ~ gsub("[^[:alnum:]_]", "", .))) %>% - left_join(equations_long, by = c("indicator" = "var")) + left_join(equations_long_full, by = c("indicator" = "var")) # 'df_eq_initial_spannings' contains the initial spanning information # for equations (rows where 'eq_name' is not missing), summarised by equation name. From 078467be6bdd9b236391287b8a9984c10292c6ac Mon Sep 17 00:00:00 2001 From: Baudry Clara Date: Thu, 6 Nov 2025 14:52:39 +0100 Subject: [PATCH 14/17] sans browser() --- R/identify_hrc_with_eq.R | 3 --- 1 file changed, 3 deletions(-) diff --git a/R/identify_hrc_with_eq.R b/R/identify_hrc_with_eq.R index 9eb6152..2dee86d 100644 --- a/R/identify_hrc_with_eq.R +++ b/R/identify_hrc_with_eq.R @@ -87,8 +87,6 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ ) %>% dplyr::distinct() - # browser() - # ---- 1) Identif. totaux ambigus ---- total_counts <- parsed_equations %>% count(total, name = "n_total") ambiguous_totals <- total_counts %>% filter(n_total > 1) %>% pull(total) @@ -148,7 +146,6 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ # si pour certains var_mapped il n'y a pas de group (isolés), on peut laisser NA ou donner un groupe unique mutate(group = as.integer(group)) - browser() # 'df_spannings' is a modified version of 'df_metadata_long' where: # - 'spanning' is replaced by its uppercase hierarchical version if available, # - 'indicator' is replaced by its uppercase hierarchical version From 149f88b5489e9d296b78667a1d433533244949ad Mon Sep 17 00:00:00 2001 From: Baudry Clara Date: Mon, 19 Jan 2026 16:23:17 +0100 Subject: [PATCH 15/17] =?UTF-8?q?fix:=20plusieurs=20vairables=20de=20crois?= =?UTF-8?q?ements=20pour=20les=20tableaux=20dont=20l'indicateur=20fait=20p?= =?UTF-8?q?artie=20d'une=20=C3=A9quation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- R/identify_hrc_with_eq.R | 65 ++++++++++++++++++++++++---------------- 1 file changed, 39 insertions(+), 26 deletions(-) diff --git a/R/identify_hrc_with_eq.R b/R/identify_hrc_with_eq.R index 2dee86d..d95428a 100644 --- a/R/identify_hrc_with_eq.R +++ b/R/identify_hrc_with_eq.R @@ -88,25 +88,25 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ dplyr::distinct() # ---- 1) Identif. totaux ambigus ---- - total_counts <- parsed_equations %>% count(total, name = "n_total") - ambiguous_totals <- total_counts %>% filter(n_total > 1) %>% pull(total) + total_counts <- parsed_equations %>% dplyr::count(total, name = "n_total") + ambiguous_totals <- total_counts %>% dplyr::filter(n_total > 1) %>% pull(total) # ---- 2) Construire un mapping total -> total_alt par eq_name ---- # pour toutes les équations (ambigües ou non) on crée une ligne ; # pour les non-ambigües total_alt == total alt_map <- parsed_equations %>% - distinct(eq_name, total) %>% - group_by(total) %>% - arrange(eq_name) %>% # ordre stable - mutate(alt_idx = row_number(), - total_alt = case_when( - n() == 1 ~ total, + dplyr::distinct(eq_name, total) %>% + dplyr::group_by(total) %>% + dplyr::arrange(eq_name) %>% # ordre stable + dplyr::mutate(alt_idx = dplyr::row_number(), + total_alt = dplyr::case_when( + dplyr::n() == 1 ~ total, alt_idx == 1 ~ total, TRUE ~ paste0(total, "_alt", alt_idx - 1) ) ) %>% - ungroup() %>% - select(eq_name, total, total_alt) + dplyr::ungroup() %>% + dplyr::select(eq_name, total, total_alt) # ---- 3) Appliquer le mapping aux liens ---- # 'links' contient total, rhs, eq_name (si tu ne l'as pas, il faut le joindre) @@ -114,21 +114,21 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ links_full <- links %>% # remplacer le total par sa version alt spécifique à l'eq left_join(alt_map, by = c("eq_name", "total")) %>% - mutate(total = coalesce(total_alt, total)) %>% + mutate(total = dplyr::coalesce(total_alt, total)) %>% select(-total_alt) %>% # maintenant, remplacer rhs s'il existe comme "total" dans alt_map : # on doit choisir la bonne total_alt pour le rhs selon l'équation où il joue le rôle de total. # pour cela on joint alt_map en faisant rhs -> total, et en gardant l'alt correspondant à l'eq_name de la ligne SOURCE. left_join(alt_map, by = c("eq_name", "rhs" = "total")) %>% - mutate(rhs = coalesce(total_alt, rhs)) %>% + mutate(rhs = dplyr::coalesce(total_alt, rhs)) %>% select(total, rhs, eq_name) %>% - distinct() + dplyr::distinct() # ---- 4) Construire le graphe complet (avec toutes les copies) ---- g_full <- graph_from_data_frame(links_full %>% select(total, rhs), directed = TRUE) # ---- 5) calculer les composantes sur g_full ---- - comp_full <- components(g_full)$membership + comp_full <- igraph::components(g_full)$membership comp_df <- data.frame(var = names(comp_full), group = as.integer(comp_full), stringsAsFactors = FALSE) # ---- 6) Mettre à jour equations_long : @@ -139,13 +139,13 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ equations_long_full <- equations_long %>% # joindre la correspondance eq_name + var(original total) -> total_alt (si existant) left_join(alt_map, by = c("eq_name", "var" = "total")) %>% - mutate(var_mapped = coalesce(total_alt, var)) %>% + mutate(var_mapped = dplyr::coalesce(total_alt, var)) %>% select(-total_alt) %>% # joindre le groupe calculé sur le graphe complet left_join(comp_df, by = c("var_mapped" = "var")) %>% # si pour certains var_mapped il n'y a pas de group (isolés), on peut laisser NA ou donner un groupe unique mutate(group = as.integer(group)) - + # browser() # 'df_spannings' is a modified version of 'df_metadata_long' where: # - 'spanning' is replaced by its uppercase hierarchical version if available, # - 'indicator' is replaced by its uppercase hierarchical version @@ -178,19 +178,32 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ # 'df_eq_initial_spannings' contains the initial spanning information # for equations (rows where 'eq_name' is not missing), summarised by equation name. # Each equation keeps the last relevant field values, with concatenated table names. + # df_eq_initial_spannings <- df_spannings_eq %>% + # filter(!is.na(eq_name)) %>% + # group_by(group) %>% + # summarise( + # table_name = paste(unique(table_name), collapse = "."), + # field = last(field), + # hrc_field = last(hrc_field), + # spanning = last(spanning), + # hrc_spanning = last(hrc_spanning), + # indicator = last(unit), + # hrc_indicator = last(hrc_indicator), + # .groups = "drop" + # ) # TODO changer ici pour le pbm d'un manque de var de crois !! + df_eq_initial_spannings <- df_spannings_eq %>% filter(!is.na(eq_name)) %>% group_by(group) %>% - summarise( - table_name = paste(unique(table_name), collapse = "."), - field = last(field), - hrc_field = last(hrc_field), - spanning = last(spanning), - hrc_spanning = last(hrc_spanning), - indicator = last(unit), - hrc_indicator = last(hrc_indicator), - .groups = "drop" - ) + dplyr::reframe( + table_name = paste(unique(table_name), collapse = "."), + field = last(field), + hrc_field = last(hrc_field), + spanning = spanning, + hrc_spanning = hrc_spanning, + indicator = last(unit), + hrc_indicator = last(hrc_indicator) + ) %>% unique() # 'df_eq_indicator_spannings' defines the spanning information for equation indicators. # Each equation name is transformed into its uppercase form with a "^h" suffix, From e64873f47b1e36d037a33d34648fb22079107548 Mon Sep 17 00:00:00 2001 From: Baudry Clara Date: Mon, 19 Jan 2026 16:45:45 +0100 Subject: [PATCH 16/17] =?UTF-8?q?test:=20=C3=A9quations=20entre=20indicate?= =?UTF-8?q?urs,=20verificaiton=20qu'on=20a=20bien=20toutes=20les=20variabl?= =?UTF-8?q?es=20de=20croisement?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/testthat/test_analyse_metadata.R | 47 ++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/tests/testthat/test_analyse_metadata.R b/tests/testthat/test_analyse_metadata.R index 3da0d29..a7cad19 100644 --- a/tests/testthat/test_analyse_metadata.R +++ b/tests/testthat/test_analyse_metadata.R @@ -158,3 +158,50 @@ test_that("hierarchies on indicators", { expect_equal(analyse_metadata(meta),answer) } ) + +##################################################### INDICATOR EQUATIONS CHECKS +# all the spanning variables are taken into account when using equations on +# indicators ------------------------------------------------------------------- +answer <- data.frame( + cluster = c( + "france_entreprises_2023.hrc_lettuce", + "france_entreprises_2023.hrc_lettuce", + "france_entreprises_2023.to_pizza", + "france_entreprises_2023.to_pizza" + ), + table_name = c( + "T10.T12.T8", + "T11.T7.T9", + "T1.T2", + "T3.T4.T5.T6" + ), + field = rep("france_entreprises_2023", 4), + indicator = c("LETTUCE", "LETTUCE", "to_pizza", "to_pizza"), + spanning_1 = c("HRC_NAF", "HRC_NAF", "HRC_NUTS", "HRC_NAF"), + spanning_2 = c("cj", "size", "size", "HRC_NUTS"), + spanning_3 = c("HRC_LETTUCE^h", "HRC_LETTUCE^h", NA, NA), + hrc_spanning_1 = c("hrc_naf", "hrc_naf", "hrc_nuts", "hrc_naf"), + hrc_spanning_2 = c(NA, NA, NA, "hrc_nuts"), + hrc_spanning_3 = c("hrc_lettuce", "hrc_lettuce", NA, NA) +) + +test_that("indicators equation", { + df_eq_ex <- data.frame( + eq_name = c("eq1"), + eq_indicator = c("ca_salades = ca_batavia + ca_mache"), + unit = c("EUR"), + stringsAsFactors = FALSE + ) + + expect_warning( + expect_equal( + analyse_metadata(df_metadata = metadata_pizza_lettuce,df_eq_indicator = df_eq_ex), + answer + ), + "hrc_indicator column will be ignored" + ) + +} +) + + From ee9d14879e9f43e304bbc840a32b8108cb326428 Mon Sep 17 00:00:00 2001 From: Baudry Clara Date: Tue, 20 Jan 2026 14:28:02 +0100 Subject: [PATCH 17/17] maj doc et modifs suite au check du package avant release (devtools::check(cran=FALSE)) --- DESCRIPTION | 2 +- NAMESPACE | 2 + R/analyse_metadata.R | 3 +- R/globals.R | 3 +- R/identify_hrc_with_eq.R | 79 ++++++----- man/analyse_metadata.Rd | 6 +- man/format_template.Rd | 2 +- man/identify_hrc_with_eq.Rd | 59 ++++++++ tests/testthat/test_analyse_metadata.R | 4 +- vignettes/auto_metadata.Rmd | 178 +++++++++++++++++++++---- vignettes/auto_metadata.Rmd.orig | 61 ++++++++- vignettes/auto_metadata_fr.Rmd | 159 +++++++++++++++++----- vignettes/auto_metadata_fr.Rmd.orig | 53 +++++++- 13 files changed, 497 insertions(+), 114 deletions(-) create mode 100644 man/identify_hrc_with_eq.Rd diff --git a/DESCRIPTION b/DESCRIPTION index a04aeab..323c53e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -67,7 +67,7 @@ Description: Protects tables by calling the Tau-Argus software from R. License: MIT + file LICENSE Encoding: UTF-8 LazyData: true -RoxygenNote: 7.3.2 +RoxygenNote: 7.3.3 VignetteBuilder: knitr URL: https://inseefrlab.github.io/rtauargus, https://github.com/inseefrlab/rtauargus, diff --git a/NAMESPACE b/NAMESPACE index a39aae2..5fdf8a9 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -12,6 +12,7 @@ export(from_5_to_3) export(grp_tab_in_cluster) export(grp_tab_names) export(identify_hrc) +export(identify_hrc_with_eq) export(import) export(length_tabs) export(micro_arb) @@ -59,6 +60,7 @@ importFrom(dplyr,rowwise) importFrom(dplyr,select) importFrom(dplyr,summarise) importFrom(dplyr,ungroup) +importFrom(dplyr,where) importFrom(igraph,graph_from_data_frame) importFrom(igraph,which_mutual) importFrom(lifecycle,badge) diff --git a/R/analyse_metadata.R b/R/analyse_metadata.R index 137ac63..ec155a2 100644 --- a/R/analyse_metadata.R +++ b/R/analyse_metadata.R @@ -6,6 +6,7 @@ #' creating a structured output for further processing. #' #' @param df_metadata A dataframe containing metadata in wide format. +#' @param df_eq_indicator A datadframe containing the indicators equations if needed. #' @param verbose Logical. If `TRUE`, returns a detailed list of intermediate results #' from each processing step. If `FALSE`, returns only the cluster assignments. Defaults to `FALSE`. #' @@ -28,7 +29,7 @@ #' @details The function performs the following steps: #' \itemize{ #' \item Converts the metadata from wide format to long format using \code{wide_to_long}. -#' \item Identifies hierarchical relationships and renames variables with \code{identify_hrc}. +#' \item Identifies hierarchical relationships and renames variables with \code{identify_hrc} or \code{identify_hrc_with_eq}. #' \item Splits hierarchical relationships into clusters using \code{split_in_clusters}. #' \item Creates edges to describe the relationships via \code{create_edges}. #' \item Generates translation tables for regrouping with \code{grp_tab_names}. diff --git a/R/globals.R b/R/globals.R index 9f82e0b..24bc305 100644 --- a/R/globals.R +++ b/R/globals.R @@ -3,6 +3,7 @@ utils::globalVariables( c("table_name", "data","cluster","field","hrc_field","indicator","hrc_indicator", "n_unique","column","unique_modalities","from.eg","to.eg","from","to","mutual_full", "Group","table_eg","spanning","hrc_spanning","spanning_old","tab_inclus", - "starts_with","spanning_name","hrc_spanning_name", + "starts_with","spanning_name","hrc_spanning_name","eq_indicator","rhs","total","term_number", + "eq_name","unit","var","n_total","total_alt","group", ".") ) diff --git a/R/identify_hrc_with_eq.R b/R/identify_hrc_with_eq.R index d95428a..ff22b0f 100644 --- a/R/identify_hrc_with_eq.R +++ b/R/identify_hrc_with_eq.R @@ -13,6 +13,11 @@ #' - `indicator`: name of the indicator of the table. #' - `hrc_indicator`: Specifies linked response variables. #' - `spanning_*`, `hrc_spanning_*`: Spanning variables and their hierarchies. +#' @param df_eq_indicator A dataframe containing the equations on indicators with +#' the following required columns : +#' - `eq_name`: Name of the equation. +#' - `eq_indicator`: The equation for example, A = B + C. +#' - `unit`: The unit of the indicators in the equation. #' #' @return `list_hrc_identified`, a list with two elements: #' - `df_indicators`: The updated data frame with renamed variables and grouped @@ -27,12 +32,18 @@ #' data(metadata_pizza_lettuce) #' #' metadata_pizza_lettuce_long <- wide_to_long(metadata_pizza_lettuce) +#' df_eq_ex <- data.frame( +#' eq_name = c("eq1"), +#' eq_indicator = c("ca_salades = ca_batavia + ca_mache"), +#' unit = c("EUR")) #' -#' list_hrc_identified <- identify_hrc(metadata_pizza_lettuce_long) +#' list_hrc_identified <- identify_hrc(metadata_pizza_lettuce_long, df_eq_ex) #' #' str(list_hrc_identified) #' } #' +#' @importFrom dplyr where +#' identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ # check that the input is in the right format: right column names check_column_names <- function(df) { @@ -87,13 +98,13 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ ) %>% dplyr::distinct() - # ---- 1) Identif. totaux ambigus ---- + # ---- 1) Identify ambiguous totals ---- total_counts <- parsed_equations %>% dplyr::count(total, name = "n_total") ambiguous_totals <- total_counts %>% dplyr::filter(n_total > 1) %>% pull(total) - # ---- 2) Construire un mapping total -> total_alt par eq_name ---- - # pour toutes les équations (ambigües ou non) on crée une ligne ; - # pour les non-ambigües total_alt == total + # ---- 2) Build a total -> total_alt mapping by eq_name ---- + # For all equations (ambiguous or not), create one row; + # for non-ambiguous totals, total_alt == total alt_map <- parsed_equations %>% dplyr::distinct(eq_name, total) %>% dplyr::group_by(total) %>% @@ -108,44 +119,47 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ dplyr::ungroup() %>% dplyr::select(eq_name, total, total_alt) - # ---- 3) Appliquer le mapping aux liens ---- - # 'links' contient total, rhs, eq_name (si tu ne l'as pas, il faut le joindre) - # ici j'assume links a colonne eq_name ; sinon faire left_join(links, parsed_equations %>% select(eq_name, total, rhs)...) auparavant + # ---- 3) Apply the mapping to the links ---- + # 'links' contains total, rhs, eq_name (if not, it must be joined beforehand) + # here we assume links has an eq_name column; otherwise do + # left_join(links, parsed_equations %>% select(eq_name, total, rhs), ...) first links_full <- links %>% - # remplacer le total par sa version alt spécifique à l'eq + # replace total with its equation-specific alternative left_join(alt_map, by = c("eq_name", "total")) %>% mutate(total = dplyr::coalesce(total_alt, total)) %>% select(-total_alt) %>% - # maintenant, remplacer rhs s'il existe comme "total" dans alt_map : - # on doit choisir la bonne total_alt pour le rhs selon l'équation où il joue le rôle de total. - # pour cela on joint alt_map en faisant rhs -> total, et en gardant l'alt correspondant à l'eq_name de la ligne SOURCE. + # now replace rhs if it exists as a "total" in alt_map: + # we must choose the correct total_alt for rhs according to the equation + # where it plays the role of a total. + # to do so, join alt_map by mapping rhs -> total, keeping the alt + # corresponding to the SOURCE row eq_name. left_join(alt_map, by = c("eq_name", "rhs" = "total")) %>% mutate(rhs = dplyr::coalesce(total_alt, rhs)) %>% select(total, rhs, eq_name) %>% dplyr::distinct() - # ---- 4) Construire le graphe complet (avec toutes les copies) ---- + # ---- 4) Build the full graph (including all copies) ---- g_full <- graph_from_data_frame(links_full %>% select(total, rhs), directed = TRUE) - # ---- 5) calculer les composantes sur g_full ---- + # ---- 5) Compute components on g_full ---- comp_full <- igraph::components(g_full)$membership comp_df <- data.frame(var = names(comp_full), group = as.integer(comp_full), stringsAsFactors = FALSE) - # ---- 6) Mettre à jour equations_long : - # associer le var alt (si present) et le groupe correspondant ---- - # Remarques : - # - equations_long contient les variables originales (var) et eq_name ; - # - on veut retrouver la version "var" ou "var_alt" utilisée dans g_full. + # ---- 6) Update equations_long: + # associate the alternative variable (if present) and the corresponding group ---- + # Notes: + # - equations_long contains the original variables (var) and eq_name; + # - we want to recover the "var" or "var_alt" version used in g_full. equations_long_full <- equations_long %>% - # joindre la correspondance eq_name + var(original total) -> total_alt (si existant) + # join the correspondence eq_name + var (original total) -> total_alt (if any) left_join(alt_map, by = c("eq_name", "var" = "total")) %>% mutate(var_mapped = dplyr::coalesce(total_alt, var)) %>% select(-total_alt) %>% - # joindre le groupe calculé sur le graphe complet + # join the group computed on the full graph left_join(comp_df, by = c("var_mapped" = "var")) %>% - # si pour certains var_mapped il n'y a pas de group (isolés), on peut laisser NA ou donner un groupe unique + # for var_mapped without a group (isolated), keep NA or assign a single group mutate(group = as.integer(group)) - # browser() + # 'df_spannings' is a modified version of 'df_metadata_long' where: # - 'spanning' is replaced by its uppercase hierarchical version if available, # - 'indicator' is replaced by its uppercase hierarchical version @@ -172,26 +186,9 @@ identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ df_spannings_eq <- df_spannings %>% # delete all the non-word elements, specifically for the white spaces - mutate(across(where(is.character), ~ gsub("[^[:alnum:]_]", "", .))) %>% + mutate(across(dplyr::where(is.character), ~ gsub("[^[:alnum:]_]", "", .))) %>% left_join(equations_long_full, by = c("indicator" = "var")) - # 'df_eq_initial_spannings' contains the initial spanning information - # for equations (rows where 'eq_name' is not missing), summarised by equation name. - # Each equation keeps the last relevant field values, with concatenated table names. - # df_eq_initial_spannings <- df_spannings_eq %>% - # filter(!is.na(eq_name)) %>% - # group_by(group) %>% - # summarise( - # table_name = paste(unique(table_name), collapse = "."), - # field = last(field), - # hrc_field = last(hrc_field), - # spanning = last(spanning), - # hrc_spanning = last(hrc_spanning), - # indicator = last(unit), - # hrc_indicator = last(hrc_indicator), - # .groups = "drop" - # ) # TODO changer ici pour le pbm d'un manque de var de crois !! - df_eq_initial_spannings <- df_spannings_eq %>% filter(!is.na(eq_name)) %>% group_by(group) %>% diff --git a/man/analyse_metadata.Rd b/man/analyse_metadata.Rd index 9179a2b..216e9ef 100644 --- a/man/analyse_metadata.Rd +++ b/man/analyse_metadata.Rd @@ -4,11 +4,13 @@ \alias{analyse_metadata} \title{Analyse Metadata of Tables Needing Secondary Tabular Data Protection} \usage{ -analyse_metadata(df_metadata, verbose = FALSE) +analyse_metadata(df_metadata, df_eq_indicator = NULL, verbose = FALSE) } \arguments{ \item{df_metadata}{A dataframe containing metadata in wide format.} +\item{df_eq_indicator}{A datadframe containing the indicators equations if needed.} + \item{verbose}{Logical. If \code{TRUE}, returns a detailed list of intermediate results from each processing step. If \code{FALSE}, returns only the cluster assignments. Defaults to \code{FALSE}.} } @@ -41,7 +43,7 @@ creating a structured output for further processing. The function performs the following steps: \itemize{ \item Converts the metadata from wide format to long format using \code{wide_to_long}. -\item Identifies hierarchical relationships and renames variables with \code{identify_hrc}. +\item Identifies hierarchical relationships and renames variables with \code{identify_hrc} or \code{identify_hrc_with_eq}. \item Splits hierarchical relationships into clusters using \code{split_in_clusters}. \item Creates edges to describe the relationships via \code{create_edges}. \item Generates translation tables for regrouping with \code{grp_tab_names}. diff --git a/man/format_template.Rd b/man/format_template.Rd index 16aa90e..89df820 100644 --- a/man/format_template.Rd +++ b/man/format_template.Rd @@ -13,7 +13,7 @@ format_template(data, indicator_column, spanning_var_tot, field_columns) \item{spanning_var_tot}{a named list of the spanning variables and their totals} -\item{field_columns}{vecotr of all the columns that are fields (ex: year of collect)} +\item{field_columns}{vector of all the columns that are fields (ex: year of collect)} } \value{ named list of a dataframe describing the tables (metadata) and a list of diff --git a/man/identify_hrc_with_eq.Rd b/man/identify_hrc_with_eq.Rd new file mode 100644 index 0000000..4aa0cbf --- /dev/null +++ b/man/identify_hrc_with_eq.Rd @@ -0,0 +1,59 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/identify_hrc_with_eq.R +\name{identify_hrc_with_eq} +\alias{identify_hrc_with_eq} +\title{Rename variables based on their hierarchies and their equations} +\usage{ +identify_hrc_with_eq(df_metadata_long, df_eq_indicator) +} +\arguments{ +\item{df_metadata_long}{A data frame in long format with the following +required columns: +\itemize{ +\item \code{table_name}: Identifies the table. +\item \code{field} : name of the field of the table. +\item \code{indicator}: name of the indicator of the table. +\item \code{hrc_indicator}: Specifies linked response variables. +\item \verb{spanning_*}, \verb{hrc_spanning_*}: Spanning variables and their hierarchies. +}} + +\item{df_eq_indicator}{A dataframe containing the equations on indicators with +the following required columns : +\itemize{ +\item \code{eq_name}: Name of the equation. +\item \code{eq_indicator}: The equation for example, A = B + C. +\item \code{unit}: The unit of the indicators in the equation. +}} +} +\value{ +\code{list_hrc_identified}, a list with two elements: +\itemize{ +\item \code{df_indicators}: The updated data frame with renamed variables and grouped +response variables. +\item \code{df_variable_info}: A data frame mapping original variable names +(\code{spanning_old}) to their renamed counterparts (\code{spanning}). +} +} +\description{ +This function renames variables in a long-format metadata data frame based on +their hierarchical groupings. Spanning variables are renamed using the name +of their hierarchy in uppercase, while response variables linked by an +equation (specified in the \code{hrc_indicator} column) are grouped together, and +a new grouping variable is created. +} +\examples{ +\dontrun{ +data(metadata_pizza_lettuce) + +metadata_pizza_lettuce_long <- wide_to_long(metadata_pizza_lettuce) + df_eq_ex <- data.frame( + eq_name = c("eq1"), + eq_indicator = c("ca_salades = ca_batavia + ca_mache"), + unit = c("EUR")) + +list_hrc_identified <- identify_hrc(metadata_pizza_lettuce_long, df_eq_ex) + +str(list_hrc_identified) +} + +} diff --git a/tests/testthat/test_analyse_metadata.R b/tests/testthat/test_analyse_metadata.R index a7cad19..a075c40 100644 --- a/tests/testthat/test_analyse_metadata.R +++ b/tests/testthat/test_analyse_metadata.R @@ -67,7 +67,7 @@ test_that("error message for wrong column name - df_eq_indicator", { test_that("warning when hrc_indicator is ignored because df_eq_indicator used", { expect_warning( analyse_metadata(df_meta_for_eq, df_eq_ex), - "The hrc_indicator column will be ignored" + "the hrc_indicator column will be ignored" ) }) @@ -203,5 +203,3 @@ test_that("indicators equation", { } ) - - diff --git a/vignettes/auto_metadata.Rmd b/vignettes/auto_metadata.Rmd index 58891e4..32ccc02 100644 --- a/vignettes/auto_metadata.Rmd +++ b/vignettes/auto_metadata.Rmd @@ -52,15 +52,30 @@ args(analyse_metadata) ``` ``` -## function (df_metadata, verbose = FALSE) +## function (df_metadata, df_eq_indicator = NULL, verbose = FALSE) ## NULL ``` The function arguments are as follows: - `df_metadata`: a dataframe structured as specified in section \ref{sec:crea_meta}. +- `df_eq_indicator` : un dataframe regroupant les équations entre les indicateurs. - `verbose`: a variable of type `logical`. If `TRUE` he function returns a list with the different steps of the analysis; if `FALSE`, it returns only the final dataframe with the cluster indicator. +The `df_eq_indicator` data frame consists of three columns that must follow the format below: + +``` r +data.frame(eq_name = c("eq1"), + eq_indicator = c("ca_salades = ca_batavia + ca_mache"), + unit = c("EUR")) +``` + +``` +## eq_name eq_indicator unit +## 1 eq1 ca_salades = ca_batavia + ca_mache EUR +``` + + ## The Arguments of `format_template` \label{sec:args_template} @@ -74,12 +89,12 @@ args(format_template) ## NULL ``` -Les arguments de la fonction sont les suivants : +The function arguments are as follows: -- `data` : dataframe du template Eurostat contenant toutes les cellules publiées. -- `indicator_column` : nom de la colonne dans laquelle se trouvent les indicateurs. -- `spanning_var_tot` : liste nommée des variables de croisements et de leurs totaux. -- `field_columns` : vecteur de toutes les colonnes représentant des champs (ex : millésime). +- `data`: the Eurostat template data frame containing all published cells. +- `indicator_column`: the name of the column containing the indicators. +- `spanning_var_tot`: a named list of the crossing variables and their totals. +- `field_columns`: a vector of all columns representing fields (e.g., vintage/year). # Examples @@ -126,14 +141,16 @@ cluster_id_dataframe ``` ``` -## # A tibble: 4 x 10 -## # Groups: table_name [4] -## cluster table_name field indicator spanning_1 spanning_2 spanning_3 hrc_spanning_1 hrc_spanning_2 hrc_spanning_3 -## -## 1 france_entrep~ T10.T12.T8 fran~ LETTUCE HRC_NAF cj HRC_LETTU~ hrc_naf hrc_lettuce -## 2 france_entrep~ T11.T7.T9 fran~ LETTUCE HRC_NAF size HRC_LETTU~ hrc_naf hrc_lettuce -## 3 france_entrep~ T1.T2 fran~ to_pizza HRC_NUTS size hrc_nuts -## 4 france_entrep~ T3.T4.T5.~ fran~ to_pizza HRC_NAF HRC_NUTS hrc_naf hrc_nuts +## cluster table_name field indicator spanning_1 spanning_2 spanning_3 hrc_spanning_1 +## 1 france_entreprises_2023.hrc_lettuce T10.T12.T8 france_entreprises_2023 LETTUCE HRC_NAF cj HRC_LETTUCE^h hrc_naf +## 2 france_entreprises_2023.hrc_lettuce T11.T7.T9 france_entreprises_2023 LETTUCE HRC_NAF size HRC_LETTUCE^h hrc_naf +## 3 france_entreprises_2023.to_pizza T1.T2 france_entreprises_2023 to_pizza HRC_NUTS size hrc_nuts +## 4 france_entreprises_2023.to_pizza T3.T4.T5.T6 france_entreprises_2023 to_pizza HRC_NAF HRC_NUTS hrc_naf +## hrc_spanning_2 hrc_spanning_3 +## 1 hrc_lettuce +## 2 hrc_lettuce +## 3 +## 4 hrc_nuts ``` For the 12 tables to be published, it is sufficient to protect 4 tables. These tables are distributed across two different clusters. Therefore, `tab_multi_manager()` needs to be called twice. @@ -147,8 +164,8 @@ names(detailed_analysis) ``` ``` -## [1] "identify_hrc" "info_var" "split_in_clusters" "create_edges" "grp_tab_names" -## [6] "grp_tab_in_clusters" "tab_to_treat" "df_tab_to_treat" +## [1] "identify_hrc" "info_var" "split_in_clusters" "create_edges" "grp_tab_names" "grp_tab_in_clusters" +## [7] "tab_to_treat" "df_tab_to_treat" ``` One finds the dataframe with the cluster indicator `df_tab_to_treat`. The result is the same but in list format: each element of the list is an independent cluster `tab_to_treat`. Additionally, the 6 steps of the analysis are included. @@ -184,7 +201,14 @@ template_formatted <- format_template( LEGAL_FORM = "_T"), field_columns = c("TIME_PERIOD") ) +``` +``` +## treating the field 2021 +## treating the field 2022 +``` + +``` r template_formatted$metadata ``` @@ -230,29 +254,131 @@ Next, this dataframe is used as input for the analysis function. ``` r # Analyse complète, avec les étapes detailed_analysis <- analyse_metadata(metadata_template, verbose = TRUE) +``` +``` +## Error in check_column_names(df_metadata): Error: The dataframe is missing one or more required columns: table_name, field, hrc_field, indicator, hrc_indicator. +``` + +``` r # Output simplifié, uniquement le dataframe avec l'indicatrice de cluster cluster_id_dataframe <- analyse_metadata(metadata_template, verbose = FALSE) +``` +``` +## Error in check_column_names(df_metadata): Error: The dataframe is missing one or more required columns: table_name, field, hrc_field, indicator, hrc_indicator. +``` + +``` r # visualisation du résultat de l'analyse cluster_id_dataframe ``` ``` -## # A tibble: 6 x 8 -## # Groups: table_name [6] -## cluster table_name field indicator spanning_1 spanning_2 hrc_spanning_1 hrc_spanning_2 -## -## 1 2021.SAL_DTH table_2021_SAL_DTH_1 2021 SAL_DTH HRC_ACTIVITY_131 HRC_LEGAL_FORM_3 hrc_activity_131 hrc_legal_form_3 -## 2 2021.SAL_DTH table_2021_SAL_DTH_2 2021 SAL_DTH HRC_ACTIVITY_131 HRC_NUMBER_EMPL_4 hrc_activity_131 hrc_number_empl_4 -## 3 2022.SAL table_2022_SAL_1 2022 SAL HRC_ACTIVITY_131 HRC_LEGAL_FORM_3 hrc_activity_131 hrc_legal_form_3 -## 4 2022.SAL table_2022_SAL_2 2022 SAL HRC_ACTIVITY_131 HRC_NUMBER_EMPL_4 hrc_activity_131 hrc_number_empl_4 -## 5 2022.SAL_DTH table_2022_SAL_DTH_1 2022 SAL_DTH HRC_ACTIVITY_131 HRC_LEGAL_FORM_3 hrc_activity_131 hrc_legal_form_3 -## 6 2022.SAL_DTH table_2022_SAL_DTH_2 2022 SAL_DTH HRC_ACTIVITY_131 HRC_NUMBER_EMPL_4 hrc_activity_131 hrc_number_empl_4 +## cluster table_name field indicator spanning_1 spanning_2 spanning_3 hrc_spanning_1 +## 1 france_entreprises_2023.hrc_lettuce T10.T12.T8 france_entreprises_2023 LETTUCE HRC_NAF cj HRC_LETTUCE^h hrc_naf +## 2 france_entreprises_2023.hrc_lettuce T11.T7.T9 france_entreprises_2023 LETTUCE HRC_NAF size HRC_LETTUCE^h hrc_naf +## 3 france_entreprises_2023.to_pizza T1.T2 france_entreprises_2023 to_pizza HRC_NUTS size hrc_nuts +## 4 france_entreprises_2023.to_pizza T3.T4.T5.T6 france_entreprises_2023 to_pizza HRC_NAF HRC_NUTS hrc_naf +## hrc_spanning_2 hrc_spanning_3 +## 1 hrc_lettuce +## 2 hrc_lettuce +## 3 +## 4 hrc_nuts ``` Ultimately, there are 6 tables to process in 3 different clusters. In other words, `tab_multi_manager()` will need to be called three times. +## Equations between indicators + +In this example, we start from a metadata file describing 12 tables to be published on the turnover from pizza and salad sales. + + +``` r +str(metadata_pizza_lettuce) +``` + +``` +## 'data.frame': 12 obs. of 9 variables: +## $ table_name : chr "T1" "T2" "T3" "T4" ... +## $ field : chr "france_entreprises_2023" "france_entreprises_2023" "france_entreprises_2023" "france_entreprises_2023" ... +## $ hrc_field : logi NA NA NA NA NA NA ... +## $ indicator : chr "to_pizza" "to_pizza" "to_pizza" "to_pizza" ... +## $ hrc_indicator : chr NA NA NA NA ... +## $ spanning_1 : chr "nuts2" "nuts3" "a10" "a10" ... +## $ hrc_spanning_1: chr "hrc_nuts" "hrc_nuts" "hrc_naf" "hrc_naf" ... +## $ spanning_2 : chr "size" "size" "nuts2" "nuts3" ... +## $ hrc_spanning_2: chr NA NA "hrc_nuts" "hrc_nuts" ... +``` + +This time a data frame describing the relationships between indicators is added. + +``` r +liens_eq <- data.frame(eq_name = c("eq1"), + eq_indicator = c("ca_salades = ca_batavia + ca_mache"), + unit = c("EUR")) +``` + + +Example code : + +``` r +library(rtauargus) + +data(metadata_pizza_lettuce) + +# Full analysis, with intermediate steps +detailed_analysis <- analyse_metadata(metadata_pizza_lettuce, + df_eq_indicator = liens_eq, + verbose = TRUE) +``` + +``` +## Warning in analyse_metadata(metadata_pizza_lettuce, df_eq_indicator = liens_eq, : For the variables part of equations specified in df_eq_indicator, +## the hrc_indicator column will be ignored. +``` + +``` +## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in dplyr 1.1.0. +## i Please use `reframe()` instead. +## i When switching from `summarise()` to `reframe()`, remember that `reframe()` always returns an ungrouped data frame and adjust +## accordingly. +## i The deprecated feature was likely used in the rtauargus package. +## Please report the issue at . +## This warning is displayed once every 8 hours. +## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated. +``` + +``` r +# Simplified output, only the data frame with the cluster indicator +cluster_id_dataframe <- analyse_metadata(metadata_pizza_lettuce, verbose = FALSE) +``` + +Here, a warning informs us that the `hrc_indicator` column will be ignored since the +relationships between indicators have been specified using the `df_eq_indicator` argument. +Therefore, links should not be defined in `hrc_indicator` when using the `df_eq_indicator` argument. + +The result is the same as the first example. + + +``` r +cluster_id_dataframe +``` + +``` +## cluster table_name field indicator spanning_1 spanning_2 spanning_3 hrc_spanning_1 +## 1 france_entreprises_2023.hrc_lettuce T10.T12.T8 france_entreprises_2023 LETTUCE HRC_NAF cj HRC_LETTUCE^h hrc_naf +## 2 france_entreprises_2023.hrc_lettuce T11.T7.T9 france_entreprises_2023 LETTUCE HRC_NAF size HRC_LETTUCE^h hrc_naf +## 3 france_entreprises_2023.to_pizza T1.T2 france_entreprises_2023 to_pizza HRC_NUTS size hrc_nuts +## 4 france_entreprises_2023.to_pizza T3.T4.T5.T6 france_entreprises_2023 to_pizza HRC_NAF HRC_NUTS hrc_naf +## hrc_spanning_2 hrc_spanning_3 +## 1 hrc_lettuce +## 2 hrc_lettuce +## 3 +## 4 hrc_nuts +``` + + # Going Further: Visualizing Inclusions The create_edges step in the metadata analysis identifies tables included within other tables. For example, XXXXX is included in XXXXX. The following code allows visualizing these inclusions using graphs to better understand the analysis procedure. diff --git a/vignettes/auto_metadata.Rmd.orig b/vignettes/auto_metadata.Rmd.orig index f9e614f..d7a9d3f 100644 --- a/vignettes/auto_metadata.Rmd.orig +++ b/vignettes/auto_metadata.Rmd.orig @@ -54,8 +54,17 @@ args(analyse_metadata) The function arguments are as follows: - `df_metadata`: a dataframe structured as specified in section \ref{sec:crea_meta}. +- `df_eq_indicator` : un dataframe regroupant les équations entre les indicateurs. - `verbose`: a variable of type `logical`. If `TRUE` he function returns a list with the different steps of the analysis; if `FALSE`, it returns only the final dataframe with the cluster indicator. +The `df_eq_indicator` data frame consists of three columns that must follow the format below: +```{r} +data.frame(eq_name = c("eq1"), + eq_indicator = c("ca_salades = ca_batavia + ca_mache"), + unit = c("EUR")) +``` + + ## The Arguments of `format_template` \label{sec:args_template} @@ -63,12 +72,12 @@ The function arguments are as follows: args(format_template) ``` -Les arguments de la fonction sont les suivants : +The function arguments are as follows: -- `data` : dataframe du template Eurostat contenant toutes les cellules publiées. -- `indicator_column` : nom de la colonne dans laquelle se trouvent les indicateurs. -- `spanning_var_tot` : liste nommée des variables de croisements et de leurs totaux. -- `field_columns` : vecteur de toutes les colonnes représentant des champs (ex : millésime). +- `data`: the Eurostat template data frame containing all published cells. +- `indicator_column`: the name of the column containing the indicators. +- `spanning_var_tot`: a named list of the crossing variables and their totals. +- `field_columns`: a vector of all columns representing fields (e.g., vintage/year). # Examples @@ -162,6 +171,48 @@ cluster_id_dataframe Ultimately, there are 6 tables to process in 3 different clusters. In other words, `tab_multi_manager()` will need to be called three times. +## Equations between indicators + +In this example, we start from a metadata file describing 12 tables to be published on the turnover from pizza and salad sales. + +```{r} +str(metadata_pizza_lettuce) +``` + +This time a data frame describing the relationships between indicators is added. +```{r} +liens_eq <- data.frame(eq_name = c("eq1"), + eq_indicator = c("ca_salades = ca_batavia + ca_mache"), + unit = c("EUR")) +``` + + +Example code : +```{r} +library(rtauargus) + +data(metadata_pizza_lettuce) + +# Full analysis, with intermediate steps +detailed_analysis <- analyse_metadata(metadata_pizza_lettuce, + df_eq_indicator = liens_eq, + verbose = TRUE) + +# Simplified output, only the data frame with the cluster indicator +cluster_id_dataframe <- analyse_metadata(metadata_pizza_lettuce, verbose = FALSE) +``` + +Here, a warning informs us that the `hrc_indicator` column will be ignored since the +relationships between indicators have been specified using the `df_eq_indicator` argument. +Therefore, links should not be defined in `hrc_indicator` when using the `df_eq_indicator` argument. + +The result is the same as the first example. + +```{r} +cluster_id_dataframe +``` + + # Going Further: Visualizing Inclusions The create_edges step in the metadata analysis identifies tables included within other tables. For example, XXXXX is included in XXXXX. The following code allows visualizing these inclusions using graphs to better understand the analysis procedure. diff --git a/vignettes/auto_metadata_fr.Rmd b/vignettes/auto_metadata_fr.Rmd index 5c70a69..d98f6dd 100644 --- a/vignettes/auto_metadata_fr.Rmd +++ b/vignettes/auto_metadata_fr.Rmd @@ -6,10 +6,10 @@ output: toc: true toc_depth: 3 vignette: > - %\VignetteIndexEntry{Analyse automatique des métadonnées} +%\VignetteIndexEntry{Analyse automatique des métadonnées} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} ---- + --- @@ -52,15 +52,29 @@ args(analyse_metadata) ``` ``` -## function (df_metadata, verbose = FALSE) +## function (df_metadata, df_eq_indicator = NULL, verbose = FALSE) ## NULL ``` Les arguments de la fonction sont les suivants : - `df_metadata` : un dataframe structuré comme précisé dans la section \ref{sec:crea_meta}. +- `df_eq_indicator` : un dataframe regroupant les équations entre les indicateurs. - `verbose` : une variable de type `logical`, si `TRUE` la fonction retourne une liste avec les différentes étapes de l'analyse, si `FALSE` elle retourne juste le dataframe final avec l'indicatrice de cluster. +Le dataframe `df_eq_indicator` est constitué de trois colonnes qui doivent respecter le format suivant : + +``` r +data.frame(eq_name = c("eq1"), + eq_indicator = c("ca_salades = ca_batavia + ca_mache"), + unit = c("EUR")) +``` + +``` +## eq_name eq_indicator unit +## 1 eq1 ca_salades = ca_batavia + ca_mache EUR +``` + ## Les arguments de `format_template` \label{sec:args_template} @@ -126,16 +140,16 @@ cluster_id_dataframe ``` ``` -## cluster table_name field indicator spanning_1 spanning_2 spanning_3 hrc_spanning_1 hrc_spanning_2 -## 1 france_entreprises_2023.hrc_lettuce T10.T12.T8 france_entreprises_2023 LETTUCE HRC_NAF cj HRC_LETTUCE^h hrc_naf -## 2 france_entreprises_2023.hrc_lettuce T11.T7.T9 france_entreprises_2023 LETTUCE HRC_NAF size HRC_LETTUCE^h hrc_naf -## 3 france_entreprises_2023.to_pizza T1.T2 france_entreprises_2023 to_pizza HRC_NUTS size hrc_nuts -## 4 france_entreprises_2023.to_pizza T3.T4.T5.T6 france_entreprises_2023 to_pizza HRC_NAF HRC_NUTS hrc_naf hrc_nuts -## hrc_spanning_3 -## 1 hrc_lettuce -## 2 hrc_lettuce -## 3 -## 4 +## cluster table_name field indicator spanning_1 spanning_2 spanning_3 hrc_spanning_1 +## 1 france_entreprises_2023.hrc_lettuce T10.T12.T8 france_entreprises_2023 LETTUCE HRC_NAF cj HRC_LETTUCE^h hrc_naf +## 2 france_entreprises_2023.hrc_lettuce T11.T7.T9 france_entreprises_2023 LETTUCE HRC_NAF size HRC_LETTUCE^h hrc_naf +## 3 france_entreprises_2023.to_pizza T1.T2 france_entreprises_2023 to_pizza HRC_NUTS size hrc_nuts +## 4 france_entreprises_2023.to_pizza T3.T4.T5.T6 france_entreprises_2023 to_pizza HRC_NAF HRC_NUTS hrc_naf +## hrc_spanning_2 hrc_spanning_3 +## 1 hrc_lettuce +## 2 hrc_lettuce +## 3 +## 4 hrc_nuts ``` Pour les 12 tableaux à publier il suffit de protéger 4 tableaux. Ces tableaux sont repartis dans deux clusters différents. Il faudra donc faire appel deux fois à `tab_multi_manager()`. @@ -149,8 +163,8 @@ names(detailed_analysis) ``` ``` -## [1] "identify_hrc" "info_var" "split_in_clusters" "create_edges" "grp_tab_names" "grp_tab_in_clusters" "tab_to_treat" -## [8] "df_tab_to_treat" +## [1] "identify_hrc" "info_var" "split_in_clusters" "create_edges" "grp_tab_names" "grp_tab_in_clusters" +## [7] "tab_to_treat" "df_tab_to_treat" ``` On retrouve le dataframe avec l'indicatrice du cluster `df_tab_to_treat`. On a le même résultat mais en format liste : chaque élément de la sous-liste `tab_to_treat` est un cluster indépendant. Les autres éléments de la liste sont les étapes de l'analyse. @@ -186,7 +200,14 @@ template_formatted <- format_template( LEGAL_FORM = "_T"), field_columns = c("TIME_PERIOD") ) +``` +``` +## treating the field 2021 +## treating the field 2022 +``` + +``` r template_formatted$metadata ``` @@ -253,20 +274,102 @@ cluster_id_dataframe ``` ``` -## cluster table_name field indicator spanning_1 spanning_2 spanning_3 hrc_spanning_1 hrc_spanning_2 -## 1 france_entreprises_2023.hrc_lettuce T10.T12.T8 france_entreprises_2023 LETTUCE HRC_NAF cj HRC_LETTUCE^h hrc_naf -## 2 france_entreprises_2023.hrc_lettuce T11.T7.T9 france_entreprises_2023 LETTUCE HRC_NAF size HRC_LETTUCE^h hrc_naf -## 3 france_entreprises_2023.to_pizza T1.T2 france_entreprises_2023 to_pizza HRC_NUTS size hrc_nuts -## 4 france_entreprises_2023.to_pizza T3.T4.T5.T6 france_entreprises_2023 to_pizza HRC_NAF HRC_NUTS hrc_naf hrc_nuts -## hrc_spanning_3 -## 1 hrc_lettuce -## 2 hrc_lettuce -## 3 -## 4 +## cluster table_name field indicator spanning_1 spanning_2 spanning_3 hrc_spanning_1 +## 1 france_entreprises_2023.hrc_lettuce T10.T12.T8 france_entreprises_2023 LETTUCE HRC_NAF cj HRC_LETTUCE^h hrc_naf +## 2 france_entreprises_2023.hrc_lettuce T11.T7.T9 france_entreprises_2023 LETTUCE HRC_NAF size HRC_LETTUCE^h hrc_naf +## 3 france_entreprises_2023.to_pizza T1.T2 france_entreprises_2023 to_pizza HRC_NUTS size hrc_nuts +## 4 france_entreprises_2023.to_pizza T3.T4.T5.T6 france_entreprises_2023 to_pizza HRC_NAF HRC_NUTS hrc_naf +## hrc_spanning_2 hrc_spanning_3 +## 1 hrc_lettuce +## 2 hrc_lettuce +## 3 +## 4 hrc_nuts ``` Finalement, il y a 6 tableaux à traiter dans 3 clusters différents. Autrement dit, il faudra faire trois fois appel à `tab_multi_manager()`. +## Equations entre indicateurs + +Dans cet exemple on part d'un fichier de métadonnées présentant 12 tableaux à publier sur les chiffres d'affaires de ventes de pizzas et de salades. + +``` r +str(metadata_pizza_lettuce) +``` + +``` +## 'data.frame': 12 obs. of 9 variables: +## $ table_name : chr "T1" "T2" "T3" "T4" ... +## $ field : chr "france_entreprises_2023" "france_entreprises_2023" "france_entreprises_2023" "france_entreprises_2023" ... +## $ hrc_field : logi NA NA NA NA NA NA ... +## $ indicator : chr "to_pizza" "to_pizza" "to_pizza" "to_pizza" ... +## $ hrc_indicator : chr NA NA NA NA ... +## $ spanning_1 : chr "nuts2" "nuts3" "a10" "a10" ... +## $ hrc_spanning_1: chr "hrc_nuts" "hrc_nuts" "hrc_naf" "hrc_naf" ... +## $ spanning_2 : chr "size" "size" "nuts2" "nuts3" ... +## $ hrc_spanning_2: chr NA NA "hrc_nuts" "hrc_nuts" ... +``` + +Auquel on ajoute un dataframe décrivant les liens entre les indicateurs. + +``` r +liens_eq <- data.frame(eq_name = c("eq1"), + eq_indicator = c("ca_salades = ca_batavia + ca_mache"), + unit = c("EUR")) +``` + + +Exemple de code : + +``` r +library(rtauargus) + +data(metadata_pizza_lettuce) + +# Analyse complète, avec les étapes +detailed_analysis <- analyse_metadata(metadata_pizza_lettuce, + df_eq_indicator = liens_eq, + verbose = TRUE) +``` + +``` +## Warning in analyse_metadata(metadata_pizza_lettuce, df_eq_indicator = liens_eq, : For the variables part of equations specified in df_eq_indicator, +## the hrc_indicator column will be ignored. +``` + +``` +## Error in components(g_full): impossible de trouver la fonction "components" +``` + +``` r +# Output simplifié, uniquement le dataframe avec l'indicatrice de cluster +cluster_id_dataframe <- analyse_metadata(metadata_pizza_lettuce, verbose = FALSE) +``` + +Ici, un warning nous prévient que la colonne `hrc_indicator` sera ignorée puisque l'on a précisé +les liens entre indicateurs avec l'argument `df_eq_indicator`. Ainsi, il ne faut pas mettre +des liens dans `hrc_indicator` lorsque l'on utilise l'argument `df_eq_indicator`. + +On obtient le même dataframe nous présentant la façon de traiter les tableaux pour la pose du secret +que dans le premier exemple. + + +``` r +cluster_id_dataframe +``` + +``` +## cluster table_name field indicator spanning_1 spanning_2 spanning_3 hrc_spanning_1 +## 1 france_entreprises_2023.hrc_lettuce T10.T12.T8 france_entreprises_2023 LETTUCE HRC_NAF cj HRC_LETTUCE^h hrc_naf +## 2 france_entreprises_2023.hrc_lettuce T11.T7.T9 france_entreprises_2023 LETTUCE HRC_NAF size HRC_LETTUCE^h hrc_naf +## 3 france_entreprises_2023.to_pizza T1.T2 france_entreprises_2023 to_pizza HRC_NUTS size hrc_nuts +## 4 france_entreprises_2023.to_pizza T3.T4.T5.T6 france_entreprises_2023 to_pizza HRC_NAF HRC_NUTS hrc_naf +## hrc_spanning_2 hrc_spanning_3 +## 1 hrc_lettuce +## 2 hrc_lettuce +## 3 +## 4 hrc_nuts +``` + # Pour aller plus loin : visualiser les inclusions L'étape `create_edges` de l'analyse des métadonnées identifie les tableaux inclus dans d'autres tableaux. Par exemple, XXXXX est inclus dans XXXXX. Le code suivant permet de visualiser ces inclusions à l'aide de graphes afin de mieux comprendre la procédure d'analyse. @@ -287,12 +390,6 @@ library(igraph) ## as_data_frame, groups, union ``` -``` -## L'objet suivant est masqué depuis 'package:testthat': -## -## compare -``` - ``` ## Les objets suivants sont masqués depuis 'package:stats': ## diff --git a/vignettes/auto_metadata_fr.Rmd.orig b/vignettes/auto_metadata_fr.Rmd.orig index eedd1b6..f0c8159 100644 --- a/vignettes/auto_metadata_fr.Rmd.orig +++ b/vignettes/auto_metadata_fr.Rmd.orig @@ -6,10 +6,10 @@ output: toc: true toc_depth: 3 vignette: > - %\VignetteIndexEntry{Analyse automatique des métadonnées} +%\VignetteIndexEntry{Analyse automatique des métadonnées} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} ---- + --- ```{r setup, include=FALSE} @@ -54,8 +54,16 @@ args(analyse_metadata) Les arguments de la fonction sont les suivants : - `df_metadata` : un dataframe structuré comme précisé dans la section \ref{sec:crea_meta}. +- `df_eq_indicator` : un dataframe regroupant les équations entre les indicateurs. - `verbose` : une variable de type `logical`, si `TRUE` la fonction retourne une liste avec les différentes étapes de l'analyse, si `FALSE` elle retourne juste le dataframe final avec l'indicatrice de cluster. +Le dataframe `df_eq_indicator` est constitué de trois colonnes qui doivent respecter le format suivant : +```{r} +data.frame(eq_name = c("eq1"), + eq_indicator = c("ca_salades = ca_batavia + ca_mache"), + unit = c("EUR")) +``` + ## Les arguments de `format_template` \label{sec:args_template} @@ -162,6 +170,47 @@ cluster_id_dataframe Finalement, il y a 6 tableaux à traiter dans 3 clusters différents. Autrement dit, il faudra faire trois fois appel à `tab_multi_manager()`. +## Equations entre indicateurs + +Dans cet exemple on part d'un fichier de métadonnées présentant 12 tableaux à publier sur les chiffres d'affaires de ventes de pizzas et de salades. +```{r} +str(metadata_pizza_lettuce) +``` + +Auquel on ajoute un dataframe décrivant les liens entre les indicateurs. +```{r} +liens_eq <- data.frame(eq_name = c("eq1"), + eq_indicator = c("ca_salades = ca_batavia + ca_mache"), + unit = c("EUR")) +``` + + +Exemple de code : +```{r} +library(rtauargus) + +data(metadata_pizza_lettuce) + +# Analyse complète, avec les étapes +detailed_analysis <- analyse_metadata(metadata_pizza_lettuce, + df_eq_indicator = liens_eq, + verbose = TRUE) + +# Output simplifié, uniquement le dataframe avec l'indicatrice de cluster +cluster_id_dataframe <- analyse_metadata(metadata_pizza_lettuce, verbose = FALSE) +``` + +Ici, un warning nous prévient que la colonne `hrc_indicator` sera ignorée puisque l'on a précisé +les liens entre indicateurs avec l'argument `df_eq_indicator`. Ainsi, il ne faut pas mettre +des liens dans `hrc_indicator` lorsque l'on utilise l'argument `df_eq_indicator`. + +On obtient le même dataframe nous présentant la façon de traiter les tableaux pour la pose du secret +que dans le premier exemple. + +```{r} +cluster_id_dataframe +``` + # Pour aller plus loin : visualiser les inclusions L'étape `create_edges` de l'analyse des métadonnées identifie les tableaux inclus dans d'autres tableaux. Par exemple, XXXXX est inclus dans XXXXX. Le code suivant permet de visualiser ces inclusions à l'aide de graphes afin de mieux comprendre la procédure d'analyse.