diff --git a/DESCRIPTION b/DESCRIPTION index 62e61f0..323c53e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,7 +2,7 @@ Package: rtauargus Type: Package Title: Using Tau-Argus from R Language: fr -Version: 1.3.1 +Version: 1.3.2 Depends: R (>= 3.5.0) Imports: purrr (>= 0.2), @@ -67,7 +67,7 @@ Description: Protects tables by calling the Tau-Argus software from R. License: MIT + file LICENSE Encoding: UTF-8 LazyData: true -RoxygenNote: 7.3.2 +RoxygenNote: 7.3.3 VignetteBuilder: knitr URL: https://inseefrlab.github.io/rtauargus, https://github.com/inseefrlab/rtauargus, diff --git a/NAMESPACE b/NAMESPACE index a39aae2..5fdf8a9 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -12,6 +12,7 @@ export(from_5_to_3) export(grp_tab_in_cluster) export(grp_tab_names) export(identify_hrc) +export(identify_hrc_with_eq) export(import) export(length_tabs) export(micro_arb) @@ -59,6 +60,7 @@ importFrom(dplyr,rowwise) importFrom(dplyr,select) importFrom(dplyr,summarise) importFrom(dplyr,ungroup) +importFrom(dplyr,where) importFrom(igraph,graph_from_data_frame) importFrom(igraph,which_mutual) importFrom(lifecycle,badge) diff --git a/R/analyse_metadata.R b/R/analyse_metadata.R index 46b52c1..ec155a2 100644 --- a/R/analyse_metadata.R +++ b/R/analyse_metadata.R @@ -6,6 +6,7 @@ #' creating a structured output for further processing. #' #' @param df_metadata A dataframe containing metadata in wide format. +#' @param df_eq_indicator A datadframe containing the indicators equations if needed. #' @param verbose Logical. If `TRUE`, returns a detailed list of intermediate results #' from each processing step. If `FALSE`, returns only the cluster assignments. Defaults to `FALSE`. #' @@ -28,7 +29,7 @@ #' @details The function performs the following steps: #' \itemize{ #' \item Converts the metadata from wide format to long format using \code{wide_to_long}. -#' \item Identifies hierarchical relationships and renames variables with \code{identify_hrc}. +#' \item Identifies hierarchical relationships and renames variables with \code{identify_hrc} or \code{identify_hrc_with_eq}. #' \item Splits hierarchical relationships into clusters using \code{split_in_clusters}. #' \item Creates edges to describe the relationships via \code{create_edges}. #' \item Generates translation tables for regrouping with \code{grp_tab_names}. @@ -53,7 +54,7 @@ #' `r lifecycle::badge("experimental")` #' #' @export -analyse_metadata <- function(df_metadata,verbose = FALSE){ +analyse_metadata <- function(df_metadata,df_eq_indicator = NULL,verbose = FALSE){ # check that the input is in the right format: right column names check_column_names <- function(df) { # Expected fixed column names @@ -102,7 +103,14 @@ analyse_metadata <- function(df_metadata,verbose = FALSE){ # start of the actual analysis df_metadata_long <- wide_to_long(df_metadata) - list_hrc_identified <- identify_hrc(df_metadata_long) + if(is.null(df_eq_indicator)){ + list_hrc_identified <- identify_hrc(df_metadata_long) + }else{ + warning("For the variables part of equations specified in df_eq_indicator, + the hrc_indicator column will be ignored.") + list_hrc_identified <- identify_hrc_with_eq(df_metadata_long,df_eq_indicator) + } + list_split <- split_in_clusters(list_hrc_identified) list_desc_links <- create_edges(list_split) list_translation_tables <- grp_tab_names(list_desc_links) diff --git a/R/create_edges.R b/R/create_edges.R index 1464e7e..acbbb26 100644 --- a/R/create_edges.R +++ b/R/create_edges.R @@ -57,26 +57,43 @@ create_edges <- function(list_split){ small_tibble$spanning})) }) list_desc_links <- nested_crois %>% map(function(big_tibble) { - # Condition for clusters that only have one table + # Only compare the spanning variables for clusters that have more than 1 table if(length(big_tibble$table_name) > 1) { spannings_nom_tab <- combn(big_tibble$table_name, 2, FUN = list) - tab_to_keep <- map(spannings_nom_tab, function(crois) { + # Extract the first table (Ta) corresponding to the first table of the + # pair of tables we are studying Ta <- big_tibble %>% filter(table_name == crois[[1]]) + # Get the 'spanning' column for Ta crois_Ta <- Ta$spanning[[1]] + + # Extract the first table (Tb) corresponding to the second table of the + # pair of tables we are studying Tb <- big_tibble %>% filter(table_name == crois[[2]]) + # Get the 'spanning' column for Tb crois_Tb <- Tb$spanning[[1]] + + # Create an empty data frame to store origin → destination relationships df_origin_dest <- data.frame(from = character(), to = character(), stringsAsFactors = FALSE) + + # Check if all elements of Ta are included in Tb if(all(crois_Ta %in% crois_Tb)) { + # Add a row indicating that Ta can go to Tb df_origin_dest <- rbind(df_origin_dest, - data.frame(from = crois[[1]], to = crois[[2]],stringsAsFactors = FALSE)) + data.frame(from = crois[[1]], to = crois[[2]], stringsAsFactors = FALSE)) } + + # Check if all elements of Tb are included in Ta if(all(crois_Tb %in% crois_Ta)) { + # Add a row indicating that Tb can go to Ta df_origin_dest <- rbind(df_origin_dest, data.frame(from = crois[[2]], to = crois[[1]], stringsAsFactors = FALSE)) } + + # Return the data frame containing possible links return(df_origin_dest) }) + # Filter non empty tables and combine them tab_to_keep_compact <- tab_to_keep %>% purrr::discard(is.null) %>% diff --git a/R/format_template.R b/R/format_template.R index 06f73f1..ba48d0c 100644 --- a/R/format_template.R +++ b/R/format_template.R @@ -78,15 +78,27 @@ filter_on_marginal_of_spanning_var <- function(data, criteria, subset_keys) { ) # Create filter expressions for all other keys with != other_keys <- setdiff(names(criteria), subset_keys) - filter_expr_not_in <- purrr::map2( - other_keys, - criteria[other_keys], - ~ rlang::expr(!!rlang::sym(.x) != !!.y) - ) + if(all(other_keys == "")){ + # treat the eventuality of all spanning variables being crossed on their + # non total values + not_totals_all_spannings <- purrr::map2( + subset_keys, + criteria[subset_keys], + ~ rlang::expr(!!rlang::sym(.x) != !!.y) + ) + return(data %>% filter(!!!not_totals_all_spannings)) + } else { + filter_expr_not_in <- purrr::map2( + other_keys, + criteria[other_keys], + ~ rlang::expr(!!rlang::sym(.x) != !!.y) + ) + } + # Combine the two sets of expressions combined_filter_expr <- c(filter_expr_in, filter_expr_not_in) # Apply the combined filter - data %>% filter(!!!combined_filter_expr) %>% select(-!!subset_keys) + return(data %>% filter(!!!combined_filter_expr) %>% select(-!!subset_keys)) } #' Determines the tables described in a template gathering all the published cells @@ -94,7 +106,7 @@ filter_on_marginal_of_spanning_var <- function(data, criteria, subset_keys) { #' @param data template gathering all the published cells #' @param indicator_column name of the column in which the indicators are #' @param spanning_var_tot a named list of the spanning variables and their totals -#' @param field_columns vecotr of all the columns that are fields (ex: year of collect) +#' @param field_columns vector of all the columns that are fields (ex: year of collect) #' #' @return named list of a dataframe describing the tables (metadata) and a list of #' the modalities of each hierarchical variable (modalities) diff --git a/R/globals.R b/R/globals.R index 9f82e0b..24bc305 100644 --- a/R/globals.R +++ b/R/globals.R @@ -3,6 +3,7 @@ utils::globalVariables( c("table_name", "data","cluster","field","hrc_field","indicator","hrc_indicator", "n_unique","column","unique_modalities","from.eg","to.eg","from","to","mutual_full", "Group","table_eg","spanning","hrc_spanning","spanning_old","tab_inclus", - "starts_with","spanning_name","hrc_spanning_name", + "starts_with","spanning_name","hrc_spanning_name","eq_indicator","rhs","total","term_number", + "eq_name","unit","var","n_total","total_alt","group", ".") ) diff --git a/R/grp_tab_in_cluster.R b/R/grp_tab_in_cluster.R index 16bd9e8..92cc552 100644 --- a/R/grp_tab_in_cluster.R +++ b/R/grp_tab_in_cluster.R @@ -53,7 +53,7 @@ grp_tab_in_cluster <- function(list_split, list_translation_tables) { # Nest each cluster of tables by `table_name` nested_crois <- list_split %>% - purrr::map(function(tab) { tab %>% dplyr::group_by(table_name) %>% tidyr::nest() }) + purrr::map(function(tab) {tab %>% dplyr::group_by(table_name) %>% tidyr::nest() }) # Process each cluster using inclusion relationships purrr::map2(list_translation_tables, nested_crois, function(tab_to_keep, big_tibble) { @@ -66,7 +66,7 @@ grp_tab_in_cluster <- function(list_split, list_translation_tables) { # Process the cluster big_tibble_eg <- big_tibble %>% mutate( - spanning = map(data, function(small_tibble) { small_tibble$spanning }) + spanning = map(data, function(small_tibble) {small_tibble$spanning}) ) %>% dplyr::left_join(tab_to_keep[[2]], by = c("table_name" = "Original")) %>% mutate(table_eg = ifelse(is.na(Group), table_name, Group)) %>% @@ -84,6 +84,13 @@ grp_tab_in_cluster <- function(list_split, list_translation_tables) { ) }) ) %>% dplyr::rename(table_name = table_eg) + # adding the tables that are not included in each other in the cluster + if(length(big_tibble$table_name) != length(tab_to_keep$passage_nom_tab$Original)){ + tables_no_inclusion <- big_tibble %>% + filter(table_name %in% setdiff(big_tibble$table_name,tab_to_keep$passage_nom_tab$Original)) %>% + mutate(spanning = map(data, function(small_tibble) {small_tibble$spanning})) + big_tibble_eg <- bind_rows(tables_no_inclusion,big_tibble_eg) + } return(big_tibble_eg) } else { diff --git a/R/identify_hrc.R b/R/identify_hrc.R index 2ecd0dd..5638ebc 100644 --- a/R/identify_hrc.R +++ b/R/identify_hrc.R @@ -48,7 +48,7 @@ identify_hrc <- function(df_metadata_long){ table_name = df_spannings$table_name ) %>% unique() df_spannings <- df_spannings %>% select(-spanning_old) - if(all(is.na(df_spannings$hrc_indicator))){ # condition pour les hiérarchies sur les indicateurs + if(all(is.na(df_spannings$hrc_indicator))){ # condition for hierarchies on indicators df_indicators <- df_spannings return(list(df_indicators,df_variable_info)) } else { diff --git a/R/identify_hrc_with_eq.R b/R/identify_hrc_with_eq.R new file mode 100644 index 0000000..ff22b0f --- /dev/null +++ b/R/identify_hrc_with_eq.R @@ -0,0 +1,267 @@ +#' Rename variables based on their hierarchies and their equations +#' +#' This function renames variables in a long-format metadata data frame based on +#' their hierarchical groupings. Spanning variables are renamed using the name +#' of their hierarchy in uppercase, while response variables linked by an +#' equation (specified in the `hrc_indicator` column) are grouped together, and +#' a new grouping variable is created. +#' +#' @param df_metadata_long A data frame in long format with the following +#' required columns: +#' - `table_name`: Identifies the table. +#' - `field` : name of the field of the table. +#' - `indicator`: name of the indicator of the table. +#' - `hrc_indicator`: Specifies linked response variables. +#' - `spanning_*`, `hrc_spanning_*`: Spanning variables and their hierarchies. +#' @param df_eq_indicator A dataframe containing the equations on indicators with +#' the following required columns : +#' - `eq_name`: Name of the equation. +#' - `eq_indicator`: The equation for example, A = B + C. +#' - `unit`: The unit of the indicators in the equation. +#' +#' @return `list_hrc_identified`, a list with two elements: +#' - `df_indicators`: The updated data frame with renamed variables and grouped +#' response variables. +#' - `df_variable_info`: A data frame mapping original variable names +#' (`spanning_old`) to their renamed counterparts (`spanning`). +#' +#' @export +#' +#' @examples +#' \dontrun{ +#' data(metadata_pizza_lettuce) +#' +#' metadata_pizza_lettuce_long <- wide_to_long(metadata_pizza_lettuce) +#' df_eq_ex <- data.frame( +#' eq_name = c("eq1"), +#' eq_indicator = c("ca_salades = ca_batavia + ca_mache"), +#' unit = c("EUR")) +#' +#' list_hrc_identified <- identify_hrc(metadata_pizza_lettuce_long, df_eq_ex) +#' +#' str(list_hrc_identified) +#' } +#' +#' @importFrom dplyr where +#' +identify_hrc_with_eq <- function(df_metadata_long,df_eq_indicator){ + # check that the input is in the right format: right column names + check_column_names <- function(df) { + # Expected fixed column names + fixed_columns <- c("eq_name", "eq_indicator", "unit") + + # Check that the fixed columns exist + if (!all(fixed_columns %in% names(df))) { + stop("Error: The dataframe describing the equations between indicators is + missing one or more required columns: eq_name, eq_indicator, unit.") + }} + check_column_names(df_eq_indicator) + + # 'parsed_equations' is a data frame where each equation from 'df_eq_indicator' + # is parsed into its left-hand side ('total') and right-hand side terms ('rhs1', 'rhs2', etc.), + # with each rhs term placed in a separate column. + parsed_equations <- df_eq_indicator %>% + tidyr::separate(eq_indicator, into = c("total", "rhs"), sep = "=", extra = "merge") %>% + dplyr::mutate(rhs = trimws(rhs)) %>% + tidyr::separate_rows(rhs, sep = "\\+") %>% + dplyr::mutate(rhs = trimws(rhs), + total = trimws(total)) %>% + dplyr::group_by(dplyr::across(-rhs)) %>% + dplyr::mutate(term_number = paste0("rhs", dplyr::row_number())) %>% + tidyr::pivot_wider(names_from = term_number, values_from = rhs) %>% + dplyr::ungroup() %>% + dplyr::select(eq_name, unit, total, everything()) + + # change to long format in order to join with df_metadata_long + equations_long <- parsed_equations %>% + mutate(across(c(total, starts_with("rhs")), trimws)) %>% + tidyr::pivot_longer( + cols = c(total, starts_with("rhs")), + names_to = "side", # côté équation (total / rhs1 / rhs2...) + values_to = "var" + ) %>% + filter(!is.na(var)) + + # Identify chained equations (A = B + C, B = D + E → group both equations together) + + # Build dependency links between totals and rhs + links <- parsed_equations %>% + tidyr::pivot_longer( + cols = starts_with("rhs"), + names_to = "rhs_term", + values_to = "rhs" + ) %>% + dplyr::filter(!is.na(rhs)) %>% + dplyr::mutate( + total = trimws(as.character(total)), + rhs = trimws(as.character(rhs)) + ) %>% + dplyr::distinct() + + # ---- 1) Identify ambiguous totals ---- + total_counts <- parsed_equations %>% dplyr::count(total, name = "n_total") + ambiguous_totals <- total_counts %>% dplyr::filter(n_total > 1) %>% pull(total) + + # ---- 2) Build a total -> total_alt mapping by eq_name ---- + # For all equations (ambiguous or not), create one row; + # for non-ambiguous totals, total_alt == total + alt_map <- parsed_equations %>% + dplyr::distinct(eq_name, total) %>% + dplyr::group_by(total) %>% + dplyr::arrange(eq_name) %>% # ordre stable + dplyr::mutate(alt_idx = dplyr::row_number(), + total_alt = dplyr::case_when( + dplyr::n() == 1 ~ total, + alt_idx == 1 ~ total, + TRUE ~ paste0(total, "_alt", alt_idx - 1) + ) + ) %>% + dplyr::ungroup() %>% + dplyr::select(eq_name, total, total_alt) + + # ---- 3) Apply the mapping to the links ---- + # 'links' contains total, rhs, eq_name (if not, it must be joined beforehand) + # here we assume links has an eq_name column; otherwise do + # left_join(links, parsed_equations %>% select(eq_name, total, rhs), ...) first + links_full <- links %>% + # replace total with its equation-specific alternative + left_join(alt_map, by = c("eq_name", "total")) %>% + mutate(total = dplyr::coalesce(total_alt, total)) %>% + select(-total_alt) %>% + # now replace rhs if it exists as a "total" in alt_map: + # we must choose the correct total_alt for rhs according to the equation + # where it plays the role of a total. + # to do so, join alt_map by mapping rhs -> total, keeping the alt + # corresponding to the SOURCE row eq_name. + left_join(alt_map, by = c("eq_name", "rhs" = "total")) %>% + mutate(rhs = dplyr::coalesce(total_alt, rhs)) %>% + select(total, rhs, eq_name) %>% + dplyr::distinct() + + # ---- 4) Build the full graph (including all copies) ---- + g_full <- graph_from_data_frame(links_full %>% select(total, rhs), directed = TRUE) + + # ---- 5) Compute components on g_full ---- + comp_full <- igraph::components(g_full)$membership + comp_df <- data.frame(var = names(comp_full), group = as.integer(comp_full), stringsAsFactors = FALSE) + + # ---- 6) Update equations_long: + # associate the alternative variable (if present) and the corresponding group ---- + # Notes: + # - equations_long contains the original variables (var) and eq_name; + # - we want to recover the "var" or "var_alt" version used in g_full. + equations_long_full <- equations_long %>% + # join the correspondence eq_name + var (original total) -> total_alt (if any) + left_join(alt_map, by = c("eq_name", "var" = "total")) %>% + mutate(var_mapped = dplyr::coalesce(total_alt, var)) %>% + select(-total_alt) %>% + # join the group computed on the full graph + left_join(comp_df, by = c("var_mapped" = "var")) %>% + # for var_mapped without a group (isolated), keep NA or assign a single group + mutate(group = as.integer(group)) + + # 'df_spannings' is a modified version of 'df_metadata_long' where: + # - 'spanning' is replaced by its uppercase hierarchical version if available, + # - 'indicator' is replaced by its uppercase hierarchical version + # (without the 'hrc_' prefix) if available. + df_spannings <- df_metadata_long %>% + mutate(spanning_old = spanning) %>% + mutate(spanning = ifelse(is.na(hrc_spanning), + spanning, + toupper(hrc_spanning))) %>% + mutate(indicator = ifelse(is.na(hrc_indicator), + indicator, + toupper(sub("hrc_","",hrc_indicator)))) + + # 'df_variable_info' is a reference table linking original spanning names ('spanning_old') + # to their transformed counterparts ('spanning'), along with the corresponding table name. + df_variable_info <- data.frame( + var_start_name = df_spannings$spanning_old, + var_end_name = df_spannings$spanning, + table_name = df_spannings$table_name + ) %>% unique() + + # Update 'df_spannings' by removing the temporary 'spanning_old' column. + df_spannings <- df_spannings %>% select(-spanning_old) + + df_spannings_eq <- df_spannings %>% + # delete all the non-word elements, specifically for the white spaces + mutate(across(dplyr::where(is.character), ~ gsub("[^[:alnum:]_]", "", .))) %>% + left_join(equations_long_full, by = c("indicator" = "var")) + + df_eq_initial_spannings <- df_spannings_eq %>% + filter(!is.na(eq_name)) %>% + group_by(group) %>% + dplyr::reframe( + table_name = paste(unique(table_name), collapse = "."), + field = last(field), + hrc_field = last(hrc_field), + spanning = spanning, + hrc_spanning = hrc_spanning, + indicator = last(unit), + hrc_indicator = last(hrc_indicator) + ) %>% unique() + + # 'df_eq_indicator_spannings' defines the spanning information for equation indicators. + # Each equation name is transformed into its uppercase form with a "^h" suffix, + # and its hierarchical version prefixed with "hrc_". + df_eq_indicator_spannings <- df_spannings_eq %>% + filter(!is.na(eq_name)) %>% + group_by(group) %>% + summarise( + table_name = paste(unique(table_name), collapse = "."), + field = last(field), + hrc_field = last(hrc_field), + spanning = if(length(unique(eq_name)) > 1) { + paste0(paste0(unique(toupper(eq_name)), collapse = "_"), "^h") + } else { + paste0(toupper(last(eq_name)), "^h") + }, + hrc_spanning = if(length(unique(eq_name)) > 1) { + paste0("hrc_", paste0(unique(toupper(eq_name)), collapse = "_")) + } else { + paste0("hrc_", toupper(last(eq_name))) + }, + indicator = last(unit), + hrc_indicator = last(hrc_indicator), + .groups = "drop" + ) + + # 'df_indicators' combines both initial and indicator spanning information + # into a single harmonized dataset, keeping key structural columns + # and sorting rows by table name. + df_indicators <- bind_rows(df_eq_initial_spannings,df_eq_indicator_spannings) %>% + select(table_name,field,hrc_field,indicator,hrc_indicator,everything()) %>% + arrange(table_name) + + # 'df_no_eq_spannings' contains all spanning rows + # that are not associated with any equation (eq_name is missing). + df_no_eq_spannings <- df_spannings_eq %>% filter(is.na(eq_name)) + + if(nrow(df_no_eq_spannings) > 0){ + if(all(is.na(df_no_eq_spannings$hrc_indicator))){ + df_indicators <- bind_rows(df_indicators,df_no_eq_spannings) %>% arrange(table_name) + return(list(df_indicators,df_variable_info)) + } else { + df_no_eq_indicators <- df_no_eq_spannings %>% + filter(!is.na(hrc_indicator)) %>% + dplyr::group_by(table_name) %>% + summarise( + field = last(field), + hrc_field = last(hrc_field), + spanning = paste0(toupper(last(hrc_indicator)),"^h"), + hrc_spanning = last(hrc_indicator), + indicator = last(indicator), + hrc_indicator = last(hrc_indicator) + ) %>% + bind_rows(df_spannings, .) %>% + arrange(table_name) + df_indicators <- bind_rows(df_indicators,df_no_eq_indicators) %>% arrange(table_name) + list_hrc_identified = list(df_indicators,df_variable_info) + return(list_hrc_identified) + } + } else { + list_hrc_identified = list(df_indicators,df_variable_info) + return(list_hrc_identified) + } +} diff --git a/man/analyse_metadata.Rd b/man/analyse_metadata.Rd index 9179a2b..216e9ef 100644 --- a/man/analyse_metadata.Rd +++ b/man/analyse_metadata.Rd @@ -4,11 +4,13 @@ \alias{analyse_metadata} \title{Analyse Metadata of Tables Needing Secondary Tabular Data Protection} \usage{ -analyse_metadata(df_metadata, verbose = FALSE) +analyse_metadata(df_metadata, df_eq_indicator = NULL, verbose = FALSE) } \arguments{ \item{df_metadata}{A dataframe containing metadata in wide format.} +\item{df_eq_indicator}{A datadframe containing the indicators equations if needed.} + \item{verbose}{Logical. If \code{TRUE}, returns a detailed list of intermediate results from each processing step. If \code{FALSE}, returns only the cluster assignments. Defaults to \code{FALSE}.} } @@ -41,7 +43,7 @@ creating a structured output for further processing. The function performs the following steps: \itemize{ \item Converts the metadata from wide format to long format using \code{wide_to_long}. -\item Identifies hierarchical relationships and renames variables with \code{identify_hrc}. +\item Identifies hierarchical relationships and renames variables with \code{identify_hrc} or \code{identify_hrc_with_eq}. \item Splits hierarchical relationships into clusters using \code{split_in_clusters}. \item Creates edges to describe the relationships via \code{create_edges}. \item Generates translation tables for regrouping with \code{grp_tab_names}. diff --git a/man/format_template.Rd b/man/format_template.Rd index 16aa90e..89df820 100644 --- a/man/format_template.Rd +++ b/man/format_template.Rd @@ -13,7 +13,7 @@ format_template(data, indicator_column, spanning_var_tot, field_columns) \item{spanning_var_tot}{a named list of the spanning variables and their totals} -\item{field_columns}{vecotr of all the columns that are fields (ex: year of collect)} +\item{field_columns}{vector of all the columns that are fields (ex: year of collect)} } \value{ named list of a dataframe describing the tables (metadata) and a list of diff --git a/man/identify_hrc_with_eq.Rd b/man/identify_hrc_with_eq.Rd new file mode 100644 index 0000000..4aa0cbf --- /dev/null +++ b/man/identify_hrc_with_eq.Rd @@ -0,0 +1,59 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/identify_hrc_with_eq.R +\name{identify_hrc_with_eq} +\alias{identify_hrc_with_eq} +\title{Rename variables based on their hierarchies and their equations} +\usage{ +identify_hrc_with_eq(df_metadata_long, df_eq_indicator) +} +\arguments{ +\item{df_metadata_long}{A data frame in long format with the following +required columns: +\itemize{ +\item \code{table_name}: Identifies the table. +\item \code{field} : name of the field of the table. +\item \code{indicator}: name of the indicator of the table. +\item \code{hrc_indicator}: Specifies linked response variables. +\item \verb{spanning_*}, \verb{hrc_spanning_*}: Spanning variables and their hierarchies. +}} + +\item{df_eq_indicator}{A dataframe containing the equations on indicators with +the following required columns : +\itemize{ +\item \code{eq_name}: Name of the equation. +\item \code{eq_indicator}: The equation for example, A = B + C. +\item \code{unit}: The unit of the indicators in the equation. +}} +} +\value{ +\code{list_hrc_identified}, a list with two elements: +\itemize{ +\item \code{df_indicators}: The updated data frame with renamed variables and grouped +response variables. +\item \code{df_variable_info}: A data frame mapping original variable names +(\code{spanning_old}) to their renamed counterparts (\code{spanning}). +} +} +\description{ +This function renames variables in a long-format metadata data frame based on +their hierarchical groupings. Spanning variables are renamed using the name +of their hierarchy in uppercase, while response variables linked by an +equation (specified in the \code{hrc_indicator} column) are grouped together, and +a new grouping variable is created. +} +\examples{ +\dontrun{ +data(metadata_pizza_lettuce) + +metadata_pizza_lettuce_long <- wide_to_long(metadata_pizza_lettuce) + df_eq_ex <- data.frame( + eq_name = c("eq1"), + eq_indicator = c("ca_salades = ca_batavia + ca_mache"), + unit = c("EUR")) + +list_hrc_identified <- identify_hrc(metadata_pizza_lettuce_long, df_eq_ex) + +str(list_hrc_identified) +} + +} diff --git a/tests/testthat/test_analyse_metadata.R b/tests/testthat/test_analyse_metadata.R index c699ae2..a075c40 100644 --- a/tests/testthat/test_analyse_metadata.R +++ b/tests/testthat/test_analyse_metadata.R @@ -1,16 +1,31 @@ # test data -------------------------------------------------------------------- data(metadata_pizza_lettuce) - baseline_res <- analyse_metadata(metadata_pizza_lettuce) +df_eq_ex <- data.frame( + eq_name = c("eq1", "eq2", "eq3", "eq4"), + eq_indicator = c("A = B + C", "A = D + E", "D = F + G", "Z = X + Y + S"), + unit = c("EUR", "EUR", "EUR", "EUR"), + stringsAsFactors = FALSE +) +df_meta_for_eq <- data.frame( + table_name = c("Ta","Tb","Tc","Td","Te","Tf","Tg","Tz","Tx","Ty","Ts"), + field = NA, + hrc_field = NA, + indicator = c("A","B","C","D","E","F","G","Z","X","Y","S"), + hrc_indicator = NA, + spanning_1 = "age_class", + hrc_spanning_1 = NA +) + ################################################################### INPUT CHECKS # check that an error is returned if the names of the columns don't respect the # expected format -------------------------------------------------------------- test_that("error message for wrong column name - fixed columns",{ meta <- metadata_pizza_lettuce %>% rename(table = table_name) - expect_error(analyse_metadata(meta),"The dataframe is missing one or more required columns") + expect_error(analyse_metadata(meta),"one or more required columns: table_name, field, hrc_field, indicator, hrc_indicator") }) test_that("error message for wrong column name - dynamic columns",{ @@ -28,7 +43,6 @@ test_that("unique table name", { expect_error(analyse_metadata(meta),"Duplicate values found in 'table_name'") }) - # check that each table is named (table_name column) --------------------------- test_that("error message when some tables are not named", { @@ -38,6 +52,25 @@ test_that("error message when some tables are not named", { } ) +# check column names of df_eq_indicator +test_that("error message for wrong column name - df_eq_indicator", { + df_eq_ex_modif <- df_eq_ex %>% rename(eq_indicators = eq_indicator) + + expect_error( + suppressWarnings(analyse_metadata(df_meta_for_eq, df_eq_ex_modif)), + "one or more required columns: eq_name, eq_indicator, unit" + ) +}) + +# check that the user will be warned the hrc_indicator column will be ignored if +# df_eq_indicator is used ----------------------------------------------------- +test_that("warning when hrc_indicator is ignored because df_eq_indicator used", { + expect_warning( + analyse_metadata(df_meta_for_eq, df_eq_ex), + "the hrc_indicator column will be ignored" + ) +}) + ##################################################################### HRC CHECKS # check that hierarchies on indicators are handled properly -------------------- answer <- data.frame( @@ -97,7 +130,7 @@ answer <- data.frame( test_that("one-way table inclusion", { meta <- metadata_pizza_lettuce %>% filter(table_name == "T1") %>% - mutate(hrc_spanning_1 = as.character(NA)) %>% + mutate(hrc_spanning_1 = as.character(NA)) %>% bind_rows(.,.) %>% mutate( table_name = c("T1","T1_bis"), @@ -125,3 +158,48 @@ test_that("hierarchies on indicators", { expect_equal(analyse_metadata(meta),answer) } ) + +##################################################### INDICATOR EQUATIONS CHECKS +# all the spanning variables are taken into account when using equations on +# indicators ------------------------------------------------------------------- +answer <- data.frame( + cluster = c( + "france_entreprises_2023.hrc_lettuce", + "france_entreprises_2023.hrc_lettuce", + "france_entreprises_2023.to_pizza", + "france_entreprises_2023.to_pizza" + ), + table_name = c( + "T10.T12.T8", + "T11.T7.T9", + "T1.T2", + "T3.T4.T5.T6" + ), + field = rep("france_entreprises_2023", 4), + indicator = c("LETTUCE", "LETTUCE", "to_pizza", "to_pizza"), + spanning_1 = c("HRC_NAF", "HRC_NAF", "HRC_NUTS", "HRC_NAF"), + spanning_2 = c("cj", "size", "size", "HRC_NUTS"), + spanning_3 = c("HRC_LETTUCE^h", "HRC_LETTUCE^h", NA, NA), + hrc_spanning_1 = c("hrc_naf", "hrc_naf", "hrc_nuts", "hrc_naf"), + hrc_spanning_2 = c(NA, NA, NA, "hrc_nuts"), + hrc_spanning_3 = c("hrc_lettuce", "hrc_lettuce", NA, NA) +) + +test_that("indicators equation", { + df_eq_ex <- data.frame( + eq_name = c("eq1"), + eq_indicator = c("ca_salades = ca_batavia + ca_mache"), + unit = c("EUR"), + stringsAsFactors = FALSE + ) + + expect_warning( + expect_equal( + analyse_metadata(df_metadata = metadata_pizza_lettuce,df_eq_indicator = df_eq_ex), + answer + ), + "hrc_indicator column will be ignored" + ) + +} +) diff --git a/tests/testthat/test_format_template.R b/tests/testthat/test_format_template.R index 8d0b792..86b40aa 100644 --- a/tests/testthat/test_format_template.R +++ b/tests/testthat/test_format_template.R @@ -47,3 +47,29 @@ test_that("only one spanning variable", { expect_equal(length(spanning_vars), 1) }) +# check that it takes into acocunt tables that cross all spanning variables +test_that("only one spanning variable", { + # add lines where all spanning variables are crossed + template <- enterprise_template %>% + # We're only doing it for the "B" and NUMBER_EMPL crossings + filter(ACTIVITY == "B", NUMBER_EMPL != "_T") %>% + select(-LEGAL_FORM) %>% + tidyr::crossing(LEGAL_FORM = c("LL", "PA", "SP")) %>% + bind_rows(enterprise_template) + + result <- format_template( + data = template, + indicator_column = "INDICATOR", + spanning_var_tot = list(ACTIVITY = "BTSXO_S94", NUMBER_EMPL = "_T", LEGAL_FORM = "_T"), + field_columns = c("TIME_PERIOD") + )$metadata + + # select all the columns starting with "spanning_" + spanning_vars <- grep("^spanning_", names(result), value = TRUE) + + # check there is three variables + expect_equal(length(spanning_vars), 3) +}) + + + diff --git a/vignettes/auto_metadata.Rmd b/vignettes/auto_metadata.Rmd index 58891e4..32ccc02 100644 --- a/vignettes/auto_metadata.Rmd +++ b/vignettes/auto_metadata.Rmd @@ -52,15 +52,30 @@ args(analyse_metadata) ``` ``` -## function (df_metadata, verbose = FALSE) +## function (df_metadata, df_eq_indicator = NULL, verbose = FALSE) ## NULL ``` The function arguments are as follows: - `df_metadata`: a dataframe structured as specified in section \ref{sec:crea_meta}. +- `df_eq_indicator` : un dataframe regroupant les équations entre les indicateurs. - `verbose`: a variable of type `logical`. If `TRUE` he function returns a list with the different steps of the analysis; if `FALSE`, it returns only the final dataframe with the cluster indicator. +The `df_eq_indicator` data frame consists of three columns that must follow the format below: + +``` r +data.frame(eq_name = c("eq1"), + eq_indicator = c("ca_salades = ca_batavia + ca_mache"), + unit = c("EUR")) +``` + +``` +## eq_name eq_indicator unit +## 1 eq1 ca_salades = ca_batavia + ca_mache EUR +``` + + ## The Arguments of `format_template` \label{sec:args_template} @@ -74,12 +89,12 @@ args(format_template) ## NULL ``` -Les arguments de la fonction sont les suivants : +The function arguments are as follows: -- `data` : dataframe du template Eurostat contenant toutes les cellules publiées. -- `indicator_column` : nom de la colonne dans laquelle se trouvent les indicateurs. -- `spanning_var_tot` : liste nommée des variables de croisements et de leurs totaux. -- `field_columns` : vecteur de toutes les colonnes représentant des champs (ex : millésime). +- `data`: the Eurostat template data frame containing all published cells. +- `indicator_column`: the name of the column containing the indicators. +- `spanning_var_tot`: a named list of the crossing variables and their totals. +- `field_columns`: a vector of all columns representing fields (e.g., vintage/year). # Examples @@ -126,14 +141,16 @@ cluster_id_dataframe ``` ``` -## # A tibble: 4 x 10 -## # Groups: table_name [4] -## cluster table_name field indicator spanning_1 spanning_2 spanning_3 hrc_spanning_1 hrc_spanning_2 hrc_spanning_3 -## -## 1 france_entrep~ T10.T12.T8 fran~ LETTUCE HRC_NAF cj HRC_LETTU~ hrc_naf hrc_lettuce -## 2 france_entrep~ T11.T7.T9 fran~ LETTUCE HRC_NAF size HRC_LETTU~ hrc_naf hrc_lettuce -## 3 france_entrep~ T1.T2 fran~ to_pizza HRC_NUTS size hrc_nuts -## 4 france_entrep~ T3.T4.T5.~ fran~ to_pizza HRC_NAF HRC_NUTS hrc_naf hrc_nuts +## cluster table_name field indicator spanning_1 spanning_2 spanning_3 hrc_spanning_1 +## 1 france_entreprises_2023.hrc_lettuce T10.T12.T8 france_entreprises_2023 LETTUCE HRC_NAF cj HRC_LETTUCE^h hrc_naf +## 2 france_entreprises_2023.hrc_lettuce T11.T7.T9 france_entreprises_2023 LETTUCE HRC_NAF size HRC_LETTUCE^h hrc_naf +## 3 france_entreprises_2023.to_pizza T1.T2 france_entreprises_2023 to_pizza HRC_NUTS size hrc_nuts +## 4 france_entreprises_2023.to_pizza T3.T4.T5.T6 france_entreprises_2023 to_pizza HRC_NAF HRC_NUTS hrc_naf +## hrc_spanning_2 hrc_spanning_3 +## 1 hrc_lettuce +## 2 hrc_lettuce +## 3 +## 4 hrc_nuts ``` For the 12 tables to be published, it is sufficient to protect 4 tables. These tables are distributed across two different clusters. Therefore, `tab_multi_manager()` needs to be called twice. @@ -147,8 +164,8 @@ names(detailed_analysis) ``` ``` -## [1] "identify_hrc" "info_var" "split_in_clusters" "create_edges" "grp_tab_names" -## [6] "grp_tab_in_clusters" "tab_to_treat" "df_tab_to_treat" +## [1] "identify_hrc" "info_var" "split_in_clusters" "create_edges" "grp_tab_names" "grp_tab_in_clusters" +## [7] "tab_to_treat" "df_tab_to_treat" ``` One finds the dataframe with the cluster indicator `df_tab_to_treat`. The result is the same but in list format: each element of the list is an independent cluster `tab_to_treat`. Additionally, the 6 steps of the analysis are included. @@ -184,7 +201,14 @@ template_formatted <- format_template( LEGAL_FORM = "_T"), field_columns = c("TIME_PERIOD") ) +``` +``` +## treating the field 2021 +## treating the field 2022 +``` + +``` r template_formatted$metadata ``` @@ -230,29 +254,131 @@ Next, this dataframe is used as input for the analysis function. ``` r # Analyse complète, avec les étapes detailed_analysis <- analyse_metadata(metadata_template, verbose = TRUE) +``` +``` +## Error in check_column_names(df_metadata): Error: The dataframe is missing one or more required columns: table_name, field, hrc_field, indicator, hrc_indicator. +``` + +``` r # Output simplifié, uniquement le dataframe avec l'indicatrice de cluster cluster_id_dataframe <- analyse_metadata(metadata_template, verbose = FALSE) +``` +``` +## Error in check_column_names(df_metadata): Error: The dataframe is missing one or more required columns: table_name, field, hrc_field, indicator, hrc_indicator. +``` + +``` r # visualisation du résultat de l'analyse cluster_id_dataframe ``` ``` -## # A tibble: 6 x 8 -## # Groups: table_name [6] -## cluster table_name field indicator spanning_1 spanning_2 hrc_spanning_1 hrc_spanning_2 -## -## 1 2021.SAL_DTH table_2021_SAL_DTH_1 2021 SAL_DTH HRC_ACTIVITY_131 HRC_LEGAL_FORM_3 hrc_activity_131 hrc_legal_form_3 -## 2 2021.SAL_DTH table_2021_SAL_DTH_2 2021 SAL_DTH HRC_ACTIVITY_131 HRC_NUMBER_EMPL_4 hrc_activity_131 hrc_number_empl_4 -## 3 2022.SAL table_2022_SAL_1 2022 SAL HRC_ACTIVITY_131 HRC_LEGAL_FORM_3 hrc_activity_131 hrc_legal_form_3 -## 4 2022.SAL table_2022_SAL_2 2022 SAL HRC_ACTIVITY_131 HRC_NUMBER_EMPL_4 hrc_activity_131 hrc_number_empl_4 -## 5 2022.SAL_DTH table_2022_SAL_DTH_1 2022 SAL_DTH HRC_ACTIVITY_131 HRC_LEGAL_FORM_3 hrc_activity_131 hrc_legal_form_3 -## 6 2022.SAL_DTH table_2022_SAL_DTH_2 2022 SAL_DTH HRC_ACTIVITY_131 HRC_NUMBER_EMPL_4 hrc_activity_131 hrc_number_empl_4 +## cluster table_name field indicator spanning_1 spanning_2 spanning_3 hrc_spanning_1 +## 1 france_entreprises_2023.hrc_lettuce T10.T12.T8 france_entreprises_2023 LETTUCE HRC_NAF cj HRC_LETTUCE^h hrc_naf +## 2 france_entreprises_2023.hrc_lettuce T11.T7.T9 france_entreprises_2023 LETTUCE HRC_NAF size HRC_LETTUCE^h hrc_naf +## 3 france_entreprises_2023.to_pizza T1.T2 france_entreprises_2023 to_pizza HRC_NUTS size hrc_nuts +## 4 france_entreprises_2023.to_pizza T3.T4.T5.T6 france_entreprises_2023 to_pizza HRC_NAF HRC_NUTS hrc_naf +## hrc_spanning_2 hrc_spanning_3 +## 1 hrc_lettuce +## 2 hrc_lettuce +## 3 +## 4 hrc_nuts ``` Ultimately, there are 6 tables to process in 3 different clusters. In other words, `tab_multi_manager()` will need to be called three times. +## Equations between indicators + +In this example, we start from a metadata file describing 12 tables to be published on the turnover from pizza and salad sales. + + +``` r +str(metadata_pizza_lettuce) +``` + +``` +## 'data.frame': 12 obs. of 9 variables: +## $ table_name : chr "T1" "T2" "T3" "T4" ... +## $ field : chr "france_entreprises_2023" "france_entreprises_2023" "france_entreprises_2023" "france_entreprises_2023" ... +## $ hrc_field : logi NA NA NA NA NA NA ... +## $ indicator : chr "to_pizza" "to_pizza" "to_pizza" "to_pizza" ... +## $ hrc_indicator : chr NA NA NA NA ... +## $ spanning_1 : chr "nuts2" "nuts3" "a10" "a10" ... +## $ hrc_spanning_1: chr "hrc_nuts" "hrc_nuts" "hrc_naf" "hrc_naf" ... +## $ spanning_2 : chr "size" "size" "nuts2" "nuts3" ... +## $ hrc_spanning_2: chr NA NA "hrc_nuts" "hrc_nuts" ... +``` + +This time a data frame describing the relationships between indicators is added. + +``` r +liens_eq <- data.frame(eq_name = c("eq1"), + eq_indicator = c("ca_salades = ca_batavia + ca_mache"), + unit = c("EUR")) +``` + + +Example code : + +``` r +library(rtauargus) + +data(metadata_pizza_lettuce) + +# Full analysis, with intermediate steps +detailed_analysis <- analyse_metadata(metadata_pizza_lettuce, + df_eq_indicator = liens_eq, + verbose = TRUE) +``` + +``` +## Warning in analyse_metadata(metadata_pizza_lettuce, df_eq_indicator = liens_eq, : For the variables part of equations specified in df_eq_indicator, +## the hrc_indicator column will be ignored. +``` + +``` +## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in dplyr 1.1.0. +## i Please use `reframe()` instead. +## i When switching from `summarise()` to `reframe()`, remember that `reframe()` always returns an ungrouped data frame and adjust +## accordingly. +## i The deprecated feature was likely used in the rtauargus package. +## Please report the issue at . +## This warning is displayed once every 8 hours. +## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated. +``` + +``` r +# Simplified output, only the data frame with the cluster indicator +cluster_id_dataframe <- analyse_metadata(metadata_pizza_lettuce, verbose = FALSE) +``` + +Here, a warning informs us that the `hrc_indicator` column will be ignored since the +relationships between indicators have been specified using the `df_eq_indicator` argument. +Therefore, links should not be defined in `hrc_indicator` when using the `df_eq_indicator` argument. + +The result is the same as the first example. + + +``` r +cluster_id_dataframe +``` + +``` +## cluster table_name field indicator spanning_1 spanning_2 spanning_3 hrc_spanning_1 +## 1 france_entreprises_2023.hrc_lettuce T10.T12.T8 france_entreprises_2023 LETTUCE HRC_NAF cj HRC_LETTUCE^h hrc_naf +## 2 france_entreprises_2023.hrc_lettuce T11.T7.T9 france_entreprises_2023 LETTUCE HRC_NAF size HRC_LETTUCE^h hrc_naf +## 3 france_entreprises_2023.to_pizza T1.T2 france_entreprises_2023 to_pizza HRC_NUTS size hrc_nuts +## 4 france_entreprises_2023.to_pizza T3.T4.T5.T6 france_entreprises_2023 to_pizza HRC_NAF HRC_NUTS hrc_naf +## hrc_spanning_2 hrc_spanning_3 +## 1 hrc_lettuce +## 2 hrc_lettuce +## 3 +## 4 hrc_nuts +``` + + # Going Further: Visualizing Inclusions The create_edges step in the metadata analysis identifies tables included within other tables. For example, XXXXX is included in XXXXX. The following code allows visualizing these inclusions using graphs to better understand the analysis procedure. diff --git a/vignettes/auto_metadata.Rmd.orig b/vignettes/auto_metadata.Rmd.orig index f9e614f..d7a9d3f 100644 --- a/vignettes/auto_metadata.Rmd.orig +++ b/vignettes/auto_metadata.Rmd.orig @@ -54,8 +54,17 @@ args(analyse_metadata) The function arguments are as follows: - `df_metadata`: a dataframe structured as specified in section \ref{sec:crea_meta}. +- `df_eq_indicator` : un dataframe regroupant les équations entre les indicateurs. - `verbose`: a variable of type `logical`. If `TRUE` he function returns a list with the different steps of the analysis; if `FALSE`, it returns only the final dataframe with the cluster indicator. +The `df_eq_indicator` data frame consists of three columns that must follow the format below: +```{r} +data.frame(eq_name = c("eq1"), + eq_indicator = c("ca_salades = ca_batavia + ca_mache"), + unit = c("EUR")) +``` + + ## The Arguments of `format_template` \label{sec:args_template} @@ -63,12 +72,12 @@ The function arguments are as follows: args(format_template) ``` -Les arguments de la fonction sont les suivants : +The function arguments are as follows: -- `data` : dataframe du template Eurostat contenant toutes les cellules publiées. -- `indicator_column` : nom de la colonne dans laquelle se trouvent les indicateurs. -- `spanning_var_tot` : liste nommée des variables de croisements et de leurs totaux. -- `field_columns` : vecteur de toutes les colonnes représentant des champs (ex : millésime). +- `data`: the Eurostat template data frame containing all published cells. +- `indicator_column`: the name of the column containing the indicators. +- `spanning_var_tot`: a named list of the crossing variables and their totals. +- `field_columns`: a vector of all columns representing fields (e.g., vintage/year). # Examples @@ -162,6 +171,48 @@ cluster_id_dataframe Ultimately, there are 6 tables to process in 3 different clusters. In other words, `tab_multi_manager()` will need to be called three times. +## Equations between indicators + +In this example, we start from a metadata file describing 12 tables to be published on the turnover from pizza and salad sales. + +```{r} +str(metadata_pizza_lettuce) +``` + +This time a data frame describing the relationships between indicators is added. +```{r} +liens_eq <- data.frame(eq_name = c("eq1"), + eq_indicator = c("ca_salades = ca_batavia + ca_mache"), + unit = c("EUR")) +``` + + +Example code : +```{r} +library(rtauargus) + +data(metadata_pizza_lettuce) + +# Full analysis, with intermediate steps +detailed_analysis <- analyse_metadata(metadata_pizza_lettuce, + df_eq_indicator = liens_eq, + verbose = TRUE) + +# Simplified output, only the data frame with the cluster indicator +cluster_id_dataframe <- analyse_metadata(metadata_pizza_lettuce, verbose = FALSE) +``` + +Here, a warning informs us that the `hrc_indicator` column will be ignored since the +relationships between indicators have been specified using the `df_eq_indicator` argument. +Therefore, links should not be defined in `hrc_indicator` when using the `df_eq_indicator` argument. + +The result is the same as the first example. + +```{r} +cluster_id_dataframe +``` + + # Going Further: Visualizing Inclusions The create_edges step in the metadata analysis identifies tables included within other tables. For example, XXXXX is included in XXXXX. The following code allows visualizing these inclusions using graphs to better understand the analysis procedure. diff --git a/vignettes/auto_metadata_fr.Rmd b/vignettes/auto_metadata_fr.Rmd index 5c70a69..d98f6dd 100644 --- a/vignettes/auto_metadata_fr.Rmd +++ b/vignettes/auto_metadata_fr.Rmd @@ -6,10 +6,10 @@ output: toc: true toc_depth: 3 vignette: > - %\VignetteIndexEntry{Analyse automatique des métadonnées} +%\VignetteIndexEntry{Analyse automatique des métadonnées} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} ---- + --- @@ -52,15 +52,29 @@ args(analyse_metadata) ``` ``` -## function (df_metadata, verbose = FALSE) +## function (df_metadata, df_eq_indicator = NULL, verbose = FALSE) ## NULL ``` Les arguments de la fonction sont les suivants : - `df_metadata` : un dataframe structuré comme précisé dans la section \ref{sec:crea_meta}. +- `df_eq_indicator` : un dataframe regroupant les équations entre les indicateurs. - `verbose` : une variable de type `logical`, si `TRUE` la fonction retourne une liste avec les différentes étapes de l'analyse, si `FALSE` elle retourne juste le dataframe final avec l'indicatrice de cluster. +Le dataframe `df_eq_indicator` est constitué de trois colonnes qui doivent respecter le format suivant : + +``` r +data.frame(eq_name = c("eq1"), + eq_indicator = c("ca_salades = ca_batavia + ca_mache"), + unit = c("EUR")) +``` + +``` +## eq_name eq_indicator unit +## 1 eq1 ca_salades = ca_batavia + ca_mache EUR +``` + ## Les arguments de `format_template` \label{sec:args_template} @@ -126,16 +140,16 @@ cluster_id_dataframe ``` ``` -## cluster table_name field indicator spanning_1 spanning_2 spanning_3 hrc_spanning_1 hrc_spanning_2 -## 1 france_entreprises_2023.hrc_lettuce T10.T12.T8 france_entreprises_2023 LETTUCE HRC_NAF cj HRC_LETTUCE^h hrc_naf -## 2 france_entreprises_2023.hrc_lettuce T11.T7.T9 france_entreprises_2023 LETTUCE HRC_NAF size HRC_LETTUCE^h hrc_naf -## 3 france_entreprises_2023.to_pizza T1.T2 france_entreprises_2023 to_pizza HRC_NUTS size hrc_nuts -## 4 france_entreprises_2023.to_pizza T3.T4.T5.T6 france_entreprises_2023 to_pizza HRC_NAF HRC_NUTS hrc_naf hrc_nuts -## hrc_spanning_3 -## 1 hrc_lettuce -## 2 hrc_lettuce -## 3 -## 4 +## cluster table_name field indicator spanning_1 spanning_2 spanning_3 hrc_spanning_1 +## 1 france_entreprises_2023.hrc_lettuce T10.T12.T8 france_entreprises_2023 LETTUCE HRC_NAF cj HRC_LETTUCE^h hrc_naf +## 2 france_entreprises_2023.hrc_lettuce T11.T7.T9 france_entreprises_2023 LETTUCE HRC_NAF size HRC_LETTUCE^h hrc_naf +## 3 france_entreprises_2023.to_pizza T1.T2 france_entreprises_2023 to_pizza HRC_NUTS size hrc_nuts +## 4 france_entreprises_2023.to_pizza T3.T4.T5.T6 france_entreprises_2023 to_pizza HRC_NAF HRC_NUTS hrc_naf +## hrc_spanning_2 hrc_spanning_3 +## 1 hrc_lettuce +## 2 hrc_lettuce +## 3 +## 4 hrc_nuts ``` Pour les 12 tableaux à publier il suffit de protéger 4 tableaux. Ces tableaux sont repartis dans deux clusters différents. Il faudra donc faire appel deux fois à `tab_multi_manager()`. @@ -149,8 +163,8 @@ names(detailed_analysis) ``` ``` -## [1] "identify_hrc" "info_var" "split_in_clusters" "create_edges" "grp_tab_names" "grp_tab_in_clusters" "tab_to_treat" -## [8] "df_tab_to_treat" +## [1] "identify_hrc" "info_var" "split_in_clusters" "create_edges" "grp_tab_names" "grp_tab_in_clusters" +## [7] "tab_to_treat" "df_tab_to_treat" ``` On retrouve le dataframe avec l'indicatrice du cluster `df_tab_to_treat`. On a le même résultat mais en format liste : chaque élément de la sous-liste `tab_to_treat` est un cluster indépendant. Les autres éléments de la liste sont les étapes de l'analyse. @@ -186,7 +200,14 @@ template_formatted <- format_template( LEGAL_FORM = "_T"), field_columns = c("TIME_PERIOD") ) +``` +``` +## treating the field 2021 +## treating the field 2022 +``` + +``` r template_formatted$metadata ``` @@ -253,20 +274,102 @@ cluster_id_dataframe ``` ``` -## cluster table_name field indicator spanning_1 spanning_2 spanning_3 hrc_spanning_1 hrc_spanning_2 -## 1 france_entreprises_2023.hrc_lettuce T10.T12.T8 france_entreprises_2023 LETTUCE HRC_NAF cj HRC_LETTUCE^h hrc_naf -## 2 france_entreprises_2023.hrc_lettuce T11.T7.T9 france_entreprises_2023 LETTUCE HRC_NAF size HRC_LETTUCE^h hrc_naf -## 3 france_entreprises_2023.to_pizza T1.T2 france_entreprises_2023 to_pizza HRC_NUTS size hrc_nuts -## 4 france_entreprises_2023.to_pizza T3.T4.T5.T6 france_entreprises_2023 to_pizza HRC_NAF HRC_NUTS hrc_naf hrc_nuts -## hrc_spanning_3 -## 1 hrc_lettuce -## 2 hrc_lettuce -## 3 -## 4 +## cluster table_name field indicator spanning_1 spanning_2 spanning_3 hrc_spanning_1 +## 1 france_entreprises_2023.hrc_lettuce T10.T12.T8 france_entreprises_2023 LETTUCE HRC_NAF cj HRC_LETTUCE^h hrc_naf +## 2 france_entreprises_2023.hrc_lettuce T11.T7.T9 france_entreprises_2023 LETTUCE HRC_NAF size HRC_LETTUCE^h hrc_naf +## 3 france_entreprises_2023.to_pizza T1.T2 france_entreprises_2023 to_pizza HRC_NUTS size hrc_nuts +## 4 france_entreprises_2023.to_pizza T3.T4.T5.T6 france_entreprises_2023 to_pizza HRC_NAF HRC_NUTS hrc_naf +## hrc_spanning_2 hrc_spanning_3 +## 1 hrc_lettuce +## 2 hrc_lettuce +## 3 +## 4 hrc_nuts ``` Finalement, il y a 6 tableaux à traiter dans 3 clusters différents. Autrement dit, il faudra faire trois fois appel à `tab_multi_manager()`. +## Equations entre indicateurs + +Dans cet exemple on part d'un fichier de métadonnées présentant 12 tableaux à publier sur les chiffres d'affaires de ventes de pizzas et de salades. + +``` r +str(metadata_pizza_lettuce) +``` + +``` +## 'data.frame': 12 obs. of 9 variables: +## $ table_name : chr "T1" "T2" "T3" "T4" ... +## $ field : chr "france_entreprises_2023" "france_entreprises_2023" "france_entreprises_2023" "france_entreprises_2023" ... +## $ hrc_field : logi NA NA NA NA NA NA ... +## $ indicator : chr "to_pizza" "to_pizza" "to_pizza" "to_pizza" ... +## $ hrc_indicator : chr NA NA NA NA ... +## $ spanning_1 : chr "nuts2" "nuts3" "a10" "a10" ... +## $ hrc_spanning_1: chr "hrc_nuts" "hrc_nuts" "hrc_naf" "hrc_naf" ... +## $ spanning_2 : chr "size" "size" "nuts2" "nuts3" ... +## $ hrc_spanning_2: chr NA NA "hrc_nuts" "hrc_nuts" ... +``` + +Auquel on ajoute un dataframe décrivant les liens entre les indicateurs. + +``` r +liens_eq <- data.frame(eq_name = c("eq1"), + eq_indicator = c("ca_salades = ca_batavia + ca_mache"), + unit = c("EUR")) +``` + + +Exemple de code : + +``` r +library(rtauargus) + +data(metadata_pizza_lettuce) + +# Analyse complète, avec les étapes +detailed_analysis <- analyse_metadata(metadata_pizza_lettuce, + df_eq_indicator = liens_eq, + verbose = TRUE) +``` + +``` +## Warning in analyse_metadata(metadata_pizza_lettuce, df_eq_indicator = liens_eq, : For the variables part of equations specified in df_eq_indicator, +## the hrc_indicator column will be ignored. +``` + +``` +## Error in components(g_full): impossible de trouver la fonction "components" +``` + +``` r +# Output simplifié, uniquement le dataframe avec l'indicatrice de cluster +cluster_id_dataframe <- analyse_metadata(metadata_pizza_lettuce, verbose = FALSE) +``` + +Ici, un warning nous prévient que la colonne `hrc_indicator` sera ignorée puisque l'on a précisé +les liens entre indicateurs avec l'argument `df_eq_indicator`. Ainsi, il ne faut pas mettre +des liens dans `hrc_indicator` lorsque l'on utilise l'argument `df_eq_indicator`. + +On obtient le même dataframe nous présentant la façon de traiter les tableaux pour la pose du secret +que dans le premier exemple. + + +``` r +cluster_id_dataframe +``` + +``` +## cluster table_name field indicator spanning_1 spanning_2 spanning_3 hrc_spanning_1 +## 1 france_entreprises_2023.hrc_lettuce T10.T12.T8 france_entreprises_2023 LETTUCE HRC_NAF cj HRC_LETTUCE^h hrc_naf +## 2 france_entreprises_2023.hrc_lettuce T11.T7.T9 france_entreprises_2023 LETTUCE HRC_NAF size HRC_LETTUCE^h hrc_naf +## 3 france_entreprises_2023.to_pizza T1.T2 france_entreprises_2023 to_pizza HRC_NUTS size hrc_nuts +## 4 france_entreprises_2023.to_pizza T3.T4.T5.T6 france_entreprises_2023 to_pizza HRC_NAF HRC_NUTS hrc_naf +## hrc_spanning_2 hrc_spanning_3 +## 1 hrc_lettuce +## 2 hrc_lettuce +## 3 +## 4 hrc_nuts +``` + # Pour aller plus loin : visualiser les inclusions L'étape `create_edges` de l'analyse des métadonnées identifie les tableaux inclus dans d'autres tableaux. Par exemple, XXXXX est inclus dans XXXXX. Le code suivant permet de visualiser ces inclusions à l'aide de graphes afin de mieux comprendre la procédure d'analyse. @@ -287,12 +390,6 @@ library(igraph) ## as_data_frame, groups, union ``` -``` -## L'objet suivant est masqué depuis 'package:testthat': -## -## compare -``` - ``` ## Les objets suivants sont masqués depuis 'package:stats': ## diff --git a/vignettes/auto_metadata_fr.Rmd.orig b/vignettes/auto_metadata_fr.Rmd.orig index eedd1b6..f0c8159 100644 --- a/vignettes/auto_metadata_fr.Rmd.orig +++ b/vignettes/auto_metadata_fr.Rmd.orig @@ -6,10 +6,10 @@ output: toc: true toc_depth: 3 vignette: > - %\VignetteIndexEntry{Analyse automatique des métadonnées} +%\VignetteIndexEntry{Analyse automatique des métadonnées} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} ---- + --- ```{r setup, include=FALSE} @@ -54,8 +54,16 @@ args(analyse_metadata) Les arguments de la fonction sont les suivants : - `df_metadata` : un dataframe structuré comme précisé dans la section \ref{sec:crea_meta}. +- `df_eq_indicator` : un dataframe regroupant les équations entre les indicateurs. - `verbose` : une variable de type `logical`, si `TRUE` la fonction retourne une liste avec les différentes étapes de l'analyse, si `FALSE` elle retourne juste le dataframe final avec l'indicatrice de cluster. +Le dataframe `df_eq_indicator` est constitué de trois colonnes qui doivent respecter le format suivant : +```{r} +data.frame(eq_name = c("eq1"), + eq_indicator = c("ca_salades = ca_batavia + ca_mache"), + unit = c("EUR")) +``` + ## Les arguments de `format_template` \label{sec:args_template} @@ -162,6 +170,47 @@ cluster_id_dataframe Finalement, il y a 6 tableaux à traiter dans 3 clusters différents. Autrement dit, il faudra faire trois fois appel à `tab_multi_manager()`. +## Equations entre indicateurs + +Dans cet exemple on part d'un fichier de métadonnées présentant 12 tableaux à publier sur les chiffres d'affaires de ventes de pizzas et de salades. +```{r} +str(metadata_pizza_lettuce) +``` + +Auquel on ajoute un dataframe décrivant les liens entre les indicateurs. +```{r} +liens_eq <- data.frame(eq_name = c("eq1"), + eq_indicator = c("ca_salades = ca_batavia + ca_mache"), + unit = c("EUR")) +``` + + +Exemple de code : +```{r} +library(rtauargus) + +data(metadata_pizza_lettuce) + +# Analyse complète, avec les étapes +detailed_analysis <- analyse_metadata(metadata_pizza_lettuce, + df_eq_indicator = liens_eq, + verbose = TRUE) + +# Output simplifié, uniquement le dataframe avec l'indicatrice de cluster +cluster_id_dataframe <- analyse_metadata(metadata_pizza_lettuce, verbose = FALSE) +``` + +Ici, un warning nous prévient que la colonne `hrc_indicator` sera ignorée puisque l'on a précisé +les liens entre indicateurs avec l'argument `df_eq_indicator`. Ainsi, il ne faut pas mettre +des liens dans `hrc_indicator` lorsque l'on utilise l'argument `df_eq_indicator`. + +On obtient le même dataframe nous présentant la façon de traiter les tableaux pour la pose du secret +que dans le premier exemple. + +```{r} +cluster_id_dataframe +``` + # Pour aller plus loin : visualiser les inclusions L'étape `create_edges` de l'analyse des métadonnées identifie les tableaux inclus dans d'autres tableaux. Par exemple, XXXXX est inclus dans XXXXX. Le code suivant permet de visualiser ces inclusions à l'aide de graphes afin de mieux comprendre la procédure d'analyse.