Merge pull request #284 from tidymodels/sparse-tf

EmilHvitfeldt · web-flow · commit 3187b811c413 · 2025-03-07T09:33:02.000-08:00
add sparse arg to step_tf()
diff --git a/NAMESPACE b/NAMESPACE
@@ -2,6 +2,7 @@
 
 S3method(.recipes_estimate_sparsity,step_dummy_hash)
 S3method(.recipes_estimate_sparsity,step_texthash)
+S3method(.recipes_estimate_sparsity,step_tf)
 S3method(bake,step_clean_levels)
 S3method(bake,step_clean_names)
 S3method(bake,step_dummy_hash)
diff --git a/NEWS.md b/NEWS.md
@@ -2,6 +2,8 @@
 
 * `step_texthash()` and `step_dummy_hash()` gained `sparse` argument. When set to `"yes"`, `step_dummy()` will produce sparse vectors. (#282)
 
+* `step_tf()` gained `sparse` argument. When set to `"yes"`, `step_dummy()` will produce sparse vectors. (#284)
+
 # textrecipes 1.0.7
 
 ## Improvements
diff --git a/R/tf.R b/R/tf.R
@@ -20,6 +20,7 @@
 #'   be stored here once this preprocessing step has be trained by
 #'   [recipes::prep.recipe()].
 #' @template args-prefix
+#' @template args-sparse
 #' @template args-keep_original_cols
 #' @template args-skip
 #' @template args-id
@@ -69,6 +70,12 @@
 #' cat(result)
 #' ```
 #'
+#' @template sparse-creation
+#'
+#' @description
+#' `sparse = "yes"` doesn't take effect when
+#' `weight_scheme = "double normalization"` as it doesn't produce sparse data.
+#'
 #' @template case-weights-not-supported
 #'
 #' @seealso [step_tokenize()] to turn characters into [`tokens`][tokenlist()]
@@ -106,6 +113,7 @@ step_tf <-
     vocabulary = NULL,
     res = NULL,
     prefix = "tf",
+    sparse = "auto",
     keep_original_cols = FALSE,
     skip = FALSE,
     id = rand_id("tf")
@@ -122,6 +130,7 @@ step_tf <-
         weight = weight,
         vocabulary = vocabulary,
         prefix = prefix,
+        sparse = sparse,
         keep_original_cols = keep_original_cols,
         skip = skip,
         id = id
@@ -148,6 +157,7 @@ step_tf_new <-
     vocabulary,
     res,
     prefix,
+    sparse,
     keep_original_cols,
     skip,
     id
@@ -163,6 +173,7 @@ step_tf_new <-
       vocabulary = vocabulary,
       res = res,
       prefix = prefix,
+      sparse = sparse,
       keep_original_cols = keep_original_cols,
       skip = skip,
       id = id
@@ -197,6 +208,7 @@ prep.step_tf <- function(x, training, info = NULL, ...) {
     vocabulary = x$vocabulary,
     res = token_list,
     prefix = x$prefix,
+    sparse = x$sparse,
     keep_original_cols = get_keep_original_cols(x),
     skip = x$skip,
     id = x$id
@@ -219,11 +231,16 @@ bake.step_tf <- function(object, new_data, ...) {
       object$res[[col_name]],
       paste0(object$prefix, "_", col_name),
       object$weight_scheme,
-      object$weight
+      object$weight,
+      object$sparse
     )
 
     if (object$weight_scheme %in% c("binary", "raw count")) {
-      tf_text <- purrr::map_dfc(tf_text, as.integer)
+      if (sparse_is_yes(object$sparse)) {
+        tf_text <- purrr::map_dfc(tf_text, sparsevctrs::as_sparse_integer)
+      } else {
+        tf_text <- purrr::map_dfc(tf_text, as.integer)
+      }
     }
 
     tf_text <- recipes::check_name(tf_text, new_data, object, names(tf_text))
@@ -264,12 +281,21 @@ tidy.step_tf <- function(x, ...) {
   res
 }
 
-tf_function <- function(data, names, labels, weights, weight) {
-  counts <- as.matrix(tokenlist_to_dtm(data, names))
+tf_function <- function(data, names, labels, weights, weight, sparse) {
+  counts <- tokenlist_to_dtm(data, names)
+
+  if (weights == "double normalization" || !sparse_is_yes(sparse)) {
+    counts <- as.matrix(counts)
+    out <- tf_weight(counts, weights, weight)
+    colnames(out) <- paste0(labels, "_", names)
+    out <- as_tibble(out)
+  } else {
+    counts <- sparsevctrs::coerce_to_sparse_tibble(counts)
+    out <- tf_weight_sparse(counts, weights)
+    colnames(out) <- paste0(labels, "_", names)
+  }
 
-  tf <- tf_weight(counts, weights, weight)
-  colnames(tf) <- paste0(labels, "_", names)
-  as_tibble(tf)
+  out
 }
 
 tf_weight <- function(x, scheme, weight) {
@@ -294,6 +320,49 @@ tf_weight <- function(x, scheme, weight) {
   }
 }
 
+tf_weight_sparse <- function(x, scheme) {
+  if (scheme == "binary") {
+    res <- lapply(x, function(x) {
+      positions <- sparsevctrs::sparse_positions(x)
+      len <- length(x)
+
+      sparsevctrs::sparse_integer(rep(1, length(positions)), positions, len)
+    })
+
+    res <- tibble::new_tibble(res)
+    return(res)
+  }
+  if (scheme == "raw count") {
+    return(x)
+  }
+  if (scheme == "term frequency") {
+    x <- sparsevctrs::coerce_to_sparse_matrix(x)
+    rowsums_x <- Matrix::rowSums(x)
+    res <- x / rowsums_x
+    if (any(rowsums_x == 0)) {
+      res[rowsums_x == 0, ] <- 0
+    }
+    res <- sparsevctrs::coerce_to_sparse_tibble(res)
+    return(res)
+  }
+  if (scheme == "log normalization") {
+    res <- lapply(x, function(x) {
+      values <- sparsevctrs::sparse_values(x)
+      positions <- sparsevctrs::sparse_positions(x)
+      len <- length(x)
+
+      sparsevctrs::sparse_double(
+        log(1 + values),
+        positions,
+        len
+      )
+    })
+
+    res <- tibble::new_tibble(res)
+    return(res)
+  }
+}
+
 #' @rdname required_pkgs.step
 #' @export
 required_pkgs.step_tf <- function(x, ...) {
@@ -314,3 +383,21 @@ tunable.step_tf <- function(x, ...) {
     component_id = x$id
   )
 }
+
+#' @export
+.recipes_estimate_sparsity.step_tf <- function(x, data, ...) {
+  get_levels <- function(col) {
+    n_chars <- nchar(col[seq(1, min(10, length(col)))])
+
+    floor(mean(n_chars))
+  }
+
+  n_levels <- lapply(data, get_levels)
+
+  lapply(n_levels, function(n_lvl) {
+    c(
+      n_cols = n_lvl,
+      sparsity = 1 - 1 / n_lvl
+    )
+  })
+}
diff --git a/R/tokenlist.R b/R/tokenlist.R
@@ -195,19 +195,19 @@ tokenlist_filter_function <- function(x, fn) {
 
   keeps <- lapply(tokens, fn)
 
-  out <- purrr::map2(tokens, keeps, ~.x[.y])
+  out <- purrr::map2(tokens, keeps, ~ .x[.y])
 
   lemma <- maybe_get_lemma(x)
   if (!is.null(lemma)) {
-    lemma <- purrr::map2(lemma, keeps, ~.x[.y])
+    lemma <- purrr::map2(lemma, keeps, ~ .x[.y])
     names(lemma) <- NULL
   } else {
     lemma <- NULL
   }
 
   pos <- maybe_get_pos(x)
   if (!is.null(pos)) {
-    pos <- purrr::map2(pos, keeps, ~.x[.y])
+    pos <- purrr::map2(pos, keeps, ~ .x[.y])
     names(pos) <- NULL
   } else {
     pos <- NULL
diff --git a/man/step_tf.Rd b/man/step_tf.Rd
diff --git a/tests/testthat/test-tf.R b/tests/testthat/test-tf.R
@@ -134,7 +134,7 @@ test_that("check_name() is used", {
 test_that("tunable", {
   rec <-
     recipe(~., data = mtcars) %>%
-      step_tf(all_predictors())
+    step_tf(all_predictors())
   rec_param <- tunable.step_tf(rec$steps[[1]])
   expect_equal(rec_param$name, c("weight_scheme", "weight"))
   expect_true(all(rec_param$source == "recipe"))
@@ -173,6 +173,101 @@ test_that("bad args", {
   )
 })
 
+test_that("sparse = 'yes' works", {
+  rec <- recipe(~., data = test_data)
+
+  dense <- rec %>%
+    step_tokenize(text) %>%
+    step_tf(text, weight_scheme = "raw count", sparse = "no") %>%
+    prep() %>%
+    bake(NULL)
+  sparse <- rec %>%
+    step_tokenize(text) %>%
+    step_tf(text, weight_scheme = "raw count", sparse = "yes") %>%
+    prep() %>%
+    bake(NULL)
+
+  expect_identical(dense, sparse)
+
+  expect_false(any(vapply(dense, sparsevctrs::is_sparse_integer, logical(1))))
+  expect_true(all(vapply(sparse, sparsevctrs::is_sparse_integer, logical(1))))
+
+  dense <- rec %>%
+    step_tokenize(text) %>%
+    step_tf(text, weight_scheme = "binary", sparse = "no") %>%
+    prep() %>%
+    bake(NULL)
+  sparse <- rec %>%
+    step_tokenize(text) %>%
+    step_tf(text, weight_scheme = "binary", sparse = "yes") %>%
+    prep() %>%
+    bake(NULL)
+
+  expect_identical(dense, sparse)
+
+  expect_false(any(vapply(dense, sparsevctrs::is_sparse_integer, logical(1))))
+  expect_true(all(vapply(sparse, sparsevctrs::is_sparse_integer, logical(1))))
+
+  dense <- rec %>%
+    step_tokenize(text) %>%
+    step_tf(text, weight_scheme = "term frequency", sparse = "no") %>%
+    prep() %>%
+    bake(NULL)
+  sparse <- rec %>%
+    step_tokenize(text) %>%
+    step_tf(text, weight_scheme = "term frequency", sparse = "yes") %>%
+    prep() %>%
+    bake(NULL)
+
+  expect_identical(dense, sparse)
+
+  expect_false(any(vapply(dense, sparsevctrs::is_sparse_double, logical(1))))
+  expect_true(all(vapply(sparse, sparsevctrs::is_sparse_double, logical(1))))
+
+  dense <- rec %>%
+    step_tokenize(text) %>%
+    step_tf(text, weight_scheme = "log normalization", sparse = "no") %>%
+    prep() %>%
+    bake(NULL)
+  sparse <- rec %>%
+    step_tokenize(text) %>%
+    step_tf(text, weight_scheme = "log normalization", sparse = "yes") %>%
+    prep() %>%
+    bake(NULL)
+
+  expect_identical(dense, sparse)
+
+  expect_false(any(vapply(dense, sparsevctrs::is_sparse_double, logical(1))))
+  expect_true(all(vapply(sparse, sparsevctrs::is_sparse_double, logical(1))))
+})
+
+test_that("sparse argument is backwards compatible", {
+  rec <- recipe(~., data = test_data) %>%
+    step_tokenize(text) %>%
+    step_tf(text, sparse = "no") %>%
+    prep()
+
+  exp <- bake(rec, test_data)
+
+  # Simulate old recipe
+  rec$steps[[1]]$sparse <- NULL
+
+  expect_identical(
+    bake(rec, test_data),
+    exp
+  )
+})
+
+test_that(".recipes_toggle_sparse_args works", {
+  rec <- recipe(~., data = test_data) %>%
+    step_tokenize(text) %>%
+    step_tf(text, sparse = "auto")
+
+  exp <- rec %>% prep() %>% bake(NULL) %>% sparsevctrs::sparsity()
+
+  expect_true(.recipes_estimate_sparsity(rec) >= exp)
+})
+
 # Infrastructure ---------------------------------------------------------------
 
 test_that("bake method errors when needed non-standard role columns are missing", {