Skip to content

Commit 61f5bb2

Browse files
Merge pull request #285 from tidymodels/sparse-tfidf
add sparse arg to step_tfidf()
2 parents 3187b81 + f3fff7d commit 61f5bb2

File tree

5 files changed

+166
-6
lines changed

5 files changed

+166
-6
lines changed

NAMESPACE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
S3method(.recipes_estimate_sparsity,step_dummy_hash)
44
S3method(.recipes_estimate_sparsity,step_texthash)
55
S3method(.recipes_estimate_sparsity,step_tf)
6+
S3method(.recipes_estimate_sparsity,step_tfidf)
67
S3method(bake,step_clean_levels)
78
S3method(bake,step_clean_names)
89
S3method(bake,step_dummy_hash)

NEWS.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44

55
* `step_tf()` gained `sparse` argument. When set to `"yes"`, `step_dummy()` will produce sparse vectors. (#284)
66

7+
* `step_tfidf()` gained `sparse` argument. When set to `"yes"`, `step_dummy()` will produce sparse vectors. (#285)
8+
79
# textrecipes 1.0.7
810

911
## Improvements

R/tfidf.R

Lines changed: 49 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#' @param sublinear_tf A logical, apply sublinear term-frequency scaling, i.e.,
2323
#' replace the term frequency with 1 + log(TF). Defaults to FALSE.
2424
#' @template args-prefix
25+
#' @template args-sparse
2526
#' @template args-keep_original_cols
2627
#' @template args-skip
2728
#' @template args-id
@@ -63,6 +64,8 @@
6364
#'
6465
#' @template details-prefix
6566
#'
67+
#' @template sparse-creation
68+
#'
6669
#' @template case-weights-not-supported
6770
#'
6871
#' @seealso [step_tokenize()] to turn characters into [`tokens`][tokenlist()]
@@ -101,6 +104,7 @@ step_tfidf <-
101104
norm = "l1",
102105
sublinear_tf = FALSE,
103106
prefix = "tfidf",
107+
sparse = "auto",
104108
keep_original_cols = FALSE,
105109
skip = FALSE,
106110
id = rand_id("tfidf")
@@ -118,6 +122,7 @@ step_tfidf <-
118122
sublinear_tf = sublinear_tf,
119123
columns = columns,
120124
prefix = prefix,
125+
sparse = sparse,
121126
keep_original_cols = keep_original_cols,
122127
skip = skip,
123128
id = id
@@ -137,6 +142,7 @@ step_tfidf_new <-
137142
norm,
138143
sublinear_tf,
139144
prefix,
145+
sparse,
140146
keep_original_cols,
141147
skip,
142148
id
@@ -153,6 +159,7 @@ step_tfidf_new <-
153159
norm = norm,
154160
sublinear_tf = sublinear_tf,
155161
prefix = prefix,
162+
sparse = sparse,
156163
keep_original_cols = keep_original_cols,
157164
skip = skip,
158165
id = id
@@ -191,6 +198,7 @@ prep.step_tfidf <- function(x, training, info = NULL, ...) {
191198
norm = x$norm,
192199
sublinear_tf = x$sublinear_tf,
193200
prefix = x$prefix,
201+
sparse = x$sparse,
194202
keep_original_cols = get_keep_original_cols(x),
195203
skip = x$skip,
196204
id = x$id
@@ -214,7 +222,8 @@ bake.step_tfidf <- function(object, new_data, ...) {
214222
paste0(object$prefix, "_", col_name),
215223
object$smooth_idf,
216224
object$norm,
217-
object$sublinear_tf
225+
object$sublinear_tf,
226+
object$sparse
218227
)
219228

220229
tfidf_text <- recipes::check_name(
@@ -255,7 +264,7 @@ tidy.step_tfidf <- function(x, ...) {
255264
res <- purrr::map2_dfr(
256265
x$columns,
257266
x$res,
258-
~tibble(
267+
~ tibble(
259268
terms = .x,
260269
token = names(.y),
261270
weight = unname(.y)
@@ -281,7 +290,8 @@ tfidf_function <- function(
281290
labels,
282291
smooth_idf,
283292
norm,
284-
sublinear_tf
293+
sublinear_tf,
294+
sparse
285295
) {
286296
# Backwards compatibility with 1592690d36581fc5f4952da3e9b02351b31f1a2e
287297
if (is.numeric(weights)) {
@@ -291,13 +301,20 @@ tfidf_function <- function(
291301
}
292302
counts <- tokenlist_to_dtm(data, dict)
293303

294-
tfidf <- dtm_to_tfidf(counts, weights, smooth_idf, norm, sublinear_tf)
304+
tfidf <- dtm_to_tfidf(counts, weights, smooth_idf, norm, sublinear_tf, sparse)
295305

296306
colnames(tfidf) <- paste0(labels, "_", dict)
297307
as_tibble(tfidf)
298308
}
299309

300-
dtm_to_tfidf <- function(dtm, idf_weights, smooth_idf, norm, sublinear_tf) {
310+
dtm_to_tfidf <- function(
311+
dtm,
312+
idf_weights,
313+
smooth_idf,
314+
norm,
315+
sublinear_tf,
316+
sparse
317+
) {
301318
dtm <- normalize(dtm, norm)
302319

303320
if (sublinear_tf) {
@@ -315,7 +332,14 @@ dtm_to_tfidf <- function(dtm, idf_weights, smooth_idf, norm, sublinear_tf) {
315332
} else {
316333
out <- dtm %*% Matrix::Diagonal(x = idf_weights)
317334
}
318-
as.matrix(out)
335+
336+
if (sparse_is_yes(sparse)) {
337+
colnames(out) <- seq_len(ncol(out))
338+
out <- sparsevctrs::coerce_to_sparse_tibble(out)
339+
} else {
340+
out <- as.matrix(out)
341+
}
342+
out
319343
}
320344

321345
normalize <- function(dtm, norm = c("l1", "l2", "none")) {
@@ -344,3 +368,22 @@ calc_idf <- function(dtm, smooth) {
344368
required_pkgs.step_tfidf <- function(x, ...) {
345369
c("textrecipes")
346370
}
371+
372+
373+
#' @export
374+
.recipes_estimate_sparsity.step_tfidf <- function(x, data, ...) {
375+
get_levels <- function(col) {
376+
n_chars <- nchar(col[seq(1, min(10, length(col)))])
377+
378+
floor(mean(n_chars))
379+
}
380+
381+
n_levels <- lapply(data, get_levels)
382+
383+
lapply(n_levels, function(n_lvl) {
384+
c(
385+
n_cols = n_lvl,
386+
sparsity = 1 - 1 / n_lvl
387+
)
388+
})
389+
}

man/step_tfidf.Rd

Lines changed: 19 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/testthat/test-tfidf.R

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,101 @@ test_that("bad args", {
164164
)
165165
})
166166

167+
test_that("sparse = 'yes' works", {
168+
rec <- recipe(~., data = test_data)
169+
170+
dense <- rec %>%
171+
step_tokenize(text) %>%
172+
step_tfidf(text, norm = "l1", sparse = "no") %>%
173+
prep() %>%
174+
bake(NULL)
175+
sparse <- rec %>%
176+
step_tokenize(text) %>%
177+
step_tfidf(text, norm = "l1", sparse = "yes") %>%
178+
prep() %>%
179+
bake(NULL)
180+
181+
expect_identical(dense, sparse)
182+
183+
expect_false(any(vapply(dense, sparsevctrs::is_sparse_double, logical(1))))
184+
expect_true(all(vapply(sparse, sparsevctrs::is_sparse_double, logical(1))))
185+
186+
dense <- rec %>%
187+
step_tokenize(text) %>%
188+
step_tfidf(text, norm = "l2", sparse = "no") %>%
189+
prep() %>%
190+
bake(NULL)
191+
sparse <- rec %>%
192+
step_tokenize(text) %>%
193+
step_tfidf(text, norm = "l2", sparse = "yes") %>%
194+
prep() %>%
195+
bake(NULL)
196+
197+
expect_identical(dense, sparse)
198+
199+
expect_false(any(vapply(dense, sparsevctrs::is_sparse_double, logical(1))))
200+
expect_true(all(vapply(sparse, sparsevctrs::is_sparse_double, logical(1))))
201+
202+
dense <- rec %>%
203+
step_tokenize(text) %>%
204+
step_tfidf(text, norm = "none", sparse = "no") %>%
205+
prep() %>%
206+
bake(NULL)
207+
sparse <- rec %>%
208+
step_tokenize(text) %>%
209+
step_tfidf(text, norm = "none", sparse = "yes") %>%
210+
prep() %>%
211+
bake(NULL)
212+
213+
expect_identical(dense, sparse)
214+
215+
expect_false(any(vapply(dense, sparsevctrs::is_sparse_double, logical(1))))
216+
expect_true(all(vapply(sparse, sparsevctrs::is_sparse_double, logical(1))))
217+
218+
dense <- rec %>%
219+
step_tokenize(text) %>%
220+
step_tfidf(text, sublinear_tf = TRUE, sparse = "no") %>%
221+
prep() %>%
222+
bake(NULL)
223+
sparse <- rec %>%
224+
step_tokenize(text) %>%
225+
step_tfidf(text, sublinear_tf = TRUE, sparse = "yes") %>%
226+
prep() %>%
227+
bake(NULL)
228+
229+
expect_identical(dense, sparse)
230+
231+
expect_false(any(vapply(dense, sparsevctrs::is_sparse_double, logical(1))))
232+
expect_true(all(vapply(sparse, sparsevctrs::is_sparse_double, logical(1))))
233+
})
234+
235+
test_that("sparse argument is backwards compatible", {
236+
rec <- recipe(~., data = test_data) %>%
237+
step_tokenize(text) %>%
238+
step_tfidf(text, sparse = "no") %>%
239+
prep()
240+
241+
exp <- bake(rec, test_data)
242+
243+
# Simulate old recipe
244+
rec$steps[[1]]$sparse <- NULL
245+
246+
expect_identical(
247+
bake(rec, test_data),
248+
exp
249+
)
250+
})
251+
252+
test_that(".recipes_toggle_sparse_args works", {
253+
rec <- recipe(~., data = test_data) %>%
254+
step_tokenize(text) %>%
255+
step_tfidf(text, sparse = "auto")
256+
257+
exp <- rec %>% prep() %>% bake(NULL) %>% sparsevctrs::sparsity()
258+
259+
expect_true(.recipes_estimate_sparsity(rec) >= exp)
260+
})
261+
167262
# Infrastructure ---------------------------------------------------------------
168263

169264
test_that("bake method errors when needed non-standard role columns are missing", {

0 commit comments

Comments
 (0)