From 3686b4ac22dfcaec7a1ab8ed236d18b8f8ee4631 Mon Sep 17 00:00:00 2001 From: jennybc Date: Wed, 18 May 2016 00:43:57 -0700 Subject: [PATCH 01/32] extract worksheet reading code from functions --- .gitignore | 1 + DESCRIPTION | 8 +- R/read.R | 28 ++- man/xlsx_read_file.Rd | 22 ++ rexcel.Rproj | 1 + vignettes/read-a-sheet.R | 81 +++++++ vignettes/read-a-sheet.Rmd | 118 +++++++++++ vignettes/read-a-sheet.html | 328 +++++++++++++++++++++++++++++ vignettes/read-a-sheet.md | 407 ++++++++++++++++++++++++++++++++++++ 9 files changed, 984 insertions(+), 10 deletions(-) create mode 100644 man/xlsx_read_file.Rd create mode 100644 vignettes/read-a-sheet.R create mode 100644 vignettes/read-a-sheet.Rmd create mode 100644 vignettes/read-a-sheet.html create mode 100644 vignettes/read-a-sheet.md diff --git a/.gitignore b/.gitignore index 7e6f982..26179a6 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ example.xlsx tests/testthat/readxl tests/testthat/*.xlsx +inst/doc diff --git a/DESCRIPTION b/DESCRIPTION index 3833b2e..939e59d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -15,5 +15,9 @@ Imports: tibble, xml2 (>= 0.1.2.9000) Suggests: - testthat -RoxygenNote: 5.0.1 + testthat, + knitr, + rmarkdown, + rprojroot +RoxygenNote: 5.0.1.9000 +VignetteBuilder: knitr diff --git a/R/read.R b/R/read.R index 6f47e90..1432277 100644 --- a/R/read.R +++ b/R/read.R @@ -69,11 +69,11 @@ rexcel_read_workbook <- function(path, sheets=NULL, progress=TRUE) { } else { fmt <- xlsx_format_codes() } - num_fmt <- tibble::data_frame(num_fmt=fmt) - style <- linen::linen_style(lookup, font=style_xlsx$fonts, - fill=style_xlsx$fills, - border=style_xlsx$borders, - num_fmt=num_fmt) + num_fmt <- tibble::data_frame(num_fmt = fmt) + style <- linen::linen_style(lookup, font = style_xlsx$fonts, + fill = style_xlsx$fills, + border = style_xlsx$borders, + num_fmt = num_fmt) workbook <- linen::workbook(sheets, style, dat$defined_names) for (s in sheets) { @@ -132,13 +132,25 @@ xlsx_read_sheet <- function(path, sheet, workbook_dat) { xml } + +#' Read XML for a specific file +#' +#' Read in the XML for a specific file within the xlsx, e.g. the file +#' corresponding to a specific worksheet. +#' +#' @param path path to xlsx +#' @param file xml file corresponding to a specific worksheet +#' +#' @return an XML document +#' +#' @keywords internal xlsx_read_file <- function(path, file) { tmp <- tempfile() dir.create(tmp) ## Oh boy more terrible default behaviour. - filename <- tryCatch(utils::unzip(path, file, exdir=tmp), - warning=function(e) stop(e)) - on.exit(unlink(tmp, recursive=TRUE)) + filename <- tryCatch(utils::unzip(path, file, exdir = tmp), + warning = function(e) stop(e)) + on.exit(unlink(tmp, recursive = TRUE)) xml2::read_xml(filename) } diff --git a/man/xlsx_read_file.Rd b/man/xlsx_read_file.Rd new file mode 100644 index 0000000..c03e3ee --- /dev/null +++ b/man/xlsx_read_file.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/read.R +\name{xlsx_read_file} +\alias{xlsx_read_file} +\title{Read XML for a specific file} +\usage{ +xlsx_read_file(path, file) +} +\arguments{ +\item{path}{path to xlsx} + +\item{file}{xml file corresponding to a specific worksheet} +} +\value{ +an XML document +} +\description{ +Read in the XML for a specific file within the xlsx, e.g. the file +corresponding to a specific worksheet. +} +\keyword{internal} + diff --git a/rexcel.Rproj b/rexcel.Rproj index 497f8bf..f0d6187 100644 --- a/rexcel.Rproj +++ b/rexcel.Rproj @@ -18,3 +18,4 @@ StripTrailingWhitespace: Yes BuildType: Package PackageUseDevtools: Yes PackageInstallArgs: --no-multiarch --with-keep.source +PackageRoxygenize: rd,collate,namespace,vignette diff --git a/vignettes/read-a-sheet.R b/vignettes/read-a-sheet.R new file mode 100644 index 0000000..07cdee2 --- /dev/null +++ b/vignettes/read-a-sheet.R @@ -0,0 +1,81 @@ +## ------------------------------------------------------------------------ +library(rprojroot) +devtools::load_all(find_package_root_file()) + +## ------------------------------------------------------------------------ +(ff_path <- system.file("sheets", "gs-test-formula-formatting.xlsx", + package = "rexcel")) + +## enter rexcel_read_workbook() +path <- ff_path +sheets <- 1L + +## this gets info about the files inside the zip archive +dat <- xlsx_read_workbook(path) +dat$rels ## ?files in the zip archive? +dat$sheets ## ?files corresponding to worksheets? +(sheets <- xlsx_sheet_names(dat)[sheets]) + +(strings <- xlsx_read_shared_strings(path)) +(date_offset <- xlsx_date_offset(path)) + +style_xlsx <- xlsx_read_style(path) +str(style_xlsx, max.level = 1) +(lookup <- tibble::data_frame( + font = style_xlsx$cell_xfs$font_id, + fill = style_xlsx$cell_xfs$fill_id, + border = style_xlsx$cell_xfs$border_id, + num_fmt = style_xlsx$cell_xfs$num_fmt_id)) + +## numeric formatting +n <- max(style_xlsx$num_fmts$num_format_id) +fmt <- rep(NA_character_, n) +fmt[seq_along(xlsx_format_codes())] <- xlsx_format_codes() +fmt[style_xlsx$num_fmts$num_format_id] <- style_xlsx$num_fmts$format_code +num_fmt <- tibble::data_frame(num_fmt = fmt) +style <- linen::linen_style(lookup, font = style_xlsx$fonts, + fill = style_xlsx$fills, + border = style_xlsx$borders, + num_fmt = num_fmt) + +(workbook <- linen::workbook(sheets, style, dat$defined_names)) + +## ------------------------------------------------------------------------ +## enter rexcel_read_worksheet() +## rexcel_read_worksheet(path, s, workbook, dat, strings, style, date_offset) +(sheet <- sheets[1]) +(sheet_idx <- match(sheet, workbook$names)) +(sheet_name <- sheet) + +(target <- xlsx_internal_sheet_name(sheet, dat)) +(rels <- xlsx_read_rels(path, target)) + +## ------------------------------------------------------------------------ +## enter xlsx_read_sheet() +(file <- xlsx_internal_sheet_name(sheet_idx, dat)) +xml <- xlsx_read_file(path, file) ## at last! the xml! w00t! +(ns <- xml2::xml_ns(xml)) ## much less w00t now :( + +(merged <- xlsx_read_merged(xml, ns)) +(view <- xlsx_ct_worksheet_views(xml, ns)) +(cols <- xlsx_ct_cols(xml, ns)) # NOTE: not used yet + +## this is where it's at! +(cell_dat <- xlsx_parse_cells(xml, ns, strings, style, date_offset)) + +## not even sure what this is +(rows <- cell_dat$rows) + +## this is where cells come from +(cells <- linen::cells(cell_dat$cells$ref, cell_dat$cells$style, + cell_dat$cells$type, cell_dat$cells$value, + cell_dat$cells$formula)) + +## in real life and in other sheets, it's possible comments will be populated +## but not in this sheet +comments <- NULL + +## ------------------------------------------------------------------------ +(ws <- linen::worksheet(sheet_name, cols, rows, cells, merged, view, comments, + workbook)) + diff --git a/vignettes/read-a-sheet.Rmd b/vignettes/read-a-sheet.Rmd new file mode 100644 index 0000000..c0631fe --- /dev/null +++ b/vignettes/read-a-sheet.Rmd @@ -0,0 +1,118 @@ +--- +title: "Read an Excel Sheet" +author: "Jennifer Bryan" +date: "`r Sys.Date()`" +output: + rmarkdown::html_vignette: + toc: true + toc_depth: 4 + keep_md: true +vignette: > + %\VignetteIndexEntry{Vignette Title} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +*I'm exploring the existing sheet reading functionality, using the vignette format. This is not an actual vignette!* + +```{r} +library(rprojroot) +devtools::load_all(find_package_root_file()) +``` + +Peeling the many-layered onion that is `rexcel_read()` until I get at the XML for a worksheet. Wish me luck. + +We'll work with an example sheet created for `googlesheets` that has alot of formulas and formatting going. + +Objective 1: create a `linen::workbook` object. Dropping into code inside `rexcel_read_workbook()`. + +```{r} +(ff_path <- system.file("sheets", "gs-test-formula-formatting.xlsx", + package = "rexcel")) + +## enter rexcel_read_workbook() +path <- ff_path +sheets <- 1L + +## this gets info about the files inside the zip archive +dat <- xlsx_read_workbook(path) +dat$rels ## ?files in the zip archive? +dat$sheets ## ?files corresponding to worksheets? +(sheets <- xlsx_sheet_names(dat)[sheets]) + +(strings <- xlsx_read_shared_strings(path)) +(date_offset <- xlsx_date_offset(path)) + +style_xlsx <- xlsx_read_style(path) +str(style_xlsx, max.level = 1) +(lookup <- tibble::data_frame( + font = style_xlsx$cell_xfs$font_id, + fill = style_xlsx$cell_xfs$fill_id, + border = style_xlsx$cell_xfs$border_id, + num_fmt = style_xlsx$cell_xfs$num_fmt_id)) + +## numeric formatting +n <- max(style_xlsx$num_fmts$num_format_id) +fmt <- rep(NA_character_, n) +fmt[seq_along(xlsx_format_codes())] <- xlsx_format_codes() +fmt[style_xlsx$num_fmts$num_format_id] <- style_xlsx$num_fmts$format_code +num_fmt <- tibble::data_frame(num_fmt = fmt) +style <- linen::linen_style(lookup, font = style_xlsx$fonts, + fill = style_xlsx$fills, + border = style_xlsx$borders, + num_fmt = num_fmt) + +(workbook <- linen::workbook(sheets, style, dat$defined_names)) +``` + +Objective 2: Visit and extract information for all requested worksheets. + +In this case, I'm just reading the first and only sheet. This loop appears in `rexcel_read_workbook()` and calls `rexcel_read_worksheet()` for each requested worksheet. This is the loop and function we eventually exit from and this `workbook` object is what's returned. + +```{r} +## enter rexcel_read_worksheet() +## rexcel_read_worksheet(path, s, workbook, dat, strings, style, date_offset) +(sheet <- sheets[1]) +(sheet_idx <- match(sheet, workbook$names)) +(sheet_name <- sheet) + +(target <- xlsx_internal_sheet_name(sheet, dat)) +(rels <- xlsx_read_rels(path, target)) +``` + +Now we drop down into a lower-level non-exported function, `xlsx_read_sheet()`. + +```{r} +## enter xlsx_read_sheet() +(file <- xlsx_internal_sheet_name(sheet_idx, dat)) +xml <- xlsx_read_file(path, file) ## at last! the xml! w00t! +(ns <- xml2::xml_ns(xml)) ## much less w00t now :( + +(merged <- xlsx_read_merged(xml, ns)) +(view <- xlsx_ct_worksheet_views(xml, ns)) +(cols <- xlsx_ct_cols(xml, ns)) # NOTE: not used yet + +## this is where it's at! +(cell_dat <- xlsx_parse_cells(xml, ns, strings, style, date_offset)) + +## not even sure what this is +(rows <- cell_dat$rows) + +## this is where cells come from +(cells <- linen::cells(cell_dat$cells$ref, cell_dat$cells$style, + cell_dat$cells$type, cell_dat$cells$value, + cell_dat$cells$formula)) + +## in real life and in other sheets, it's possible comments will be populated +## but not in this sheet +comments <- NULL +``` + +Now we gather everything we've learned about this worksheet into a `linen::worksheet` object. + +```{r} +(ws <- linen::worksheet(sheet_name, cols, rows, cells, merged, view, comments, + workbook)) +``` + +If we had other sheets to read, that would be done now. Ultimately this `workbook` is returned. diff --git a/vignettes/read-a-sheet.html b/vignettes/read-a-sheet.html new file mode 100644 index 0000000..90301ef --- /dev/null +++ b/vignettes/read-a-sheet.html @@ -0,0 +1,328 @@ + + + + + + + + + + + + + + + + +Read an Excel Sheet + + + + + + + + + + + + + + + + + +

Read an Excel Sheet

+

Jennifer Bryan

+

2016-05-18

+ + + +

I’m exploring the existing sheet reading functionality, using the vignette format. This is not an actual vignette!

+
library(rprojroot)
+
## Warning: package 'rprojroot' was built under R version 3.2.4
+
devtools::load_all(find_package_root_file())
+
## Loading rexcel
+

Peeling the many-layered onion that is rexcel_read() until I get at the XML for a worksheet. Wish me luck.

+

We’ll work with an example sheet created for googlesheets that has alot of formulas and formatting going.

+

Objective 1: create a linen::workbook object. Dropping into code inside rexcel_read_workbook().

+
(ff_path <- system.file("sheets", "gs-test-formula-formatting.xlsx",
+                        package = "rexcel"))
+
## [1] "/Users/jenny/rrr/rexcel/inst/sheets/gs-test-formula-formatting.xlsx"
+
## enter rexcel_read_workbook()
+path <- ff_path
+sheets <- 1L
+
+## this gets info about the files inside the zip archive
+dat <- xlsx_read_workbook(path)
+dat$rels   ## ?files in the zip archive?
+
## Source: local data frame [3 x 4]
+## 
+##      id          type                target               target_abs
+##   <chr>         <chr>                 <chr>                    <chr>
+## 1  rId1        styles            styles.xml            xl/styles.xml
+## 2  rId2 sharedStrings     sharedStrings.xml     xl/sharedStrings.xml
+## 3  rId3     worksheet worksheets/sheet1.xml xl/worksheets/sheet1.xml
+
dat$sheets ## ?files corresponding to worksheets?
+
##     name sheet_id   state  ref      type                target
+## 1 Sheet1        1 visible rId3 worksheet worksheets/sheet1.xml
+##                 target_abs
+## 1 xl/worksheets/sheet1.xml
+
(sheets <- xlsx_sheet_names(dat)[sheets])
+
## [1] "Sheet1"
+
(strings <- xlsx_read_shared_strings(path))
+
##  [1] "integer"           "number_formatted"  "number_rounded"   
+##  [4] "character"         "formula"           "formula_formatted"
+##  [7] "one"               "three"             "four"             
+## [10] "five"
+
(date_offset <- xlsx_date_offset(path))
+
## [1] "1899-12-30"
+
style_xlsx <- xlsx_read_style(path)
+str(style_xlsx, max.level = 1)
+
## List of 7
+##  $ fonts         :Classes 'tbl_df', 'tbl' and 'data.frame':  4 obs. of  13 variables:
+##  $ fills         :Classes 'tbl_df', 'tbl' and 'data.frame':  2 obs. of  4 variables:
+##  $ borders       :Classes 'tbl_df', 'tbl' and 'data.frame':  1 obs. of  19 variables:
+##  $ cell_style_xfs:Classes 'tbl_df', 'tbl' and 'data.frame':  1 obs. of  16 variables:
+##  $ cell_xfs      :Classes 'tbl_df', 'tbl' and 'data.frame':  16 obs. of  16 variables:
+##  $ cell_styles   :Classes 'tbl_df', 'tbl' and 'data.frame':  1 obs. of  6 variables:
+##  $ num_fmts      :Classes 'tbl_df', 'tbl' and 'data.frame':  1 obs. of  2 variables:
+
(lookup <- tibble::data_frame(
+  font    = style_xlsx$cell_xfs$font_id,
+  fill    = style_xlsx$cell_xfs$fill_id,
+  border  = style_xlsx$cell_xfs$border_id,
+  num_fmt = style_xlsx$cell_xfs$num_fmt_id))
+
## Source: local data frame [16 x 4]
+## 
+##     font  fill border num_fmt
+##    <int> <int>  <int>   <int>
+## 1      1    NA     NA      NA
+## 2      2    NA     NA      NA
+## 3      2    NA     NA       4
+## 4      2    NA     NA       5
+## 5      3    NA     NA      NA
+## 6      2    NA     NA      12
+## 7      2    NA     NA      11
+## 8      2    NA     NA       5
+## 9      2    NA     NA      11
+## 10     2    NA     NA      12
+## 11     4    NA     NA      NA
+## 12     2    NA     NA       3
+## 13     2    NA     NA      13
+## 14     2    NA     NA     165
+## 15     2    NA     NA       4
+## 16     2    NA     NA       4
+
## numeric formatting
+n <- max(style_xlsx$num_fmts$num_format_id)
+fmt <- rep(NA_character_, n)
+fmt[seq_along(xlsx_format_codes())] <- xlsx_format_codes()
+fmt[style_xlsx$num_fmts$num_format_id] <- style_xlsx$num_fmts$format_code
+num_fmt <- tibble::data_frame(num_fmt = fmt)
+style <- linen::linen_style(lookup, font = style_xlsx$fonts,
+                            fill = style_xlsx$fills,
+                            border = style_xlsx$borders,
+                            num_fmt = num_fmt)
+
+(workbook <- linen::workbook(sheets, style, dat$defined_names))
+
## <workbook>
+##   Public:
+##     add_sheet: function (sheet) 
+##     clone: function (deep = FALSE) 
+##     defined_names: tbl_df, tbl, data.frame
+##     initialize: function (names, style, defined_names) 
+##     names: Sheet1
+##     sheets: list
+##     style: linen_style
+

Objective 2: Visit and extract information for all requested worksheets.

+

In this case, I’m just reading the first and only sheet. This loop appears in rexcel_read_workbook() and calls rexcel_read_worksheet() for each requested worksheet. This is the loop and function we eventually exit from and this workbook object is what’s returned.

+
## enter rexcel_read_worksheet()
+## rexcel_read_worksheet(path, s, workbook, dat, strings, style, date_offset)
+(sheet <- sheets[1])
+
## [1] "Sheet1"
+
(sheet_idx <- match(sheet, workbook$names))
+
## [1] 1
+
(sheet_name <- sheet)
+
## [1] "Sheet1"
+
(target <- xlsx_internal_sheet_name(sheet, dat))
+
## [1] "xl/worksheets/sheet1.xml"
+
(rels <- xlsx_read_rels(path, target))
+
## Source: local data frame [2 x 4]
+## 
+##      id      type                            target
+##   <chr>     <chr>                             <chr>
+## 1  rId1 hyperlink            http://www.google.com/
+## 2  rId2   drawing ../drawings/worksheetdrawing1.xml
+## Variables not shown: target_abs <chr>.
+

Now we drop down into a lower-level non-exported function, xlsx_read_sheet().

+
## enter xlsx_read_sheet()
+(file <- xlsx_internal_sheet_name(sheet_idx, dat))
+
## [1] "xl/worksheets/sheet1.xml"
+
xml <- xlsx_read_file(path, file) ## at last! the xml! w00t!
+(ns <- xml2::xml_ns(xml)) ## much less w00t now :(
+
## d1    <-> http://schemas.openxmlformats.org/spreadsheetml/2006/main
+## r     <-> http://schemas.openxmlformats.org/officeDocument/2006/relationships
+## mx    <-> http://schemas.microsoft.com/office/mac/excel/2008/main
+## mc    <-> http://schemas.openxmlformats.org/markup-compatibility/2006
+## mv    <-> urn:schemas-microsoft-com:mac:vml
+## x14   <-> http://schemas.microsoft.com/office/spreadsheetml/2009/9/main
+## x14ac <-> http://schemas.microsoft.com/office/spreadsheetml/2009/9/ac
+## xm    <-> http://schemas.microsoft.com/office/excel/2006/main
+
(merged <- xlsx_read_merged(xml, ns))
+
## list()
+
(view <- xlsx_ct_worksheet_views(xml, ns))
+
## NULL
+
(cols <- xlsx_ct_cols(xml, ns)) # NOTE: not used yet
+
## Source: local data frame [6 x 9]
+## 
+##   best_fit collapsed custom_width hidden   min   max outline_level style
+##      <lgl>     <lgl>        <lgl>  <lgl> <int> <int>         <int> <int>
+## 1    FALSE     FALSE         TRUE  FALSE     1     1            NA    NA
+## 2    FALSE     FALSE         TRUE  FALSE     2     2            NA    NA
+## 3    FALSE     FALSE         TRUE  FALSE     3     3            NA    NA
+## 4    FALSE     FALSE         TRUE  FALSE     4     4            NA    NA
+## 5    FALSE     FALSE         TRUE  FALSE     5     5            NA    NA
+## 6    FALSE     FALSE         TRUE  FALSE     6     6            NA    NA
+## Variables not shown: width <dbl>.
+
## this is where it's at!
+(cell_dat <- xlsx_parse_cells(xml, ns, strings, style, date_offset))
+
## $cells
+## Source: local data frame [2,022 x 5]
+## 
+##      ref style   type formula     value
+##    <chr> <int>  <chr>   <chr>    <list>
+## 1     A1     2   text    <NA> <chr [1]>
+## 2     B1     3   text    <NA> <chr [1]>
+## 3     C1     4   text    <NA> <chr [1]>
+## 4     D1     2   text    <NA> <chr [1]>
+## 5     E1     2   text    <NA> <chr [1]>
+## 6     F1     2   text    <NA> <chr [1]>
+## 7     A2     2 number    <NA> <dbl [1]>
+## 8     B2     3 number    <NA> <dbl [1]>
+## 9     C2     4 number    <NA> <dbl [1]>
+## 10    D2     2   text    <NA> <chr [1]>
+## ..   ...   ...    ...     ...       ...
+## 
+## $rows
+## Source: local data frame [1,000 x 11]
+## 
+##        r spans     s custom_format    ht hidden custom_height
+##    <int> <chr> <int>         <lgl> <dbl>  <lgl>         <lgl>
+## 1      1  <NA>    NA         FALSE    NA  FALSE            NA
+## 2      2  <NA>    NA         FALSE    NA  FALSE            NA
+## 3      3  <NA>    NA         FALSE    NA  FALSE            NA
+## 4      4  <NA>    NA         FALSE    NA  FALSE            NA
+## 5      5  <NA>    NA         FALSE    NA  FALSE            NA
+## 6      6  <NA>    NA         FALSE    NA  FALSE            NA
+## 7      7  <NA>    NA         FALSE    NA  FALSE            NA
+## 8      8  <NA>    NA         FALSE    NA  FALSE            NA
+## 9      9  <NA>    NA         FALSE    NA  FALSE            NA
+## 10    10  <NA>    NA         FALSE    NA  FALSE            NA
+## ..   ...   ...   ...           ...   ...    ...           ...
+## Variables not shown: outline_level <int>, collapsed <lgl>, thick_top
+##   <lgl>, thick_bot <lgl>.
+
## not even sure what this is
+(rows <- cell_dat$rows)
+
## Source: local data frame [1,000 x 11]
+## 
+##        r spans     s custom_format    ht hidden custom_height
+##    <int> <chr> <int>         <lgl> <dbl>  <lgl>         <lgl>
+## 1      1  <NA>    NA         FALSE    NA  FALSE            NA
+## 2      2  <NA>    NA         FALSE    NA  FALSE            NA
+## 3      3  <NA>    NA         FALSE    NA  FALSE            NA
+## 4      4  <NA>    NA         FALSE    NA  FALSE            NA
+## 5      5  <NA>    NA         FALSE    NA  FALSE            NA
+## 6      6  <NA>    NA         FALSE    NA  FALSE            NA
+## 7      7  <NA>    NA         FALSE    NA  FALSE            NA
+## 8      8  <NA>    NA         FALSE    NA  FALSE            NA
+## 9      9  <NA>    NA         FALSE    NA  FALSE            NA
+## 10    10  <NA>    NA         FALSE    NA  FALSE            NA
+## ..   ...   ...   ...           ...   ...    ...           ...
+## Variables not shown: outline_level <int>, collapsed <lgl>, thick_top
+##   <lgl>, thick_bot <lgl>.
+
## this is where cells come from  
+(cells <- linen::cells(cell_dat$cells$ref, cell_dat$cells$style,
+                       cell_dat$cells$type, cell_dat$cells$value,
+                       cell_dat$cells$formula))
+
## Source: local data frame [2,022 x 12]
+## 
+##      ref style     value formula   type is_formula is_value is_blank
+##    <chr> <int>    <list>   <chr>  <chr>      <lgl>    <lgl>    <lgl>
+## 1     A1     2 <chr [1]>    <NA>   text      FALSE     TRUE    FALSE
+## 2     B1     3 <chr [1]>    <NA>   text      FALSE     TRUE    FALSE
+## 3     C1     4 <chr [1]>    <NA>   text      FALSE     TRUE    FALSE
+## 4     D1     2 <chr [1]>    <NA>   text      FALSE     TRUE    FALSE
+## 5     E1     2 <chr [1]>    <NA>   text      FALSE     TRUE    FALSE
+## 6     F1     2 <chr [1]>    <NA>   text      FALSE     TRUE    FALSE
+## 7     A2     2 <dbl [1]>    <NA> number      FALSE     TRUE    FALSE
+## 8     B2     3 <dbl [1]>    <NA> number      FALSE     TRUE    FALSE
+## 9     C2     4 <dbl [1]>    <NA> number      FALSE     TRUE    FALSE
+## 10    D2     2 <chr [1]>    <NA>   text      FALSE     TRUE    FALSE
+## ..   ...   ...       ...     ...    ...        ...      ...      ...
+## Variables not shown: is_bool <lgl>, is_number <lgl>, is_text <lgl>,
+##   is_date <lgl>.
+
## in real life and in other sheets, it's possible comments will be populated
+## but not in this sheet
+comments <- NULL
+

Now we gather everything we’ve learned about this worksheet into a linen::worksheet object.

+
(ws <- linen::worksheet(sheet_name, cols, rows, cells, merged, view, comments,
+                        workbook))
+
## <worksheet: 1000 x 6>
+##  : ABCDEF
+## 1: aaaaaa
+## 2: 000a$$
+## 3: 000 $$
+## 4: 000a$$
+## 5:  00a$$
+## 6: 000a $
+

If we had other sheets to read, that would be done now. Ultimately this workbook is returned.

+ + + + + + + + diff --git a/vignettes/read-a-sheet.md b/vignettes/read-a-sheet.md new file mode 100644 index 0000000..dc9e316 --- /dev/null +++ b/vignettes/read-a-sheet.md @@ -0,0 +1,407 @@ +# Read an Excel Sheet +Jennifer Bryan +`r Sys.Date()` + +*I'm exploring the existing sheet reading functionality, using the vignette format. This is not an actual vignette!* + + +```r +library(rprojroot) +``` + +``` +## Warning: package 'rprojroot' was built under R version 3.2.4 +``` + +```r +devtools::load_all(find_package_root_file()) +``` + +``` +## Loading rexcel +``` + +Peeling the many-layered onion that is `rexcel_read()` until I get at the XML for a worksheet. Wish me luck. + +We'll work with an example sheet created for `googlesheets` that has alot of formulas and formatting going. + +Objective 1: create a `linen::workbook` object. Dropping into code inside `rexcel_read_workbook()`. + + +```r +(ff_path <- system.file("sheets", "gs-test-formula-formatting.xlsx", + package = "rexcel")) +``` + +``` +## [1] "/Users/jenny/rrr/rexcel/inst/sheets/gs-test-formula-formatting.xlsx" +``` + +```r +## enter rexcel_read_workbook() +path <- ff_path +sheets <- 1L + +## this gets info about the files inside the zip archive +dat <- xlsx_read_workbook(path) +dat$rels ## ?files in the zip archive? +``` + +``` +## Source: local data frame [3 x 4] +## +## id type target target_abs +## +## 1 rId1 styles styles.xml xl/styles.xml +## 2 rId2 sharedStrings sharedStrings.xml xl/sharedStrings.xml +## 3 rId3 worksheet worksheets/sheet1.xml xl/worksheets/sheet1.xml +``` + +```r +dat$sheets ## ?files corresponding to worksheets? +``` + +``` +## name sheet_id state ref type target +## 1 Sheet1 1 visible rId3 worksheet worksheets/sheet1.xml +## target_abs +## 1 xl/worksheets/sheet1.xml +``` + +```r +(sheets <- xlsx_sheet_names(dat)[sheets]) +``` + +``` +## [1] "Sheet1" +``` + +```r +(strings <- xlsx_read_shared_strings(path)) +``` + +``` +## [1] "integer" "number_formatted" "number_rounded" +## [4] "character" "formula" "formula_formatted" +## [7] "one" "three" "four" +## [10] "five" +``` + +```r +(date_offset <- xlsx_date_offset(path)) +``` + +``` +## [1] "1899-12-30" +``` + +```r +style_xlsx <- xlsx_read_style(path) +str(style_xlsx, max.level = 1) +``` + +``` +## List of 7 +## $ fonts :Classes 'tbl_df', 'tbl' and 'data.frame': 4 obs. of 13 variables: +## $ fills :Classes 'tbl_df', 'tbl' and 'data.frame': 2 obs. of 4 variables: +## $ borders :Classes 'tbl_df', 'tbl' and 'data.frame': 1 obs. of 19 variables: +## $ cell_style_xfs:Classes 'tbl_df', 'tbl' and 'data.frame': 1 obs. of 16 variables: +## $ cell_xfs :Classes 'tbl_df', 'tbl' and 'data.frame': 16 obs. of 16 variables: +## $ cell_styles :Classes 'tbl_df', 'tbl' and 'data.frame': 1 obs. of 6 variables: +## $ num_fmts :Classes 'tbl_df', 'tbl' and 'data.frame': 1 obs. of 2 variables: +``` + +```r +(lookup <- tibble::data_frame( + font = style_xlsx$cell_xfs$font_id, + fill = style_xlsx$cell_xfs$fill_id, + border = style_xlsx$cell_xfs$border_id, + num_fmt = style_xlsx$cell_xfs$num_fmt_id)) +``` + +``` +## Source: local data frame [16 x 4] +## +## font fill border num_fmt +## +## 1 1 NA NA NA +## 2 2 NA NA NA +## 3 2 NA NA 4 +## 4 2 NA NA 5 +## 5 3 NA NA NA +## 6 2 NA NA 12 +## 7 2 NA NA 11 +## 8 2 NA NA 5 +## 9 2 NA NA 11 +## 10 2 NA NA 12 +## 11 4 NA NA NA +## 12 2 NA NA 3 +## 13 2 NA NA 13 +## 14 2 NA NA 165 +## 15 2 NA NA 4 +## 16 2 NA NA 4 +``` + +```r +## numeric formatting +n <- max(style_xlsx$num_fmts$num_format_id) +fmt <- rep(NA_character_, n) +fmt[seq_along(xlsx_format_codes())] <- xlsx_format_codes() +fmt[style_xlsx$num_fmts$num_format_id] <- style_xlsx$num_fmts$format_code +num_fmt <- tibble::data_frame(num_fmt = fmt) +style <- linen::linen_style(lookup, font = style_xlsx$fonts, + fill = style_xlsx$fills, + border = style_xlsx$borders, + num_fmt = num_fmt) + +(workbook <- linen::workbook(sheets, style, dat$defined_names)) +``` + +``` +## +## Public: +## add_sheet: function (sheet) +## clone: function (deep = FALSE) +## defined_names: tbl_df, tbl, data.frame +## initialize: function (names, style, defined_names) +## names: Sheet1 +## sheets: list +## style: linen_style +``` + +Objective 2: Visit and extract information for all requested worksheets. + +In this case, I'm just reading the first and only sheet. This loop appears in `rexcel_read_workbook()` and calls `rexcel_read_worksheet()` for each requested worksheet. This is the loop and function we eventually exit from and this `workbook` object is what's returned. + + +```r +## enter rexcel_read_worksheet() +## rexcel_read_worksheet(path, s, workbook, dat, strings, style, date_offset) +(sheet <- sheets[1]) +``` + +``` +## [1] "Sheet1" +``` + +```r +(sheet_idx <- match(sheet, workbook$names)) +``` + +``` +## [1] 1 +``` + +```r +(sheet_name <- sheet) +``` + +``` +## [1] "Sheet1" +``` + +```r +(target <- xlsx_internal_sheet_name(sheet, dat)) +``` + +``` +## [1] "xl/worksheets/sheet1.xml" +``` + +```r +(rels <- xlsx_read_rels(path, target)) +``` + +``` +## Source: local data frame [2 x 4] +## +## id type target +## +## 1 rId1 hyperlink http://www.google.com/ +## 2 rId2 drawing ../drawings/worksheetdrawing1.xml +## Variables not shown: target_abs . +``` + +Now we drop down into a lower-level non-exported function, `xlsx_read_sheet()`. + + +```r +## enter xlsx_read_sheet() +(file <- xlsx_internal_sheet_name(sheet_idx, dat)) +``` + +``` +## [1] "xl/worksheets/sheet1.xml" +``` + +```r +xml <- xlsx_read_file(path, file) ## at last! the xml! w00t! +(ns <- xml2::xml_ns(xml)) ## much less w00t now :( +``` + +``` +## d1 <-> http://schemas.openxmlformats.org/spreadsheetml/2006/main +## r <-> http://schemas.openxmlformats.org/officeDocument/2006/relationships +## mx <-> http://schemas.microsoft.com/office/mac/excel/2008/main +## mc <-> http://schemas.openxmlformats.org/markup-compatibility/2006 +## mv <-> urn:schemas-microsoft-com:mac:vml +## x14 <-> http://schemas.microsoft.com/office/spreadsheetml/2009/9/main +## x14ac <-> http://schemas.microsoft.com/office/spreadsheetml/2009/9/ac +## xm <-> http://schemas.microsoft.com/office/excel/2006/main +``` + +```r +(merged <- xlsx_read_merged(xml, ns)) +``` + +``` +## list() +``` + +```r +(view <- xlsx_ct_worksheet_views(xml, ns)) +``` + +``` +## NULL +``` + +```r +(cols <- xlsx_ct_cols(xml, ns)) # NOTE: not used yet +``` + +``` +## Source: local data frame [6 x 9] +## +## best_fit collapsed custom_width hidden min max outline_level style +## +## 1 FALSE FALSE TRUE FALSE 1 1 NA NA +## 2 FALSE FALSE TRUE FALSE 2 2 NA NA +## 3 FALSE FALSE TRUE FALSE 3 3 NA NA +## 4 FALSE FALSE TRUE FALSE 4 4 NA NA +## 5 FALSE FALSE TRUE FALSE 5 5 NA NA +## 6 FALSE FALSE TRUE FALSE 6 6 NA NA +## Variables not shown: width . +``` + +```r +## this is where it's at! +(cell_dat <- xlsx_parse_cells(xml, ns, strings, style, date_offset)) +``` + +``` +## $cells +## Source: local data frame [2,022 x 5] +## +## ref style type formula value +## +## 1 A1 2 text +## 2 B1 3 text +## 3 C1 4 text +## 4 D1 2 text +## 5 E1 2 text +## 6 F1 2 text +## 7 A2 2 number +## 8 B2 3 number +## 9 C2 4 number +## 10 D2 2 text +## .. ... ... ... ... ... +## +## $rows +## Source: local data frame [1,000 x 11] +## +## r spans s custom_format ht hidden custom_height +## +## 1 1 NA FALSE NA FALSE NA +## 2 2 NA FALSE NA FALSE NA +## 3 3 NA FALSE NA FALSE NA +## 4 4 NA FALSE NA FALSE NA +## 5 5 NA FALSE NA FALSE NA +## 6 6 NA FALSE NA FALSE NA +## 7 7 NA FALSE NA FALSE NA +## 8 8 NA FALSE NA FALSE NA +## 9 9 NA FALSE NA FALSE NA +## 10 10 NA FALSE NA FALSE NA +## .. ... ... ... ... ... ... ... +## Variables not shown: outline_level , collapsed , thick_top +## , thick_bot . +``` + +```r +## not even sure what this is +(rows <- cell_dat$rows) +``` + +``` +## Source: local data frame [1,000 x 11] +## +## r spans s custom_format ht hidden custom_height +## +## 1 1 NA FALSE NA FALSE NA +## 2 2 NA FALSE NA FALSE NA +## 3 3 NA FALSE NA FALSE NA +## 4 4 NA FALSE NA FALSE NA +## 5 5 NA FALSE NA FALSE NA +## 6 6 NA FALSE NA FALSE NA +## 7 7 NA FALSE NA FALSE NA +## 8 8 NA FALSE NA FALSE NA +## 9 9 NA FALSE NA FALSE NA +## 10 10 NA FALSE NA FALSE NA +## .. ... ... ... ... ... ... ... +## Variables not shown: outline_level , collapsed , thick_top +## , thick_bot . +``` + +```r +## this is where cells come from +(cells <- linen::cells(cell_dat$cells$ref, cell_dat$cells$style, + cell_dat$cells$type, cell_dat$cells$value, + cell_dat$cells$formula)) +``` + +``` +## Source: local data frame [2,022 x 12] +## +## ref style value formula type is_formula is_value is_blank +## +## 1 A1 2 text FALSE TRUE FALSE +## 2 B1 3 text FALSE TRUE FALSE +## 3 C1 4 text FALSE TRUE FALSE +## 4 D1 2 text FALSE TRUE FALSE +## 5 E1 2 text FALSE TRUE FALSE +## 6 F1 2 text FALSE TRUE FALSE +## 7 A2 2 number FALSE TRUE FALSE +## 8 B2 3 number FALSE TRUE FALSE +## 9 C2 4 number FALSE TRUE FALSE +## 10 D2 2 text FALSE TRUE FALSE +## .. ... ... ... ... ... ... ... ... +## Variables not shown: is_bool , is_number , is_text , +## is_date . +``` + +```r +## in real life and in other sheets, it's possible comments will be populated +## but not in this sheet +comments <- NULL +``` + +Now we gather everything we've learned about this worksheet into a `linen::worksheet` object. + + +```r +(ws <- linen::worksheet(sheet_name, cols, rows, cells, merged, view, comments, + workbook)) +``` + +``` +## +## : ABCDEF +## 1: aaaaaa +## 2: 000a$$ +## 3: 000 $$ +## 4: 000a$$ +## 5: 00a$$ +## 6: 000a $ +``` + +If we had other sheets to read, that would be done now. Ultimately this `workbook` is returned. From 3be66721fd7b33b16a1716932fe9b408043fbd10 Mon Sep 17 00:00:00 2001 From: jennybc Date: Wed, 18 May 2016 12:08:16 -0700 Subject: [PATCH 02/32] add function to list files in the xlsx --- R/read.R | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/R/read.R b/R/read.R index 1432277..dcc7945 100644 --- a/R/read.R +++ b/R/read.R @@ -154,6 +154,10 @@ xlsx_read_file <- function(path, file) { xml2::read_xml(filename) } +xlsx_list_files <- function(path) { + tibble::as_data_frame(utils::unzip(path, list = TRUE)) +} + xlsx_read_file_if_exists <- function(path, file, missing=NULL) { ## TODO: Appropriate error handling here is difficult; we should ## check that `path` exists, but by the time that this is called we From 989433e86b9626581bb34c24a11f4ee69fac34b4 Mon Sep 17 00:00:00 2001 From: jennybc Date: Wed, 18 May 2016 12:20:43 -0700 Subject: [PATCH 03/32] stub for is_xlsx() --- R/utils.R | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/R/utils.R b/R/utils.R index 2fede2d..3f02ae5 100644 --- a/R/utils.R +++ b/R/utils.R @@ -133,3 +133,12 @@ as_na <- function(x) { storage.mode(ret) <- storage.mode(x) ret } + +is_xlsx <- function(path) { + if (!file.exists(path)) { + stop("\n", path, "\ndoes not exist") + } + ## TO DO: what else could we put here to increase confidence that this truly + ## is xlsx? + invisible(path) +} From 794a5d3cad54e6015d493f7f98e76df0c44e6af9 Mon Sep 17 00:00:00 2001 From: jennybc Date: Wed, 18 May 2016 23:52:31 -0700 Subject: [PATCH 04/32] start of experiment with registration fxn This is also me just exploring all the files in the unzipped xlsx. And then exposing anything that looks valuable. I'm sure this has lots of overlap with existing linen::workbook. --- DESCRIPTION | 8 +++++--- R/read.R | 4 +--- R/register.R | 34 ++++++++++++++++++++++++++++++++++ R/rexcel-package.r | 6 ++++++ 4 files changed, 46 insertions(+), 6 deletions(-) create mode 100644 R/register.R create mode 100644 R/rexcel-package.r diff --git a/DESCRIPTION b/DESCRIPTION index 939e59d..69e6d87 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -12,12 +12,14 @@ Imports: cellranger, linen (>= 0.0.3), progress, + purrr, tibble, - xml2 (>= 0.1.2.9000) + xml2 (>= 0.1.2.9000), + dplyr Suggests: - testthat, knitr, rmarkdown, - rprojroot + rprojroot, + testthat, RoxygenNote: 5.0.1.9000 VignetteBuilder: knitr diff --git a/R/read.R b/R/read.R index dcc7945..c90ed4f 100644 --- a/R/read.R +++ b/R/read.R @@ -32,9 +32,7 @@ rexcel_read <- function(path, sheet=1L) { ##' @param progress Display a progress bar? ##' @export rexcel_read_workbook <- function(path, sheets=NULL, progress=TRUE) { - if (!file.exists(path)) { - stop(sprintf("%s does not exist", path)) - } + is_xlsx(path) dat <- xlsx_read_workbook(path) diff --git a/R/register.R b/R/register.R new file mode 100644 index 0000000..1ca4673 --- /dev/null +++ b/R/register.R @@ -0,0 +1,34 @@ +rexcel_workbook <- function(path) { + ## TO DO: + ## if path is actually a workbook + ## Recall(path$path) + ## i.e. refresh registration of the workbook + ## to be used when you are concerned the xlsx has changed + is_xlsx(path) + manifest <- xlsx_list_files(path) + + ct <- xlsx_read_file(path, "[Content_Types].xml") %>% + xml2::xml_contents() %>% + xml2::xml_attrs() %>% + purrr::map(as.list) %>% + dplyr::bind_rows() + + ## this appears to be always boring? omit it + #rels <- xlsx_read_file(path, "_rels/.rels") + + xl_workbook <- xlsx_read_file(path, "xl/workbook.xml") + sheets <- xl_workbook %>% + xml2::xml_find_one("//d1:sheets", xml2::xml_ns(.)) %>% + xml2::xml_contents() %>% + xml2::xml_attrs() %>% + purrr::map(as.list) %>% + dplyr::bind_rows() + + lst(xlsx_path = path, + reg_time = Sys.time(), + manifest, + content_types = ct, + sheets + ) + ## TODO: obviously this should return a workbook object! +} diff --git a/R/rexcel-package.r b/R/rexcel-package.r new file mode 100644 index 0000000..d2c2b64 --- /dev/null +++ b/R/rexcel-package.r @@ -0,0 +1,6 @@ +#' rexcel. +#' +#' @name rexcel +#' @docType package +#' @importFrom purrr `%>%` +NULL From 53ea8b3ef906ab6f83325ea9155401f98a273fad Mon Sep 17 00:00:00 2001 From: jennybc Date: Fri, 20 May 2016 00:30:45 -0700 Subject: [PATCH 05/32] I've poked around all files in xlsx now --- NAMESPACE | 1 + R/register.R | 173 ++++++++++++++++++++++++++++++++++++++++- R/utils.R | 6 ++ man/rexcel.Rd | 11 +++ man/rexcel_workbook.Rd | 31 ++++++++ 5 files changed, 218 insertions(+), 4 deletions(-) create mode 100644 man/rexcel.Rd create mode 100644 man/rexcel_workbook.Rd diff --git a/NAMESPACE b/NAMESPACE index 80fb485..cf36ca9 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,3 +3,4 @@ export(rexcel_read) export(rexcel_read_workbook) export(rexcel_readxl) +importFrom(purrr,"`%>%`") diff --git a/R/register.R b/R/register.R index 1ca4673..b5d49f4 100644 --- a/R/register.R +++ b/R/register.R @@ -1,3 +1,23 @@ +#' Low-level function to expose contents of xlsx +#' +#' This is just Jenny getting to know xlsx! Function returns alot of the same +#' information as rexcel_read_workbook() but with several notable exceptions. +#' Returns a list, not a proper linen::workbook. Much less processing is done -- +#' basically only whats needed to some reasonable R object, usually a data +#' frame. +#' +#' @param path +#' +#' @return a list +#' @keywords internaln +#' +#' @examples +#' mini_gap_path <- system.file("sheets", "mini-gap.xlsx", package = "rexcel") +#' rexcel_workbook(mini_gap_path) +#' +#' ff_path <- system.file("sheets", "gs-test-formula-formatting.xlsx", +#' package = "rexcel") +#' rexcel_workbook(ff_path) rexcel_workbook <- function(path) { ## TO DO: ## if path is actually a workbook @@ -7,28 +27,173 @@ rexcel_workbook <- function(path) { is_xlsx(path) manifest <- xlsx_list_files(path) + ## overview of typical manifest + + ## [Content_Types].xml + ## _rels/.rels + ## xl/workbook.xml + ## xl/sharedStrings.xml + ## xl/styles.xml + ## xl/_rels/workbook.xml.rels + + ## xl/worksheets/sheet1.xml + ## xl/worksheets/sheet2.xml + ## ... and so on + ## xl/worksheets/_rels/sheet1.xml.rels + ## xl/worksheets/_rels/sheet2.xml.rels + ## ... and so on + ## xl/drawings/worksheetdrawing1.xml + ## xl/drawings/worksheetdrawing2.xml + ## ... and so on + + ## [Content_Types].xml ct <- xlsx_read_file(path, "[Content_Types].xml") %>% xml2::xml_contents() %>% xml2::xml_attrs() %>% purrr::map(as.list) %>% - dplyr::bind_rows() + dplyr::bind_rows() %>% + dplyr::select(PartName, Extension, ContentType) + #setdiff(manifest$Name, gsub("^\\/", "", ct$PartName)) + #intersect(gsub("^\\/", "", ct$PartName), manifest$Name) + ## ct is a tbl associating content types with extensions or files + ## for the most part, each row addresses a specific file in manifest + ## except all "rels" files are represented by a single row + ## and there's a row that says xml files are "application/xml" - ## this appears to be always boring? omit it + ## _rels/.rels #rels <- xlsx_read_file(path, "_rels/.rels") + ## this appears to be always boring? omit it + ## xl/workbook.xml xl_workbook <- xlsx_read_file(path, "xl/workbook.xml") sheets <- xl_workbook %>% xml2::xml_find_one("//d1:sheets", xml2::xml_ns(.)) %>% xml2::xml_contents() %>% xml2::xml_attrs() %>% purrr::map(as.list) %>% + dplyr::bind_rows() %>% + dplyr::mutate(sheetId = as.integer(sheetId)) + ## sheets is a tbl with one row per worksheet and these variables: + ## state: "visible" (or what else ... "invisible"?) + ## name: e.g. "Africa" (assume this is name of the tab) + ## sheetID: integer (assume this is order perceived by user) + ## id: character, e.g. "rId5" (a key that comes up in other tables) + + ## xl/sharedStrings.xml + shared_strings <- xlsx_read_file(path, "xl/sharedStrings.xml") + shared_strings_att <- xml2::xml_attrs(shared_strings) %>% + as.list() %>% + purrr::map(as.integer) + shared_strings <- shared_strings %>% + xml2::xml_find_all("//d1:t", xml2::xml_ns(.)) %>% + purrr::map_chr(xml2::xml_text) + attributes(shared_strings) <- shared_strings_att + ## sh_strings is a character of shared strings + ## with attributes count (total # of strings?), uniqueCount (its own length?) + + ## xl/styles.xml + styles <- xlsx_read_file(path, "xl/styles.xml") + ns <- xml2::xml_ns(styles) + + font_nodes <- styles %>% + xml2::xml_find_all("//d1:fonts/d1:font", ns) %>% + purrr::map(xml2::xml_children) + f <- function(font_node, ns) { + nms <- xml2::xml_name(font_node, ns) %>% rm_xml_ns() + vals <- xml2::xml_attrs(font_node, ns) %>% purrr::map(unname) + setNames(vals, nms) %>% + purrr::keep(~length(.x) > 0) + } + fonts <- font_nodes %>% + purrr::map(f, ns = ns) %>% dplyr::bind_rows() + ## fonts is a tbl with one row per font and variables such as + ## sz, color, name + + ## I'm don't feel like parsing the remaining elements of styles for now + ## no temptation to duplicate rich's efforts there ... yikes + fills <- NULL + borders <- NULL + cell_style_xfs <- NULL + cell_xfs <- NULL + cell_styles <- NULL + num_fmts <- NULL + dxfs <- NULL + + ## xl/_rels/workbook.xml.rels + workbook_rels <- xlsx_read_file(path, "xl/_rels/workbook.xml.rels") %>% + xml2::xml_contents() %>% + xml2::xml_attrs() %>% + purrr::map(as.list) %>% + dplyr::bind_rows() %>% + dplyr::select(Id, Target, Type) + ## workbook_rels is a tibble, each row a file, with variables + ## Id: character, e.g. "rId5" (a key that came up already in sheets above) + ## Target: a file path relative to xl/ + ## Type: an long namespace-y string, the last bit of which tells you + ## if the associated file is sharedStrings, styles, or a worksheet, e.g., + ## http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet + + ## xl/worksheets/_rels/sheet1.xml.rels and friends + fnames <- manifest %>% + dplyr::filter(grepl("rels", Name), grepl("sheet", Name)) %>% + .$Name + nms <- gsub("xl/worksheets/_rels/(sheet[0-9]+).xml.rels", "\\1", fnames) + wr <- fnames %>% + purrr::map(xlsx_read_file, path = path) + ns <- xml2::xml_ns(wr[[1]]) + worksheet_rels <- + wr %>% + purrr::map(xml2::xml_find_one, xpath = "//d1:Relationship", ns = ns) %>% + purrr::map(xml2::xml_attrs, ns = ns) %>% + purrr::map(as.list) %>% + dplyr::bind_rows() %>% + dplyr::mutate(sheet = nms) %>% + dplyr::select(sheet, Target, dplyr::everything()) + ## worksheet_rels is a tibble, each row a file ... so far one row per + ## worksheet, though that might not hold in general, with variables + ## sheet: character, e.g. "sheet1" (I added this!) + ## Target: hmmm .... seems to vary + ## in one example: path to the corresponding worksheetdrawingX.xml file + ## ^^ maybe that's the default? when there's nothing else? + ## in another: "http://www.google.com/", which appears in the sheet + ## Id: character, so far uniformly "rId1" + ## Type: uniformly a long name-spacey string ending in "drawing" or "hyperlink" + ## TargetMode: (seen in one example) "External" for the hyperlink + + ## xl/worksheets/sheet1.xml etc. + #one_sheet <- xlsx_read_file(path, "xl/worksheets/sheet1.xml") + ## come back here and parse enough to learn worksheet extent + ## otherwise, I don't see anything here that belongs in top-level workbook + ## creation + + ## xl/drawings/worksheetdrawing1.xml etc. + #one_drawing <- xlsx_read_file(path, "xl/drawings/worksheetdrawing1.xml") + ## I don't see anything here that belongs in top-level workbook creation + ## also, in my toy examples with no charts, this consists only of namespaces + + ## come back here and make a new object + ## one row per sheet + ## everything from sheets tbl already formed + ## workbook_rels prepend xl/ to Target + ## join to sheets on Id + sheets_df <- workbook_rels %>% + dplyr::mutate(Target = file.path("xl", Target)) %>% + dplyr::right_join(sheets, by = c("Id" = "id")) %>% + dplyr::select(sheetId, name, Id, Target, Type) - lst(xlsx_path = path, + dplyr::lst(xlsx_path = path, reg_time = Sys.time(), manifest, content_types = ct, - sheets + sheets, + sheets_df, + shared_strings, + styles = + dplyr::lst(fonts, fills, borders, cell_style_xfs, cell_xfs, + cell_styles, num_fmts, dxfs), + workbook_rels, + worksheet_rels ) ## TODO: obviously this should return a workbook object! } diff --git a/R/utils.R b/R/utils.R index 3f02ae5..359e292 100644 --- a/R/utils.R +++ b/R/utils.R @@ -140,5 +140,11 @@ is_xlsx <- function(path) { } ## TO DO: what else could we put here to increase confidence that this truly ## is xlsx? + ## look at extension? + ## verify it's a zip archive? only way I know is unix `file` command + ## maybe peek at file listing and verify presence of ... what? + ## [Content_Types].xml ? xl/workbook.xml ? invisible(path) } + +rm_xml_ns <- function(x) gsub(".*:(.*)", "\\1", x) diff --git a/man/rexcel.Rd b/man/rexcel.Rd new file mode 100644 index 0000000..2e57f3d --- /dev/null +++ b/man/rexcel.Rd @@ -0,0 +1,11 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/rexcel-package.r +\docType{package} +\name{rexcel} +\alias{rexcel} +\alias{rexcel-package} +\title{rexcel.} +\description{ +rexcel. +} + diff --git a/man/rexcel_workbook.Rd b/man/rexcel_workbook.Rd new file mode 100644 index 0000000..3cf9e2e --- /dev/null +++ b/man/rexcel_workbook.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/register.R +\name{rexcel_workbook} +\alias{rexcel_workbook} +\title{Low-level function to expose contents of xlsx} +\usage{ +rexcel_workbook(path) +} +\arguments{ +\item{path}{} +} +\value{ +a list +} +\description{ +This is just Jenny getting to know xlsx! Function returns alot of the same +information as rexcel_read_workbook() but with several notable exceptions. +Returns a list, not a proper linen::workbook. Much less processing is done -- +basically only whats needed to some reasonable R object, usually a data +frame. +} +\examples{ +mini_gap_path <- system.file("sheets", "mini-gap.xlsx", package = "rexcel") +rexcel_workbook(mini_gap_path) + +ff_path <- system.file("sheets", "gs-test-formula-formatting.xlsx", + package = "rexcel") +rexcel_workbook(ff_path) +} +\keyword{internaln} + From 1f60cb3f2c31eacadd0941564b4345e4959bcb1f Mon Sep 17 00:00:00 2001 From: jennybc Date: Fri, 20 May 2016 14:22:14 -0700 Subject: [PATCH 06/32] oops, correct import of %>% --- NAMESPACE | 2 +- R/rexcel-package.r | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index cf36ca9..5bd68fa 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,4 +3,4 @@ export(rexcel_read) export(rexcel_read_workbook) export(rexcel_readxl) -importFrom(purrr,"`%>%`") +importFrom(purrr,"%>%") diff --git a/R/rexcel-package.r b/R/rexcel-package.r index d2c2b64..1c3c5b9 100644 --- a/R/rexcel-package.r +++ b/R/rexcel-package.r @@ -2,5 +2,5 @@ #' #' @name rexcel #' @docType package -#' @importFrom purrr `%>%` +#' @importFrom purrr %>% NULL From 36a7f21d95ca1519ca6ff690583c01435cb3ab8c Mon Sep 17 00:00:00 2001 From: jennybc Date: Sun, 22 May 2016 00:50:06 -0700 Subject: [PATCH 07/32] toy registration fxn for jenny to get oriented --- R/register.R | 45 +++--- vignettes/read-a-sheet.R | 9 ++ vignettes/read-a-sheet.Rmd | 44 +++++- vignettes/read-a-sheet.html | 276 +++++++++++++++++++++++++++++++++- vignettes/read-a-sheet.md | 291 +++++++++++++++++++++++++++++++++++- 5 files changed, 638 insertions(+), 27 deletions(-) diff --git a/R/register.R b/R/register.R index b5d49f4..b29ca93 100644 --- a/R/register.R +++ b/R/register.R @@ -29,13 +29,15 @@ rexcel_workbook <- function(path) { ## overview of typical manifest - ## [Content_Types].xml + ## *** workbook infrastructure *** + ## [Content_Types].xml ## _rels/.rels ## xl/workbook.xml ## xl/sharedStrings.xml ## xl/styles.xml ## xl/_rels/workbook.xml.rels + ## ** worksheets-related *** ## xl/worksheets/sheet1.xml ## xl/worksheets/sheet2.xml ## ... and so on @@ -55,10 +57,9 @@ rexcel_workbook <- function(path) { dplyr::select(PartName, Extension, ContentType) #setdiff(manifest$Name, gsub("^\\/", "", ct$PartName)) #intersect(gsub("^\\/", "", ct$PartName), manifest$Name) - ## ct is a tbl associating content types with extensions or files - ## for the most part, each row addresses a specific file in manifest - ## except all "rels" files are represented by a single row - ## and there's a row that says xml files are "application/xml" + ## ct is a tbl associating content types with extensions or specific files + ## two "general" rows for the xml and rels extensions, otherwise ... + ## each row seems to address a specific file from the manifest (but not all) ## _rels/.rels #rels <- xlsx_read_file(path, "_rels/.rels") @@ -110,7 +111,7 @@ rexcel_workbook <- function(path) { ## fonts is a tbl with one row per font and variables such as ## sz, color, name - ## I'm don't feel like parsing the remaining elements of styles for now + ## I don't feel like parsing the remaining elements of styles for now ## no temptation to duplicate rich's efforts there ... yikes fills <- NULL borders <- NULL @@ -130,7 +131,7 @@ rexcel_workbook <- function(path) { ## workbook_rels is a tibble, each row a file, with variables ## Id: character, e.g. "rId5" (a key that came up already in sheets above) ## Target: a file path relative to xl/ - ## Type: an long namespace-y string, the last bit of which tells you + ## Type: a long namespace-y string, the last bit of which tells you ## if the associated file is sharedStrings, styles, or a worksheet, e.g., ## http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet @@ -142,23 +143,27 @@ rexcel_workbook <- function(path) { wr <- fnames %>% purrr::map(xlsx_read_file, path = path) ns <- xml2::xml_ns(wr[[1]]) - worksheet_rels <- - wr %>% - purrr::map(xml2::xml_find_one, xpath = "//d1:Relationship", ns = ns) %>% - purrr::map(xml2::xml_attrs, ns = ns) %>% - purrr::map(as.list) %>% - dplyr::bind_rows() %>% - dplyr::mutate(sheet = nms) %>% - dplyr::select(sheet, Target, dplyr::everything()) + worksheet_rels <- wr %>% + purrr::map(xml2::xml_find_all, xpath = "//d1:Relationship", ns = ns) %>% + setNames(nms) + f <- function(x, ns) { + x %>% + purrr::map(xml2::xml_attrs, ns = ns) %>% + purrr::map(as.list) %>% + dplyr::bind_rows() + } + worksheet_rels <- worksheet_rels %>% + purrr::map(f, ns = ns) %>% + dplyr::bind_rows(.id = "worksheet") ## worksheet_rels is a tibble, each row a file ... so far one row per ## worksheet, though that might not hold in general, with variables - ## sheet: character, e.g. "sheet1" (I added this!) + ## worksheet: character, e.g. "sheet1" (I added this!) + ## Id: character, so far uniformly "rId1" + ## Type: uniformly a long name-spacey string ending in "drawing" or "hyperlink" ## Target: hmmm .... seems to vary ## in one example: path to the corresponding worksheetdrawingX.xml file ## ^^ maybe that's the default? when there's nothing else? ## in another: "http://www.google.com/", which appears in the sheet - ## Id: character, so far uniformly "rId1" - ## Type: uniformly a long name-spacey string ending in "drawing" or "hyperlink" ## TargetMode: (seen in one example) "External" for the hyperlink ## xl/worksheets/sheet1.xml etc. @@ -172,7 +177,7 @@ rexcel_workbook <- function(path) { ## I don't see anything here that belongs in top-level workbook creation ## also, in my toy examples with no charts, this consists only of namespaces - ## come back here and make a new object + ## use synthesis of the above: ## one row per sheet ## everything from sheets tbl already formed ## workbook_rels prepend xl/ to Target @@ -180,7 +185,7 @@ rexcel_workbook <- function(path) { sheets_df <- workbook_rels %>% dplyr::mutate(Target = file.path("xl", Target)) %>% dplyr::right_join(sheets, by = c("Id" = "id")) %>% - dplyr::select(sheetId, name, Id, Target, Type) + dplyr::select(sheetId, name, Id, Target, state, Type) dplyr::lst(xlsx_path = path, reg_time = Sys.time(), diff --git a/vignettes/read-a-sheet.R b/vignettes/read-a-sheet.R index 07cdee2..feae282 100644 --- a/vignettes/read-a-sheet.R +++ b/vignettes/read-a-sheet.R @@ -2,6 +2,15 @@ library(rprojroot) devtools::load_all(find_package_root_file()) +## ------------------------------------------------------------------------ +mini_gap_path <- system.file("sheets", "mini-gap.xlsx", package = "rexcel") +rexcel_workbook(mini_gap_path) + +## ------------------------------------------------------------------------ +ff_path <- system.file("sheets", "gs-test-formula-formatting.xlsx", + package = "rexcel") +rexcel_workbook(ff_path) + ## ------------------------------------------------------------------------ (ff_path <- system.file("sheets", "gs-test-formula-formatting.xlsx", package = "rexcel")) diff --git a/vignettes/read-a-sheet.Rmd b/vignettes/read-a-sheet.Rmd index c0631fe..8798b62 100644 --- a/vignettes/read-a-sheet.Rmd +++ b/vignettes/read-a-sheet.Rmd @@ -13,13 +13,53 @@ vignette: > %\VignetteEncoding{UTF-8} --- -*I'm exploring the existing sheet reading functionality, using the vignette format. This is not an actual vignette!* - ```{r} library(rprojroot) devtools::load_all(find_package_root_file()) ``` +*Using a function I wrote while exploring all the files that make up an xlsx.* + +Apply it to mini gapminder. + +```{r} +mini_gap_path <- system.file("sheets", "mini-gap.xlsx", package = "rexcel") +mini_gap_workbook <- rexcel_workbook(mini_gap_path) +str(mini_gap_workbook, max.level = 1) +``` + +What's here? + + * `xlsx_path`: path to the xlsx + * `reg_time`: time xlsx was processed + * `manifest`: file list for the xlsx zip archive + * `content_types`: tbl representing `[Content_Types].xml` + * `sheets`: tbl representing `xl/workbook.xml` + * `sheets_df`: + - This is really the only thing I created. + - A tbl with one row per worksheet, from joining `sheets` and `workbook_rels` + * `shared_strings`: character vector representing `xl/sharedStrings.xml` + * `styles`: list of tbls from `xl/styles.xml` (ok, I admit, I stopped after parsing fonts) + * `workbook_rels`: + - tbl that links target files to `Id`s, also gives file type + - example: tells you that `Id = rId4` refers to `Target` file `xl/worksheets/sheetX.xml` + - comes from `xl/_rels/workbook.xml.rels` + * `worksheet_rels`: + - *I'm still figuring this one out but it's about files or external resources (potentially) referred to from worksheets* + - comes from files like` xl/worksheets/_rels/(sheet[0-9]+).xml.rels` + + + +Apply it to formula and formatting sheet. + +```{r} +ff_path <- system.file("sheets", "gs-test-formula-formatting.xlsx", + package = "rexcel") +rexcel_workbook(ff_path) +``` + +*Here I'm exploring the existing sheet reading functionality, using the vignette format. This is not an actual vignette!* + Peeling the many-layered onion that is `rexcel_read()` until I get at the XML for a worksheet. Wish me luck. We'll work with an example sheet created for `googlesheets` that has alot of formulas and formatting going. diff --git a/vignettes/read-a-sheet.html b/vignettes/read-a-sheet.html index 90301ef..18ae3a3 100644 --- a/vignettes/read-a-sheet.html +++ b/vignettes/read-a-sheet.html @@ -12,7 +12,7 @@ - + Read an Excel Sheet @@ -70,15 +70,285 @@

Read an Excel Sheet

Jennifer Bryan

-

2016-05-18

+

2016-05-20

-

I’m exploring the existing sheet reading functionality, using the vignette format. This is not an actual vignette!

library(rprojroot)
## Warning: package 'rprojroot' was built under R version 3.2.4
devtools::load_all(find_package_root_file())
## Loading rexcel
+

Using a function I wrote while exploring all the files that make up an xlsx.

+

Apply it to mini gapminder.

+
mini_gap_path <- system.file("sheets", "mini-gap.xlsx", package = "rexcel")
+rexcel_workbook(mini_gap_path)
+
## $xlsx_path
+## [1] "/Users/jenny/rrr/rexcel/inst/sheets/mini-gap.xlsx"
+## 
+## $reg_time
+## [1] "2016-05-20 15:18:34 PDT"
+## 
+## $manifest
+## Source: local data frame [21 x 3]
+## 
+##                                   Name Length                Date
+##                                  <chr>  <dbl>              <time>
+## 1             xl/worksheets/sheet1.xml   2136 2015-04-25 12:00:00
+## 2  xl/worksheets/_rels/sheet1.xml.rels    307 2015-04-25 12:00:00
+## 3             xl/worksheets/sheet2.xml   2136 2015-04-25 12:00:00
+## 4  xl/worksheets/_rels/sheet2.xml.rels    307 2015-04-25 12:00:00
+## 5             xl/worksheets/sheet3.xml   2146 2015-04-25 12:00:00
+## 6  xl/worksheets/_rels/sheet3.xml.rels    307 2015-04-25 12:00:00
+## 7             xl/worksheets/sheet4.xml   2136 2015-04-25 12:00:00
+## 8  xl/worksheets/_rels/sheet4.xml.rels    307 2015-04-25 12:00:00
+## 9             xl/worksheets/sheet5.xml   2144 2015-04-25 12:00:00
+## 10 xl/worksheets/_rels/sheet5.xml.rels    307 2015-04-25 12:00:00
+## ..                                 ...    ...                 ...
+## 
+## $content_types
+## Source: local data frame [15 x 3]
+## 
+##                              PartName Extension
+##                                 <chr>     <chr>
+## 1                                <NA>      rels
+## 2                                <NA>       xml
+## 3  /xl/drawings/worksheetdrawing4.xml      <NA>
+## 4  /xl/drawings/worksheetdrawing2.xml      <NA>
+## 5  /xl/drawings/worksheetdrawing1.xml      <NA>
+## 6  /xl/drawings/worksheetdrawing3.xml      <NA>
+## 7  /xl/drawings/worksheetdrawing5.xml      <NA>
+## 8                      /xl/styles.xml      <NA>
+## 9               /xl/sharedStrings.xml      <NA>
+## 10                   /xl/workbook.xml      <NA>
+## 11          /xl/worksheets/sheet5.xml      <NA>
+## 12          /xl/worksheets/sheet3.xml      <NA>
+## 13          /xl/worksheets/sheet1.xml      <NA>
+## 14          /xl/worksheets/sheet4.xml      <NA>
+## 15          /xl/worksheets/sheet2.xml      <NA>
+## Variables not shown: ContentType <chr>.
+## 
+## $sheets
+## Source: local data frame [5 x 4]
+## 
+##     state     name sheetId    id
+##     <chr>    <chr>   <int> <chr>
+## 1 visible   Africa       1  rId3
+## 2 visible Americas       2  rId4
+## 3 visible     Asia       3  rId5
+## 4 visible   Europe       4  rId6
+## 5 visible  Oceania       5  rId7
+## 
+## $sheets_df
+## Source: local data frame [5 x 5]
+## 
+##   sheetId     name    Id                   Target
+##     <int>    <chr> <chr>                    <chr>
+## 1       1   Africa  rId3 xl/worksheets/sheet4.xml
+## 2       2 Americas  rId4 xl/worksheets/sheet3.xml
+## 3       3     Asia  rId5 xl/worksheets/sheet5.xml
+## 4       4   Europe  rId6 xl/worksheets/sheet1.xml
+## 5       5  Oceania  rId7 xl/worksheets/sheet2.xml
+## Variables not shown: Type <chr>.
+## 
+## $shared_strings
+##  [1] "country"                "continent"             
+##  [3] "year"                   "lifeExp"               
+##  [5] "pop"                    "gdpPercap"             
+##  [7] "Algeria"                "Africa"                
+##  [9] "Angola"                 "Albania"               
+## [11] "Europe"                 "Benin"                 
+## [13] "Austria"                "Argentina"             
+## [15] "Americas"               "Belgium"               
+## [17] "Australia"              "Oceania"               
+## [19] "Bolivia"                "Bosnia and Herzegovina"
+## [21] "New Zealand"            "Bulgaria"              
+## [23] "Brazil"                 "Canada"                
+## [25] "Afghanistan"            "Asia"                  
+## [27] "Bahrain"                "Chile"                 
+## [29] "Bangladesh"             "Botswana"              
+## [31] "Cambodia"               "China"                 
+## [33] "Burkina Faso"          
+## attr(,"count")
+## [1] 80
+## attr(,"uniqueCount")
+## [1] 33
+## 
+## $styles
+## $styles$fonts
+## Source: local data frame [1 x 3]
+## 
+##      sz    color  name
+##   <chr>    <chr> <chr>
+## 1  10.0 FF000000 Arial
+## 
+## $styles$fills
+## NULL
+## 
+## $styles$borders
+## NULL
+## 
+## $styles$cell_style_xfs
+## NULL
+## 
+## $styles$cell_xfs
+## NULL
+## 
+## $styles$cell_styles
+## NULL
+## 
+## $styles$num_fmts
+## NULL
+## 
+## $styles$dxfs
+## NULL
+## 
+## 
+## $workbook_rels
+## Source: local data frame [7 x 3]
+## 
+##      Id                Target
+##   <chr>                 <chr>
+## 1  rId2     sharedStrings.xml
+## 2  rId1            styles.xml
+## 3  rId4 worksheets/sheet3.xml
+## 4  rId3 worksheets/sheet4.xml
+## 5  rId6 worksheets/sheet1.xml
+## 6  rId5 worksheets/sheet5.xml
+## 7  rId7 worksheets/sheet2.xml
+## Variables not shown: Type <chr>.
+## 
+## $worksheet_rels
+## $worksheet_rels[[1]]
+## {xml_nodeset (1)}
+## [1] <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/offi ...
+## 
+## $worksheet_rels[[2]]
+## {xml_nodeset (1)}
+## [1] <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/offi ...
+## 
+## $worksheet_rels[[3]]
+## {xml_nodeset (1)}
+## [1] <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/offi ...
+## 
+## $worksheet_rels[[4]]
+## {xml_nodeset (1)}
+## [1] <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/offi ...
+## 
+## $worksheet_rels[[5]]
+## {xml_nodeset (1)}
+## [1] <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/offi ...
+

Apply it to formula and formatting sheet.

+
ff_path <- system.file("sheets", "gs-test-formula-formatting.xlsx",
+                       package = "rexcel")
+rexcel_workbook(ff_path)
+
## $xlsx_path
+## [1] "/Users/jenny/rrr/rexcel/inst/sheets/gs-test-formula-formatting.xlsx"
+## 
+## $reg_time
+## [1] "2016-05-20 15:18:34 PDT"
+## 
+## $manifest
+## Source: local data frame [9 x 3]
+## 
+##                                  Name Length                Date
+##                                 <chr>  <dbl>              <time>
+## 1            xl/worksheets/sheet1.xml  60580 2016-05-03 02:56:00
+## 2 xl/worksheets/_rels/sheet1.xml.rels    471 2016-05-03 02:56:00
+## 3   xl/drawings/worksheetdrawing1.xml    494 2016-05-03 02:56:00
+## 4                xl/sharedStrings.xml    407 2016-05-03 02:56:00
+## 5                       xl/styles.xml   3014 2016-05-03 02:56:00
+## 6                     xl/workbook.xml    731 2016-05-03 02:56:00
+## 7          xl/_rels/workbook.xml.rels    565 2016-05-03 02:56:00
+## 8                         _rels/.rels    296 2016-05-03 02:56:00
+## 9                 [Content_Types].xml    945 2016-05-03 02:56:00
+## 
+## $content_types
+## Source: local data frame [7 x 3]
+## 
+##                             PartName Extension
+##                                <chr>     <chr>
+## 1                               <NA>       xml
+## 2                               <NA>      rels
+## 3          /xl/worksheets/sheet1.xml      <NA>
+## 4              /xl/sharedStrings.xml      <NA>
+## 5 /xl/drawings/worksheetdrawing1.xml      <NA>
+## 6                     /xl/styles.xml      <NA>
+## 7                   /xl/workbook.xml      <NA>
+## Variables not shown: ContentType <chr>.
+## 
+## $sheets
+## Source: local data frame [1 x 4]
+## 
+##     state   name sheetId    id
+##     <chr>  <chr>   <int> <chr>
+## 1 visible Sheet1       1  rId3
+## 
+## $sheets_df
+## Source: local data frame [1 x 5]
+## 
+##   sheetId   name    Id                   Target
+##     <int>  <chr> <chr>                    <chr>
+## 1       1 Sheet1  rId3 xl/worksheets/sheet1.xml
+## Variables not shown: Type <chr>.
+## 
+## $shared_strings
+##  [1] "integer"           "number_formatted"  "number_rounded"   
+##  [4] "character"         "formula"           "formula_formatted"
+##  [7] "one"               "three"             "four"             
+## [10] "five"             
+## attr(,"count")
+## [1] 10
+## attr(,"uniqueCount")
+## [1] 10
+## 
+## $styles
+## $styles$fonts
+## Source: local data frame [3 x 3]
+## 
+##      sz    color        name
+##   <chr>    <chr>       <chr>
+## 1  10.0 FF000000       Arial
+## 2  <NA> FF0000FF        <NA>
+## 3  <NA>     <NA> Courier New
+## 
+## $styles$fills
+## NULL
+## 
+## $styles$borders
+## NULL
+## 
+## $styles$cell_style_xfs
+## NULL
+## 
+## $styles$cell_xfs
+## NULL
+## 
+## $styles$cell_styles
+## NULL
+## 
+## $styles$num_fmts
+## NULL
+## 
+## $styles$dxfs
+## NULL
+## 
+## 
+## $workbook_rels
+## Source: local data frame [3 x 3]
+## 
+##      Id                Target
+##   <chr>                 <chr>
+## 1  rId1            styles.xml
+## 2  rId2     sharedStrings.xml
+## 3  rId3 worksheets/sheet1.xml
+## Variables not shown: Type <chr>.
+## 
+## $worksheet_rels
+## $worksheet_rels[[1]]
+## {xml_nodeset (2)}
+## [1] <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/offi ...
+## [2] <Relationship Id="rId2" Type="http://schemas.openxmlformats.org/offi ...
+

Here I’m exploring the existing sheet reading functionality, using the vignette format. This is not an actual vignette!

Peeling the many-layered onion that is rexcel_read() until I get at the XML for a worksheet. Wish me luck.

We’ll work with an example sheet created for googlesheets that has alot of formulas and formatting going.

Objective 1: create a linen::workbook object. Dropping into code inside rexcel_read_workbook().

diff --git a/vignettes/read-a-sheet.md b/vignettes/read-a-sheet.md index dc9e316..b2533f9 100644 --- a/vignettes/read-a-sheet.md +++ b/vignettes/read-a-sheet.md @@ -2,8 +2,6 @@ Jennifer Bryan `r Sys.Date()` -*I'm exploring the existing sheet reading functionality, using the vignette format. This is not an actual vignette!* - ```r library(rprojroot) @@ -21,6 +19,295 @@ devtools::load_all(find_package_root_file()) ## Loading rexcel ``` +*Using a function I wrote while exploring all the files that make up an xlsx.* + +Apply it to mini gapminder. + + +```r +mini_gap_path <- system.file("sheets", "mini-gap.xlsx", package = "rexcel") +rexcel_workbook(mini_gap_path) +``` + +``` +## $xlsx_path +## [1] "/Users/jenny/rrr/rexcel/inst/sheets/mini-gap.xlsx" +## +## $reg_time +## [1] "2016-05-20 15:18:34 PDT" +## +## $manifest +## Source: local data frame [21 x 3] +## +## Name Length Date +##