Skip to content

Cannot read nullable date/datetime columns returned as Arrow object #869

@cgiachalis

Description

@cgiachalis

Continue from #866, when storing date/datetime data with nullable = TRUE and then reading back as Arrow object returns nothing (null). See below.

Reprex

Create schema

library(tiledb)

uri <- tempfile()

domain <- tiledb_domain(tiledb_dim("row", c(0L, 100L), 100L, "INT32"))

attrib <- c(tiledb_attr("date",   type = "DATETIME_DAY", nullable = TRUE),
            tiledb_attr("datetime",   type = "DATETIME_MS", nullable = TRUE),
            tiledb_attr("nanosecs",   type = "DATETIME_NS", nullable = TRUE),
            tiledb_attr("float64",  type = "FLOAT64", nullable = TRUE))

schema <- tiledb_array_schema(domain, attrib, sparse=TRUE)
res <- tiledb_array_create(uri, schema)

Store data and read back as data.frame

df <- data.frame(row     =  1:2,
                 date    =  c(as.Date("1990-01-01"), as.Date(NA)),
                 datetime   =  c(as.POSIXct("1990-01-01"), as.POSIXct(NA)),
                 nanosecs   =  nanotime::as.nanotime(c(100, NA)),
                 float64 =  c(1, NA))

# Save data and read back as data.table
arr <- tiledb_array(uri, return_as="data.table")
arr[] <- df
arr[]
#>      row       date                  datetime     nanosecs                           float64
#>    <int>     <Date>                    <POSc>     <nanotime>                          <num>
#> 1:     1 1990-01-01       1990-01-01 00:00:00    1970-01-01T00:00:00.000000100+00:00     1
#> 2:     2 1970-01-01 -292275055-05-16 18:21:56    <NA>                                   NA

Read back as arrow

# Now read back as arrow table
arr <- tiledb_array(uri, return_as = "arrow")

# print
arr[]
#> Table
#> 2 rows x 5 columns
#> $row <int32 not null>
#> $date <date32[day]>
#> $datetime <timestamp[ms]>
#> $nanosecs <timestamp[ns]>
#> $float64 <double>

# Convert to data.frame
data.table::as.data.table(arr[])
#>      row   date datetime nanosecs float64
#>    <int> <Date>   <POSc>   <POSc>   <num>
#> 1:     1   <NA>     <NA>     <NA>       1
#> 2:     2   <NA>     <NA>     <NA>      NA

# or equivalent conversion
arr[]$to_data_frame()
#>   row date datetime nanosecs float64
#> 1   1 <NA>     <NA>     <NA>       1
#> 2   2 <NA>     <NA>     <NA>      NA

# hmm...
arr[][["date"]]
#> ChunkedArray
#> <date32[day]>
#> [
#>   [
#>     null,
#>     null
#>   ]
#> ]

Relevant issues: #847, #866

With nullable = FALSE works as expected

library(tiledb)

uri <- tempfile()


domain <- tiledb_domain(tiledb_dim("row", c(0L, 100L), 100L, "INT32"))

attrib <- c(tiledb_attr("date",   type = "DATETIME_DAY", nullable = FALSE),
            tiledb_attr("datetime",   type = "DATETIME_MS", nullable = FALSE),
            tiledb_attr("nanosecs",   type = "DATETIME_NS", nullable = FALSE),
            tiledb_attr("float64",  type = "FLOAT64", nullable = FALSE))

schema <- tiledb_array_schema(domain, attrib, sparse=TRUE)
res <- tiledb_array_create(uri, schema)


df <- data.frame(row     =  1:2,
                 date    =  c(as.Date("1990-01-01"), as.Date(NA)),
                 datetime   =  c(as.POSIXct("1990-01-01"), as.POSIXct(NA)),
                 nanosecs   =  nanotime::as.nanotime(c(100, NA)),
                 float64 =  c(1, NA))

# Save data and read back as data.table
arr <- tiledb_array(uri, return_as="data.table")
arr[] <- df
arr[]
#>      row       date                  datetime
#>    <int>     <Date>                    <POSc>
#> 1:     1 1990-01-01       1990-01-01 00:00:00
#> 2:     2 1970-01-01 -292275055-05-16 18:21:56
#>                               nanosecs float64
#>                             <nanotime>   <num>
#> 1: 1970-01-01T00:00:00.000000100+00:00       1
#> 2:                                <NA>      NA

# Now read back as arrow table
arr <- tiledb_array(uri, return_as = "arrow")

# print
arr[]
#> Table
#> 2 rows x 5 columns
#> $row <int32 not null>
#> $date <date32[day] not null>
#> $datetime <timestamp[ms] not null>
#> $nanosecs <timestamp[ns] not null>
#> $float64 <double not null>

# Convert to data.frame
data.table::as.data.table(arr[])
#>      row       date                  datetime            nanosecs float64
#>    <int>     <Date>                    <POSc>              <POSc>   <num>
#> 1:     1 1990-01-01       1990-01-01 00:00:00 1970-01-01 02:00:00       1
#> 2:     2 1970-01-01 -292275055-05-16 18:21:56 1677-09-21 01:47:35      NA

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions