diff --git a/.Rbuildignore b/.Rbuildignore index 5e611f9..c2e6a6c 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,7 +1,9 @@ -\.gitignore -\.project -\.settings -\.Rbuildignore -.*\.tar\.gz -\.Rbuildignore.Rcheck -RSocrata.Rcheck +\.gitignore +\.project +\.settings +\.Rbuildignore +.*\.tar\.gz +\.Rbuildignore.Rcheck +RSocrata.Rcheck +^.*\.Rproj$ +^\.Rproj\.user$ diff --git a/.gitignore b/.gitignore index 01b7d11..adc165d 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,6 @@ *.pdf /DESCRIPTION.Rcheck /.Rbuildignore.Rcheck +.Rproj.user +*.Rproj +.Rhistory diff --git a/DESCRIPTION b/DESCRIPTION index 454ba33..2b4ff7a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,18 +1,18 @@ -Package: RSocrata -Title: Download Socrata datasets as R data frames -Description: Provided with a Socrata dataset resource URL, - or a Socrata SoDA web API query, - or a Socrata "human-friendly" URL, - returns an R data frame. - Converts dates to POSIX format. - Manages throttling by Socrata. -Version: 1.4 -Date: 2014-03-18 -Depends: - httr (>= 0.3), - RJSONIO -Suggests: - RUnit -Author: Hugh J. Devlin, Ph. D. and Tom Schenk, Jr. -Maintainer: Hugh J. Devlin -License: MIT + file LICENSE +Package: RSocrata +Title: Download Socrata datasets as R data frames +Description: Provided with a Socrata dataset resource URL, + or a Socrata SoDA web API query, + or a Socrata "human-friendly" URL, + returns an R data frame. + Converts dates to POSIX format. + Manages throttling by Socrata. +Version: 1.5 +Date: 2014-03-18 +Depends: + httr (>= 0.3), + jsonlite +Suggests: + RUnit +Author: Hugh J. Devlin, Ph. D. and Tom Schenk, Jr. +Maintainer: Hugh J. Devlin +License: MIT + file LICENSE diff --git a/NAMESPACE b/NAMESPACE index 8c6a91c..2bd0ab2 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,5 @@ -export(fieldName) -export(posixify) -export(read.socrata) -import(RJSONIO) -import(httr) +# Generated by roxygen2 (4.0.1): do not edit by hand + +export(fieldName) +export(posixify) +export(read.socrata) diff --git a/R/RSocrata.R b/R/RSocrata.R index 9789bb6..146063e 100644 --- a/R/RSocrata.R +++ b/R/RSocrata.R @@ -1,187 +1,219 @@ -# An interface to data hosted online in Socrata data repositories -# -# Author: Hugh J. Devlin, Ph. D. 2013-08-28 -############################################################################### - -library('httr') # for access to the HTTP header -library('RJSONIO') # for parsing data types from Socrata - -#' Time-stamped message -#' -#' Issue a time-stamped, origin-stamped log message. -#' @param s a string -#' @return None (invisible NULL) as per cat -#' @author Hugh J. Devlin \email{Hugh.Devlin@@cityofchicago.org} -logMsg <- function(s) { - cat(format(Sys.time(), "%Y-%m-%d %H:%M:%OS3 "), as.character(sys.call(-1))[1], ": ", s, '\n', sep='') -} - -#' Checks the validity of the syntax for a potential Socrata dataset Unique Identifier, also known as a 4x4. -#' -#' Will check the validity of a potential dataset unique identifier -#' supported by Socrata. It will provide an exception if the syntax -#' does not align to Socrata unique identifiers. It only checks for -#' the validity of the syntax, but does not check if it actually exists. -#' @param fourByFour a string; character vector of length one -#' @return TRUE if is valid Socrata unique identifier, FALSE otherwise -#' @author Tom Schenk Jr \email{tom.schenk@@cityofchicago.org} -isFourByFour <- function(fourByFour) { - fourByFour <- as.character(fourByFour) - if(nchar(fourByFour) != 9) - return(FALSE) - if(regexpr("[[:alnum:]]{4}-[[:alnum:]]{4}", fourByFour) == -1) - return(FALSE) - TRUE -} - -#' Convert, if necessary, URL to valid REST API URL supported by Socrata. -#' -#' Will convert a human-readable URL to a valid REST API call -#' supported by Socrata. It will accept a valid API URL if provided -#' by users and will also convert a human-readable URL to a valid API -#' URL. -#' @param url a string; character vector of length one -#' @return a valid Url -#' @author Tom Schenk Jr \email{tom.schenk@@cityofchicago.org} -validateUrl <- function(url) { - url <- as.character(url) - parsedUrl <- parse_url(url) - if(is.null(parsedUrl$scheme) | is.null(parsedUrl$hostname) | is.null(parsedUrl$path)) - stop(url, " does not appear to be a valid URL.") - if(substr(parsedUrl$path, 1, 9) == 'resource/') { - return(build_url(parsedUrl)) # resource url already - } - fourByFour <- basename(parsedUrl$path) - if(!isFourByFour(fourByFour)) - stop(fourByFour, " is not a valid Socrata dataset unique identifier.") - parsedUrl$path <- paste("resource/", fourByFour, ".csv", sep="") - build_url(parsedUrl) -} - -#' Convert Socrata human-readable column name to field name -#' -#' Convert Socrata human-readable column name, -#' as it might appear in the first row of data, -#' to field name as it might appear in the HTTP header; -#' that is, lower case, periods replaced with underscores#' -#' @param humanName a Socrata human-readable column name -#' @return Socrata field name -#' @export -#' @author Hugh J. Devlin, Ph. D. \email{Hugh.Devlin@@cityofchicago.org} -#' @examples -#' fieldName("Number.of.Stations") # number_of_stations -fieldName <- function(humanName) { - tolower(gsub('\\.', '_', as.character(humanName))) -} - -#' Convert Socrata calendar_date string to POSIX -#' -#' @param x char in Socrata calendar_date format -#' @return a POSIX date -#' @export -#' @author Hugh J. Devlin, Ph. D. \email{Hugh.Devlin@@cityofchicago.org} -posixify <- function(x) { - x <- as.character(x) - # Two calendar date formats supplied by Socrata - if(regexpr("^[[:digit:]]{1,2}/[[:digit:]]{1,2}/[[:digit:]]{4}$", x[1])[1] == 1) - strptime(x, format="%m/%d/%Y") - else - strptime(x, format="%m/%d/%Y %I:%M:%S %p") -} - -# Wrap httr GET in some diagnostics -# -# In case of failure, report error details from Socrata -# -# @param url Socrata Open Data Application Program Interface (SODA) query -# @return httr response object -# @author Hugh J. Devlin, Ph. D. \email{Hugh.Devlin@@cityofchicago.org} -getResponse <- function(url) { - response <- GET(url) - status <- http_status(response) - if(response$status_code != 200) { - msg <- paste("Error in httr GET:", response$status_code, response$headers$statusmessage, url) - if(response$headers$`content-length` > 0) { - details <- content(response) - msg <- paste(msg, details$code[1], details$message[1]) - } - logMsg(msg) - } - stop_for_status(response) - response -} - -# Content parsers -# -# Return a data frame for csv -# -# @author Hugh J. Devlin \email{Hugh.Devlin@@cityofchicago.org} -# @param an httr response object -# @return data frame, possibly empty -getContentAsDataFrame <- function(response) { UseMethod('response') } -getContentAsDataFrame <- function(response) { - mimeType <- response$header$'content-type' - # skip optional parameters - sep <- regexpr(';', mimeType)[1] - if(sep != -1) mimeType <- substr(mimeType, 0, sep[1] - 1) - switch(mimeType, - 'text/csv' = - content(response), # automatic parsing - 'application/json' = - if(content(response, as='text') == "[ ]") # empty json? - data.frame() # empty data frame - else - data.frame(t(sapply(content(response), unlist)), stringsAsFactors=FALSE) - ) # end switch -} - -# Get the SoDA 2 data types -# -# Get the Socrata Open Data Application Program Interface data types from the http response header -# @author Hugh J. Devlin, Ph. D. \email{Hugh.Devlin@@cityofchicago.org} -# @param responseHeaders headers attribute from an httr response object -# @return a named vector mapping field names to data types -getSodaTypes <- function(response) { UseMethod('response') } -getSodaTypes <- function(response) { - result <- fromJSON(response$headers[['x-soda2-types']]) - names(result) <- fromJSON(response$headers[['x-soda2-fields']]) - result -} - -#' Get a full Socrata data set as an R data frame -#' -#' Manages throttling and POSIX date-time conversions -#' -#' @param url A Socrata resource URL, -#' or a Socrata "human-friendly" URL, -#' or Socrata Open Data Application Program Interface (SODA) query -#' requesting a comma-separated download format (.csv suffix), -#' May include SoQL parameters, -#' but is assumed to not include a SODA offset parameter -#' @return an R data frame with POSIX dates -#' @export -#' @author Hugh J. Devlin, Ph. D. \email{Hugh.Devlin@@cityofchicago.org} -#' @examples -#' df <- read.socrata("http://soda.demo.socrata.com/resource/4334-bgaj.csv") -read.socrata <- function(url) { - validUrl <- validateUrl(url) # check url syntax, allow human-readable Socrata url - parsedUrl <- parse_url(validUrl) - mimeType <- guess_media(parsedUrl$path) - if(!(mimeType %in% c('text/csv','application/json'))) - stop("Error in read.socrata: ", mimeType, " not a supported data format.") - response <- getResponse(validUrl) - page <- getContentAsDataFrame(response) - result <- page - dataTypes <- getSodaTypes(response) - while (nrow(page) > 0) { # more to come maybe? - query <- paste(validUrl, if(is.null(parsedUrl$query)) {'?'} else {"&"}, '$offset=', nrow(result), sep='') - response <- getResponse(query) - page <- getContentAsDataFrame(response) - result <- rbind(result, page) # accumulate - } - # convert Socrata calendar dates to posix format - for(columnName in colnames(page)[!is.na(dataTypes[fieldName(colnames(page))]) & dataTypes[fieldName(colnames(page))] == 'calendar_date']) { - result[[columnName]] <- posixify(result[[columnName]]) - } - result -} +############################################################################### +# An interface to data hosted online in Socrata data repositories +# +# Author: Hugh J. Devlin, Ph. D. 2013-08-28 +############################################################################### + +library('httr') # for access to the HTTP header +library('jsonlite') # for parsing data types from Socrata + +#' Time-stamped message +#' +#' Issue a time-stamped, origin-stamped log message. +#' @param s a string +#' @return None (invisible NULL) as per cat +#' @author Hugh J. Devlin \email{Hugh.Devlin@@cityofchicago.org} +logMsg <- function(s) { + cat(format(Sys.time(), "%Y-%m-%d %H:%M:%OS3 "), as.character(sys.call(-1))[1], ": ", s, '\n', sep='') +} + + + +#' Checks the validity of the syntax for a potential Socrata dataset Unique Identifier, also known as a 4x4. +#' +#' Will check the validity of a potential dataset unique identifier +#' supported by Socrata. It will provide an exception if the syntax +#' does not align to Socrata unique identifiers. It only checks for +#' the validity of the syntax, but does not check if it actually exists. +#' @param fourByFour a string; character vector of length one +#' @return TRUE if is valid Socrata unique identifier, FALSE otherwise +#' @author Tom Schenk Jr \email{tom.schenk@@cityofchicago.org} +isFourByFour <- function(fourByFour) { + fourByFour <- as.character(fourByFour) + if(nchar(fourByFour) != 9) + return(FALSE) + if(regexpr("[[:alnum:]]{4}-[[:alnum:]]{4}", fourByFour) == -1) + return(FALSE) + TRUE +} + +#' Convert, if necessary, URL to valid REST API URL supported by Socrata. +#' +#' Will convert a human-readable URL to a valid REST API call +#' supported by Socrata. It will accept a valid API URL if provided +#' by users and will also convert a human-readable URL to a valid API +#' URL. +#' @param url a string; character vector of length one +#' @return a valid Url +#' @author Tom Schenk Jr \email{tom.schenk@@cityofchicago.org} +validateUrl <- function(url) { + url <- as.character(url) + parsedUrl <- parse_url(url) + if(is.null(parsedUrl$scheme) | is.null(parsedUrl$hostname) | is.null(parsedUrl$path)) + stop(url, " does not appear to be a valid URL.") + if(substr(parsedUrl$path, 1, 9) == 'resource/') { + return(build_url(parsedUrl)) # resource url already + } + fourByFour <- basename(parsedUrl$path) + if(!isFourByFour(fourByFour)) + stop(fourByFour, " is not a valid Socrata dataset unique identifier.") + parsedUrl$path <- paste("resource/", fourByFour, ".csv", sep="") + build_url(parsedUrl) +} + +#' Convert Socrata human-readable column name to field name +#' +#' Convert Socrata human-readable column name, +#' as it might appear in the first row of data, +#' to field name as it might appear in the HTTP header; +#' that is, lower case, periods replaced with underscores#' +#' @param humanName a Socrata human-readable column name +#' @return Socrata field name +#' @export +#' @author Hugh J. Devlin, Ph. D. \email{Hugh.Devlin@@cityofchicago.org} +#' @examples +#' fieldName("Number.of.Stations") # number_of_stations +fieldName <- function(humanName) { + tolower(gsub('\\.', '_', as.character(humanName))) +} + +#' Convert Socrata calendar_date string to POSIX +#' +#' @param x char in Socrata calendar_date format +#' @return a POSIX date +#' @export +#' @author Hugh J. Devlin, Ph. D. \email{Hugh.Devlin@@cityofchicago.org} +posixify <- function(x) { + x <- as.character(x) + # Two calendar date formats supplied by Socrata + if(regexpr("^[[:digit:]]{1,2}/[[:digit:]]{1,2}/[[:digit:]]{4}$", x[1])[1] == 1) + strptime(x, format="%m/%d/%Y") + else + strptime(x, format="%m/%d/%Y %I:%M:%S %p") +} + +#' Wrap httr GET in some diagnostics +#' +#' In case of failure, report error details from Socrata +#' Optionally do not throw an error but just produce the error log message +#' +#' @param url Socrata Open Data Application Program Interface (SODA) query +#' @param throw_error logical, should an error be thrown? +#' @return httr response object +#' @author Hugh J. Devlin, Ph. D. \email{Hugh.Devlin@@cityofchicago.org} +getResponse <- function(url, throw_error = TRUE) { + response <- GET(url) + status <- http_status(response) + if(response$status_code != 200) { + msg <- paste("Error in httr GET:", response$status_code, response$headers$statusmessage, url) + if(length(response$content)) { + details <- content(response) + msg <- paste(msg, details$code[1], details$message[1]) + } + logMsg(msg) + } + if(throw_error || response$status_code == 200){ + stop_for_status(response) + response + } +} + +#' Converts to data frame even with missing columns +#' +#' If all items are of the same length, just goes ahead and converts to df +#' If the items are of different lengths, assume the longest has all the columns, +#' fill in the gaps with NA in the other columns and return in the original column order +#' +#' @param con a list as output by content(response) +#' @return dataframe +#' @author David A Springate \email{daspringate@@gmail.com} +content_to_df <- function(con){ + lengths <- sapply(con, length) + if(all(lengths == length(con[[1]]))){ + data.frame(t(sapply(con, unlist)), stringsAsFactors = FALSE) + } else { + all_cols <- names(con[[which(sapply(con, length) == max(sapply(con, length)))[1]]]) + con <- lapply(con, function(x){ + r <- c(x, sapply(all_cols[!all_cols %in% names(x)], function(xx) NA, simplify = FALSE)) + r[all_cols] + }) + data.frame(t(sapply(con, unlist)), stringsAsFactors = FALSE) + } +} + + +#' Content parsers +#' +#' Return a data frame for csv +#' +#' @author Hugh J. Devlin \email{Hugh.Devlin@@cityofchicago.org} +#' @param response an httr response object +#' @return data frame, possibly empty +getContentAsDataFrame <- function(response) { UseMethod('response') } +getContentAsDataFrame <- function(response) { + mimeType <- response$header$'content-type' + # skip optional parameters + sep <- regexpr(';', mimeType)[1] + if(sep != -1) mimeType <- substr(mimeType, 0, sep[1] - 1) + switch(mimeType, + 'text/csv' = + content(response), # automatic parsing + 'application/json' = + if(content(response, as='text') == "[ ]") # empty json? + data.frame() # empty data frame + else + content_to_df(content(response)) + ) # end switch +} + + +#' Get the SoDA 2 data types +#' +#' Get the Socrata Open Data Application Program Interface data types from the http response header +#' @author Hugh J. Devlin, Ph. D. \email{Hugh.Devlin@@cityofchicago.org} +#' @param response an httr response object +#' @return a named vector mapping field names to data types +getSodaTypes <- function(response) { UseMethod('response') } +getSodaTypes <- function(response) { + result <- fromJSON(response$headers[['x-soda2-types']]) + names(result) <- fromJSON(response$headers[['x-soda2-fields']]) + result +} + +#' Get a full Socrata data set as an R data frame +#' +#' Manages throttling and POSIX date-time conversions +#' +#' @param url A Socrata resource URL, +#' or a Socrata "human-friendly" URL, +#' or Socrata Open Data Application Program Interface (SODA) query +#' requesting a comma-separated download format (.csv suffix), +#' May include SoQL parameters, +#' but is assumed to not include a SODA offset parameter +#' @return an R data frame with POSIX dates +#' @export +#' @author Hugh J. Devlin, Ph. D. \email{Hugh.Devlin@@cityofchicago.org} +#' @examples +#' df <- read.socrata("http://soda.demo.socrata.com/resource/4334-bgaj.csv") +read.socrata <- function(url) { + validUrl <- validateUrl(url) # check url syntax, allow human-readable Socrata url + parsedUrl <- parse_url(validUrl) + mimeType <- guess_media(parsedUrl$path) + if(!(mimeType %in% c('text/csv','application/json'))) + stop("Error in read.socrata: ", mimeType, " not a supported data format.") + response <- getResponse(validUrl) + page <- getContentAsDataFrame(response) + result <- page + dataTypes <- getSodaTypes(response) + while (nrow(page) > 0) { # more to come maybe? + query <- paste(validUrl, if(is.null(parsedUrl$query)) {'?'} else {"&"}, '$offset=', nrow(result), sep='') + response <- getResponse(query) + page <- getContentAsDataFrame(response) + result <- rbind(result, page) # accumulate + } + # convert Socrata calendar dates to posix format + for(columnName in colnames(page)[!is.na(dataTypes[fieldName(colnames(page))]) & dataTypes[fieldName(colnames(page))] == 'calendar_date']) { + result[[columnName]] <- posixify(result[[columnName]]) + } + result +} \ No newline at end of file diff --git a/R/tests/testRSocrata.R b/R/tests/testRSocrata.R index ef67842..f6c21bb 100644 --- a/R/tests/testRSocrata.R +++ b/R/tests/testRSocrata.R @@ -1,122 +1,128 @@ -# RUnit tests -# -# resource 4334-bgaj on the Socrata demo site is USGS Earthquakes for 2012-11-01 API School Demo -# -# Author: Hugh 2013-07-15 -############################################################################### - -library('RUnit') - -source("R/RSocrata.R") - -test.posixifyLong <- function() { - dt <- posixify("09/14/2012 10:38:01 PM") - checkEquals("POSIXlt", class(dt)[1], "first data type of a date") - checkEquals(2012, dt$year + 1900, "year") - checkEquals(9, dt$mon + 1, "month") - checkEquals(14, dt$mday, "day") - checkEquals(22, dt$hour, "hours") - checkEquals(38, dt$min, "minutes") - checkEquals(1, dt$sec, "seconds") -} - -test.posixifyShort <- function() { - dt <- posixify("09/14/2012") - checkEquals("POSIXlt", class(dt)[1], "first data type of a date") - checkEquals(2012, dt$year + 1900, "year") - checkEquals(9, dt$mon + 1, "month") - checkEquals(14, dt$mday, "day") - checkEquals(0, dt$hour, "hours") - checkEquals(0, dt$min, "minutes") - checkEquals(0, dt$sec, "seconds") -} - -test.readSocrataCsv <- function() { - df <- read.socrata('https://soda.demo.socrata.com/resource/4334-bgaj.csv') - checkEquals(1007, nrow(df), "rows") - checkEquals(9, ncol(df), "columns") -} - -test.readSocrataJson <- function() { - df <- read.socrata('https://soda.demo.socrata.com/resource/4334-bgaj.json') - checkEquals(1007, nrow(df), "rows") - checkEquals(11, ncol(df), "columns") -} - -test.readSocrataNoScheme <- function() { - checkException(read.socrata('soda.demo.socrata.com/resource/4334-bgaj.csv')) -} - -test.readSoQL <- function() { - df <- read.socrata('http://soda.demo.socrata.com/resource/4334-bgaj.csv?$select=region') - checkEquals(1007, nrow(df), "rows") - checkEquals(1, ncol(df), "columns") -} - -test.readSoQLColumnNotFound <- function() { - # SoQL API uses field names, not human names - checkException(read.socrata('http://soda.demo.socrata.com/resource/4334-bgaj.csv?$select=Region')) -} - -test.readSocrataHumanReadable <- function() { - df <- read.socrata('https://soda.demo.socrata.com/dataset/USGS-Earthquake-Reports/4334-bgaj') - checkEquals(1007, nrow(df), "rows") - checkEquals(9, ncol(df), "columns") -} - -test.readSocrataFormatNotSupported <- function() { - # Unsupported data formats - checkException(read.socrata('http://soda.demo.socrata.com/resource/4334-bgaj.xml')) -} - -test.readSocrataCalendarDateLong <- function() { - df <- read.socrata('http://soda.demo.socrata.com/resource/4334-bgaj.csv') - dt <- df$Datetime[1] # "2012-09-14 22:38:01" - checkEquals("POSIXlt", class(dt)[1], "data type of a date") - checkEquals(2012, dt$year + 1900, "year") - checkEquals(9, dt$mon + 1, "month") - checkEquals(14, dt$mday, "day") - checkEquals(22, dt$hour, "hours") - checkEquals(38, dt$min, "minutes") - checkEquals(1, dt$sec, "seconds") -} - -test.readSocrataCalendarDateShort <- function() { - df <- read.socrata('http://data.cityofchicago.org/resource/y93d-d9e3.csv?$order=debarment_date') - dt <- df$DEBARMENT.DATE[1] # "05/21/1981" - checkEquals("POSIXlt", class(dt)[1], "data type of a date") - checkEquals(81, dt$year, "year") - checkEquals(5, dt$mon + 1, "month") - checkEquals(21, dt$mday, "day") - checkEquals(0, dt$hour, "hours") - checkEquals(0, dt$min, "minutes") - checkEquals(0, dt$sec, "seconds") -} - -test.isFourByFour <- function() { - checkTrue(isFourByFour("4334-bgaj"), "ok") - checkTrue(!isFourByFour("4334c-bgajc"), "11 characters instead of 9") - checkTrue(!isFourByFour("433-bga"), "7 characters instead of 9") - checkTrue(!isFourByFour("433-bgaj"), "3 characters before dash instead of 4") - checkTrue(!isFourByFour("4334-!gaj"), "non-alphanumeric character") -} - -test.isFourByFourUrl <- function() { - checkException(read.socrata("https://soda.demo.socrata.com/api/views/4334c-bgajc"), "11 characters instead of 9") - checkException(read.socrata("https://soda.demo.socrata.com/api/views/433-bga"), "7 characters instead of 9") - checkException(read.socrata("https://soda.demo.socrata.com/api/views/433-bgaj"), "3 characters before dash instead of 4") - checkException(read.socrata("https://soda.demo.socrata.com/api/views/4334-!gaj"), "non-alphanumeric character") -} - -test.readSocrataInvalidUrl <- function() { - checkException(read.socrata("a.fake.url.being.tested"), "invalid url") -} - -test.suite <- defineTestSuite("test Socrata SODA interface", - dirs = file.path("R/tests"), - testFileRegexp = '^test.*\\.R') - -runAllTests <- function() { - test.result <- runTestSuite(test.suite) - printTextProtocol(test.result) -} +# RUnit tests +# +# resource 4334-bgaj on the Socrata demo site is USGS Earthquakes for 2012-11-01 API School Demo +# +# Author: Hugh 2013-07-15 +############################################################################### + +library('RUnit') + +source("R/RSocrata.R") + +test.posixifyLong <- function() { + dt <- posixify("09/14/2012 10:38:01 PM") + checkEquals("POSIXlt", class(dt)[1], "first data type of a date") + checkEquals(2012, dt$year + 1900, "year") + checkEquals(9, dt$mon + 1, "month") + checkEquals(14, dt$mday, "day") + checkEquals(22, dt$hour, "hours") + checkEquals(38, dt$min, "minutes") + checkEquals(1, dt$sec, "seconds") +} + +test.posixifyShort <- function() { + dt <- posixify("09/14/2012") + checkEquals("POSIXlt", class(dt)[1], "first data type of a date") + checkEquals(2012, dt$year + 1900, "year") + checkEquals(9, dt$mon + 1, "month") + checkEquals(14, dt$mday, "day") + checkEquals(0, dt$hour, "hours") + checkEquals(0, dt$min, "minutes") + checkEquals(0, dt$sec, "seconds") +} + +test.readSocrataCsv <- function() { + df <- read.socrata('https://soda.demo.socrata.com/resource/4334-bgaj.csv') + checkEquals(1007, nrow(df), "rows") + checkEquals(9, ncol(df), "columns") +} + +test.readSocrataJson <- function() { + df <- read.socrata('https://soda.demo.socrata.com/resource/4334-bgaj.json') + checkEquals(1007, nrow(df), "rows") + checkEquals(11, ncol(df), "columns") +} + +test.readSocrataNoScheme <- function() { + checkException(read.socrata('soda.demo.socrata.com/resource/4334-bgaj.csv')) +} + +test.readSoQL <- function() { + df <- read.socrata('http://soda.demo.socrata.com/resource/4334-bgaj.csv?$select=region') + checkEquals(1007, nrow(df), "rows") + checkEquals(1, ncol(df), "columns") +} + +test.readSoQLColumnNotFound <- function() { + # SoQL API uses field names, not human names + checkException(read.socrata('http://soda.demo.socrata.com/resource/4334-bgaj.csv?$select=Region')) +} + +test.readSocrataHumanReadable <- function() { + df <- read.socrata('https://soda.demo.socrata.com/dataset/USGS-Earthquake-Reports/4334-bgaj') + checkEquals(1007, nrow(df), "rows") + checkEquals(9, ncol(df), "columns") +} + +test.readSocrataFormatNotSupported <- function() { + # Unsupported data formats + checkException(read.socrata('http://soda.demo.socrata.com/resource/4334-bgaj.xml')) +} + +test.readSocrataCalendarDateLong <- function() { + df <- read.socrata('http://soda.demo.socrata.com/resource/4334-bgaj.csv') + dt <- df$Datetime[1] # "2012-09-14 22:38:01" + checkEquals("POSIXlt", class(dt)[1], "data type of a date") + checkEquals(2012, dt$year + 1900, "year") + checkEquals(9, dt$mon + 1, "month") + checkEquals(14, dt$mday, "day") + checkEquals(22, dt$hour, "hours") + checkEquals(38, dt$min, "minutes") + checkEquals(1, dt$sec, "seconds") +} + +test.readSocrataCalendarDateShort <- function() { + df <- read.socrata('http://data.cityofchicago.org/resource/y93d-d9e3.csv?$order=debarment_date') + dt <- df$DEBARMENT.DATE[1] # "05/21/1981" + checkEquals("POSIXlt", class(dt)[1], "data type of a date") + checkEquals(81, dt$year, "year") + checkEquals(5, dt$mon + 1, "month") + checkEquals(21, dt$mday, "day") + checkEquals(0, dt$hour, "hours") + checkEquals(0, dt$min, "minutes") + checkEquals(0, dt$sec, "seconds") +} + +test.isFourByFour <- function() { + checkTrue(isFourByFour("4334-bgaj"), "ok") + checkTrue(!isFourByFour("4334c-bgajc"), "11 characters instead of 9") + checkTrue(!isFourByFour("433-bga"), "7 characters instead of 9") + checkTrue(!isFourByFour("433-bgaj"), "3 characters before dash instead of 4") + checkTrue(!isFourByFour("4334-!gaj"), "non-alphanumeric character") +} + +test.isFourByFourUrl <- function() { + checkException(read.socrata("https://soda.demo.socrata.com/api/views/4334c-bgajc"), "11 characters instead of 9") + checkException(read.socrata("https://soda.demo.socrata.com/api/views/433-bga"), "7 characters instead of 9") + checkException(read.socrata("https://soda.demo.socrata.com/api/views/433-bgaj"), "3 characters before dash instead of 4") + checkException(read.socrata("https://soda.demo.socrata.com/api/views/4334-!gaj"), "non-alphanumeric character") +} + +test.readSocrataInvalidUrl <- function() { + checkException(read.socrata("a.fake.url.being.tested"), "invalid url") +} + +test.copeWithMissingColumns <- function(){ + url <- "http://data.undp.org/resource/wxub-qc5k.json" + df <- read.socrata(url) + checkTrue(class(df) == "data.frame", "builds dataframe with missing columns") +} + +test.suite <- defineTestSuite("test Socrata SODA interface", + dirs = file.path("R/tests"), + testFileRegexp = '^test.*\\.R') + +runAllTests <- function() { + test.result <- runTestSuite(test.suite) + printTextProtocol(test.result) +} diff --git a/README.md b/README.md index 54811b9..492f693 100644 --- a/README.md +++ b/README.md @@ -1,44 +1,46 @@ -RSocrata -======== - -A tool for downloading Socrata datasets as R data frames --------------------------------------------------------- - -Provided with a URL to a dataset resource published on a [Socrata](http://www.socrata.com) webserver, -or a Socrata [SoDA (Socrata Open Data Application Program Interface) web API](http://dev.socrata.com) query, -or a Socrata "human-friendly" URL, -returns an [R data frame](http://stat.ethz.ch/R-manual/R-devel/library/base/html/data.frame.html). -Converts dates to [POSIX](http://stat.ethz.ch/R-manual/R-devel/library/base/html/DateTimeClasses.html) format. -Supports CSV download file formats from Socrata. -Manages the throttling of data returned from Socrata. -[RUnit](http://cran.r-project.org/web/packages/RUnit/index.html) test coverage. - -### Usage example 1 - -

-earthquakesDataFrame <- read.socrata("http://soda.demo.socrata.com/resource/4334-bgaj.csv")
-nrow(earthquakesDataFrame) # 1007 (two "pages")
-class(earthquakesDataFrame$Datetime[1]) # POSIXlt -
- -### Usage example 2 - -

-earthquakesDataFrame <- read.socrata("https://soda.demo.socrata.com/dataset/USGS-Earthquakes-for-2012-11-01-API-School-Demo/4334-bgaj")
-nrow(earthquakesDataFrame) # 1007 (two "pages")
-class(earthquakesDataFrame$Datetime[1]) # POSIXlt -
- -### Issues - -Please report issues, request enhancements or fork us at the [City of Chicago github](https://github.com/Chicago/RSocrata/issues). - -### Change log - -1.1 Add check for valid Socrata resource URL. Add check for supported download file format. Add support for Socrata short dates. - -1.2 Use comma-separated file format for Socrata downloads. - -1.3 Added support for human-readable URL. - -1.4 Add json file format for Socrata downloads. Switch to RJSONIO rom rjson. +RSocrata +======== + +A tool for downloading Socrata datasets as R data frames +-------------------------------------------------------- + +Provided with a URL to a dataset resource published on a [Socrata](http://www.socrata.com) webserver, +or a Socrata [SoDA (Socrata Open Data Application Program Interface) web API](http://dev.socrata.com) query, +or a Socrata "human-friendly" URL, +returns an [R data frame](http://stat.ethz.ch/R-manual/R-devel/library/base/html/data.frame.html). +Converts dates to [POSIX](http://stat.ethz.ch/R-manual/R-devel/library/base/html/DateTimeClasses.html) format. +Supports CSV download file formats from Socrata. +Manages the throttling of data returned from Socrata. +[RUnit](http://cran.r-project.org/web/packages/RUnit/index.html) test coverage. + +### Usage example 1 + +

+earthquakesDataFrame <- read.socrata("http://soda.demo.socrata.com/resource/4334-bgaj.csv")
+nrow(earthquakesDataFrame) # 1007 (two "pages")
+class(earthquakesDataFrame$Datetime[1]) # POSIXlt +
+ +### Usage example 2 + +

+earthquakesDataFrame <- read.socrata("https://soda.demo.socrata.com/dataset/USGS-Earthquakes-for-2012-11-01-API-School-Demo/4334-bgaj")
+nrow(earthquakesDataFrame) # 1007 (two "pages")
+class(earthquakesDataFrame$Datetime[1]) # POSIXlt +
+ +### Issues + +Please report issues, request enhancements or fork us at the [City of Chicago github](https://github.com/Chicago/RSocrata/issues). + +### Change log + +1.1 Add check for valid Socrata resource URL. Add check for supported download file format. Add support for Socrata short dates. + +1.2 Use comma-separated file format for Socrata downloads. + +1.3 Added support for human-readable URL. + +1.4 Add json file format for Socrata downloads. Switch to RJSONIO rom rjson. + +1.5 Add function to allow conversion to df of files with some missing columns. Switch to jsonlite from RJSONIO.