diff --git a/src/bin/extract_metadata.c b/src/bin/extract_metadata.c index 95c17140..68bd4b60 100644 --- a/src/bin/extract_metadata.c +++ b/src/bin/extract_metadata.c @@ -155,15 +155,15 @@ static int handle_variable_sav(int index, readstat_variable_t *variable, const c } else if (hasPrefix(vformat, "YMDHMS16") == 0) { // e.g. 2013-01-31 1:02 format = EXTRACT_METADATA_FORMAT_DATE_TIME; - pattern = "yyyy-MM-dd h:mm"; + pattern = "yyyy-MM-dd hh:mm"; } else if (hasPrefix(vformat, "YMDHMS19") == 0) { // e.g. 2013-01-31 1:02:33 format = EXTRACT_METADATA_FORMAT_DATE_TIME; - pattern = "yyyy-MM-dd h:mm:ss"; + pattern = "yyyy-MM-dd hh:mm:ss"; } else if (hasPrefix(vformat, "YMDHMS19.2") == 0) { // e.g. 2013-01-31 1:02:33.72 format = EXTRACT_METADATA_FORMAT_DATE_TIME; - pattern = "yyyy-MM-dd h:mm:ss.SS+"; + pattern = "yyyy-MM-dd hh:mm:ss.SS+"; } else if (hasPrefix(vformat, "MTIME5") == 0) { // e.g. 1754:36 format = EXTRACT_METADATA_FORMAT_TIME; diff --git a/src/bin/read_csv/csv_metadata.h b/src/bin/read_csv/csv_metadata.h index 7c950850..8867461a 100644 --- a/src/bin/read_csv/csv_metadata.h +++ b/src/bin/read_csv/csv_metadata.h @@ -12,6 +12,7 @@ typedef struct csv_metadata { void *user_ctx; readstat_variable_t *variables; int* is_date; + int* is_date_time; struct json_metadata *json_md; rs_read_module_t *output_module; } csv_metadata; diff --git a/src/bin/read_csv/mod_dta.c b/src/bin/read_csv/mod_dta.c index 8ed7736a..94a099d6 100644 --- a/src/bin/read_csv/mod_dta.c +++ b/src/bin/read_csv/mod_dta.c @@ -206,7 +206,7 @@ void produce_column_header_dta(void *csv_metadata, const char *column, readstat_ break; case EXTRACT_METADATA_FORMAT_TIME: case EXTRACT_METADATA_FORMAT_DATE_TIME: - var->type = READSTAT_TYPE_INT32; + var->type = READSTAT_TYPE_DOUBLE; snprintf(var->format, sizeof(var->format), "%s", "%tC"); // %tC => is equivalent to coordinated universal time (UTC) break; @@ -385,10 +385,106 @@ static readstat_value_t value_double_dta(const char *s, size_t len, struct csv_m return value; } +static readstat_value_t value_double_date_time_dta(const char *s, size_t len, struct csv_metadata *c) { + // Handle empty or NULL strings as missing values + if (s == NULL || len == 0 || *s == '\0') { + readstat_value_t value = { + .type = READSTAT_TYPE_DOUBLE, + .is_system_missing = 1, + .v = { .double_value = NAN } + }; + return value; + } + + // Truncate the date string to 23 characters to remove the timezone offset and + // microseconds, if present. STATA does not support timezones or microseconds. + char date_time[24]; + snprintf(date_time, sizeof(date_time), "%s", s); + + // Parse date-time components + int year, month, day, hour, minute, second, msecs = 0; + int matched = sscanf( + date_time, + "%d-%d-%d %d:%d:%d.%d", + &year, &month, &day, &hour, &minute, &second, &msecs + ); + if (matched < 6 || matched > 8) { + fprintf(stderr, "%s:%d not a valid date-time: %s (expected format: yyyy-mm-dd hh:MM:SS with optional milliseconds. Datetime string is truncated at 23 characters to ignore microseconds and timezone information.)\n", __FILE__, __LINE__, date_time); + exit(EXIT_FAILURE); + } + + // Get days since the epoch for the date + char days_since_epoch_string[11]; + snprintf(days_since_epoch_string, sizeof(days_since_epoch_string), "%04d-%02d-%02d", year, month, day); + char* dest; + int days_since_epoch = readstat_dta_num_days(days_since_epoch_string, &dest); + + // Add the hours, minutes, and seconds to the days + double msecs_since_epoch = 86400000.0 * days_since_epoch + hour * 3600000.0 + minute * 60000.0 + second * 1000.0 + msecs * 1.0; + + // Adjust for leap seconds; 27 have occurred as of writing this code + // https://en.m.wikipedia.org/wiki/Leap_second + typedef struct { + int year; + int month; + int day; + } leap_second_date; + + leap_second_date leap_seconds[] = { + {1972, 6, 30}, {1972, 12, 31}, // +2 seconds in 1972 + {1973, 12, 31}, // +1 second in 1973 + {1974, 12, 31}, // +1 second in 1974 + {1975, 12, 31}, // +1 second in 1975 + {1976, 12, 31}, // +1 second in 1976 + {1977, 12, 31}, // +1 second in 1977 + {1978, 12, 31}, // +1 second in 1978 + {1979, 12, 31}, // +1 second in 1979 + {1981, 6, 30}, // +1 second in 1981 + {1982, 6, 30}, // +1 second in 1982 + {1983, 6, 30}, // +1 second in 1983 + {1985, 6, 30}, // +1 second in 1985 + {1987, 12, 31}, // +1 second in 1987 + {1989, 12, 31}, // +1 second in 1989 + {1990, 12, 31}, // +1 second in 1990 + {1992, 6, 30}, // +1 second in 1992 + {1993, 6, 30}, // +1 second in 1993 + {1994, 6, 30}, // +1 second in 1994 + {1995, 12, 31}, // +1 second in 1995 + {1997, 6, 30}, // +1 second in 1997 + {1998, 12, 31}, // +1 second in 1998 + {2005, 12, 31}, // +1 second in 2005 + {2008, 12, 31}, // +1 second in 2008 + {2012, 6, 30}, // +1 second in 2012 + {2015, 6, 30}, // +1 second in 2015 + {2016, 12, 31} // +1 second in 2016 + }; + + int leap_second_count = sizeof(leap_seconds) / sizeof(leap_seconds[0]); + int leap_seconds_to_add = 0; + + for (int i = 0; i < leap_second_count; i++) { + // If the date is after this leap second, add one second + if ( + (year > leap_seconds[i].year) || + (year == leap_seconds[i].year && month > leap_seconds[i].month) || + (year == leap_seconds[i].year && month == leap_seconds[i].month && day > leap_seconds[i].day) + ) { leap_seconds_to_add++; } + } + msecs_since_epoch += leap_seconds_to_add * 1000.0; + + readstat_value_t value = { + .type = READSTAT_TYPE_DOUBLE, + .v = { .double_value = msecs_since_epoch } + }; + + return value; +} + void produce_csv_value_dta(void *csv_metadata, const char *s, size_t len) { struct csv_metadata *c = (struct csv_metadata *)csv_metadata; readstat_variable_t *var = &c->variables[c->columns]; int is_date = c->is_date[c->columns]; + int is_date_time = c->is_date_time[c->columns]; int obs_index = c->rows - 1; // TODO: ??? readstat_value_t value; @@ -396,6 +492,8 @@ void produce_csv_value_dta(void *csv_metadata, const char *s, size_t len) { value = value_sysmiss(s, len, c); } else if (is_date) { value = value_int32_date_dta(s, len, c); + } else if (is_date_time) { + value = value_double_date_time_dta(s, len, c); } else if (var->type == READSTAT_TYPE_DOUBLE) { value = value_double_dta(s, len, c); } else if (var->type == READSTAT_TYPE_STRING) { diff --git a/src/bin/read_csv/read_csv.c b/src/bin/read_csv/read_csv.c index bdc0d548..65fb343a 100644 --- a/src/bin/read_csv/read_csv.c +++ b/src/bin/read_csv/read_csv.c @@ -46,6 +46,7 @@ static void produce_column_header(struct csv_metadata *c, void *s, size_t len) { extract_metadata_format_t colformat = column_format(c->json_md, column); c->is_date[c->columns] = colformat == EXTRACT_METADATA_FORMAT_DATE; + c->is_date_time[c->columns] = colformat == EXTRACT_METADATA_FORMAT_DATE_TIME; if (c->output_module->header) { c->output_module->header(c, column, var); } @@ -75,6 +76,7 @@ static void csv_metadata_cell(void *s, size_t len, void *data) if (c->rows == 0) { c->variables = realloc(c->variables, (c->columns+1) * sizeof(readstat_variable_t)); c->is_date = realloc(c->is_date, (c->columns+1) * sizeof(int)); + c->is_date_time = realloc(c->is_date_time, (c->columns+1) * sizeof(int)); produce_column_header(c, s, len); } else if (c->rows >= 1 && c->handle.value && c->output_module->csv_value) { c->output_module->csv_value(c, s, len); @@ -184,6 +186,10 @@ readstat_error_t readstat_parse_csv(readstat_parser_t *parser, free(md->is_date); md->is_date = NULL; } + if (md->is_date_time) { + free(md->is_date_time); + md->is_date_time = NULL; + } csv_free(p); io->close(io->io_ctx); return retval;