From a7009681ae825afa1191b2da85085c5a3de3ac74 Mon Sep 17 00:00:00 2001 From: Miguel de Benito Delgado Date: Mon, 22 Jun 2026 21:12:06 +0200 Subject: [PATCH 1/4] Handle NaT (INT64_MIN) for numpy time types --- src_cpp/numpy/numpy_scan.cpp | 16 ++++- test/test_scan_pandas.py | 131 +++++++++++++++++++++++++++++++++++ 2 files changed, 145 insertions(+), 2 deletions(-) diff --git a/src_cpp/numpy/numpy_scan.cpp b/src_cpp/numpy/numpy_scan.cpp index d11d670..0dc885c 100644 --- a/src_cpp/numpy/numpy_scan.cpp +++ b/src_cpp/numpy/numpy_scan.cpp @@ -1,5 +1,7 @@ #include "numpy/numpy_scan.h" +#include + #include "common/exception/runtime.h" #include "common/type_utils.h" #include "common/types/timestamp_t.h" @@ -121,8 +123,13 @@ void NumpyScan::scan(PandasColumnBindData* bindData, uint64_t count, uint64_t of auto dstData = reinterpret_cast(outputVector->getData()); for (auto i = 0u; i < count; i++) { auto pos = offset + i; - dstData[i].value = sourceData[pos]; - outputVector->setNull(i, false /* isNull */); + // Pandas represents NaT as the sentinel value INT64_MIN + if (sourceData[pos] == std::numeric_limits::min()) { + outputVector->setNull(i, true); + } else { + dstData[i].value = sourceData[pos]; + outputVector->setNull(i, false /* isNull */); + } } break; } @@ -131,6 +138,11 @@ void NumpyScan::scan(PandasColumnBindData* bindData, uint64_t count, uint64_t of auto dstData = reinterpret_cast(outputVector->getData()); for (auto i = 0u; i < count; i++) { auto pos = offset + i; + // Pandas represents NaT as the sentinel value INT64_MIN + if (sourceData[pos] == std::numeric_limits::min()) { + outputVector->setNull(i, true); + continue; + } auto micro = sourceData[pos] / 1000; auto days = micro / Interval::MICROS_PER_DAY; micro = micro % Interval::MICROS_PER_DAY; diff --git a/test/test_scan_pandas.py b/test/test_scan_pandas.py index 06e687f..f969e1a 100644 --- a/test/test_scan_pandas.py +++ b/test/test_scan_pandas.py @@ -766,3 +766,134 @@ def test_df_with_struct_cast(conn_db_readonly: ConnDB) -> None: assert tup[0] == "{'a': 1}" tup = res.get_next() assert tup[0] == "{'a': '2'}" + + +def test_scan_pandas_datetime_nat(conn_db_empty: ConnDB) -> None: + """Test that NaT/None in datetime64 columns are scanned as NULL.""" + conn, _ = conn_db_empty + valid_ts = np.datetime64("2024-01-15T10:30:00") + valid_ts2 = np.datetime64("2025-06-01T00:00:00") + df = pd.DataFrame( + { + "dt_s": np.array( + [valid_ts, None, pd.NaT, valid_ts2], dtype="datetime64[s]" + ), + "dt_ms": np.array( + [valid_ts, None, pd.NaT, valid_ts2], dtype="datetime64[ms]" + ), + "dt_us": np.array( + [valid_ts, None, pd.NaT, valid_ts2], dtype="datetime64[us]" + ), + "dt_ns": np.array( + [valid_ts, None, pd.NaT, valid_ts2], dtype="datetime64[ns]" + ), + } + ) + res = conn.execute("LOAD FROM df RETURN *") + # Row 0: all valid timestamps + row0 = res.get_next() + assert row0[0] == datetime.datetime(2024, 1, 15, 10, 30) + assert row0[1] == datetime.datetime(2024, 1, 15, 10, 30) + assert row0[2] == datetime.datetime(2024, 1, 15, 10, 30) + assert row0[3] == datetime.datetime(2024, 1, 15, 10, 30) + # Row 1: None -> NULL + row1 = res.get_next() + assert row1[0] is None + assert row1[1] is None + assert row1[2] is None + assert row1[3] is None + # Row 2: pd.NaT -> NULL + row2 = res.get_next() + assert row2[0] is None + assert row2[1] is None + assert row2[2] is None + assert row2[3] is None + # Row 3: valid timestamp + row3 = res.get_next() + assert row3[0] == datetime.datetime(2025, 6, 1, 0, 0) + assert row3[1] == datetime.datetime(2025, 6, 1, 0, 0) + assert row3[2] == datetime.datetime(2025, 6, 1, 0, 0) + assert row3[3] == datetime.datetime(2025, 6, 1, 0, 0) + assert not res.has_next() + + +def test_scan_pandas_timedelta_nat(conn_db_empty: ConnDB) -> None: + """Test that NaT/None in timedelta64 columns are scanned as NULL.""" + conn, _ = conn_db_empty + valid_td = np.timedelta64(1000000, "ns") # 1 millisecond + valid_td2 = np.timedelta64(5000000000, "ns") # 5 seconds + df = pd.DataFrame( + { + "td": np.array( + [valid_td, None, pd.NaT, valid_td2], dtype="timedelta64[ns]" + ), + } + ) + res = conn.execute("LOAD FROM df RETURN *") + row0 = res.get_next() + assert row0[0] == datetime.timedelta(microseconds=1000) + row1 = res.get_next() + assert row1[0] is None + row2 = res.get_next() + assert row2[0] is None + row3 = res.get_next() + assert row3[0] == datetime.timedelta(seconds=5) + assert not res.has_next() + + +def test_copy_from_datetime_nat(conn_db_empty: ConnDB) -> None: + """Test that COPY FROM with NaT datetime stores NULL in the table.""" + conn, _ = conn_db_empty + conn.execute( + "CREATE NODE TABLE Test (id INT64, ts TIMESTAMP, PRIMARY KEY (id))" + ) + valid_ts = np.datetime64("2024-01-15T10:30:00") + df = pd.DataFrame( + { + "id": np.array([1, 2], dtype=np.int64), + "ts": np.array([valid_ts, pd.NaT], dtype="datetime64[ns]"), + } + ) + conn.execute( + "COPY Test FROM (LOAD FROM $df RETURN " + "CAST(`id` AS INT64) AS `id`, " + "CAST(`ts` AS TIMESTAMP) AS `ts`)", + {"df": df}, + ) + result = conn.execute("MATCH (t:Test) RETURN t.id, t.ts ORDER BY t.id") + row1 = result.get_next() + assert row1[0] == 1 + assert row1[1] == datetime.datetime(2024, 1, 15, 10, 30) + row2 = result.get_next() + assert row2[0] == 2 + assert row2[1] is None + assert not result.has_next() + + +def test_copy_from_timedelta_nat(conn_db_empty: ConnDB) -> None: + """Test that COPY FROM with NaT timedelta stores NULL in the table.""" + conn, _ = conn_db_empty + conn.execute( + "CREATE NODE TABLE Test (id INT64, dur INTERVAL, PRIMARY KEY (id))" + ) + valid_td = np.timedelta64(3600000000000, "ns") # 1 hour + df = pd.DataFrame( + { + "id": np.array([1, 2], dtype=np.int64), + "td": np.array([valid_td, pd.NaT], dtype="timedelta64[ns]"), + } + ) + conn.execute( + "COPY Test FROM (LOAD FROM $df RETURN " + "CAST(`id` AS INT64) AS `id`, " + "CAST(`td` AS INTERVAL) AS `dur`)", + {"df": df}, + ) + result = conn.execute("MATCH (t:Test) RETURN t.id, t.dur ORDER BY t.id") + row1 = result.get_next() + assert row1[0] == 1 + assert row1[1] == datetime.timedelta(hours=1) + row2 = result.get_next() + assert row2[0] == 2 + assert row2[1] is None + assert not result.has_next() From bec2a6646e3ae73d073cf966eacceb8dfb05e89c Mon Sep 17 00:00:00 2001 From: Miguel de Benito Delgado Date: Tue, 23 Jun 2026 10:12:08 +0200 Subject: [PATCH 2/4] Test fix: Only NaT (not None) is allowed in time cols. --- test/test_scan_pandas.py | 50 ++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 27 deletions(-) diff --git a/test/test_scan_pandas.py b/test/test_scan_pandas.py index f969e1a..03c1e53 100644 --- a/test/test_scan_pandas.py +++ b/test/test_scan_pandas.py @@ -769,63 +769,57 @@ def test_df_with_struct_cast(conn_db_readonly: ConnDB) -> None: def test_scan_pandas_datetime_nat(conn_db_empty: ConnDB) -> None: - """Test that NaT/None in datetime64 columns are scanned as NULL.""" + """Test that NaT in datetime64 columns are scanned as NULL.""" conn, _ = conn_db_empty valid_ts = np.datetime64("2024-01-15T10:30:00") valid_ts2 = np.datetime64("2025-06-01T00:00:00") df = pd.DataFrame( { "dt_s": np.array( - [valid_ts, None, pd.NaT, valid_ts2], dtype="datetime64[s]" + [valid_ts, np.datetime64("NaT", "s"), np.datetime64("NaT", "s"), + valid_ts2], dtype="datetime64[s]" ), "dt_ms": np.array( - [valid_ts, None, pd.NaT, valid_ts2], dtype="datetime64[ms]" + [valid_ts, np.datetime64("NaT", "ms"), np.datetime64("NaT", "ms"), + valid_ts2], dtype="datetime64[ms]" ), "dt_us": np.array( - [valid_ts, None, pd.NaT, valid_ts2], dtype="datetime64[us]" + [valid_ts, np.datetime64("NaT", "us"), np.datetime64("NaT", "us"), + valid_ts2], dtype="datetime64[us]" ), "dt_ns": np.array( - [valid_ts, None, pd.NaT, valid_ts2], dtype="datetime64[ns]" + [valid_ts, np.datetime64("NaT", "ns"), np.datetime64("NaT", "ns"), + valid_ts2], dtype="datetime64[ns]" ), } ) res = conn.execute("LOAD FROM df RETURN *") # Row 0: all valid timestamps row0 = res.get_next() - assert row0[0] == datetime.datetime(2024, 1, 15, 10, 30) - assert row0[1] == datetime.datetime(2024, 1, 15, 10, 30) - assert row0[2] == datetime.datetime(2024, 1, 15, 10, 30) - assert row0[3] == datetime.datetime(2024, 1, 15, 10, 30) - # Row 1: None -> NULL + assert all(r == datetime.datetime(2024, 1, 15, 10, 30) for r in row0) + # Row 1: NaT -> NULL row1 = res.get_next() - assert row1[0] is None - assert row1[1] is None - assert row1[2] is None - assert row1[3] is None - # Row 2: pd.NaT -> NULL + assert not any(row1) + # Row 2: NaT -> NULL row2 = res.get_next() - assert row2[0] is None - assert row2[1] is None - assert row2[2] is None - assert row2[3] is None + assert not any(row2) # Row 3: valid timestamp row3 = res.get_next() - assert row3[0] == datetime.datetime(2025, 6, 1, 0, 0) - assert row3[1] == datetime.datetime(2025, 6, 1, 0, 0) - assert row3[2] == datetime.datetime(2025, 6, 1, 0, 0) - assert row3[3] == datetime.datetime(2025, 6, 1, 0, 0) + assert all(r == datetime.datetime(2025, 6, 1, 0, 0) for r in row3) + assert not res.has_next() def test_scan_pandas_timedelta_nat(conn_db_empty: ConnDB) -> None: - """Test that NaT/None in timedelta64 columns are scanned as NULL.""" + """Test that NaT in timedelta64 columns are scanned as NULL.""" conn, _ = conn_db_empty + nat = np.timedelta64("NaT", "ns") valid_td = np.timedelta64(1000000, "ns") # 1 millisecond valid_td2 = np.timedelta64(5000000000, "ns") # 5 seconds df = pd.DataFrame( { "td": np.array( - [valid_td, None, pd.NaT, valid_td2], dtype="timedelta64[ns]" + [valid_td, nat, nat, valid_td2], dtype="timedelta64[ns]" ), } ) @@ -848,10 +842,11 @@ def test_copy_from_datetime_nat(conn_db_empty: ConnDB) -> None: "CREATE NODE TABLE Test (id INT64, ts TIMESTAMP, PRIMARY KEY (id))" ) valid_ts = np.datetime64("2024-01-15T10:30:00") + nat = np.datetime64("NaT", "ns") df = pd.DataFrame( { "id": np.array([1, 2], dtype=np.int64), - "ts": np.array([valid_ts, pd.NaT], dtype="datetime64[ns]"), + "ts": np.array([valid_ts, nat], dtype="datetime64[ns]"), } ) conn.execute( @@ -876,11 +871,12 @@ def test_copy_from_timedelta_nat(conn_db_empty: ConnDB) -> None: conn.execute( "CREATE NODE TABLE Test (id INT64, dur INTERVAL, PRIMARY KEY (id))" ) + nat = np.timedelta64("NaT", "ns") valid_td = np.timedelta64(3600000000000, "ns") # 1 hour df = pd.DataFrame( { "id": np.array([1, 2], dtype=np.int64), - "td": np.array([valid_td, pd.NaT], dtype="timedelta64[ns]"), + "td": np.array([valid_td, nat], dtype="timedelta64[ns]"), } ) conn.execute( From 9e168b5c231ed91aea05ceb6f6b224a05a895c5a Mon Sep 17 00:00:00 2001 From: Miguel de Benito Delgado Date: Tue, 23 Jun 2026 13:16:22 +0200 Subject: [PATCH 3/4] Add test for None in time column --- test/test_scan_pandas.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/test/test_scan_pandas.py b/test/test_scan_pandas.py index 03c1e53..1cdc0b1 100644 --- a/test/test_scan_pandas.py +++ b/test/test_scan_pandas.py @@ -893,3 +893,36 @@ def test_copy_from_timedelta_nat(conn_db_empty: ConnDB) -> None: assert row2[0] == 2 assert row2[1] is None assert not result.has_next() + + +def test_copy_from_datetime_none(conn_db_empty: ConnDB) -> None: + """Test that COPY FROM with None in a datetime column stores NULL. + Pandas auto-infers the column as datetime64[ns] and converts None to NaT + """ + conn, _ = conn_db_empty + conn.execute( + "CREATE NODE TABLE Test (id INT64, ts TIMESTAMP, PRIMARY KEY (id))" + ) + df = pd.DataFrame( + { + "id": [1, 2], + "ts": [datetime.datetime(2024, 1, 15, 10, 30), None], + } + ) + # Sanity check: pandas should infer a nullable datetime64[ns] column + assert df["ts"].dtype == "datetime64[ns]" + + conn.execute( + "COPY Test FROM (LOAD FROM $df RETURN " + "CAST(`id` AS INT64) AS `id`, " + "CAST(`ts` AS TIMESTAMP) AS `ts`)", + {"df": df}, + ) + result = conn.execute("MATCH (t:Test) RETURN t.id, t.ts ORDER BY t.id") + row1 = result.get_next() + assert row1[0] == 1 + assert row1[1] == datetime.datetime(2024, 1, 15, 10, 30) + row2 = result.get_next() + assert row2[0] == 2 + assert row2[1] is None + assert not result.has_next() From c33905a439bd00dc8b138851c03415792c0c2b01 Mon Sep 17 00:00:00 2001 From: Miguel de Benito Delgado Date: Tue, 23 Jun 2026 13:27:51 +0200 Subject: [PATCH 4/4] Fix formatting --- test/test_scan_pandas.py | 56 +++++++++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 21 deletions(-) diff --git a/test/test_scan_pandas.py b/test/test_scan_pandas.py index 1cdc0b1..d797e2a 100644 --- a/test/test_scan_pandas.py +++ b/test/test_scan_pandas.py @@ -776,20 +776,40 @@ def test_scan_pandas_datetime_nat(conn_db_empty: ConnDB) -> None: df = pd.DataFrame( { "dt_s": np.array( - [valid_ts, np.datetime64("NaT", "s"), np.datetime64("NaT", "s"), - valid_ts2], dtype="datetime64[s]" + [ + valid_ts, + np.datetime64("NaT", "s"), + np.datetime64("NaT", "s"), + valid_ts2, + ], + dtype="datetime64[s]", ), "dt_ms": np.array( - [valid_ts, np.datetime64("NaT", "ms"), np.datetime64("NaT", "ms"), - valid_ts2], dtype="datetime64[ms]" + [ + valid_ts, + np.datetime64("NaT", "ms"), + np.datetime64("NaT", "ms"), + valid_ts2, + ], + dtype="datetime64[ms]", ), "dt_us": np.array( - [valid_ts, np.datetime64("NaT", "us"), np.datetime64("NaT", "us"), - valid_ts2], dtype="datetime64[us]" + [ + valid_ts, + np.datetime64("NaT", "us"), + np.datetime64("NaT", "us"), + valid_ts2, + ], + dtype="datetime64[us]", ), "dt_ns": np.array( - [valid_ts, np.datetime64("NaT", "ns"), np.datetime64("NaT", "ns"), - valid_ts2], dtype="datetime64[ns]" + [ + valid_ts, + np.datetime64("NaT", "ns"), + np.datetime64("NaT", "ns"), + valid_ts2, + ], + dtype="datetime64[ns]", ), } ) @@ -818,9 +838,7 @@ def test_scan_pandas_timedelta_nat(conn_db_empty: ConnDB) -> None: valid_td2 = np.timedelta64(5000000000, "ns") # 5 seconds df = pd.DataFrame( { - "td": np.array( - [valid_td, nat, nat, valid_td2], dtype="timedelta64[ns]" - ), + "td": np.array([valid_td, nat, nat, valid_td2], dtype="timedelta64[ns]"), } ) res = conn.execute("LOAD FROM df RETURN *") @@ -838,9 +856,7 @@ def test_scan_pandas_timedelta_nat(conn_db_empty: ConnDB) -> None: def test_copy_from_datetime_nat(conn_db_empty: ConnDB) -> None: """Test that COPY FROM with NaT datetime stores NULL in the table.""" conn, _ = conn_db_empty - conn.execute( - "CREATE NODE TABLE Test (id INT64, ts TIMESTAMP, PRIMARY KEY (id))" - ) + conn.execute("CREATE NODE TABLE Test (id INT64, ts TIMESTAMP, PRIMARY KEY (id))") valid_ts = np.datetime64("2024-01-15T10:30:00") nat = np.datetime64("NaT", "ns") df = pd.DataFrame( @@ -868,9 +884,7 @@ def test_copy_from_datetime_nat(conn_db_empty: ConnDB) -> None: def test_copy_from_timedelta_nat(conn_db_empty: ConnDB) -> None: """Test that COPY FROM with NaT timedelta stores NULL in the table.""" conn, _ = conn_db_empty - conn.execute( - "CREATE NODE TABLE Test (id INT64, dur INTERVAL, PRIMARY KEY (id))" - ) + conn.execute("CREATE NODE TABLE Test (id INT64, dur INTERVAL, PRIMARY KEY (id))") nat = np.timedelta64("NaT", "ns") valid_td = np.timedelta64(3600000000000, "ns") # 1 hour df = pd.DataFrame( @@ -896,13 +910,13 @@ def test_copy_from_timedelta_nat(conn_db_empty: ConnDB) -> None: def test_copy_from_datetime_none(conn_db_empty: ConnDB) -> None: - """Test that COPY FROM with None in a datetime column stores NULL. + """ + Test that COPY FROM with None in a datetime column stores NULL. + Pandas auto-infers the column as datetime64[ns] and converts None to NaT """ conn, _ = conn_db_empty - conn.execute( - "CREATE NODE TABLE Test (id INT64, ts TIMESTAMP, PRIMARY KEY (id))" - ) + conn.execute("CREATE NODE TABLE Test (id INT64, ts TIMESTAMP, PRIMARY KEY (id))") df = pd.DataFrame( { "id": [1, 2],