diff --git a/.github/workflows/SQLiteTests.yml b/.github/workflows/SQLiteTests.yml index c2f404c..4525b41 100644 --- a/.github/workflows/SQLiteTests.yml +++ b/.github/workflows/SQLiteTests.yml @@ -28,9 +28,19 @@ jobs: sudo apt-get update -y -q -o=Dpkg::Use-Pty=0 sudo apt-get install -y -q -o=Dpkg::Use-Pty=0 \ build-essential \ + libcurl4-openssl-dev \ + libssl-dev \ ccache \ cmake \ - ninja-build + ninja-build \ + python3 + + - name: Install rclone + run: | + # apt's rclone predates `serve s3`; install a pinned recent build for the rclone-backed tests. + curl -fsSL https://downloads.rclone.org/v1.74.2/rclone-v1.74.2-linux-amd64.zip -o /tmp/rclone.zip + sudo unzip -oj /tmp/rclone.zip '*/rclone' -d /usr/local/bin + rclone version - name: Cache Key id: cache_key @@ -57,12 +67,27 @@ jobs: key: ${{ steps.cache_key.outputs.value }} - name: Test extension - env: + env: LOCAL_EXTENSION_REPO: ${{ github.workspace }}/build/release/repository/ SQLITE_TPCH_GENERATED: 1 run: | make data/db/tpch.db - ./build/release/test/unittest 'test/*' + # Wrap unittest with the local HTTP server so the remote tests (require-env SQLITE_HTTP_TEST_URL) + # run instead of skipping. The concurrency + s3 tests gate on other env vars and run in the steps below. + python3 scripts/sqlite_http_test_server.py ./build/release/test/unittest 'test/*' + + - name: Test concurrency (rclone serve http) + run: | + # The concurrent-scan and concurrent-lifecycle tests need a robust server; rclone serve http + # handles them (--server rclone-http sets SQLITE_HTTP_ROBUST so they run). + python3 scripts/sqlite_http_test_server.py --server rclone-http \ + ./build/release/test/unittest 'test/sql/scanner/http_sqlite_*concurrent*' + + - name: Test s3:// path (rclone serve s3) + run: | + # Exercise the s3:// transport + the credentialed -wal/-journal sidecar probe via rclone serve s3. + python3 scripts/sqlite_http_test_server.py --server rclone-s3 \ + ./build/release/test/unittest 'test/sql/scanner/http_sqlite_16_s3.test' linux-sanitized: name: Linux ${{ matrix.sanitizer.name }} (amd64) @@ -105,10 +130,20 @@ jobs: sudo apt-get update -y -q -o=Dpkg::Use-Pty=0 sudo apt-get install -y -q -o=Dpkg::Use-Pty=0 \ build-essential \ + libcurl4-openssl-dev \ + libssl-dev \ ccache \ cmake \ mold \ - ninja-build + ninja-build \ + python3 + + - name: Install rclone + run: | + # apt's rclone predates `serve s3`; install a pinned recent build for the rclone-backed tests. + curl -fsSL https://downloads.rclone.org/v1.74.2/rclone-v1.74.2-linux-amd64.zip -o /tmp/rclone.zip + sudo unzip -oj /tmp/rclone.zip '*/rclone' -d /usr/local/bin + rclone version - name: Cache Key id: cache_key @@ -138,7 +173,16 @@ jobs: key: ${{ steps.cache_key.outputs.value }} - name: Test extension - env: + env: LOCAL_EXTENSION_REPO: ${{ github.workspace }}/build/relassert/repository/ run: | - ./build/relassert/test/unittest + # Run under the local HTTP server so the remote tests run (not skip) under the sanitizers -- + # this exercises the VFS lifetime/refcount paths under ASan/TSan. + python3 scripts/sqlite_http_test_server.py ./build/relassert/test/unittest + + - name: Test concurrency under sanitizer (rclone serve http) + run: | + # The concurrency tests under the sanitizers: many contexts register/open/scan/unregister at once, + # contending the VFS registry to surface races / use-after-free in the retire-then-reap path. + python3 scripts/sqlite_http_test_server.py --server rclone-http \ + ./build/relassert/test/unittest 'test/sql/scanner/http_sqlite_*concurrent*' diff --git a/.gitignore b/.gitignore index 25dd41a..41978da 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ .DS_Store .idea +.cache build cmake-build-debug sqlite/build diff --git a/data/db/clean_journal.db b/data/db/clean_journal.db new file mode 100644 index 0000000..805fbd9 Binary files /dev/null and b/data/db/clean_journal.db differ diff --git a/data/db/clean_journal.db-journal b/data/db/clean_journal.db-journal new file mode 100644 index 0000000..a64a5a9 Binary files /dev/null and b/data/db/clean_journal.db-journal differ diff --git a/data/db/hot_journal.db b/data/db/hot_journal.db new file mode 100644 index 0000000..805fbd9 Binary files /dev/null and b/data/db/hot_journal.db differ diff --git a/data/db/hot_journal.db-journal b/data/db/hot_journal.db-journal new file mode 100644 index 0000000..5d7a9a7 Binary files /dev/null and b/data/db/hot_journal.db-journal differ diff --git a/data/db/not_a_database.txt b/data/db/not_a_database.txt new file mode 100644 index 0000000..779b578 --- /dev/null +++ b/data/db/not_a_database.txt @@ -0,0 +1 @@ +This is not a SQLite database. Used by http_sqlite error-handling tests. diff --git a/data/db/partial_journal.db b/data/db/partial_journal.db new file mode 100644 index 0000000..805fbd9 Binary files /dev/null and b/data/db/partial_journal.db differ diff --git a/data/db/partial_journal.db-journal b/data/db/partial_journal.db-journal new file mode 100644 index 0000000..80a61fd Binary files /dev/null and b/data/db/partial_journal.db-journal differ diff --git a/data/db/wal_checkpointed.db b/data/db/wal_checkpointed.db new file mode 100644 index 0000000..b7a93a1 Binary files /dev/null and b/data/db/wal_checkpointed.db differ diff --git a/data/db/wal_mode_snapshot.db b/data/db/wal_mode_snapshot.db new file mode 100644 index 0000000..159e90a Binary files /dev/null and b/data/db/wal_mode_snapshot.db differ diff --git a/data/db/wal_mode_snapshot.db-wal b/data/db/wal_mode_snapshot.db-wal new file mode 100644 index 0000000..76ca81e Binary files /dev/null and b/data/db/wal_mode_snapshot.db-wal differ diff --git a/duckdb b/duckdb index a966898..5013ea6 160000 --- a/duckdb +++ b/duckdb @@ -1 +1 @@ -Subproject commit a966898d86b58ce31dc4955897f8d3f99db1bd83 +Subproject commit 5013ea6077186dfbfa162a0af62e9d9501475f31 diff --git a/extension_config.cmake b/extension_config.cmake index f18df4c..60c27a6 100644 --- a/extension_config.cmake +++ b/extension_config.cmake @@ -3,8 +3,21 @@ # Extension from this repo duckdb_extension_load(sqlite_scanner SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR} + LOAD_TESTS ) # Any extra extensions that should be built # e.g.: duckdb_extension_load(json) -duckdb_extension_load(tpch) \ No newline at end of file +duckdb_extension_load(tpch) + +# The remote-SQLite tests need httpfs, pinned to the commit the duckdb engine coordinates with +# (APPLY_PATCHES applies the engine's bundled httpfs patches for the dev-engine API; engine and httpfs +# move independently and skew between releases). Skipped on WASM: the remote tests do not run there, +# and httpfs's OpenSSL dependency does not build for emscripten. +if(NOT EMSCRIPTEN) + duckdb_extension_load(httpfs + GIT_URL https://github.com/duckdb/duckdb-httpfs + GIT_TAG 53c5b032f6c368cfcc1a1ac3819118e86d3286a6 + APPLY_PATCHES + ) +endif() diff --git a/scripts/sqlite_http_test_server.py b/scripts/sqlite_http_test_server.py new file mode 100644 index 0000000..808f2b8 --- /dev/null +++ b/scripts/sqlite_http_test_server.py @@ -0,0 +1,258 @@ +#!/usr/bin/env python3 +"""Range-capable HTTP server for the remote-SQLite tests, plus a test command runner. + +DuckDB's httpfs reads remote files with HTTP range requests, so the stock +``python -m http.server`` (which ignores ``Range:``) is not enough. This server: + + * serves files from ``data/db`` with HEAD + ranged GET (206 / Content-Range), and + * exposes ``/status/`` endpoints that return an arbitrary HTTP status, + +so the whole remote suite runs hermetically with no third-party URLs. + +It starts a server, exports the relevant env vars into the child's environment, runs the child, tears +the server down, and exits with the child's return code. + +Usage: + sqlite_http_test_server.py [--server MODE] [args...] + +MODE (default ``stdlib``): + * ``stdlib`` -- the built-in threaded server above (HTTP + /status + /walerr error injection). + Reliable for the serial + error-injection tests. + * ``rclone-http`` -- ``rclone serve http`` (a robust Go server) for the concurrency stress test, which + overwhelms the stdlib server. Sets SQLITE_HTTP_TEST_URL + SQLITE_HTTP_ROBUST. + * ``rclone-s3`` -- ``rclone serve s3`` so the suite can exercise the ``s3://`` path (incl. the + credentialed ``-wal``/``-journal`` sidecar probe). Sets SQLITE_S3_TEST_*. +The rclone modes require the ``rclone`` binary on PATH; tests gate on the env vars above so they skip +where it is not wired. +""" + +import contextlib +import os +import re +import socket +import subprocess +import sys +import tempfile +import threading +import time +from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer +from pathlib import Path + +SERVE_DIR = (Path(__file__).resolve().parent.parent / "data" / "db").resolve() +STATUS_RE = re.compile(r"^/status/(\d{3})$") +# /walerr//: serve normally, but return for its "-wal"/"-journal" sidecar. +# Lets the WAL fail-closed guard be tested when the sidecar's state is unverifiable (non-404 error). +WALERR_RE = re.compile(r"^/walerr/(\d{3})/(.+)$") + + +class Handler(BaseHTTPRequestHandler): + protocol_version = "HTTP/1.1" + + def log_message(self, *args): # silence per-request logging + pass + + def _resolve(self, rel): + # Map a relative path to a file under SERVE_DIR, refusing path traversal. Returns None if it + # escapes SERVE_DIR or is not a file. + target = (SERVE_DIR / rel).resolve() + if SERVE_DIR not in target.parents and target != SERVE_DIR: + return None + return target if target.is_file() else None + + def _status_endpoint(self): + m = STATUS_RE.match(self.path.split("?", 1)[0]) + return int(m.group(1)) if m else None + + def _send_simple(self, code, body=b""): + self.send_response(code) + self.send_header("Content-Length", str(len(body))) + self.send_header("Content-Type", "text/plain") + self.end_headers() + if body and self.command != "HEAD": + self.wfile.write(body) + + def do_HEAD(self): + self._serve(head_only=True) + + def do_GET(self): + self._serve(head_only=False) + + def _serve(self, head_only): + code = self._status_endpoint() + if code is not None: + # Emulate httpbin-style status endpoints for error-mapping tests. + self._send_simple(code, f"status {code}\n".encode()) + return + + walerr = WALERR_RE.match(self.path.split("?", 1)[0]) + if walerr: + forced, rel = int(walerr.group(1)), walerr.group(2) + if rel.endswith("-wal") or rel.endswith("-journal"): + # The WAL guard's sidecar probe lands here: return the forced (non-404) error so the + # probe cannot confirm the sidecar absent, and the open must fail closed. + self._send_simple(forced, f"status {forced}\n".encode()) + return + target = self._resolve(rel) + if target is None: + self._send_simple(404, b"not found\n") + return + self._serve_file(target, head_only) + return + + target = self._resolve(self.path.lstrip("/").split("?", 1)[0]) + if target is None: + self._send_simple(404, b"not found\n") + return + + self._serve_file(target, head_only) + + def _serve_file(self, target, head_only): + # Seek+read only the requested span rather than reading the whole file per request: DuckDB's + # CachingFileSystem issues many ranged GETs (more so under parallel scans), and the fixtures + # can be several MB. Only the single `bytes=start-end` (or open-ended `bytes=start-`) form + # httpfs issues is handled; suffix (`bytes=-N`) and multi-range requests are not. + size = target.stat().st_size + rng = self.headers.get("Range") + if rng: + m = re.match(r"bytes=(\d*)-(\d*)", rng.strip()) + if m: + start = int(m.group(1)) if m.group(1) else 0 + end = int(m.group(2)) if m.group(2) else size - 1 + end = min(end, size - 1) + if start > end or start >= size: + self.send_response(416) + self.send_header("Content-Range", f"bytes */{size}") + self.send_header("Content-Length", "0") + self.end_headers() + return + length = end - start + 1 + self.send_response(206) + self.send_header("Content-Type", "application/octet-stream") + self.send_header("Accept-Ranges", "bytes") + self.send_header("Content-Range", f"bytes {start}-{end}/{size}") + self.send_header("Content-Length", str(length)) + self.end_headers() + if not head_only: + with open(target, "rb") as f: + f.seek(start) + self.wfile.write(f.read(length)) + return + + self.send_response(200) + self.send_header("Content-Type", "application/octet-stream") + self.send_header("Accept-Ranges", "bytes") + self.send_header("Content-Length", str(size)) + self.end_headers() + if not head_only: + with open(target, "rb") as f: + self.wfile.write(f.read()) + + +class Server(ThreadingHTTPServer): + # Many DuckDB connections (each parallel-scanning) can hit this server at once. socketserver's + # default listen backlog is 5, so excess simultaneous connections would be reset; raise it, reap + # threads as daemons, and allow address reuse. + request_queue_size = 128 + daemon_threads = True + allow_reuse_address = True + + +# rclone-s3 credentials: the s3 test's CREATE SECRET must use these exact values (see http_sqlite_16). +S3_KEY = "testkey" +S3_SECRET = "testsecret" + + +def _free_port(): + with contextlib.closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s: + s.bind(("127.0.0.1", 0)) + return s.getsockname()[1] + + +def _wait_for_port(port, timeout=30.0): + deadline = time.monotonic() + timeout + while time.monotonic() < deadline: + with contextlib.closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s: + s.settimeout(0.5) + if s.connect_ex(("127.0.0.1", port)) == 0: + return True + time.sleep(0.1) + return False + + +def run_stdlib(cmd): + server = Server(("127.0.0.1", 0), Handler) + port = server.server_address[1] + thread = threading.Thread(target=server.serve_forever, daemon=True) + thread.start() + env = dict(os.environ) + env["SQLITE_HTTP_TEST_URL"] = f"http://127.0.0.1:{port}" + try: + return subprocess.run(cmd, env=env).returncode + finally: + server.shutdown() + server.server_close() + + +def run_rclone(cmd, scheme): + # scheme "http": rclone serve http (robust file server for the concurrency stress test). + # scheme "s3": rclone serve s3 over data/'s parent so data/db is the "db" bucket (s3://db/). + port = _free_port() + env = dict(os.environ) + if scheme == "http": + args = ["rclone", "serve", "http", "--addr", f"127.0.0.1:{port}", "--read-only", str(SERVE_DIR)] + env["SQLITE_HTTP_TEST_URL"] = f"http://127.0.0.1:{port}" + env["SQLITE_HTTP_ROBUST"] = "1" # marker so the concurrency test runs only on the robust server + else: # s3 + args = [ + "rclone", + "serve", + "s3", + "--addr", + f"127.0.0.1:{port}", + "--auth-key", + f"{S3_KEY},{S3_SECRET}", + str(SERVE_DIR.parent), + ] + env["SQLITE_S3_TEST_ENDPOINT"] = f"127.0.0.1:{port}" + env["SQLITE_S3_TEST_URL"] = f"s3://{SERVE_DIR.name}" # s3://db + proc_err = tempfile.TemporaryFile() + proc = subprocess.Popen(args, stdout=subprocess.DEVNULL, stderr=proc_err) + try: + if not _wait_for_port(port): + proc_err.seek(0) + detail = proc_err.read().decode(errors="replace").strip() + print(f"rclone serve {scheme} did not become ready on port {port}: {detail}", file=sys.stderr) + return 3 + return subprocess.run(cmd, env=env).returncode + finally: + proc.terminate() + try: + proc.wait(timeout=10) + except subprocess.TimeoutExpired: + proc.kill() + + +def main(): + args = sys.argv[1:] + mode = "stdlib" + if args and args[0] == "--server": + if len(args) < 3: + print("usage: --server [args...]", file=sys.stderr) + return 2 + mode = args[1] + args = args[2:] + if not args: + print("usage: sqlite_http_test_server.py [--server MODE] [args...]", file=sys.stderr) + return 2 + if mode == "stdlib": + return run_stdlib(args) + if mode == "rclone-http": + return run_rclone(args, "http") + if mode == "rclone-s3": + return run_rclone(args, "s3") + print(f"unknown server mode: {mode}", file=sys.stderr) + return 2 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index f514eca..3ce9f4f 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -5,7 +5,7 @@ add_subdirectory(storage) add_library( sqlite_ext_library OBJECT - sqlite_db.cpp sqlite_extension.cpp sqlite_scanner.cpp sqlite_stmt.cpp + sqlite_db.cpp sqlite_duckdb_vfs_cache.cpp sqlite_extension.cpp sqlite_scanner.cpp sqlite_stmt.cpp sqlite_storage.cpp sqlite_utils.cpp) set(ALL_OBJECT_FILES ${ALL_OBJECT_FILES} $ diff --git a/src/include/sqlite_db.hpp b/src/include/sqlite_db.hpp index 8c156b2..7e15f0d 100644 --- a/src/include/sqlite_db.hpp +++ b/src/include/sqlite_db.hpp @@ -14,6 +14,7 @@ namespace duckdb { class SQLiteStatement; struct IndexInfo; +class ClientContext; class SQLiteDB { public: @@ -30,7 +31,13 @@ class SQLiteDB { sqlite3 *db; public: + //! Open a SQLite database (local files only) static SQLiteDB Open(const string &path, const SQLiteOpenOptions &options, bool is_shared = false); + //! Open a SQLite database, routing paths DuckDB's FileSystem owns (remote, WASM) through the VFS + //! and plain local files through native SQLite + //! @param context Required for opening through DuckDB's FileSystem VFS + static SQLiteDB Open(const string &path, const SQLiteOpenOptions &options, ClientContext &context, + bool is_shared = false); bool TryPrepare(const string &query, SQLiteStatement &result); SQLiteStatement Prepare(const string &query); void Execute(const string &query); @@ -53,6 +60,16 @@ class SQLiteDB { bool IsOpen(); void Close(); + +private: + static int GetOpenFlags(const SQLiteOpenOptions &options, bool is_shared, bool force_read_only = false); + static void ApplyBusyTimeout(sqlite3 *db, const SQLiteOpenOptions &options); + static void HandleOpenError(const string &path, int rc, ClientContext *context = nullptr); + //! Open read-only through DuckDB's FileSystem (the caching VFS) + static SQLiteDB OpenWithVFS(const string &path, const SQLiteOpenOptions &options, ClientContext &context, + bool is_shared); + //! Open a plain local SQLite database file through native SQLite + static SQLiteDB OpenLocal(const string &path, const SQLiteOpenOptions &options, bool is_shared = false); }; } // namespace duckdb diff --git a/src/include/sqlite_duckdb_vfs_cache.hpp b/src/include/sqlite_duckdb_vfs_cache.hpp new file mode 100644 index 0000000..3630db1 --- /dev/null +++ b/src/include/sqlite_duckdb_vfs_cache.hpp @@ -0,0 +1,112 @@ +//===----------------------------------------------------------------------===// +// DuckDB +// +// sqlite_duckdb_vfs_cache.hpp +// +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include "duckdb.hpp" +#include "duckdb/common/file_system.hpp" +#include "duckdb/common/mutex.hpp" +#include "duckdb/storage/buffer/buffer_handle.hpp" +#include "duckdb/storage/buffer_manager.hpp" +#include "duckdb/storage/external_file_cache/caching_file_system.hpp" + +#include "sqlite3.h" + +namespace duckdb { + +class ClientContext; +struct DuckDBVFSWrapper; + +class DuckDBCachedFile { +public: + DuckDBCachedFile(ClientContext &context, const string &path); + ~DuckDBCachedFile() = default; + + int Read(void *buffer, int amount, sqlite3_int64 offset); + sqlite3_int64 GetFileSize(); + +private: + void EnsureInitialized(); + // Open the remote file, fetch its size, and run the WAL-mode guard. Throws on any failure; + // EnsureInitialized() wraps it to memoize that failure. + void InitializeFromRemote(); + + ClientContext &context; + const string path; + // CachingFileHandle stores a *reference* to the CachingFileSystem that created it + // (and dereferences it on every Read to reach the DatabaseInstance / TaskScheduler), + // so the CachingFileSystem must outlive the handle. CachingFileSystem::Get() returns + // a value, so we own it here. Declared before caching_handle so it is destroyed after it. + unique_ptr caching_fs; + unique_ptr caching_handle; + bool initialized = false; + // EnsureInitialized() opens the remote file (an HTTP round trip or two). If that fails it is + // remembered here so later Read()/GetFileSize() calls fail immediately with the same message + // rather than re-issuing the network I/O on every SQLite page request. + bool init_failed = false; + string init_error_message; + // File size, fetched once at initialization. The remote file is read-only and immutable, + // so the size never changes; -1 means "not yet initialized / size lookup failed". + sqlite3_int64 cached_file_size = -1; +}; + +class SQLiteDuckDBCacheVFS { +public: + static void Register(ClientContext &context); + static void Unregister(ClientContext &context); + // Returns true when the path should be opened through this VFS (DuckDB's FileSystem) rather than + // native SQLite: any path DuckDB treats as remote, plus all paths on WASM (no native file I/O). + static bool CanHandlePath(const string &path); + // Returned by value so the name stays valid after the registry lock is released + // (the underlying buffer can be freed by Unregister). + static string GetVFSNameForContext(ClientContext &context); + // Returns the rich httpfs error (e.g. "HTTP Error: ... (HTTP NNN)") recorded by this context's VFS. + // Empty if none. Copied under the registry lock. Enriches the user-facing open error that SQLite + // otherwise collapses to a terse per-code string. + static string GetLastErrorForContext(ClientContext &context); + // Reset the recorded error before an open attempt so a later GetLastErrorForContext reflects only + // that attempt (the message persists across opens on the same context otherwise). + static void ClearLastErrorForContext(ClientContext &context); + + // Must be public for C callback registration. + static int Open(sqlite3_vfs *vfs, const char *filename, sqlite3_file *file, int flags, int *out_flags); + static int Delete(sqlite3_vfs *vfs, const char *filename, int sync_dir); + static int Access(sqlite3_vfs *vfs, const char *filename, int flags, int *result); + static int FullPathname(sqlite3_vfs *vfs, const char *filename, int out_size, char *out_buf); + static void *DlOpen(sqlite3_vfs *vfs, const char *filename); + static void DlError(sqlite3_vfs *vfs, int bytes, char *err_msg); + static void (*DlSym(sqlite3_vfs *vfs, void *handle, const char *symbol))(void); + static void DlClose(sqlite3_vfs *vfs, void *handle); + static int Randomness(sqlite3_vfs *vfs, int bytes, char *out); + static int Sleep(sqlite3_vfs *vfs, int microseconds); + static int CurrentTime(sqlite3_vfs *vfs, double *time); + static int GetLastError(sqlite3_vfs *vfs, int bytes, char *err_msg); + + static int Close(sqlite3_file *file); + static int Read(sqlite3_file *file, void *buffer, int amount, sqlite3_int64 offset); + static int Write(sqlite3_file *file, const void *buffer, int amount, sqlite3_int64 offset); + static int Truncate(sqlite3_file *file, sqlite3_int64 size); + static int Sync(sqlite3_file *file, int flags); + static int FileSize(sqlite3_file *file, sqlite3_int64 *size); + static int Lock(sqlite3_file *file, int level); + static int Unlock(sqlite3_file *file, int level); + static int CheckReservedLock(sqlite3_file *file, int *result); + static int FileControl(sqlite3_file *file, int op, void *arg); + static int SectorSize(sqlite3_file *file); + static int DeviceCharacteristics(sqlite3_file *file); +}; + +// Allocated by SQLite; may cross module boundaries -- raw pointers with explicit ownership. +struct SQLiteDuckDBCachedFile { + sqlite3_file base; // Must be first member for C compatibility + DuckDBCachedFile *duckdb_file; // deleted in Close() + DuckDBVFSWrapper *wrapper; // VFS instance owning the io_methods this file points into; + // refcounted so the wrapper outlives its open files +}; + +} // namespace duckdb diff --git a/src/include/storage/sqlite_transaction.hpp b/src/include/storage/sqlite_transaction.hpp index 0a71689..ecdc8aa 100644 --- a/src/include/storage/sqlite_transaction.hpp +++ b/src/include/storage/sqlite_transaction.hpp @@ -10,6 +10,8 @@ #include "duckdb/transaction/transaction.hpp" #include "duckdb/common/case_insensitive_map.hpp" +#include "duckdb/common/mutex.hpp" +#include "duckdb/common/atomic.hpp" #include "sqlite_db.hpp" namespace duckdb { @@ -38,6 +40,16 @@ class SQLiteTransaction : public Transaction { SQLiteDB *db; SQLiteDB owned_db; unique_ptr catalog_map; + + // Remote (HTTP/S3) databases defer their connection open + BEGIN to first use (GetDB()) so no + // network I/O runs while the MetaTransaction lock is held in StartTransaction(). Local and + // in-memory databases open eagerly in the constructor. `initialized` is atomic for the + // lock-free fast path in GetDB(); `started` is written in Start() (before any scan thread) or under + // init_lock in GetDB(), and read in Start(), Commit, and Rollback (the latter two after the scans + // join), so it needs no atomic. + mutex init_lock; + atomic initialized {false}; + bool started = false; }; } // namespace duckdb diff --git a/src/include/storage/sqlite_transaction_manager.hpp b/src/include/storage/sqlite_transaction_manager.hpp index 8f9cafc..b6a736d 100644 --- a/src/include/storage/sqlite_transaction_manager.hpp +++ b/src/include/storage/sqlite_transaction_manager.hpp @@ -26,6 +26,8 @@ class SQLiteTransactionManager : public TransactionManager { void Checkpoint(ClientContext &context, bool force = false) override; private: + void ExtractAndCloseAfterUnlock(Transaction &transaction); + SQLiteCatalog &sqlite_catalog; mutex transaction_lock; reference_map_t> transactions; diff --git a/src/sqlite_db.cpp b/src/sqlite_db.cpp index 27eaf27..e02cdb0 100644 --- a/src/sqlite_db.cpp +++ b/src/sqlite_db.cpp @@ -4,8 +4,12 @@ #include "duckdb/storage/table_storage_info.hpp" #include "duckdb/parser/column_list.hpp" #include "duckdb/parser/parser.hpp" +#include "duckdb/common/exception.hpp" +#include "duckdb/common/file_system.hpp" +#include "duckdb/main/client_context.hpp" #include "sqlite_db.hpp" #include "sqlite_stmt.hpp" +#include "sqlite_duckdb_vfs_cache.hpp" namespace duckdb { @@ -21,7 +25,9 @@ SQLiteDB::~SQLiteDB() { Close(); } -SQLiteDB::SQLiteDB(SQLiteDB &&other) noexcept { +SQLiteDB::SQLiteDB(SQLiteDB &&other) noexcept : db(nullptr) { + // db must be initialized before the swap: as a constructor, the member is otherwise indeterminate, + // and swapping garbage into `other` would make other's destructor sqlite3_close() a wild pointer. std::swap(db, other.db); } @@ -30,41 +36,154 @@ SQLiteDB &SQLiteDB::operator=(SQLiteDB &&other) noexcept { return *this; } -SQLiteDB SQLiteDB::Open(const string &path, const SQLiteOpenOptions &options, bool is_shared) { - SQLiteDB result; +int SQLiteDB::GetOpenFlags(const SQLiteOpenOptions &options, bool is_shared, bool force_read_only) { int flags = SQLITE_OPEN_PRIVATECACHE; - if (options.access_mode == AccessMode::READ_ONLY) { + + if (force_read_only || options.access_mode == AccessMode::READ_ONLY) { + // VFS-backed opens (remote/WASM) are always read-only flags |= SQLITE_OPEN_READONLY; } else { flags |= SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE; } + if (!is_shared) { // FIXME: we should just make sure we are not re-using the same `sqlite3` // object across threads flags |= SQLITE_OPEN_NOMUTEX; } + flags |= SQLITE_OPEN_EXRESCODE; - auto rc = sqlite3_open_v2(path.c_str(), &result.db, flags, nullptr); - if (rc != SQLITE_OK) { - throw std::runtime_error("Unable to open database \"" + path + "\": " + string(sqlite3_errstr(rc))); - } - // default busy time-out of 5 seconds + return flags; +} + +void SQLiteDB::ApplyBusyTimeout(sqlite3 *db, const SQLiteOpenOptions &options) { if (options.busy_timeout > 0) { if (options.busy_timeout > NumericLimits::Maximum()) { throw std::runtime_error("busy_timeout out of range - must be within " "valid range for type int"); } - rc = sqlite3_busy_timeout(result.db, int(options.busy_timeout)); + auto rc = sqlite3_busy_timeout(db, int(options.busy_timeout)); if (rc != SQLITE_OK) { throw std::runtime_error("Failed to set busy timeout"); } } +} + +void SQLiteDB::HandleOpenError(const string &path, int rc, ClientContext *context) { + string error_msg; + int primary_error = rc & 0xFF; + + // True when the open went through our VFS: OpenWithVFS passes a context (OpenLocal does not), and + // it is only reached for paths the VFS handles. + const bool used_vfs = context && SQLiteDuckDBCacheVFS::CanHandlePath(path); + + switch (primary_error) { + case SQLITE_CANTOPEN: + error_msg = "unable to open database file"; + break; + case SQLITE_PERM: + error_msg = "access permission denied"; + break; + case SQLITE_IOERR: + // Through the VFS, I/O errors are filesystem/network issues, not a local disk fault + if (used_vfs) { + error_msg = "unable to open database file"; + } else { + error_msg = "disk I/O error"; + } + break; + case SQLITE_BUSY: + error_msg = "database is locked"; + break; + case SQLITE_NOMEM: + error_msg = "out of memory"; + break; + case SQLITE_READONLY: + error_msg = "attempt to write a readonly database"; + break; + case SQLITE_CORRUPT: + error_msg = "file is not a database"; + break; + default: + error_msg = sqlite3_errstr(rc); + break; + } + // For VFS-backed opens, enrich the terse SQLite-code message with the full filesystem error this + // attempt recorded (e.g. HTTP status/URL), which SQLite otherwise discards. Cleared at open start, + // so a non-empty value belongs to this attempt; empty (e.g. the bytes arrived but were not a + // database) leaves the terse message untouched. + if (used_vfs) { + const string detail = SQLiteDuckDBCacheVFS::GetLastErrorForContext(*context); + if (!detail.empty()) { + throw ConnectionException("Unable to open database \"%s\": %s (%s)", path, error_msg, detail); + } + } + throw ConnectionException("Unable to open database \"%s\": %s", path, error_msg); +} + +SQLiteDB SQLiteDB::OpenLocal(const string &path, const SQLiteOpenOptions &options, bool is_shared) { + SQLiteDB result; + int flags = GetOpenFlags(options, is_shared, false); + + auto rc = sqlite3_open_v2(path.c_str(), &result.db, flags, nullptr); + if (rc != SQLITE_OK) { + throw std::runtime_error("Unable to open database \"" + path + "\": " + string(sqlite3_errstr(rc))); + } + + ApplyBusyTimeout(result.db, options); + if (!options.journal_mode.empty()) { result.Execute("PRAGMA journal_mode=" + KeywordHelper::EscapeQuotes(options.journal_mode, '\'')); } return result; } +// Opens a read-only SQLite database through DuckDB's FileSystem (the caching VFS), for any path +// DuckDB owns rather than a native local file. +SQLiteDB SQLiteDB::OpenWithVFS(const string &path, const SQLiteOpenOptions &options, ClientContext &context, + bool is_shared) { + SQLiteDuckDBCacheVFS::Register(context); + // Reset any error recorded by a previous open on this context, so a failure below surfaces + // only the rich httpfs error produced by THIS attempt (see HandleOpenError). + SQLiteDuckDBCacheVFS::ClearLastErrorForContext(context); + + SQLiteDB result; + int flags = GetOpenFlags(options, is_shared, true); + + const auto vfs_name = SQLiteDuckDBCacheVFS::GetVFSNameForContext(context); + auto rc = sqlite3_open_v2(path.c_str(), &result.db, flags, vfs_name.c_str()); + if (rc != SQLITE_OK) { + HandleOpenError(path, rc, &context); + } + + ApplyBusyTimeout(result.db, options); + + // Keep SQLite-side temp B-trees (sorter/materialization spills) in memory: this read-only VFS + // rejects the nameless temp-file open its xOpen would receive, and a remote read-only database has + // no local scratch space to spill to. + result.Execute("PRAGMA temp_store=MEMORY"); + + return result; +} + +// Overload for callers without a ClientContext (in-memory / local only). A remote path must go through +// the caching VFS, so assert it is not one here. (":memory:" is not a remote file on any platform, so +// it stays valid, including on WASM.) +SQLiteDB SQLiteDB::Open(const string &path, const SQLiteOpenOptions &options, bool is_shared) { + D_ASSERT(!FileSystem::IsRemoteFile(path)); + return OpenLocal(path, options, is_shared); +} + +// Main entry point for opening SQLite databases. +// Paths DuckDB's FileSystem owns (any remote filesystem, and all paths on WASM) are opened read-only +// through the caching VFS; plain local files use native SQLite (read-write, locking, WAL). +SQLiteDB SQLiteDB::Open(const string &path, const SQLiteOpenOptions &options, ClientContext &context, bool is_shared) { + if (SQLiteDuckDBCacheVFS::CanHandlePath(path)) { + return OpenWithVFS(path, options, context, is_shared); + } + return OpenLocal(path, options, is_shared); +} + bool SQLiteDB::TryPrepare(const string &query, SQLiteStatement &stmt) { stmt.db = db; if (debug_sqlite_print_queries) { diff --git a/src/sqlite_duckdb_vfs_cache.cpp b/src/sqlite_duckdb_vfs_cache.cpp new file mode 100644 index 0000000..cdb2c4e --- /dev/null +++ b/src/sqlite_duckdb_vfs_cache.cpp @@ -0,0 +1,963 @@ +//===----------------------------------------------------------------------===// +// DuckDB +// +// sqlite_duckdb_vfs_cache.cpp +// +// +//===----------------------------------------------------------------------===// + +#include "sqlite_duckdb_vfs_cache.hpp" + +#include "duckdb/common/exception.hpp" +#include "duckdb/common/exception/http_exception.hpp" +#include "duckdb/common/error_data.hpp" +#include "duckdb/common/file_system.hpp" +#include "duckdb/common/mutex.hpp" +#include "duckdb/common/string_util.hpp" +#include "duckdb/common/unordered_map.hpp" +#include "duckdb/common/operator/cast_operators.hpp" +#include "duckdb/common/re2_regex.hpp" +#include "duckdb/main/client_context.hpp" +#include "duckdb/main/database.hpp" +#include "duckdb/storage/buffer_manager.hpp" +#include "duckdb/common/atomic.hpp" + +#include + +namespace duckdb { + +//===--------------------------------------------------------------------===// +// Concurrency Design +//===--------------------------------------------------------------------===// +// One ClientContext drives PARALLEL scans (SqliteMaxThreads may be >1), so several SQLite +// connections - each its own sqlite3_file - share one per-context VFS wrapper at once, and Open(), +// Close() race Unregister() at connection teardown. A per-wrapper open-handle refcount, mutated only +// under registry_mutex, governs wrapper lifetime. Unregister() retires a wrapper rather than freeing +// it: sqlite3_open_v2 resolves the sqlite3_vfs pointer before calling xOpen, so a synchronous free can +// race an in-flight xOpen. A retired wrapper with live handles is reaped by its last Close(); a +// zero-handle one by the next teardown's ReapIdleRetiredWrappers(). A closing handle therefore never +// dereferences freed io_methods. +// SQLite never issues concurrent calls on one sqlite3_file, so per-handle ops (Read/FileSize/...) +// need no locking; the wrapper-shared last_error is mutex-guarded but best-effort (a later failure +// can overwrite an earlier one). Lock order: registry_mutex is the outer lock - the GetLastError / +// ClearLastErrorForContext path takes it and then calls into the wrapper's error_mutex, so never +// acquire registry_mutex while holding error_mutex. ClientContext must outlive all its SQLite +// connections; the shared ExternalFileCache lives at the DatabaseInstance level and is synchronized +// by DuckDB. +//===--------------------------------------------------------------------===// + +// Per-context VFS instances. vfs_name uses sqlite3_malloc for cross-DLL safety. +struct DuckDBVFSWrapper { + sqlite3_vfs base; // Must be first - SQLite VFS structure + ClientContext *context; + char *vfs_name; + sqlite3_io_methods io_methods; + + mutable mutex error_mutex; + string last_error_message; + + // Open-handle refcount + retire flag; read and written only under registry_mutex. + idx_t open_file_count = 0; + bool pending_unregister = false; + + ~DuckDBVFSWrapper() noexcept { + if (vfs_name) { + sqlite3_free(vfs_name); + vfs_name = nullptr; + } + } + + void SetLastError(const string &error) noexcept { + // Best-effort: recording a diagnostic must never throw out of a VFS callback into SQLite's C frame. + try { + lock_guard lock(error_mutex); + last_error_message = error; + } catch (...) { + } + } + + string GetLastError() const { + lock_guard lock(error_mutex); + return last_error_message; + } +}; + +//===--------------------------------------------------------------------===// +// HTTP Error Mapping +//===--------------------------------------------------------------------===// + +// HTTP/network-specific markers only. A bare "exception_type":"IO" is excluded: DuckDB also raises +// IOException for local faults (e.g. a disk-full write to the on-disk block cache), which are not HTTP +// errors. Network failures are identified by the HTTP ExceptionType, the parsed status code, or the +// connection-specific strings below. +static constexpr const char *HTTP_ERROR_PATTERNS[] = {"Unable to connect to URL", "Could not establish connection", + "HTTP HEAD to", "HTTP GET to"}; + +static int ExtractHttpStatus(const string &error_msg) { + // Group 1: "status_code":"XXX" (JSON) + // Group 2: XXX (Description) (httpfs format) + // Group 3: (HTTP XXX) or HTTP code XXX or HTTP XXX + static duckdb_re2::Regex status_regex( + "\"status_code\":\"(\\d{3})\"|" // JSON format + "(\\d{3})\\s*\\([^)]+\\)|" // "404 (Not Found)" + "\\(?HTTP\\s+(?:code\\s+)?(\\d{3})\\)?" // "(HTTP 404)", "HTTP code 403", "HTTP 500" + ); + + duckdb_re2::Match match; + if (duckdb_re2::RegexSearch(error_msg, match, status_regex)) { + for (idx_t i = 1; i < match.groups.size(); i++) { + if (!match.groups[i].text.empty()) { + int32_t result; + if (TryCast::Operation(string_t(match.groups[i].text), result)) { + return result; + } + } + } + } + + return 0; +} + +static bool IsHttpError(const string &error_msg) { + return std::any_of(std::begin(HTTP_ERROR_PATTERNS), std::end(HTTP_ERROR_PATTERNS), + [&error_msg](const char *pattern) { return StringUtil::Contains(error_msg, pattern); }); +} + +// Map an HTTP status code to a SQLite error code. Only 404 and 429 get a specific code; every other +// status returns 0 here, so the caller's default applies: SQLITE_CANTOPEN on open, SQLITE_IOERR_READ +// on read. +static int HttpStatusToSqliteError(int http_status) { + switch (http_status) { + case 404: + return SQLITE_CANTOPEN; // SQLite interprets this as "unable to open database file" + case 429: + // Too Many Requests. SQLITE_BUSY is the closest "transient, retry" code; SQLite's busy handler + // does not auto-retry an xRead I/O error, so a mid-scan 429 surfaces to the caller. + return SQLITE_BUSY; + default: + return 0; // Unmapped: the caller's default error code applies. + } +} + +// Extract the HTTP status code from a caught exception. Prefer the structured status_code that +// DuckDB's HTTPException records in its extra-info map; fall back to parsing the message text for +// error sources that do not populate it. +static int GetHttpStatus(const ErrorData &error_data, const string &error_msg) { + auto &info = error_data.ExtraInfo(); + auto it = info.find("status_code"); + if (it != info.end()) { + int32_t code; + if (TryCast::Operation(string_t(it->second), code)) { + return code; + } + } + return ExtractHttpStatus(error_msg); +} + +// Build a sidecar URL ("-wal" / "-journal") by inserting the suffix before any "?query", so the probe +// targets the right object rather than appending past the query string. +static string BuildSidecarUrl(const string &path, const char *suffix) { + const auto query_pos = path.find('?'); + return (query_pos == string::npos) ? path + suffix + : path.substr(0, query_pos) + suffix + path.substr(query_pos); +} + +// Classify an exception from a sidecar probe. Returns true only when the sidecar is CONFIRMED absent +// (HTTP 404, or a non-HTTP "not found"): the database has no live sidecar and the main file is safe to +// read. Any other error (403 on a bare presigned URL, a transient 429/503, a timeout) leaves the +// sidecar's state UNKNOWN, so the caller must fail closed rather than assume absent. +static bool SidecarConfirmedAbsent(const std::exception &e) { + const ErrorData error_data(e); + const string error_msg = e.what(); + const int http_status = GetHttpStatus(error_data, error_msg); + return http_status == 404 || + (http_status == 0 && error_data.Type() != ExceptionType::HTTP && !IsHttpError(error_msg) && + (StringUtil::Contains(error_msg, "No files found") || + StringUtil::Contains(error_msg, "does not exist") || StringUtil::Contains(error_msg, "No such file"))); +} + +template +static T SafeVFSCall(T error_value, Func &&func, DuckDBVFSWrapper *wrapper = nullptr, const char *path = nullptr) { + // The error-mapping below compares error_value against SQLITE_OK and returns SQLite error codes, + // so it is only meaningful for int returns. (xGetLastError, whose return is an OS errno rather + // than a SQLite code, does not route through here.) + static_assert(std::is_same::value, "SafeVFSCall maps to SQLite error codes; T must be int"); + try { + return func(); + } catch (const std::exception &e) { + // SafeVFSCall exists to keep C++ exceptions out of SQLite's C call frame, but the diagnostic + // and mapping work below itself allocates (string copies, ErrorData JSON parse, regex, + // SetLastError) and can throw (e.g. std::bad_alloc, under the same memory pressure that + // produced the original failure). An inner guard degrades a failed diagnosis to the safe + // default error code rather than letting it escape into C (which would be UB). + try { + const string error_msg = e.what(); + const ErrorData error_data(e); + const int http_status = GetHttpStatus(error_data, error_msg); + // Clean, human-readable message for the stored error (DuckDB serializes HTTPException as + // JSON in what(); RawMessage() is just the "exception_message", e.g. the "HTTP GET error + // on '...' (HTTP 404 Not Found)" text). error_msg keeps the raw form for status parsing. + const string &clean_msg = error_data.RawMessage(); + + if (http_status != 0 || error_data.Type() == ExceptionType::HTTP || IsHttpError(error_msg)) { + if (wrapper) { + string full_error = "HTTP Error: "; + full_error += clean_msg; + if (path) { + full_error += " (URL: "; + full_error += path; + full_error += ")"; + } + wrapper->SetLastError(full_error); + } + + int sqlite_error = HttpStatusToSqliteError(http_status); + if (sqlite_error != 0) { + return sqlite_error; + } + + if (error_msg.find("Unable to connect to URL") != string::npos || + error_msg.find("Could not establish connection") != string::npos) { + return error_value == SQLITE_OK ? SQLITE_CANTOPEN : error_value; + } + + return error_value == SQLITE_OK ? SQLITE_IOERR : error_value; + } + + if (error_data.Type() == ExceptionType::PERMISSION || + error_msg.find("Permission") != string::npos) { + if (wrapper) { + string full_error = "Permission denied: "; + full_error += clean_msg; + if (path) { + full_error += " (Path: "; + full_error += path; + full_error += ")"; + } + wrapper->SetLastError(full_error); + } + return error_value == SQLITE_OK ? SQLITE_PERM : error_value; + } + + if (wrapper) { + string full_error = "Error: "; + full_error += clean_msg; + if (path) { + full_error += " (Path: "; + full_error += path; + full_error += ")"; + } + wrapper->SetLastError(full_error); + } + + return error_value; + } catch (...) { + // The diagnosis/mapping itself threw (e.g. allocation failure) - return the safe default. + return error_value; + } + } catch (...) { + // Unknown (non-std) exception. SetLastError allocates, so guard it too. + try { + if (wrapper) { + wrapper->SetLastError("Unknown error occurred"); + } + } catch (...) { + } + return error_value; + } +} + +struct VFSRegistryData { + mutex registry_mutex; + unordered_map> registry; + // Wrappers Unregister() retired; reaped by the last Close() (handles still open) or by + // ReapIdleRetiredWrappers() at the next teardown (retired with zero handles). + vector> retired; +}; + +static VFSRegistryData &GetVFSRegistryData() { + static VFSRegistryData data; + return data; +} + +// Free zero-handle retired wrappers. Runs under registry_mutex, called only from Register/Unregister. +// A wrapper is retired only by Unregister(context), which DuckDB invokes from OnConnectionClosed once +// that context is quiescent: opens on a context's per-context VFS are issued only by that context, so +// once it is closing none is in flight and none can start. A retired wrapper at open_file_count == 0 +// therefore has no live or pending sqlite3_open_v2 resolving its sqlite3_vfs pointer, and freeing it +// here is safe. Wrappers retired with live handles are instead reaped by their last Close(). +static void ReapIdleRetiredWrappers(VFSRegistryData ®istry_data) { + auto &retired = registry_data.retired; + for (auto it = retired.begin(); it != retired.end();) { + if ((*it)->open_file_count == 0) { + it = retired.erase(it); + } else { + ++it; + } + } +} + +static constexpr int SQLITE_SECTOR_SIZE = 4096; + +static void InitializeIOMethods(sqlite3_io_methods &io_methods) { + memset(&io_methods, 0, sizeof(io_methods)); + + io_methods.iVersion = 1; + io_methods.xClose = SQLiteDuckDBCacheVFS::Close; + io_methods.xRead = SQLiteDuckDBCacheVFS::Read; + io_methods.xWrite = SQLiteDuckDBCacheVFS::Write; + io_methods.xTruncate = SQLiteDuckDBCacheVFS::Truncate; + io_methods.xSync = SQLiteDuckDBCacheVFS::Sync; + io_methods.xFileSize = SQLiteDuckDBCacheVFS::FileSize; + io_methods.xLock = SQLiteDuckDBCacheVFS::Lock; + io_methods.xUnlock = SQLiteDuckDBCacheVFS::Unlock; + io_methods.xCheckReservedLock = SQLiteDuckDBCacheVFS::CheckReservedLock; + io_methods.xFileControl = SQLiteDuckDBCacheVFS::FileControl; + io_methods.xSectorSize = SQLiteDuckDBCacheVFS::SectorSize; + io_methods.xDeviceCharacteristics = SQLiteDuckDBCacheVFS::DeviceCharacteristics; + io_methods.xShmMap = nullptr; + io_methods.xShmLock = nullptr; + io_methods.xShmBarrier = nullptr; + io_methods.xShmUnmap = nullptr; + io_methods.xFetch = nullptr; + io_methods.xUnfetch = nullptr; +} + +static string GetUniqueVFSName() { + static atomic vfs_counter {0}; + return "duckdb_cache_vfs_" + to_string(vfs_counter.fetch_add(1)); +} + +//===--------------------------------------------------------------------===// +// DuckDBCachedFile Implementation +//===--------------------------------------------------------------------===// + +DuckDBCachedFile::DuckDBCachedFile(ClientContext &context, const string &path) : context(context), path(path) { + // Defer open: DuckDB operations must not run inside the VFS callback frame. +} + +void DuckDBCachedFile::EnsureInitialized() { + if (initialized) { + return; + } + // A prior attempt failed (e.g. WAL rejection, or a network error fetching the header). Reads on a + // remote file go through SQLite's pager, which retries xRead many times per query; without this + // short-circuit each retry would re-issue the open's HTTP round-trips and the rich failure reason + // (e.g. the WAL message) would be lost behind a generic per-read I/O error. + if (init_failed) { + throw IOException(init_error_message); + } + try { + InitializeFromRemote(); + } catch (const std::exception &e) { + init_failed = true; + init_error_message = ErrorData(e).RawMessage(); + throw; + } +} + +void DuckDBCachedFile::InitializeFromRemote() { + // DIRECT_IO: bypass OS page cache; DuckDB's CachingFileSystem provides its own block cache. + auto flags = FileFlags::FILE_FLAGS_READ; + if (FileSystem::IsRemoteFile(path)) { + flags |= FileFlags::FILE_FLAGS_DIRECT_IO; + } + + // Build into locals and commit to members only once every step succeeds, so a throw on a retry + // leaves the prior members intact. (caching_fs must outlive caching_handle; see the member decl.) + auto new_fs = make_uniq(CachingFileSystem::Get(context)); + OpenFileInfo file_info(path); + // enable_external_access (and the allow-list) is enforced here: CachingFileSystem wraps the + // context's OpenerFileSystem, whose OpenFile throws PermissionException before any network I/O + // when the path is not permitted. The VFS adds no bypass; it must open via the context filesystem + // (no explicit opener) so that gate always applies. + auto new_handle = new_fs->OpenFile(file_info, flags); + // Fetch the size once: the remote file is read-only and immutable, so this avoids a per-read size + // lookup that could otherwise cost an HTTP round trip. + const auto size = static_cast(new_handle->GetFileSize()); + + // This VFS serves only the main database file and advertises SQLITE_IOCAP_IMMUTABLE, so SQLite skips + // hot-journal/WAL recovery. A database with a live sidecar - a populated "-wal" (WAL mode) or a hot + // "-journal" (an interrupted rollback-mode write) - would then be read as if clean, silently serving + // a stale or inconsistent snapshot. Verify no live sidecar before serving: header byte 19 (the + // read-format version) picks which sidecar to check - 2 = WAL, 1 = rollback journal, the same byte + // SQLite's own pager keys WAL-mode access off. Each probe fails CLOSED unless the sidecar is + // confirmed absent (see SidecarConfirmedAbsent). + if (size >= 20) { + data_t header[20]; + new_handle->Read(20, 0).CopyTo(data_ptr_cast(header), 20); + const bool is_sqlite = memcmp(header, "SQLite format 3", 16) == 0; + const bool wal_mode = header[19] >= 2; + if (is_sqlite && wal_mode) { + // WAL mode: reject a "-wal" that holds more than its 32-byte header (i.e. has frames). + const string wal_path = BuildSidecarUrl(path, "-wal"); + idx_t wal_size = 0; + try { + wal_size = new_fs->OpenFile(OpenFileInfo(wal_path), flags)->GetFileSize(); + } catch (const std::exception &e) { + if (!SidecarConfirmedAbsent(e)) { + throw IOException( + "Cannot verify whether remote SQLite database \"%s\" is in WAL mode: probing its " + "\"-wal\" sidecar failed (%s). Refusing to read rather than risk serving a stale " + "snapshot. Checkpoint the database into a single file (PRAGMA " + "wal_checkpoint(TRUNCATE), or VACUUM INTO), or make the \"-wal\" URL reachable.", + path, ErrorData(e).RawMessage()); + } + wal_size = 0; // confirmed absent: the main file is checkpointed-complete + } + if (wal_size > 32) { + throw IOException("Cannot read remote SQLite database \"%s\" in WAL mode: its \"-wal\" sidecar " + "holds changes this read-only reader cannot apply. Checkpoint it into a single " + "file (PRAGMA wal_checkpoint(TRUNCATE), or VACUUM INTO) before serving it.", + path); + } + } else if (is_sqlite) { + // Rollback-journal mode: a hot "-journal" holds uncommitted changes a normal opener would roll + // back, which this read-only reader cannot. Mirror SQLite's hasHotJournal(): a journal with a + // nonzero first byte is hot. A clean commit removes it (DELETE), empties it (TRUNCATE), or + // zeroes its header (PERSIST), so a zeroed header passes. Keying on the first byte is + // conservative: a corrupt-header leftover is refused, but a dirty main file is never served. + const string journal_path = BuildSidecarUrl(path, "-journal"); + bool hot_journal = false; + try { + auto journal_handle = new_fs->OpenFile(OpenFileInfo(journal_path), flags); + if (journal_handle->GetFileSize() > 0) { + data_t first = 0; + journal_handle->Read(1, 0).CopyTo(data_ptr_cast(&first), 1); + hot_journal = first != 0; + } + } catch (const std::exception &e) { + if (!SidecarConfirmedAbsent(e)) { + throw IOException( + "Cannot verify whether remote SQLite database \"%s\" has a hot rollback journal: " + "probing its \"-journal\" sidecar failed (%s). Refusing to read rather than risk " + "serving an inconsistent snapshot. Serve a cleanly-closed database, or make the " + "\"-journal\" URL reachable.", + path, ErrorData(e).RawMessage()); + } + // confirmed absent: no journal, the main file is clean. + } + if (hot_journal) { + throw IOException("Cannot read remote SQLite database \"%s\": its \"-journal\" sidecar holds a " + "hot rollback journal from an interrupted write, which this read-only reader " + "cannot replay. Serve a cleanly-closed database.", + path); + } + } + } + + caching_fs = std::move(new_fs); + caching_handle = std::move(new_handle); + cached_file_size = size; + initialized = true; +} + +int DuckDBCachedFile::Read(void *buffer, int amount, sqlite3_int64 offset) { + if (offset < 0 || amount < 0) { + return SQLITE_IOERR_READ; + } + + if (!buffer || amount == 0) { + return SQLITE_OK; + } + + EnsureInitialized(); + + if (!caching_handle) { + return SQLITE_IOERR_READ; + } + + const sqlite3_int64 file_size = cached_file_size; + if (file_size < 0) { + return SQLITE_IOERR_READ; + } + + if (offset >= file_size) { + memset(buffer, 0, amount); + return SQLITE_IOERR_SHORT_READ; + } + + const sqlite3_int64 available_bytes = file_size - offset; + const int bytes_to_read = (available_bytes < amount) ? static_cast(available_bytes) : amount; + + auto buffer_group = caching_handle->Read(static_cast(bytes_to_read), offset); + buffer_group.CopyTo(data_ptr_cast(buffer), static_cast(bytes_to_read)); + + if (bytes_to_read < amount) { + memset(static_cast(buffer) + bytes_to_read, 0, amount - bytes_to_read); + } + + return (bytes_to_read < amount) ? SQLITE_IOERR_SHORT_READ : SQLITE_OK; +} + +sqlite3_int64 DuckDBCachedFile::GetFileSize() { + EnsureInitialized(); + return cached_file_size; +} + +//===--------------------------------------------------------------------===// +// SQLiteDuckDBCacheVFS Implementation +//===--------------------------------------------------------------------===// + +bool SQLiteDuckDBCacheVFS::CanHandlePath(const string &path) { + // FileSystem::IsRemoteFile matches every remote filesystem DuckDB or its extensions register + // (http/https/s3/gcs/azure/hf/...). Plain local files on a native build stay on native SQLite for + // the read-write, locking, and WAL that a read-only VFS cannot provide. + if (FileSystem::IsRemoteFile(path)) { + return true; + } +#if defined(__EMSCRIPTEN__) + // On WASM there is no native file access, so route local paths through DuckDB's FileSystem too. + return true; +#else + return false; +#endif +} + +void SQLiteDuckDBCacheVFS::Register(ClientContext &context) { + auto ®istry_data = GetVFSRegistryData(); + lock_guard lock(registry_data.registry_mutex); + + // Reclaim zero-handle wrappers retired by an earlier teardown (bounds the retired list under churn). + ReapIdleRetiredWrappers(registry_data); + + auto it = registry_data.registry.find(&context); + if (it != registry_data.registry.end()) { + return; + } + + sqlite3_vfs *default_vfs = sqlite3_vfs_find(nullptr); + if (!default_vfs) { + throw InternalException("Failed to find default SQLite VFS - SQLite may not be properly initialized"); + } + + auto wrapper = make_uniq(); + wrapper->context = &context; + + // sqlite3_malloc: ownership must match SQLite's allocator when the name crosses DLL boundaries. + const string temp_name = GetUniqueVFSName(); + wrapper->vfs_name = static_cast(sqlite3_malloc64(temp_name.length() + 1)); + if (!wrapper->vfs_name) { + throw InternalException("Failed to allocate memory for VFS name"); + } + memcpy(wrapper->vfs_name, temp_name.c_str(), temp_name.length() + 1); + + InitializeIOMethods(wrapper->io_methods); + + memset(&wrapper->base, 0, sizeof(wrapper->base)); + wrapper->base.iVersion = 1; + wrapper->base.szOsFile = sizeof(SQLiteDuckDBCachedFile); + // Our pathnames are arbitrary-length URLs. The default OS VFS's mxPathname (512 on unix) is too + // small for presigned S3/GCS/Azure URLs, which SQLite rejects (its journal-path guard checks + // nPathname+8 > mxPathname) before xOpen runs. + wrapper->base.mxPathname = 8192; + wrapper->base.zName = wrapper->vfs_name; + wrapper->base.pAppData = wrapper.get(); + + wrapper->base.xOpen = Open; + wrapper->base.xDelete = Delete; + wrapper->base.xAccess = Access; + wrapper->base.xFullPathname = FullPathname; + wrapper->base.xDlOpen = DlOpen; + wrapper->base.xDlError = DlError; + wrapper->base.xDlSym = DlSym; + wrapper->base.xDlClose = DlClose; + wrapper->base.xRandomness = Randomness; + wrapper->base.xSleep = Sleep; + wrapper->base.xCurrentTime = CurrentTime; + wrapper->base.xGetLastError = GetLastError; + + // Take ownership in the registry before handing the VFS to SQLite, so a throwing map insert leaves + // nothing registered and SQLite's global VFS list never points at freed wrapper memory. + auto *wrapper_ptr = wrapper.get(); + registry_data.registry[&context] = std::move(wrapper); + + int rc = sqlite3_vfs_register(&wrapper_ptr->base, 0); + if (rc != SQLITE_OK) { + registry_data.registry.erase(&context); // destroys the wrapper; wrapper_ptr is dead past here + wrapper_ptr = nullptr; // guard against any future use after the erase + throw InternalException("Failed to register DuckDB Cache VFS: %s", sqlite3_errstr(rc)); + } +} + +void SQLiteDuckDBCacheVFS::Unregister(ClientContext &context) { + auto ®istry_data = GetVFSRegistryData(); + lock_guard lock(registry_data.registry_mutex); + + // Reclaim wrappers retired (with zero handles) by an earlier teardown before retiring this one. + ReapIdleRetiredWrappers(registry_data); + + auto it = registry_data.registry.find(&context); + if (it == registry_data.registry.end()) { + return; + } + // Remove the VFS from SQLite's name table so no new connection can open against it. This does + // not affect connections already open against it - they keep their sqlite3_vfs pointer. + sqlite3_vfs_unregister(&it->second->base); + // Retire, never free synchronously: an in-flight xOpen may still hold this wrapper (see the + // Concurrency Design note). The last Close() or the next ReapIdleRetiredWrappers() reaps it. + it->second->pending_unregister = true; + registry_data.retired.push_back(std::move(it->second)); + registry_data.registry.erase(it); +} + +string SQLiteDuckDBCacheVFS::GetVFSNameForContext(ClientContext &context) { + auto ®istry_data = GetVFSRegistryData(); + lock_guard lock(registry_data.registry_mutex); + + auto it = registry_data.registry.find(&context); + if (it != registry_data.registry.end() && it->second->vfs_name) { + // Copy into a std::string while holding the lock, so the returned name stays valid even if + // another thread unregisters this context's VFS (which frees vfs_name) afterwards. + return string(it->second->vfs_name); + } + + // Register() is always called before this for a remote open; reaching here means a logic error + // (e.g. a caller skipped Register). Fail loudly rather than returning the static default name, + // which could match a stale VFS from a previously-closed context or no VFS at all. + throw InternalException("DuckDB cache VFS not registered for this context"); +} + +string SQLiteDuckDBCacheVFS::GetLastErrorForContext(ClientContext &context) { + auto ®istry_data = GetVFSRegistryData(); + lock_guard lock(registry_data.registry_mutex); + + auto it = registry_data.registry.find(&context); + if (it != registry_data.registry.end()) { + // Copy the message out while holding the lock (the wrapper, and its mutex-protected + // string, can be freed by Unregister once the lock is released). + return it->second->GetLastError(); + } + return string(); +} + +void SQLiteDuckDBCacheVFS::ClearLastErrorForContext(ClientContext &context) { + auto ®istry_data = GetVFSRegistryData(); + lock_guard lock(registry_data.registry_mutex); + + auto it = registry_data.registry.find(&context); + if (it != registry_data.registry.end()) { + // Reset before an open attempt so any error read afterwards belongs to THIS attempt + // (the recorded message persists across opens on the same context otherwise). + it->second->SetLastError(""); + } +} + +//===--------------------------------------------------------------------===// +// VFS Methods +//===--------------------------------------------------------------------===// + +// System-level ops (randomness, sleep, time) have no remote file I/O; delegate to the default VFS. +#define DELEGATE_TO_DEFAULT_VFS(method_name, ...) \ + sqlite3_vfs *default_vfs = sqlite3_vfs_find(nullptr); \ + if (default_vfs && default_vfs->method_name) { \ + return default_vfs->method_name(default_vfs, __VA_ARGS__); \ + } \ + return SQLITE_OK; + +// Drop one open-handle refcount; if it was the last handle on a retired wrapper, reap it. Acquires +// registry_mutex internally (so the decrement and the reap decision cannot interleave with +// Unregister()'s count check); callers must not already hold it. +static void ReleaseWrapperHandle(DuckDBVFSWrapper *wrapper) { + if (!wrapper) { + return; + } + auto ®istry_data = GetVFSRegistryData(); + lock_guard lock(registry_data.registry_mutex); + // Each live handle holds one count (reserved in Open() before io_methods is published). A decrement + // at zero is a double-release; catch it before the unsigned wrap turns into an un-reapable leak. + D_ASSERT(wrapper->open_file_count > 0); + if (--wrapper->open_file_count == 0 && + wrapper->pending_unregister) { + for (auto it = registry_data.retired.begin(); it != registry_data.retired.end(); ++it) { + if (it->get() == wrapper) { + registry_data.retired.erase(it); + break; + } + } + } +} + +int SQLiteDuckDBCacheVFS::Open(sqlite3_vfs *vfs, const char *filename, sqlite3_file *file, int flags, int *out_flags) { + auto *wrapper = vfs && vfs->pAppData ? static_cast(vfs->pAppData) : nullptr; + + return SafeVFSCall( + SQLITE_CANTOPEN, + [&]() { + if (!vfs || !filename || !file || (flags & SQLITE_OPEN_READONLY) == 0) { + return SQLITE_CANTOPEN; + } + + // SQLite's VFS contract requires xOpen to leave file->pMethods == nullptr on every failure + // path, so xClose is never invoked on a half-open handle. Set it explicitly rather than + // relying on SQLite to pre-zero the struct; the success path overwrites it below. + file->pMethods = nullptr; + + if (vfs->szOsFile < static_cast(sizeof(SQLiteDuckDBCachedFile))) { + return SQLITE_CANTOPEN; + } + + auto *duckdb_file = reinterpret_cast(file); + + if (!vfs->pAppData) { + return SQLITE_CANTOPEN; + } + + ClientContext *context = wrapper->context; + + if (!context || !context->db) { + return SQLITE_CANTOPEN; + } + + // Reserve the refcount BEFORE publishing io_methods into the sqlite3_file, so a racing + // Unregister() observes the handle and retires the wrapper instead of freeing it under us. + // Released by Close() (or on the construction-failure path below). + { + auto ®istry_data = GetVFSRegistryData(); + lock_guard lock(registry_data.registry_mutex); + wrapper->open_file_count++; + } + + duckdb_file->base.pMethods = &wrapper->io_methods; + duckdb_file->duckdb_file = nullptr; + duckdb_file->wrapper = wrapper; + + try { + duckdb_file->duckdb_file = new DuckDBCachedFile(*context, filename); + } catch (...) { + duckdb_file->base.pMethods = nullptr; + duckdb_file->duckdb_file = nullptr; + duckdb_file->wrapper = nullptr; + ReleaseWrapperHandle(wrapper); + return SQLITE_CANTOPEN; + } + + // Defer SQLite-header validation to first read: no DuckDB I/O inside the open callback. + + if (out_flags) { + *out_flags = flags; + } + + return SQLITE_OK; + }, + wrapper, filename); +} + +int SQLiteDuckDBCacheVFS::Delete(sqlite3_vfs *vfs, const char *filename, int sync_dir) { + return SQLITE_IOERR_DELETE; +} + +int SQLiteDuckDBCacheVFS::Access(sqlite3_vfs *vfs, const char *filename, int flags, int *result) { + auto *wrapper = vfs && vfs->pAppData ? static_cast(vfs->pAppData) : nullptr; + + return SafeVFSCall( + SQLITE_IOERR, + [&]() { + if (!filename || !result) { + return SQLITE_IOERR; + } + + // This VFS serves read-only, immutable databases: DeviceCharacteristics() returns + // SQLITE_IOCAP_IMMUTABLE, so SQLite treats the main file as fresh and skips hot-journal/WAL + // recovery. We already verified at open time that no live -wal/-journal sidecar is present. Report "does not + // exist" for every probe instead of issuing a network round-trip to check. + *result = 0; + + return SQLITE_OK; + }, + wrapper, filename); +} + +int SQLiteDuckDBCacheVFS::FullPathname(sqlite3_vfs *vfs, const char *filename, int out_size, char *out_buf) { + return SafeVFSCall(SQLITE_IOERR, [&]() { + if (!filename || !out_buf || out_size <= 0) { + return SQLITE_IOERR; + } + + // Remote paths are already absolute URLs - return as-is. If the URL does not fit SQLite's + // buffer (sized from mxPathname), fail clearly: silently truncating would issue the HTTP + // request against a corrupted URL. + if (static_cast(strlen(filename)) >= out_size) { + return SQLITE_CANTOPEN; + } + strncpy(out_buf, filename, out_size - 1); + out_buf[out_size - 1] = '\0'; + return SQLITE_OK; + }); +} + +int SQLiteDuckDBCacheVFS::Randomness(sqlite3_vfs *vfs, int bytes, char *out) { + DELEGATE_TO_DEFAULT_VFS(xRandomness, bytes, out); +} + +int SQLiteDuckDBCacheVFS::Sleep(sqlite3_vfs *vfs, int microseconds) { + DELEGATE_TO_DEFAULT_VFS(xSleep, microseconds); +} + +int SQLiteDuckDBCacheVFS::CurrentTime(sqlite3_vfs *vfs, double *time) { + DELEGATE_TO_DEFAULT_VFS(xCurrentTime, time); +} + +void *SQLiteDuckDBCacheVFS::DlOpen(sqlite3_vfs *vfs, const char *filename) { + return nullptr; +} + +void SQLiteDuckDBCacheVFS::DlError(sqlite3_vfs *vfs, int bytes, char *err_msg) { + try { + if (err_msg && bytes > 0) { + strncpy(err_msg, "Dynamic loading not supported through the DuckDB filesystem VFS", bytes - 1); + err_msg[bytes - 1] = '\0'; + } + } catch (...) { + // Best effort - if we can't even set the error message, just return + if (err_msg && bytes > 0) { + err_msg[0] = '\0'; + } + } +} + +void (*SQLiteDuckDBCacheVFS::DlSym(sqlite3_vfs *vfs, void *handle, const char *symbol))(void) { + return nullptr; +} + +void SQLiteDuckDBCacheVFS::DlClose(sqlite3_vfs *vfs, void *handle) { +} + +int SQLiteDuckDBCacheVFS::GetLastError(sqlite3_vfs *vfs, int bytes, char *err_msg) { + return SafeVFSCall(0, [&]() { + if (!vfs || !vfs->pAppData || !err_msg || bytes <= 0) { + return 0; + } + + auto *wrapper = static_cast(vfs->pAppData); + const string error = wrapper->GetLastError(); + + if (error.empty()) { + err_msg[0] = '\0'; + return 0; + } + + strncpy(err_msg, error.c_str(), bytes - 1); + err_msg[bytes - 1] = '\0'; + + // SQLite uses xGetLastError's return value as an OS error code; we have none, so return 0. + // The rich error reaches callers through GetLastErrorForContext, not this hook. + return 0; + }); +} + +//===--------------------------------------------------------------------===// +// File Methods +//===--------------------------------------------------------------------===// + +int SQLiteDuckDBCacheVFS::Close(sqlite3_file *file) { + return SafeVFSCall(SQLITE_OK, [&]() { + if (file) { + auto *duckdb_file = reinterpret_cast(file); + delete duckdb_file->duckdb_file; + duckdb_file->duckdb_file = nullptr; + + // Release our handle; if this was the last one on a retired wrapper, ReleaseWrapperHandle + // reaps it here - the wrapper was kept alive precisely so this Close() stays valid. + auto *wrapper = duckdb_file->wrapper; + duckdb_file->wrapper = nullptr; + ReleaseWrapperHandle(wrapper); + } + return SQLITE_OK; + }); +} + +int SQLiteDuckDBCacheVFS::Read(sqlite3_file *file, void *buffer, int amount, sqlite3_int64 offset) { + // Record the rich error (httpfs status/URL) on the wrapper so HandleOpenError can surface it; + // the per-file wrapper is set in Open(). + auto *read_file = file ? reinterpret_cast(file) : nullptr; + auto *wrapper = read_file ? read_file->wrapper : nullptr; + return SafeVFSCall( + SQLITE_IOERR_READ, + [&]() { + if (!file || !buffer) { + return SQLITE_IOERR_READ; + } + + auto *duckdb_file = reinterpret_cast(file); + if (!duckdb_file->duckdb_file) { + return SQLITE_IOERR_READ; + } + + return duckdb_file->duckdb_file->Read(buffer, amount, offset); + }, + wrapper); +} + +int SQLiteDuckDBCacheVFS::FileSize(sqlite3_file *file, sqlite3_int64 *size) { + auto *size_file = file ? reinterpret_cast(file) : nullptr; + auto *wrapper = size_file ? size_file->wrapper : nullptr; + return SafeVFSCall( + SQLITE_IOERR, + [&]() { + if (!file || !size) { + return SQLITE_IOERR; + } + + auto *duckdb_file = reinterpret_cast(file); + if (!duckdb_file->duckdb_file) { + return SQLITE_IOERR; + } + + const sqlite3_int64 file_size = duckdb_file->duckdb_file->GetFileSize(); + if (file_size < 0) { + // -1 means the underlying open/size lookup failed (e.g. a network error). Surface an + // I/O error rather than reporting a bogus size and letting SQLite read a phantom file. + return SQLITE_IOERR; + } + *size = file_size; + return SQLITE_OK; + }, + wrapper); +} + +int SQLiteDuckDBCacheVFS::Write(sqlite3_file *file, const void *buffer, int amount, sqlite3_int64 offset) { + return SQLITE_READONLY; +} + +int SQLiteDuckDBCacheVFS::Truncate(sqlite3_file *file, sqlite3_int64 size) { + return SQLITE_READONLY; +} + +int SQLiteDuckDBCacheVFS::Sync(sqlite3_file *file, int flags) { + return SQLITE_OK; +} + +int SQLiteDuckDBCacheVFS::Lock(sqlite3_file *file, int level) { + return SQLITE_OK; +} + +int SQLiteDuckDBCacheVFS::Unlock(sqlite3_file *file, int level) { + return SQLITE_OK; +} + +int SQLiteDuckDBCacheVFS::CheckReservedLock(sqlite3_file *file, int *result) { + if (result) { + *result = 0; + } + return SQLITE_OK; +} + +int SQLiteDuckDBCacheVFS::FileControl(sqlite3_file *file, int op, void *arg) { + return SQLITE_NOTFOUND; +} + +int SQLiteDuckDBCacheVFS::SectorSize(sqlite3_file *file) { + return SQLITE_SECTOR_SIZE; +} + +int SQLiteDuckDBCacheVFS::DeviceCharacteristics(sqlite3_file *file) { + // IMMUTABLE tells SQLite the file never changes for the life of the connection, so it skips + // locking and hot-journal/WAL recovery. This is an axiom the caller must honor, not a property + // this VFS verifies: if the remote object is mutated mid-query (or a CDN / eventually-consistent + // store serves a stale block), SQLite will not detect it and may return wrong results. Remote + // SQLite databases are expected to be served as static, checkpointed snapshots. + return SQLITE_IOCAP_IMMUTABLE; +} + +} // namespace duckdb diff --git a/src/sqlite_extension.cpp b/src/sqlite_extension.cpp index a534259..bf60f06 100644 --- a/src/sqlite_extension.cpp +++ b/src/sqlite_extension.cpp @@ -4,6 +4,7 @@ #include "duckdb.hpp" #include "sqlite_db.hpp" +#include "sqlite_duckdb_vfs_cache.hpp" #include "sqlite_scanner.hpp" #include "sqlite_storage.hpp" #include "sqlite_scanner_extension.hpp" @@ -11,6 +12,7 @@ #include "duckdb/catalog/catalog.hpp" #include "duckdb/main/extension/extension_loader.hpp" #include "duckdb/parser/parsed_data/create_table_function_info.hpp" +#include "duckdb/planner/extension_callback.hpp" using namespace duckdb; @@ -20,6 +22,15 @@ static void SetSqliteDebugQueryPrint(ClientContext &context, SetScope scope, Val SQLiteDB::DebugSetPrintQueries(BooleanValue::Get(parameter)); } +// Unregisters this connection's remote VFS (registered lazily on the first remote open) when +// the connection closes, so the per-context wrapper does not outlive its ClientContext. +class SQLiteVFSCleanupCallback : public ExtensionCallback { +public: + void OnConnectionClosed(ClientContext &context) override { + SQLiteDuckDBCacheVFS::Unregister(context); + } +}; + static void LoadInternal(ExtensionLoader &loader) { SqliteScanFunction sqlite_fun; loader.RegisterFunction(sqlite_fun); @@ -41,6 +52,7 @@ static void LoadInternal(ExtensionLoader &loader) { LogicalType::BOOLEAN, Value::BOOLEAN(false)); StorageExtension::Register(config, "sqlite_scanner", make_shared_ptr()); + ExtensionCallback::Register(config, make_shared_ptr()); } void SqliteScannerExtension::Load(ExtensionLoader &loader) { diff --git a/src/sqlite_scanner.cpp b/src/sqlite_scanner.cpp index 4268928..f4852be 100644 --- a/src/sqlite_scanner.cpp +++ b/src/sqlite_scanner.cpp @@ -56,7 +56,7 @@ static unique_ptr SqliteBind(ClientContext &context, TableFunction SQLiteStatement stmt; SQLiteOpenOptions options; options.access_mode = AccessMode::READ_ONLY; - db = SQLiteDB::Open(result->file_name, options); + db = SQLiteDB::Open(result->file_name, options, context); ColumnList columns; vector> constraints; @@ -121,7 +121,7 @@ static void SqliteInitInternal(ClientContext &context, const SqliteBindData &bin if (!local_state.db) { SQLiteOpenOptions options; options.access_mode = AccessMode::READ_ONLY; - local_state.owned_db = SQLiteDB::Open(bind_data.file_name.c_str(), options); + local_state.owned_db = SQLiteDB::Open(bind_data.file_name.c_str(), options, context); local_state.db = &local_state.owned_db; } string sql = SqliteGetScanSQL(bind_data, local_state.column_ids); @@ -421,7 +421,7 @@ static void AttachFunction(ClientContext &context, TableFunctionInput &data_p, D SQLiteOpenOptions options; options.access_mode = AccessMode::READ_ONLY; - SQLiteDB db = SQLiteDB::Open(data.file_name, options); + SQLiteDB db = SQLiteDB::Open(data.file_name, options, context); auto dconn = Connection(context.db->GetDatabase(context)); { auto tables = db.GetTables(); diff --git a/src/storage/sqlite_transaction.cpp b/src/storage/sqlite_transaction.cpp index 08ea220..e103279 100644 --- a/src/storage/sqlite_transaction.cpp +++ b/src/storage/sqlite_transaction.cpp @@ -12,6 +12,7 @@ #include "duckdb/parser/parser.hpp" #include "duckdb/parser/parsed_expression_iterator.hpp" #include "duckdb/parser/expression/columnref_expression.hpp" +#include "duckdb/common/file_system.hpp" namespace duckdb { @@ -52,15 +53,20 @@ void SQLiteCatalogMap::EraseEntry(const string &entry_name) { } SQLiteTransaction::SQLiteTransaction(SQLiteCatalog &sqlite_catalog, TransactionManager &manager, ClientContext &context) - : Transaction(manager, context), sqlite_catalog(sqlite_catalog) { + : Transaction(manager, context), sqlite_catalog(sqlite_catalog), db(nullptr) { if (sqlite_catalog.InMemory()) { - // in-memory database - get a reference to the in-memory connection db = sqlite_catalog.GetInMemoryDatabase(); - } else { - // on-disk database - open a new database connection - owned_db = SQLiteDB::Open(sqlite_catalog.path, sqlite_catalog.options, true); + initialized.store(true, std::memory_order_release); + } else if (!FileSystem::IsRemoteFile(sqlite_catalog.path)) { + // local on-disk database - open eagerly (cheap; surfaces open errors at attach time) + owned_db = SQLiteDB::Open(sqlite_catalog.path, sqlite_catalog.options, context, true); db = &owned_db; + initialized.store(true, std::memory_order_release); } + // remote database - defer the connection open and BEGIN to the first GetDB(). A remote open + // issues uncached network probes (the -wal/-journal sidecar HEADs); deferring runs that I/O on + // first use rather than eagerly in StartTransaction, and skips it for a database that is attached + // but never accessed. catalog_map = make_uniq(); } @@ -69,16 +75,47 @@ SQLiteTransaction::~SQLiteTransaction() { } void SQLiteTransaction::Start() { - db->Execute("BEGIN TRANSACTION"); + // Local/in-memory transactions begin here; remote transactions are deferred (db is not yet + // open) so this is a no-op for them and BEGIN runs in GetDB() on first use. + if (initialized.load(std::memory_order_acquire) && !started) { + db->Execute("BEGIN TRANSACTION"); + started = true; + } } void SQLiteTransaction::Commit() { - db->Execute("COMMIT"); + // Skip when the transaction never began (e.g. a remote DB attached but never queried). + if (started) { + db->Execute("COMMIT"); + } } void SQLiteTransaction::Rollback() { - db->Execute("ROLLBACK"); + if (started) { + db->Execute("ROLLBACK"); + } } SQLiteDB &SQLiteTransaction::GetDB() { + // Fast path: local and in-memory databases are initialized in the constructor. + if (initialized.load(std::memory_order_acquire)) { + return *db; + } + // Slow path (remote only): open the connection and begin the transaction on first use, outside + // the MetaTransaction lock. A per-transaction lock guards the init (parallel scans may race it). + lock_guard guard(init_lock); + if (!initialized.load(std::memory_order_relaxed)) { + auto client_context = context.lock(); + if (!client_context) { + throw TransactionException("ClientContext expired before the remote SQLite connection could be opened"); + } + auto new_db = SQLiteDB::Open(sqlite_catalog.path, sqlite_catalog.options, *client_context, true); + new_db.Execute("BEGIN TRANSACTION"); + owned_db = std::move(new_db); + db = &owned_db; + // Set started before publishing initialized: the release-store on initialized also makes + // this plain write visible to any thread that later observes initialized via acquire. + started = true; + initialized.store(true, std::memory_order_release); + } return *db; } @@ -148,7 +185,7 @@ optional_ptr SQLiteTransaction::GetCatalogEntry(const string &entr return entry; } // catalog entry not found - look up table in main SQLite database - auto type = db->GetEntryType(entry_name); + auto type = GetDB().GetEntryType(entry_name); if (type == CatalogType::INVALID) { // no table or view found return nullptr; @@ -162,7 +199,7 @@ optional_ptr SQLiteTransaction::GetCatalogEntry(const string &entr if (context.lock()->TryGetCurrentSetting("sqlite_all_varchar", sqlite_all_varchar)) { all_varchar = BooleanValue::Get(sqlite_all_varchar); } - db->GetTableInfo(entry_name, info.columns, info.constraints, all_varchar); + GetDB().GetTableInfo(entry_name, info.columns, info.constraints, all_varchar); D_ASSERT(!info.columns.empty()); result = make_uniq(sqlite_catalog, sqlite_catalog.GetMainSchema(), info, all_varchar); @@ -170,7 +207,7 @@ optional_ptr SQLiteTransaction::GetCatalogEntry(const string &entr } case CatalogType::VIEW_ENTRY: { string sql; - db->GetViewInfo(entry_name, sql); + GetDB().GetViewInfo(entry_name, sql); unique_ptr view_info; try { @@ -190,7 +227,7 @@ optional_ptr SQLiteTransaction::GetCatalogEntry(const string &entr case CatalogType::INDEX_ENTRY: { string table_name; string sql; - db->GetIndexInfo(entry_name, sql, table_name); + GetDB().GetIndexInfo(entry_name, sql, table_name); if (sql.empty()) { throw InternalException("SQL is empty"); } @@ -235,7 +272,7 @@ string GetDropSQL(CatalogType type, const string &table_name, bool cascade) { void SQLiteTransaction::DropEntry(CatalogType type, const string &table_name, bool cascade) { catalog_map->EraseEntry(table_name); - db->Execute(GetDropSQL(type, table_name, cascade)); + GetDB().Execute(GetDropSQL(type, table_name, cascade)); } } // namespace duckdb diff --git a/src/storage/sqlite_transaction_manager.cpp b/src/storage/sqlite_transaction_manager.cpp index d800794..2d8d44f 100644 --- a/src/storage/sqlite_transaction_manager.cpp +++ b/src/storage/sqlite_transaction_manager.cpp @@ -19,16 +19,31 @@ Transaction &SQLiteTransactionManager::StartTransaction(ClientContext &context) ErrorData SQLiteTransactionManager::CommitTransaction(ClientContext &context, Transaction &transaction) { auto &sqlite_transaction = transaction.Cast(); sqlite_transaction.Commit(); - lock_guard l(transaction_lock); - transactions.erase(transaction); + ExtractAndCloseAfterUnlock(transaction); return ErrorData(); } void SQLiteTransactionManager::RollbackTransaction(Transaction &transaction) { auto &sqlite_transaction = transaction.Cast(); sqlite_transaction.Rollback(); - lock_guard l(transaction_lock); - transactions.erase(transaction); + ExtractAndCloseAfterUnlock(transaction); +} + +void SQLiteTransactionManager::ExtractAndCloseAfterUnlock(Transaction &transaction) { + // Destroy the SQLite connection with transaction_lock released: ~SQLiteTransaction -> sqlite3_close_v2 + // runs connection teardown that can re-enter paths taking transaction_lock, so holding it across the + // close risks a lock-order inversion. Extract the transaction under the lock; let the moved-out + // unique_ptr destruct after the lock is dropped. + unique_ptr to_close; + { + lock_guard l(transaction_lock); + auto entry = transactions.find(transaction); + if (entry == transactions.end()) { + return; + } + to_close = std::move(entry->second); + transactions.erase(entry); + } } void SQLiteTransactionManager::Checkpoint(ClientContext &context, bool force) { diff --git a/test/sql/scanner/http_sqlite_00_vfs_registration.test b/test/sql/scanner/http_sqlite_00_vfs_registration.test new file mode 100644 index 0000000..efcebf1 --- /dev/null +++ b/test/sql/scanner/http_sqlite_00_vfs_registration.test @@ -0,0 +1,48 @@ +# name: test/sql/scanner/http_sqlite_00_vfs_registration.test +# description: HTTP VFS registration and URL-scheme handling for the SQLite scanner +# group: [sqlite_scanner] + +require sqlite_scanner + +require httpfs + +require-env SQLITE_HTTP_TEST_URL + +# The custom VFS is registered and a valid remote SQLite database opens through it. +query I +SELECT COUNT(*) FROM sqlite_scan('{SQLITE_HTTP_TEST_URL}/sakila.db', 'category'); +---- +16 + +# A reachable URL whose body is not a SQLite database errors cleanly (no crash). +statement error +SELECT * FROM sqlite_scan('{SQLITE_HTTP_TEST_URL}/not_a_database.txt', 'test'); +---- +file is not a database + +# A 404 on a remote path surfaces as a clean open failure. +statement error +SELECT * FROM sqlite_scan('{SQLITE_HTTP_TEST_URL}/status/404', 'test'); +---- +unable to open database file + +# S3 URLs are recognized and routed through httpfs; without credentials/region this surfaces +# as a permission error rather than a generic open failure (confirms the scheme is handled). +statement error +SELECT * FROM sqlite_scan('s3://bucket/test.db', 'test'); +---- +access permission denied + +# A bare nonexistent local path resolves through native local file handling, not the VFS. +statement error +SELECT * FROM sqlite_scan('nonexistent.db', 'test'); +---- +unable to open database file + +# A long URL (>512 chars) must still open. SQLite's default VFS caps pathnames at 512; presigned +# S3/GCS/Azure URLs routinely exceed that, so the VFS sets mxPathname for URLs. The padded query +# string pushes this URL past 512 while still resolving to the sakila fixture. +query I +SELECT COUNT(*) FROM sqlite_scan('{SQLITE_HTTP_TEST_URL}/sakila.db?pad=xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx', 'category'); +---- +16 diff --git a/test/sql/scanner/http_sqlite_01_basic_scan.test b/test/sql/scanner/http_sqlite_01_basic_scan.test new file mode 100644 index 0000000..a07c628 --- /dev/null +++ b/test/sql/scanner/http_sqlite_01_basic_scan.test @@ -0,0 +1,26 @@ +# name: test/sql/scanner/http_sqlite_01_basic_scan.test +# description: Basic HTTP SQLite scan functionality +# group: [sqlite_scanner] + +require sqlite_scanner + +require httpfs + +require-env SQLITE_HTTP_TEST_URL + +# Basic remote SQLite query +query I +SELECT COUNT(*) FROM sqlite_scan('{SQLITE_HTTP_TEST_URL}/sakila.db', 'category'); +---- +16 + +# Query with WHERE clause and ORDER BY +query IT +SELECT category_id, name FROM sqlite_scan('{SQLITE_HTTP_TEST_URL}/sakila.db', 'category') +WHERE name LIKE 'C%' +ORDER BY name +LIMIT 3; +---- +3 Children +4 Classics +5 Comedy diff --git a/test/sql/scanner/http_sqlite_02_concurrent_scans.test b/test/sql/scanner/http_sqlite_02_concurrent_scans.test new file mode 100644 index 0000000..6f33a87 --- /dev/null +++ b/test/sql/scanner/http_sqlite_02_concurrent_scans.test @@ -0,0 +1,39 @@ +# name: test/sql/scanner/http_sqlite_02_concurrent_scans.test +# description: Multiple concurrent scans of remote SQLite database +# group: [sqlite_scanner] + +require sqlite_scanner + +require httpfs + +require-env SQLITE_HTTP_TEST_URL + +# These concurrent scans need a robust HTTP server: they open several remote connections at once, which +# the stdlib test server does not serve reliably. Runs under the rclone-backed server (which sets this). +require-env SQLITE_HTTP_ROBUST + +statement ok +SET threads=4; + +# A parallel sqlite_scan opens a separate remote connection per row-group slice, so this exercises +# multi-threaded scanning through the per-context VFS and its shared external file cache (after the +# first block fetch, reads are served from cache). The result must stay correct under parallelism. +query I +SELECT COUNT(*) FROM sqlite_scan('{SQLITE_HTTP_TEST_URL}/sakila.db', 'film'); +---- +1000 + +# Aggregation over a parallel scan +query I +SELECT SUM(length) FROM sqlite_scan('{SQLITE_HTTP_TEST_URL}/sakila.db', 'film'); +---- +115272 + +# Multiple concurrent scans of different tables in a single query +query III +SELECT + (SELECT COUNT(*) FROM sqlite_scan('{SQLITE_HTTP_TEST_URL}/sakila.db', 'category')) as categories, + (SELECT COUNT(*) FROM sqlite_scan('{SQLITE_HTTP_TEST_URL}/sakila.db', 'language')) as languages, + (SELECT COUNT(*) FROM sqlite_scan('{SQLITE_HTTP_TEST_URL}/sakila.db', 'film')) as films; +---- +16 6 1000 diff --git a/test/sql/scanner/http_sqlite_03_joins.test b/test/sql/scanner/http_sqlite_03_joins.test new file mode 100644 index 0000000..3b3672d --- /dev/null +++ b/test/sql/scanner/http_sqlite_03_joins.test @@ -0,0 +1,23 @@ +# name: test/sql/scanner/http_sqlite_03_joins.test +# description: Joins between remote SQLite tables +# group: [sqlite_scanner] + +require sqlite_scanner + +require httpfs + +require-env SQLITE_HTTP_TEST_URL + +# Join between remote tables +query TI +SELECT c.name as Category, COUNT(*) as FilmCount +FROM sqlite_scan('{SQLITE_HTTP_TEST_URL}/sakila.db', 'category') c +JOIN sqlite_scan('{SQLITE_HTTP_TEST_URL}/sakila.db', 'film_category') fc +ON c.category_id = fc.category_id +GROUP BY c.name +ORDER BY FilmCount DESC, c.name +LIMIT 3; +---- +Sports 74 +Foreign 73 +Family 69 diff --git a/test/sql/scanner/http_sqlite_04_attach.test b/test/sql/scanner/http_sqlite_04_attach.test new file mode 100644 index 0000000..7e75e57 --- /dev/null +++ b/test/sql/scanner/http_sqlite_04_attach.test @@ -0,0 +1,48 @@ +# name: test/sql/scanner/http_sqlite_04_attach.test +# description: ATTACH remote SQLite database +# group: [sqlite_scanner] + +require sqlite_scanner + +require httpfs + +require-env SQLITE_HTTP_TEST_URL + +# Attach remote database +statement ok +ATTACH '{SQLITE_HTTP_TEST_URL}/sakila.db' AS http_db (TYPE sqlite); + +# Query attached database +query I +SELECT COUNT(*) FROM http_db.film; +---- +1000 + +# Verify actual column values round-trip correctly, not just row counts +query ITI +SELECT film_id, title, length FROM http_db.film WHERE film_id = 1; +---- +1 ACADEMY DINOSAUR 86 + +# Remote databases are read-only - writes must be rejected, not silently dropped +statement error +CREATE TABLE http_db.t(i INTEGER); +---- +read-only mode + +statement error +INSERT INTO http_db.category VALUES (9999, 'x', NULL); +---- +read-only mode + +statement ok +DETACH http_db; + +# ATTACH then DETACH with no query in between. The remote connection and BEGIN are deferred +# to the first GetDB(), so this exercises transaction teardown when the transaction never started +# (Commit/Rollback must not dereference a connection that was never opened). +statement ok +ATTACH '{SQLITE_HTTP_TEST_URL}/sakila.db' AS http_db_unused (TYPE sqlite); + +statement ok +DETACH http_db_unused; diff --git a/test/sql/scanner/http_sqlite_05_complex_queries.test b/test/sql/scanner/http_sqlite_05_complex_queries.test new file mode 100644 index 0000000..04c6aad --- /dev/null +++ b/test/sql/scanner/http_sqlite_05_complex_queries.test @@ -0,0 +1,47 @@ +# name: test/sql/scanner/http_sqlite_05_complex_queries.test +# description: Complex queries on attached remote SQLite database +# group: [sqlite_scanner] + +require sqlite_scanner + +require httpfs + +require-env SQLITE_HTTP_TEST_URL + +# First attach the database +statement ok +ATTACH '{SQLITE_HTTP_TEST_URL}/sakila.db' AS http_db (TYPE sqlite); + +# Complex aggregation on attached database +query TIR +SELECT c.name as Category, COUNT(*) as FilmCount, ROUND(AVG(f.length), 2) as AvgLength +FROM http_db.film f +JOIN http_db.film_category fc ON f.film_id = fc.film_id +JOIN http_db.category c ON fc.category_id = c.category_id +GROUP BY c.name +ORDER BY FilmCount DESC, c.name +LIMIT 3; +---- +Sports 74 128.2 +Foreign 73 121.7 +Family 69 114.78 + +# Window functions on attached database +query TTII +SELECT + a.first_name as FirstName, + a.last_name as LastName, + COUNT(fa.film_id) as FilmCount, + ROW_NUMBER() OVER (ORDER BY COUNT(fa.film_id) DESC, a.last_name, a.first_name) as ActorRank +FROM http_db.actor a +JOIN http_db.film_actor fa ON a.actor_id = fa.actor_id +GROUP BY a.first_name, a.last_name +ORDER BY FilmCount DESC, a.last_name, a.first_name +LIMIT 3; +---- +SUSAN DAVIS 54 1 +GINA DEGENERES 42 2 +WALTER TORN 41 3 + +statement ok +DETACH http_db; diff --git a/test/sql/scanner/http_sqlite_06_cte.test b/test/sql/scanner/http_sqlite_06_cte.test new file mode 100644 index 0000000..3335581 --- /dev/null +++ b/test/sql/scanner/http_sqlite_06_cte.test @@ -0,0 +1,25 @@ +# name: test/sql/scanner/http_sqlite_06_cte.test +# description: CTE with remote SQLite tables +# group: [sqlite_scanner] + +require sqlite_scanner + +require httpfs + +require-env SQLITE_HTTP_TEST_URL + +# CTE with remote tables +query TI +WITH top_categories AS ( + SELECT c.category_id, c.name, COUNT(*) as film_count + FROM sqlite_scan('{SQLITE_HTTP_TEST_URL}/sakila.db', 'category') c + JOIN sqlite_scan('{SQLITE_HTTP_TEST_URL}/sakila.db', 'film_category') fc ON c.category_id = fc.category_id + GROUP BY c.category_id, c.name + ORDER BY film_count DESC + LIMIT 3 +) +SELECT name, film_count FROM top_categories ORDER BY name; +---- +Family 69 +Foreign 73 +Sports 74 diff --git a/test/sql/scanner/http_sqlite_07_consistency.test b/test/sql/scanner/http_sqlite_07_consistency.test new file mode 100644 index 0000000..5cd00b2 --- /dev/null +++ b/test/sql/scanner/http_sqlite_07_consistency.test @@ -0,0 +1,27 @@ +# name: test/sql/scanner/http_sqlite_07_consistency.test +# description: Consistency check for multiple scans +# group: [sqlite_scanner] + +require sqlite_scanner + +require httpfs + +require-env SQLITE_HTTP_TEST_URL + +# Consistency check - multiple scans return same results +query IIIT +WITH counts AS ( + SELECT 'scan1' as scan_id, COUNT(*) as count FROM sqlite_scan('{SQLITE_HTTP_TEST_URL}/sakila.db', 'film') + UNION ALL + SELECT 'scan2', COUNT(*) FROM sqlite_scan('{SQLITE_HTTP_TEST_URL}/sakila.db', 'film') + UNION ALL + SELECT 'scan3', COUNT(*) FROM sqlite_scan('{SQLITE_HTTP_TEST_URL}/sakila.db', 'film') +) +SELECT + COUNT(*) as scan_count, + MIN(count) as min_count, + MAX(count) as max_count, + CASE WHEN MIN(count) = MAX(count) THEN 'CONSISTENT' ELSE 'INCONSISTENT' END as consistency +FROM counts; +---- +3 1000 1000 CONSISTENT diff --git a/test/sql/scanner/http_sqlite_08_error_handling.test b/test/sql/scanner/http_sqlite_08_error_handling.test new file mode 100644 index 0000000..059c820 --- /dev/null +++ b/test/sql/scanner/http_sqlite_08_error_handling.test @@ -0,0 +1,37 @@ +# name: test/sql/scanner/http_sqlite_08_error_handling.test +# description: Error handling for remote SQLite databases (hermetic, via local test server) +# group: [sqlite_scanner] + +require sqlite_scanner + +require httpfs + +require-env SQLITE_HTTP_TEST_URL + +# A reachable URL whose content is not a SQLite database +statement error +SELECT * FROM sqlite_scan('{SQLITE_HTTP_TEST_URL}/not_a_database.txt', 'test'); +---- +file is not a database + +# A non-existent file (server responds 404) maps to "unable to open database file" +statement error +SELECT * FROM sqlite_scan('{SQLITE_HTTP_TEST_URL}/definitely_nonexistent.sqlite', 'test'); +---- +unable to open database file + +# A refused connection (reserved port 1) surfaces as a clean open failure, not a crash +statement error +SELECT * FROM sqlite_scan('http://127.0.0.1:1/db.sqlite', 'test'); +---- +unable to open database file + +# The VFS routes through DuckDB's FileSystem and honors its external-access control: with external +# access disabled, the open is blocked by the filesystem layer before any network request. +statement ok +SET enable_external_access=false; + +statement error +SELECT * FROM sqlite_scan('{SQLITE_HTTP_TEST_URL}/sakila.db', 'category'); +---- +file system operations are disabled by configuration diff --git a/test/sql/scanner/http_sqlite_09_http_errors.test b/test/sql/scanner/http_sqlite_09_http_errors.test new file mode 100644 index 0000000..564b344 --- /dev/null +++ b/test/sql/scanner/http_sqlite_09_http_errors.test @@ -0,0 +1,49 @@ +# name: test/sql/scanner/http_sqlite_09_http_errors.test +# description: HTTP status code mapping to SQLite errors (hermetic, via local test server) +# group: [sqlite_scanner] + +# The local test server (scripts/sqlite_http_test_server.py) exposes /status/ endpoints that +# return the given HTTP status, so the status -> SQLite error mapping is asserted deterministically +# without a live third-party service. httpfs surfaces only some statuses to the scanner: 404 and 429 +# arrive with their code (mapped to SQLITE_CANTOPEN / SQLITE_BUSY), while 403/401/5xx are collapsed to +# "HTTP 0" inside httpfs and fall through to the caller's default error. The cases below assert that. + +require sqlite_scanner + +require httpfs + +require-env SQLITE_HTTP_TEST_URL + +# 429 Too Many Requests is distinctly mapped to SQLITE_BUSY -> "database is locked". +statement error +SELECT * FROM sqlite_scan('{SQLITE_HTTP_TEST_URL}/status/429', 'test'); +---- +database is locked + +# 404 / 403 / 401 / 500 all fail to open; httpfs does not distinguish them at the scanner +# boundary, so they collapse to the generic SQLITE_CANTOPEN -> "unable to open database file". +statement error +SELECT * FROM sqlite_scan('{SQLITE_HTTP_TEST_URL}/status/404', 'test'); +---- +unable to open database file + +# The terse SQLite code is enriched with the underlying httpfs error (status + URL), not discarded. +statement error +SELECT * FROM sqlite_scan('{SQLITE_HTTP_TEST_URL}/status/404', 'test'); +---- +HTTP Error + +statement error +SELECT * FROM sqlite_scan('{SQLITE_HTTP_TEST_URL}/status/403', 'test'); +---- +unable to open database file + +statement error +SELECT * FROM sqlite_scan('{SQLITE_HTTP_TEST_URL}/status/401', 'test'); +---- +unable to open database file + +statement error +SELECT * FROM sqlite_scan('{SQLITE_HTTP_TEST_URL}/status/500', 'test'); +---- +unable to open database file diff --git a/test/sql/scanner/http_sqlite_10_attach_active_txn.test b/test/sql/scanner/http_sqlite_10_attach_active_txn.test new file mode 100644 index 0000000..a6596cd --- /dev/null +++ b/test/sql/scanner/http_sqlite_10_attach_active_txn.test @@ -0,0 +1,28 @@ +# name: test/sql/scanner/http_sqlite_10_attach_active_txn.test +# description: Connection torn down with an open transaction on a remote ATTACH must not use-after-free the VFS +# group: [sqlite_scanner] + +require sqlite_scanner + +require httpfs + +require-env SQLITE_HTTP_TEST_URL + +statement ok +ATTACH '{SQLITE_HTTP_TEST_URL}/sakila.db' AS http_db (TYPE sqlite); + +# Open an explicit (non-autocommit) transaction so the remote SQLite connection stays open past +# the end of the test. At teardown DuckDB fires OnConnectionClosed (which unregisters the VFS) +# BEFORE rolling back the transaction (which closes the remote sqlite3 handle). Without refcounted +# VFS retirement the subsequent sqlite3_close runs xClose through freed io_methods -> use-after-free. +statement ok +BEGIN TRANSACTION; + +# Force the remote connection + handle to actually open inside the transaction. +query I +SELECT COUNT(*) FROM http_db.film; +---- +1000 + +# No COMMIT/ROLLBACK/DETACH: leave the transaction active so the connection is destroyed with the +# remote handle still open, exercising the refcounted VFS retirement on the teardown path. diff --git a/test/sql/scanner/http_sqlite_11_cross_database_join.test b/test/sql/scanner/http_sqlite_11_cross_database_join.test new file mode 100644 index 0000000..96d62fd --- /dev/null +++ b/test/sql/scanner/http_sqlite_11_cross_database_join.test @@ -0,0 +1,49 @@ +# name: test/sql/scanner/http_sqlite_11_cross_database_join.test +# description: Join across two attached remote SQLite databases; multi-handle VFS teardown with an open transaction +# group: [sqlite_scanner] + +require sqlite_scanner + +require httpfs + +require-env SQLITE_HTTP_TEST_URL + +# Attach the remote under two distinct aliases. The query string makes them distinct paths to DuckDB +# and the external file cache while resolving to the same file on the server, so both attached +# catalogs share the one per-context VFS wrapper, each with its own deferred-init transaction. +statement ok +ATTACH '{SQLITE_HTTP_TEST_URL}/sakila.db' AS db1 (TYPE sqlite); + +statement ok +ATTACH '{SQLITE_HTTP_TEST_URL}/sakila.db?alias=2' AS db2 (TYPE sqlite); + +# Join across the two attached remote databases. +query TI +SELECT c.name, COUNT(*) AS films +FROM db1.category c +JOIN db2.film_category fc ON c.category_id = fc.category_id +GROUP BY c.name +ORDER BY films DESC, c.name +LIMIT 3; +---- +Sports 74 +Foreign 73 +Family 69 + +# Open an explicit transaction touching both remote catalogs, then leave it active at teardown. The +# shared VFS wrapper has two open handles (open_file_count == 2); OnConnectionClosed unregisters it +# before the handles close, so the retire/reap path must keep it alive until the last close. +statement ok +BEGIN TRANSACTION; + +query I +SELECT COUNT(*) FROM db1.film; +---- +1000 + +query I +SELECT COUNT(*) FROM db2.film; +---- +1000 + +# No COMMIT/ROLLBACK/DETACH: the connection is destroyed with both remote handles open. diff --git a/test/sql/scanner/http_sqlite_12_wal.test b/test/sql/scanner/http_sqlite_12_wal.test new file mode 100644 index 0000000..6c84fae --- /dev/null +++ b/test/sql/scanner/http_sqlite_12_wal.test @@ -0,0 +1,32 @@ +# name: test/sql/scanner/http_sqlite_12_wal.test +# description: WAL-mode remote databases - reject a populated -wal sidecar, fail closed on an unverifiable one, allow a checkpointed file +# group: [sqlite_scanner] + +require sqlite_scanner + +require httpfs + +require-env SQLITE_HTTP_TEST_URL + +# A WAL-mode database whose -wal sidecar still holds frames: this VFS serves only the main file, so the +# open is rejected loudly instead of returning the stale last-checkpointed snapshot. +statement error +SELECT * FROM sqlite_scan('{SQLITE_HTTP_TEST_URL}/wal_mode_snapshot.db', 't'); +---- +WAL mode + +# A WAL-mode database that has been checkpointed has no populated -wal, so the main file is complete and +# reads correctly - the guard must not reject it. (Its -wal probe 404s: the one confirmed-absent case.) +query I +SELECT count(*) FROM sqlite_scan('{SQLITE_HTTP_TEST_URL}/wal_checkpointed.db', 't'); +---- +200 + +# Fail closed when the sidecar's state is UNVERIFIABLE rather than confirmed-absent. wal_checkpointed +# would PASS with a 404 -wal, so /walerr/503/ (forcing 503 on the -wal probe; mimics a 403 on a bare +# presigned URL or a transient 5xx) is what uniquely drives the rejection - isolating the fail-closed +# path. The guard must not assume "checkpointed-complete" and serve a stale snapshot. Only 404 passes. +statement error +SELECT * FROM sqlite_scan('{SQLITE_HTTP_TEST_URL}/walerr/503/wal_checkpointed.db', 't'); +---- +Cannot verify diff --git a/test/sql/scanner/http_sqlite_13_journal.test b/test/sql/scanner/http_sqlite_13_journal.test new file mode 100644 index 0000000..d284140 --- /dev/null +++ b/test/sql/scanner/http_sqlite_13_journal.test @@ -0,0 +1,40 @@ +# name: test/sql/scanner/http_sqlite_13_journal.test +# description: Rollback-journal remote databases - reject any non-zeroed -journal header, allow a clean/zeroed one +# group: [sqlite_scanner] + +require sqlite_scanner + +require httpfs + +require-env SQLITE_HTTP_TEST_URL + +# A rollback-mode database whose "-journal" sidecar is hot (intact magic, first byte 0xd9) from an +# interrupted write. This VFS serves only the main file under SQLITE_IOCAP_IMMUTABLE, so SQLite would +# skip hot-journal recovery and read an inconsistent snapshot - reject loudly instead. +statement error +SELECT * FROM sqlite_scan('{SQLITE_HTTP_TEST_URL}/hot_journal.db', 't'); +---- +hot rollback journal + +# A "-journal" whose header's first byte is nonzero but is NOT the SQLite magic (a corrupt/partial +# leftover). The guard keys off the first byte (matching SQLite's hasHotJournal), not the exact magic, +# so this is treated as potentially-hot and rejected rather than served as clean. +statement error +SELECT * FROM sqlite_scan('{SQLITE_HTTP_TEST_URL}/partial_journal.db', 't'); +---- +hot rollback journal + +# A leftover "-journal" with a ZEROED header (a clean commit, or PERSIST journal_mode) is not hot: its +# first byte is 0. The guard reads the first header byte, so this cleanly-closed database is served. +query I +SELECT count(*) FROM sqlite_scan('{SQLITE_HTTP_TEST_URL}/clean_journal.db', 't'); +---- +3 + +# Fail closed when the "-journal" state is UNVERIFIABLE (non-404), same as the WAL guard. clean_journal +# would PASS with a 404 sidecar, so /walerr/503/ (forcing 503 on the -journal probe) is what uniquely +# drives the rejection here - isolating the fail-closed path from content-based rejection. +statement error +SELECT * FROM sqlite_scan('{SQLITE_HTTP_TEST_URL}/walerr/503/clean_journal.db', 't'); +---- +Cannot verify diff --git a/test/sql/scanner/http_sqlite_14_temp_store.test b/test/sql/scanner/http_sqlite_14_temp_store.test new file mode 100644 index 0000000..9479174 --- /dev/null +++ b/test/sql/scanner/http_sqlite_14_temp_store.test @@ -0,0 +1,27 @@ +# name: test/sql/scanner/http_sqlite_14_temp_store.test +# description: VFS-backed connections keep SQLite temp storage in memory (no temp open through the read-only VFS) +# group: [sqlite_scanner] + +require sqlite_scanner + +require httpfs + +require-env SQLITE_HTTP_TEST_URL + +statement ok +ATTACH '{SQLITE_HTTP_TEST_URL}/sakila.db' AS ck (TYPE sqlite, READ_ONLY); + +# temp_store == 2 (MEMORY): SQLite-side sorter/materialization spills stay in memory rather than being +# opened as a temp file through this read-only VFS (whose nameless temp xOpen fails "unable to open +# database file"). +query I +SELECT * FROM sqlite_query('ck', 'PRAGMA temp_store'); +---- +2 + +# A SQLite-side ORDER BY in sqlite_query (not DuckDB) over the remote database completes. (temp_store +# is asserted above; that is what keeps a larger sort's spill off the read-only VFS, not this query.) +query I +SELECT count(*) FROM sqlite_query('ck', 'SELECT payment_id FROM payment ORDER BY amount, payment_date, payment_id'); +---- +16049 diff --git a/test/sql/scanner/http_sqlite_15_concurrent_lifecycle.test b/test/sql/scanner/http_sqlite_15_concurrent_lifecycle.test new file mode 100644 index 0000000..8123d93 --- /dev/null +++ b/test/sql/scanner/http_sqlite_15_concurrent_lifecycle.test @@ -0,0 +1,49 @@ +# name: test/sql/scanner/http_sqlite_15_concurrent_lifecycle.test +# description: Concurrent register/open/scan/teardown across many contexts - stresses the VFS registry under load +# group: [sqlite_scanner] + +require sqlite_scanner + +require httpfs + +require-env SQLITE_HTTP_TEST_URL + +# Robust-server marker: set only by the wrapper's --server rclone-http mode. The stdlib test server +# drops connections under this concurrency, so this test runs only against rclone serve http; without +# the marker it skips. +require-env SQLITE_HTTP_ROBUST + +# Bound per-scan parallelism so the load is concurrent-contexts-dominated (what stresses the registry) +# rather than a thundering herd of range requests; 16 contexts x a few threads is plenty. +statement ok +SET threads TO 4; + +# Each concurrentloop iteration runs on its own Connection (its own ClientContext), so this drives many +# contexts concurrently through the full VFS lifecycle: lazy Register() of the per-context wrapper, +# parallel-scan Open()s, Read()s, and connection teardown -> OnConnectionClosed -> Unregister() with the +# shared retired-vector reap. It contends the global VFSRegistryData (registry_mutex, the retired +# vector, ReapIdleRetiredWrappers, the open-handle refcount) from many threads at once, where a data +# race or use-after-free in the retire-then-reap machinery would surface. + +concurrentloop i 0 16 + +# sqlite_scan path: registers the VFS on this context, opens + (parallel-)scans, unregisters at teardown. +query I +SELECT count(*) FROM sqlite_scan('{SQLITE_HTTP_TEST_URL}/sakila.db', 'category'); +---- +16 + +# ATTACH path: exercises the deferred remote-open (connection + BEGIN on first GetDB()) concurrently, +# with a per-iteration name so the threads do not collide. +statement ok +ATTACH '{SQLITE_HTTP_TEST_URL}/sakila.db' AS ck_{i} (TYPE sqlite, READ_ONLY); + +query I +SELECT count(*) FROM sqlite_query('ck_{i}', 'SELECT category_id FROM category'); +---- +16 + +statement ok +DETACH ck_{i}; + +endloop diff --git a/test/sql/scanner/http_sqlite_16_s3.test b/test/sql/scanner/http_sqlite_16_s3.test new file mode 100644 index 0000000..132e35d --- /dev/null +++ b/test/sql/scanner/http_sqlite_16_s3.test @@ -0,0 +1,38 @@ +# name: test/sql/scanner/http_sqlite_16_s3.test +# description: Remote SQLite over s3:// (rclone serve s3) - scan + credentialed -wal sidecar guard +# group: [sqlite_scanner] + +require sqlite_scanner + +require httpfs + +# Set only by the wrapper's --server rclone-s3 mode (an rclone serve s3 endpoint + the "db" bucket). +require-env SQLITE_S3_TEST_ENDPOINT + +require-env SQLITE_S3_TEST_URL + +# Credentials must match the wrapper's rclone --auth-key (S3_KEY/S3_SECRET in sqlite_http_test_server.py). +statement ok +CREATE SECRET (TYPE s3, KEY_ID 'testkey', SECRET 'testsecret', ENDPOINT '{SQLITE_S3_TEST_ENDPOINT}', URL_STYLE 'path', USE_SSL false, REGION 'us-east-1'); + +# A remote scan over s3:// routes through the VFS and the S3FileSystem transport, distinct from the +# http:// path. +query I +SELECT count(*) FROM sqlite_scan('{SQLITE_S3_TEST_URL}/sakila.db', 'category'); +---- +16 + +# The credentialed sidecar path: a checkpointed WAL-mode DB's "-wal" probe is re-signed with the same +# S3 credentials as the main object (BuildSidecarUrl + httpfs signing). Its "-wal" is absent (404), so +# the guard passes and the main file reads - exercising sidecar authentication over S3. +query I +SELECT count(*) FROM sqlite_scan('{SQLITE_S3_TEST_URL}/wal_checkpointed.db', 't'); +---- +200 + +# And a populated "-wal" over s3:// is still rejected: the credentialed sidecar probe reaches a sidecar +# that holds frames, so the WAL guard fails closed. +statement error +SELECT * FROM sqlite_scan('{SQLITE_S3_TEST_URL}/wal_mode_snapshot.db', 't'); +---- +WAL mode