Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 50 additions & 6 deletions .github/workflows/SQLiteTests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,19 @@ jobs:
sudo apt-get update -y -q -o=Dpkg::Use-Pty=0
sudo apt-get install -y -q -o=Dpkg::Use-Pty=0 \
build-essential \
libcurl4-openssl-dev \
libssl-dev \
ccache \
cmake \
ninja-build
ninja-build \
python3

- name: Install rclone
run: |
# apt's rclone predates `serve s3`; install a pinned recent build for the rclone-backed tests.
curl -fsSL https://downloads.rclone.org/v1.74.2/rclone-v1.74.2-linux-amd64.zip -o /tmp/rclone.zip
sudo unzip -oj /tmp/rclone.zip '*/rclone' -d /usr/local/bin
rclone version

- name: Cache Key
id: cache_key
Expand All @@ -57,12 +67,27 @@ jobs:
key: ${{ steps.cache_key.outputs.value }}

- name: Test extension
env:
env:
LOCAL_EXTENSION_REPO: ${{ github.workspace }}/build/release/repository/
SQLITE_TPCH_GENERATED: 1
run: |
make data/db/tpch.db
./build/release/test/unittest 'test/*'
# Wrap unittest with the local HTTP server so the remote tests (require-env SQLITE_HTTP_TEST_URL)
# run instead of skipping. The concurrency + s3 tests gate on other env vars and run in the steps below.
python3 scripts/sqlite_http_test_server.py ./build/release/test/unittest 'test/*'

- name: Test concurrency (rclone serve http)
run: |
# The concurrent-scan and concurrent-lifecycle tests need a robust server; rclone serve http
# handles them (--server rclone-http sets SQLITE_HTTP_ROBUST so they run).
python3 scripts/sqlite_http_test_server.py --server rclone-http \
./build/release/test/unittest 'test/sql/scanner/http_sqlite_*concurrent*'

- name: Test s3:// path (rclone serve s3)
run: |
# Exercise the s3:// transport + the credentialed -wal/-journal sidecar probe via rclone serve s3.
python3 scripts/sqlite_http_test_server.py --server rclone-s3 \
./build/release/test/unittest 'test/sql/scanner/http_sqlite_16_s3.test'

linux-sanitized:
name: Linux ${{ matrix.sanitizer.name }} (amd64)
Expand Down Expand Up @@ -105,10 +130,20 @@ jobs:
sudo apt-get update -y -q -o=Dpkg::Use-Pty=0
sudo apt-get install -y -q -o=Dpkg::Use-Pty=0 \
build-essential \
libcurl4-openssl-dev \
libssl-dev \
ccache \
cmake \
mold \
ninja-build
ninja-build \
python3

- name: Install rclone
run: |
# apt's rclone predates `serve s3`; install a pinned recent build for the rclone-backed tests.
curl -fsSL https://downloads.rclone.org/v1.74.2/rclone-v1.74.2-linux-amd64.zip -o /tmp/rclone.zip
sudo unzip -oj /tmp/rclone.zip '*/rclone' -d /usr/local/bin
rclone version

- name: Cache Key
id: cache_key
Expand Down Expand Up @@ -138,7 +173,16 @@ jobs:
key: ${{ steps.cache_key.outputs.value }}

- name: Test extension
env:
env:
LOCAL_EXTENSION_REPO: ${{ github.workspace }}/build/relassert/repository/
run: |
./build/relassert/test/unittest
# Run under the local HTTP server so the remote tests run (not skip) under the sanitizers --
# this exercises the VFS lifetime/refcount paths under ASan/TSan.
python3 scripts/sqlite_http_test_server.py ./build/relassert/test/unittest

- name: Test concurrency under sanitizer (rclone serve http)
run: |
# The concurrency tests under the sanitizers: many contexts register/open/scan/unregister at once,
# contending the VFS registry to surface races / use-after-free in the retire-then-reap path.
python3 scripts/sqlite_http_test_server.py --server rclone-http \
./build/relassert/test/unittest 'test/sql/scanner/http_sqlite_*concurrent*'
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
.DS_Store
.idea
.cache
build
cmake-build-debug
sqlite/build
Expand Down
Binary file added data/db/clean_journal.db
Binary file not shown.
Binary file added data/db/clean_journal.db-journal
Binary file not shown.
Binary file added data/db/hot_journal.db
Binary file not shown.
Binary file added data/db/hot_journal.db-journal
Binary file not shown.
1 change: 1 addition & 0 deletions data/db/not_a_database.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is not a SQLite database. Used by http_sqlite error-handling tests.
Binary file added data/db/partial_journal.db
Binary file not shown.
Binary file added data/db/partial_journal.db-journal
Binary file not shown.
Binary file added data/db/wal_checkpointed.db
Binary file not shown.
Binary file added data/db/wal_mode_snapshot.db
Binary file not shown.
Binary file added data/db/wal_mode_snapshot.db-wal
Binary file not shown.
2 changes: 1 addition & 1 deletion duckdb
Submodule duckdb updated 232 files
15 changes: 14 additions & 1 deletion extension_config.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,21 @@
# Extension from this repo
duckdb_extension_load(sqlite_scanner
SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}
LOAD_TESTS
)

# Any extra extensions that should be built
# e.g.: duckdb_extension_load(json)
duckdb_extension_load(tpch)
duckdb_extension_load(tpch)

# The remote-SQLite tests need httpfs, pinned to the commit the duckdb engine coordinates with
# (APPLY_PATCHES applies the engine's bundled httpfs patches for the dev-engine API; engine and httpfs
# move independently and skew between releases). Skipped on WASM: the remote tests do not run there,
# and httpfs's OpenSSL dependency does not build for emscripten.
if(NOT EMSCRIPTEN)
duckdb_extension_load(httpfs
GIT_URL https://github.com/duckdb/duckdb-httpfs
GIT_TAG 53c5b032f6c368cfcc1a1ac3819118e86d3286a6
APPLY_PATCHES
)
endif()
258 changes: 258 additions & 0 deletions scripts/sqlite_http_test_server.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,258 @@
#!/usr/bin/env python3
"""Range-capable HTTP server for the remote-SQLite tests, plus a test command runner.

DuckDB's httpfs reads remote files with HTTP range requests, so the stock
``python -m http.server`` (which ignores ``Range:``) is not enough. This server:

* serves files from ``data/db`` with HEAD + ranged GET (206 / Content-Range), and
* exposes ``/status/<code>`` endpoints that return an arbitrary HTTP status,

so the whole remote suite runs hermetically with no third-party URLs.

It starts a server, exports the relevant env vars into the child's environment, runs the child, tears
the server down, and exits with the child's return code.

Usage:
sqlite_http_test_server.py [--server MODE] <test-binary> [args...]

MODE (default ``stdlib``):
* ``stdlib`` -- the built-in threaded server above (HTTP + /status + /walerr error injection).
Reliable for the serial + error-injection tests.
* ``rclone-http`` -- ``rclone serve http`` (a robust Go server) for the concurrency stress test, which
overwhelms the stdlib server. Sets SQLITE_HTTP_TEST_URL + SQLITE_HTTP_ROBUST.
* ``rclone-s3`` -- ``rclone serve s3`` so the suite can exercise the ``s3://`` path (incl. the
credentialed ``-wal``/``-journal`` sidecar probe). Sets SQLITE_S3_TEST_*.
The rclone modes require the ``rclone`` binary on PATH; tests gate on the env vars above so they skip
where it is not wired.
"""

import contextlib
import os
import re
import socket
import subprocess
import sys
import tempfile
import threading
import time
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
from pathlib import Path

SERVE_DIR = (Path(__file__).resolve().parent.parent / "data" / "db").resolve()
STATUS_RE = re.compile(r"^/status/(\d{3})$")
# /walerr/<code>/<file>: serve <file> normally, but return <code> for its "-wal"/"-journal" sidecar.
# Lets the WAL fail-closed guard be tested when the sidecar's state is unverifiable (non-404 error).
WALERR_RE = re.compile(r"^/walerr/(\d{3})/(.+)$")


class Handler(BaseHTTPRequestHandler):
protocol_version = "HTTP/1.1"

def log_message(self, *args): # silence per-request logging
pass

def _resolve(self, rel):
# Map a relative path to a file under SERVE_DIR, refusing path traversal. Returns None if it
# escapes SERVE_DIR or is not a file.
target = (SERVE_DIR / rel).resolve()
if SERVE_DIR not in target.parents and target != SERVE_DIR:
return None
return target if target.is_file() else None

def _status_endpoint(self):
m = STATUS_RE.match(self.path.split("?", 1)[0])
return int(m.group(1)) if m else None

def _send_simple(self, code, body=b""):
self.send_response(code)
self.send_header("Content-Length", str(len(body)))
self.send_header("Content-Type", "text/plain")
self.end_headers()
if body and self.command != "HEAD":
self.wfile.write(body)

def do_HEAD(self):
self._serve(head_only=True)

def do_GET(self):
self._serve(head_only=False)

def _serve(self, head_only):
code = self._status_endpoint()
if code is not None:
# Emulate httpbin-style status endpoints for error-mapping tests.
self._send_simple(code, f"status {code}\n".encode())
return

walerr = WALERR_RE.match(self.path.split("?", 1)[0])
if walerr:
forced, rel = int(walerr.group(1)), walerr.group(2)
if rel.endswith("-wal") or rel.endswith("-journal"):
# The WAL guard's sidecar probe lands here: return the forced (non-404) error so the
# probe cannot confirm the sidecar absent, and the open must fail closed.
self._send_simple(forced, f"status {forced}\n".encode())
return
target = self._resolve(rel)
if target is None:
self._send_simple(404, b"not found\n")
return
self._serve_file(target, head_only)
return

target = self._resolve(self.path.lstrip("/").split("?", 1)[0])
if target is None:
self._send_simple(404, b"not found\n")
return

self._serve_file(target, head_only)

def _serve_file(self, target, head_only):
# Seek+read only the requested span rather than reading the whole file per request: DuckDB's
# CachingFileSystem issues many ranged GETs (more so under parallel scans), and the fixtures
# can be several MB. Only the single `bytes=start-end` (or open-ended `bytes=start-`) form
# httpfs issues is handled; suffix (`bytes=-N`) and multi-range requests are not.
size = target.stat().st_size
rng = self.headers.get("Range")
if rng:
m = re.match(r"bytes=(\d*)-(\d*)", rng.strip())
if m:
start = int(m.group(1)) if m.group(1) else 0
end = int(m.group(2)) if m.group(2) else size - 1
end = min(end, size - 1)
if start > end or start >= size:
self.send_response(416)
self.send_header("Content-Range", f"bytes */{size}")
self.send_header("Content-Length", "0")
self.end_headers()
return
length = end - start + 1
self.send_response(206)
self.send_header("Content-Type", "application/octet-stream")
self.send_header("Accept-Ranges", "bytes")
self.send_header("Content-Range", f"bytes {start}-{end}/{size}")
self.send_header("Content-Length", str(length))
self.end_headers()
if not head_only:
with open(target, "rb") as f:
f.seek(start)
self.wfile.write(f.read(length))
return

self.send_response(200)
self.send_header("Content-Type", "application/octet-stream")
self.send_header("Accept-Ranges", "bytes")
self.send_header("Content-Length", str(size))
self.end_headers()
if not head_only:
with open(target, "rb") as f:
self.wfile.write(f.read())


class Server(ThreadingHTTPServer):
# Many DuckDB connections (each parallel-scanning) can hit this server at once. socketserver's
# default listen backlog is 5, so excess simultaneous connections would be reset; raise it, reap
# threads as daemons, and allow address reuse.
request_queue_size = 128
daemon_threads = True
allow_reuse_address = True


# rclone-s3 credentials: the s3 test's CREATE SECRET must use these exact values (see http_sqlite_16).
S3_KEY = "testkey"
S3_SECRET = "testsecret"


def _free_port():
with contextlib.closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
s.bind(("127.0.0.1", 0))
return s.getsockname()[1]


def _wait_for_port(port, timeout=30.0):
deadline = time.monotonic() + timeout
while time.monotonic() < deadline:
with contextlib.closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
s.settimeout(0.5)
if s.connect_ex(("127.0.0.1", port)) == 0:
return True
time.sleep(0.1)
return False


def run_stdlib(cmd):
server = Server(("127.0.0.1", 0), Handler)
port = server.server_address[1]
thread = threading.Thread(target=server.serve_forever, daemon=True)
thread.start()
env = dict(os.environ)
env["SQLITE_HTTP_TEST_URL"] = f"http://127.0.0.1:{port}"
try:
return subprocess.run(cmd, env=env).returncode
finally:
server.shutdown()
server.server_close()


def run_rclone(cmd, scheme):
# scheme "http": rclone serve http (robust file server for the concurrency stress test).
# scheme "s3": rclone serve s3 over data/'s parent so data/db is the "db" bucket (s3://db/<file>).
port = _free_port()
env = dict(os.environ)
if scheme == "http":
args = ["rclone", "serve", "http", "--addr", f"127.0.0.1:{port}", "--read-only", str(SERVE_DIR)]
env["SQLITE_HTTP_TEST_URL"] = f"http://127.0.0.1:{port}"
env["SQLITE_HTTP_ROBUST"] = "1" # marker so the concurrency test runs only on the robust server
else: # s3
args = [
"rclone",
"serve",
"s3",
"--addr",
f"127.0.0.1:{port}",
"--auth-key",
f"{S3_KEY},{S3_SECRET}",
str(SERVE_DIR.parent),
]
env["SQLITE_S3_TEST_ENDPOINT"] = f"127.0.0.1:{port}"
env["SQLITE_S3_TEST_URL"] = f"s3://{SERVE_DIR.name}" # s3://db
proc_err = tempfile.TemporaryFile()
proc = subprocess.Popen(args, stdout=subprocess.DEVNULL, stderr=proc_err)
try:
if not _wait_for_port(port):
proc_err.seek(0)
detail = proc_err.read().decode(errors="replace").strip()
print(f"rclone serve {scheme} did not become ready on port {port}: {detail}", file=sys.stderr)
return 3
return subprocess.run(cmd, env=env).returncode
finally:
proc.terminate()
try:
proc.wait(timeout=10)
except subprocess.TimeoutExpired:
proc.kill()


def main():
args = sys.argv[1:]
mode = "stdlib"
if args and args[0] == "--server":
if len(args) < 3:
print("usage: --server <stdlib|rclone-http|rclone-s3> <command> [args...]", file=sys.stderr)
return 2
mode = args[1]
args = args[2:]
if not args:
print("usage: sqlite_http_test_server.py [--server MODE] <command> [args...]", file=sys.stderr)
return 2
if mode == "stdlib":
return run_stdlib(args)
if mode == "rclone-http":
return run_rclone(args, "http")
if mode == "rclone-s3":
return run_rclone(args, "s3")
print(f"unknown server mode: {mode}", file=sys.stderr)
return 2


if __name__ == "__main__":
sys.exit(main())
2 changes: 1 addition & 1 deletion src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ add_subdirectory(storage)

add_library(
sqlite_ext_library OBJECT
sqlite_db.cpp sqlite_extension.cpp sqlite_scanner.cpp sqlite_stmt.cpp
sqlite_db.cpp sqlite_duckdb_vfs_cache.cpp sqlite_extension.cpp sqlite_scanner.cpp sqlite_stmt.cpp
sqlite_storage.cpp sqlite_utils.cpp)
set(ALL_OBJECT_FILES
${ALL_OBJECT_FILES} $<TARGET_OBJECTS:sqlite_ext_library>
Expand Down
Loading
Loading