diff --git a/umbra-s3/.gitignore b/umbra-s3/.gitignore new file mode 100644 index 0000000000..4af23e93d0 --- /dev/null +++ b/umbra-s3/.gitignore @@ -0,0 +1,3 @@ +data/ +db/ +.s3-env diff --git a/umbra-s3/README.md b/umbra-s3/README.md new file mode 100644 index 0000000000..f85cc1e325 --- /dev/null +++ b/umbra-s3/README.md @@ -0,0 +1,81 @@ +# Umbra (S3) + +ClickBench for [Umbra](https://umbra-db.com/) with the `hits` table stored on +**Amazon S3** (`backend=cloud`) instead of local disk. It is the same Umbra +benchmark as [`../umbra`](../umbra), with two differences: + +- `create.sql` registers an S3 bucket as Umbra remote storage and creates the + table with `backend=cloud`, so table data lives in the bucket. +- You must provision that bucket first with [`./create-bucket`](#1-create-the-s3-bucket). + +The dataset (`hits.parquet`) is still ingested from a local copy via +`umbra.parquetview`; only the resulting table is stored in S3. + +## Prerequisites + +- A fresh Ubuntu 24.04+ VM (the scripts `sudo apt-get install` Docker, the + Postgres client, and the AWS CLI as needed). +- Docker access (the default flow runs `umbradb/umbra` in a container). +- **AWS credentials that can create and write an S3 bucket.** `create-bucket` + picks them up, in order, from: + 1. `$AWS_ACCESS_KEY_ID` / `$AWS_SECRET_ACCESS_KEY` in the environment, + 2. whatever `aws configure` has stored, + 3. an interactive prompt (only if neither of the above can reach S3). + + The *same static keys* are handed to Umbra's `create remote storage` + statement, so they must allow normal S3 data access (not just bucket + creation). No IAM user/role is created. + +## 1. Create the S3 bucket + +```bash +cd umbra-s3 +./create-bucket +``` + +This: + +- ensures the AWS CLI is installed, +- resolves working AWS credentials (see above), +- generates a globally-unique bucket name `clickbench-umbra-s3--` + and creates it in your region, +- writes everything Umbra needs to **`.s3-env`** (bucket, region, key id, key). + +`.s3-env` is gitignored and `chmod 600`. **`./load` sources it automatically**, +so once `create-bucket` has run you do not need to export anything by hand. +Re-running `create-bucket` reuses the bucket/credentials already in `.s3-env`. + +### Region and path + +- Region: `$UMBRA_S3_REGION`, else `$AWS_DEFAULT_REGION`, else `us-east-1`. +- Path prefix inside the bucket: `$UMBRA_S3_PATH` (default `umbra`). + +Umbra addresses the bucket as `s3://:/` — the region is +part of the URI, not a separate option. + +## 2. Run the benchmark + +Either run the standard ClickBench driver directly from this directory: + +```bash +cd umbra-s3 +./benchmark.sh +``` + +The driver (`../lib/benchmark-common.sh`, via `benchmark.sh`) runs the +primitives in order: `install` → `start` → `load` → the 43 queries +(cold + 2 warm each) → `stop`. `install` downloads `hits.parquet` into `data/` +(kept across runs, out of the measured load time); `load` registers the S3 +remote storage, creates the `backend=cloud` table, and ingests it. + +> Run `./create-bucket` **before** the benchmark. `load` fails fast with a +> clear message if `UMBRA_S3_*` are unset (i.e. no `.s3-env`). + +## 3. Tear down + +```bash +./delete-bucket +``` + +Empties and deletes the bucket recorded in `.s3-env`, then removes `.s3-env`. +Idempotent, and touches no IAM resources. diff --git a/umbra-s3/benchmark.sh b/umbra-s3/benchmark.sh new file mode 100755 index 0000000000..b278e40528 --- /dev/null +++ b/umbra-s3/benchmark.sh @@ -0,0 +1,5 @@ +#!/bin/bash +# Thin shim — actual flow is in lib/benchmark-common.sh. +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" +export BENCH_DURABLE=yes +exec ../lib/benchmark-common.sh diff --git a/umbra-s3/check b/umbra-s3/check new file mode 100755 index 0000000000..5336d8ba03 --- /dev/null +++ b/umbra-s3/check @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +PGPASSWORD=postgres psql -p 5432 -h 127.0.0.1 -U postgres -c 'SELECT 1' >/dev/null diff --git a/umbra-s3/create-bucket b/umbra-s3/create-bucket new file mode 100755 index 0000000000..26b8e54030 --- /dev/null +++ b/umbra-s3/create-bucket @@ -0,0 +1,103 @@ +#!/bin/bash +set -eu + +# Create the S3 bucket that backs Umbra's (backend=cloud) hits table and record +# the credentials Umbra needs in .s3-env. No IAM user is created — this uses the +# access key the caller already has (env AWS_ACCESS_KEY_ID/AWS_SECRET_ACCESS_KEY, +# else `aws configure`). Those same static keys are what Umbra's +# create remote storage s3 using '' with secret '' '' +# needs, so we verify they can do S3 and, only if they can't, prompt for keys. +# +# The bucket name is always generated as clickbench-umbra-s3--; +# re-running reuses the bucket/credentials recorded in .s3-env. + +here="$(cd "$(dirname "$0")" && pwd)" +envfile="$here/.s3-env" + +region="${UMBRA_S3_REGION:-${AWS_DEFAULT_REGION:-us-east-1}}" + +# Ensure the AWS CLI is available (the base image / install step doesn't ship +# it). Install it on first use. +if ! command -v aws >/dev/null 2>&1; then + sudo apt-get update -y + sudo apt-get install -y awscli +fi + +# True if the given key id/secret can talk to S3 (lists buckets). +creds_work() { + AWS_ACCESS_KEY_ID="$1" AWS_SECRET_ACCESS_KEY="$2" \ + AWS_DEFAULT_REGION="$region" aws s3 ls >/dev/null 2>&1 +} + +# --- credentials ----------------------------------------------------------- +# Reuse keys from a previous run if present. +# shellcheck disable=SC1091 +[ -f "$envfile" ] && . "$envfile" + +key_id="${UMBRA_S3_ACCESS_KEY_ID:-${AWS_ACCESS_KEY_ID:-}}" +key_secret="${UMBRA_S3_ACCESS_KEY:-${AWS_SECRET_ACCESS_KEY:-}}" + +# Fall back to whatever `aws configure` has stored. +if [ -z "$key_id" ] || [ -z "$key_secret" ]; then + key_id="$(aws configure get aws_access_key_id 2>/dev/null || true)" + key_secret="$(aws configure get aws_secret_access_key 2>/dev/null || true)" +fi + +# If the current credentials don't exist or can't reach S3, ask for keys. +if [ -z "$key_id" ] || [ -z "$key_secret" ] || ! creds_work "$key_id" "$key_secret"; then + echo "create-bucket: current AWS credentials can't access S3." >&2 + read -r -p "AWS Access Key ID: " key_id + read -r -s -p "AWS Secret Access Key: " key_secret; echo + if ! creds_work "$key_id" "$key_secret"; then + echo "create-bucket: those keys can't access S3 either — aborting." >&2 + exit 1 + fi +fi + +export AWS_ACCESS_KEY_ID="$key_id" +export AWS_SECRET_ACCESS_KEY="$key_secret" +export AWS_DEFAULT_REGION="$region" + +# --- bucket name ----------------------------------------------------------- +# Reuse a previous run's bucket if .s3-env carried one over; else generate +# clickbench-umbra-s3-- +# (global, <=63 chars, lowercase alphanumerics + hyphens). +bucket="${UMBRA_S3_BUCKET:-}" +if [ -z "$bucket" ]; then + uuid="$( (uuidgen 2>/dev/null || cat /proc/sys/kernel/random/uuid) \ + | tr 'A-Z' 'a-z' | tr -cd 'a-f0-9' | cut -c1-12)" + bucket="clickbench-umbra-s3-$(date +%Y%m%d)-$uuid" + echo "create-bucket: generated bucket name $bucket" +fi + +# --- create ---------------------------------------------------------------- +if aws s3api head-bucket --bucket "$bucket" >/dev/null 2>&1; then + echo "create-bucket: s3://$bucket already exists" +else + # us-east-1 must NOT be passed as a LocationConstraint (the API rejects + # it); every other region requires it. + if [ "$region" = "us-east-1" ]; then + aws s3api create-bucket --bucket "$bucket" >/dev/null + else + aws s3api create-bucket --bucket "$bucket" \ + --create-bucket-configuration "LocationConstraint=$region" >/dev/null + fi + aws s3api wait bucket-exists --bucket "$bucket" + echo "create-bucket: created s3://$bucket in $region" +fi + +# --- persist --------------------------------------------------------------- +# UMBRA_S3_URI is the full storage URI load passes through verbatim — the AWS +# s3://:/ form (region is part of the URI, path follows). +path="${UMBRA_S3_PATH:-umbra}" +umask 077 +cat > "$envfile" < $envfile (load sources this for you)." diff --git a/umbra-s3/create.sql b/umbra-s3/create.sql new file mode 100644 index 0000000000..342b9c2eb4 --- /dev/null +++ b/umbra-s3/create.sql @@ -0,0 +1,236 @@ +-- Register the S3 bucket as remote storage so the table below can live in +-- the cloud (backend=cloud). Bucket URI + credentials are passed in as psql +-- variables by the load script (from UMBRA_S3_* env vars) to keep the +-- access key out of the checked-in SQL. :'name' renders each as a quoted +-- string literal, matching Umbra's expected +-- create remote storage s3 using 's3://:/' +-- with secret '' ''; +-- (the region is part of the s3:// URI, and a path prefix follows it). +create remote storage s3 using :'s3_bucket' + with secret :'s3_access_key_id' :'s3_access_key'; + +create table hits ( + watchid bigint not null, + javaenable smallint not null, + title text not null, + goodevent smallint not null, + eventtime timestamp not null, + eventdate date not null, + counterid integer not null, + clientip integer not null, + regionid integer not null, + userid bigint not null, + counterclass smallint not null, + os smallint not null, + useragent smallint not null, + url text not null, + referer text not null, + isrefresh smallint not null, + referercategoryid smallint not null, + refererregionid integer not null, + urlcategoryid smallint not null, + urlregionid integer not null, + resolutionwidth smallint not null, + resolutionheight smallint not null, + resolutiondepth smallint not null, + flashmajor smallint not null, + flashminor smallint not null, + flashminor2 text not null, + netmajor smallint not null, + netminor smallint not null, + useragentmajor smallint not null, + useragentminor varchar(255) not null, + cookieenable smallint not null, + javascriptenable smallint not null, + ismobile smallint not null, + mobilephone smallint not null, + mobilephonemodel text not null, + params text not null, + ipnetworkid integer not null, + traficsourceid smallint not null, + searchengineid smallint not null, + searchphrase text not null, + advengineid smallint not null, + isartifical smallint not null, + windowclientwidth smallint not null, + windowclientheight smallint not null, + clienttimezone smallint not null, + clienteventtime timestamp not null, + silverlightversion1 smallint not null, + silverlightversion2 smallint not null, + silverlightversion3 integer not null, + silverlightversion4 smallint not null, + pagecharset text not null, + codeversion integer not null, + islink smallint not null, + isdownload smallint not null, + isnotbounce smallint not null, + funiqid bigint not null, + originalurl text not null, + hid integer not null, + isoldcounter smallint not null, + isevent smallint not null, + isparameter smallint not null, + dontcounthits smallint not null, + withhash smallint not null, + hitcolor char not null, + localeventtime timestamp not null, + age smallint not null, + sex smallint not null, + income smallint not null, + interests smallint not null, + robotness smallint not null, + remoteip integer not null, + windowname integer not null, + openername integer not null, + historylength smallint not null, + browserlanguage text not null, + browsercountry text not null, + socialnetwork text not null, + socialaction text not null, + httperror smallint not null, + sendtiming integer not null, + dnstiming integer not null, + connecttiming integer not null, + responsestarttiming integer not null, + responseendtiming integer not null, + fetchtiming integer not null, + socialsourcenetworkid smallint not null, + socialsourcepage text not null, + paramprice bigint not null, + paramorderid text not null, + paramcurrency text not null, + paramcurrencyid smallint not null, + openstatservicename text not null, + openstatcampaignid text not null, + openstatadid text not null, + openstatsourceid text not null, + utmsource text not null, + utmmedium text not null, + utmcampaign text not null, + utmcontent text not null, + utmterm text not null, + fromtag text not null, + hasgclid smallint not null, + refererhash bigint not null, + urlhash bigint not null, + clid integer not null +) with (backend=cloud, compression=zstd); + +-- Ingest from the Athena parquet rather than the TSV. Umbra's COPY-from- +-- parquet path deadlocks on the second bulk op and the row-group reader had +-- crashes (both worked around / fixed), so we load via the umbra.parquetview +-- table function in a single INSERT — one bulk op, inline type conversion, no +-- staging. The function lives only in the Umbra-mode function table (hence the +-- umbra. qualifier) and surfaces columns CamelCased, so each must be double- +-- quoted. EventTime/ClientEventTime/LocalEventTime are unix-second int64s; +-- EventDate is a uint16 day count from the epoch. Path is the container's +-- '/data' bind mount. +insert into hits +select + "WatchID", + "JavaEnable", + "Title", + "GoodEvent", + to_timestamp("EventTime")::timestamp, + (DATE '1970-01-01' + "EventDate"::int), + "CounterID", + "ClientIP", + "RegionID", + "UserID", + "CounterClass", + "OS", + "UserAgent", + "URL", + "Referer", + "IsRefresh", + "RefererCategoryID", + "RefererRegionID", + "URLCategoryID", + "URLRegionID", + "ResolutionWidth", + "ResolutionHeight", + "ResolutionDepth", + "FlashMajor", + "FlashMinor", + "FlashMinor2", + "NetMajor", + "NetMinor", + "UserAgentMajor", + "UserAgentMinor", + "CookieEnable", + "JavascriptEnable", + "IsMobile", + "MobilePhone", + "MobilePhoneModel", + "Params", + "IPNetworkID", + "TraficSourceID", + "SearchEngineID", + "SearchPhrase", + "AdvEngineID", + "IsArtifical", + "WindowClientWidth", + "WindowClientHeight", + "ClientTimeZone", + to_timestamp("ClientEventTime")::timestamp, + "SilverlightVersion1", + "SilverlightVersion2", + "SilverlightVersion3", + "SilverlightVersion4", + "PageCharset", + "CodeVersion", + "IsLink", + "IsDownload", + "IsNotBounce", + "FUniqID", + "OriginalURL", + "HID", + "IsOldCounter", + "IsEvent", + "IsParameter", + "DontCountHits", + "WithHash", + "HitColor", + to_timestamp("LocalEventTime")::timestamp, + "Age", + "Sex", + "Income", + "Interests", + "Robotness", + "RemoteIP", + "WindowName", + "OpenerName", + "HistoryLength", + "BrowserLanguage", + "BrowserCountry", + "SocialNetwork", + "SocialAction", + "HTTPError", + "SendTiming", + "DNSTiming", + "ConnectTiming", + "ResponseStartTiming", + "ResponseEndTiming", + "FetchTiming", + "SocialSourceNetworkID", + "SocialSourcePage", + "ParamPrice", + "ParamOrderID", + "ParamCurrency", + "ParamCurrencyID", + "OpenstatServiceName", + "OpenstatCampaignID", + "OpenstatAdID", + "OpenstatSourceID", + "UTMSource", + "UTMMedium", + "UTMCampaign", + "UTMContent", + "UTMTerm", + "FromTag", + "HasGCLID", + "RefererHash", + "URLHash", + "CLID" +from umbra.parquetview('/data/hits.parquet'); diff --git a/umbra-s3/data-size b/umbra-s3/data-size new file mode 100755 index 0000000000..296ad622c8 --- /dev/null +++ b/umbra-s3/data-size @@ -0,0 +1,28 @@ +#!/bin/bash +set -eu + +# Total on-disk footprint of the loaded database. With backend=cloud the table +# data lives in S3, so report the local db dir (catalog, WAL, anything not +# offloaded) plus the bytes stored under the bucket's path prefix. +here="$(cd "$(dirname "$0")" && pwd)" +envfile="$here/.s3-env" + +sudo chmod -R 777 db 2>/dev/null || true +db_bytes=$(du -bcs db | grep total | awk '{print $1}') + +s3_bytes=0 +# shellcheck disable=SC1090 +if [ -f "$envfile" ] && command -v aws >/dev/null 2>&1; then + . "$envfile" + [ -n "${UMBRA_S3_ACCESS_KEY_ID:-}" ] && export AWS_ACCESS_KEY_ID="$UMBRA_S3_ACCESS_KEY_ID" + [ -n "${UMBRA_S3_ACCESS_KEY:-}" ] && export AWS_SECRET_ACCESS_KEY="$UMBRA_S3_ACCESS_KEY" + export AWS_DEFAULT_REGION="${UMBRA_S3_REGION:-${AWS_DEFAULT_REGION:-us-east-1}}" + path="${UMBRA_S3_PATH:-umbra}" + # --recursive --summarize paginates over all objects and prints a + # "Total Size: " line; empty/missing prefix yields no such line. + s3_bytes=$(aws s3 ls "s3://$UMBRA_S3_BUCKET/$path" --recursive --summarize \ + 2>/dev/null | awk '/Total Size:/ {print $3}') + [ -z "$s3_bytes" ] && s3_bytes=0 +fi + +echo $((db_bytes + s3_bytes)) diff --git a/umbra-s3/delete-bucket b/umbra-s3/delete-bucket new file mode 100755 index 0000000000..cd0cf66684 --- /dev/null +++ b/umbra-s3/delete-bucket @@ -0,0 +1,35 @@ +#!/bin/bash +set -eu + +# Tear down the S3 bucket create-bucket provisioned: empty it, delete it, and +# remove the local .s3-env. Uses the credentials recorded in .s3-env. No IAM +# resources are touched (create-bucket doesn't create any). Idempotent: a +# missing bucket is a no-op. +here="$(cd "$(dirname "$0")" && pwd)" +envfile="$here/.s3-env" + +# Pull bucket/region/credentials from .s3-env if present. +# shellcheck disable=SC1090 +[ -f "$envfile" ] && . "$envfile" + +: "${UMBRA_S3_BUCKET:?no .s3-env found — nothing to delete}" +region="${UMBRA_S3_REGION:-${AWS_DEFAULT_REGION:-us-east-1}}" + +if ! command -v aws >/dev/null 2>&1; then + echo "delete-bucket: aws CLI not installed — nothing to do" >&2 + exit 0 +fi + +[ -n "${UMBRA_S3_ACCESS_KEY_ID:-}" ] && export AWS_ACCESS_KEY_ID="$UMBRA_S3_ACCESS_KEY_ID" +[ -n "${UMBRA_S3_ACCESS_KEY:-}" ] && export AWS_SECRET_ACCESS_KEY="$UMBRA_S3_ACCESS_KEY" +export AWS_DEFAULT_REGION="$region" + +if aws s3api head-bucket --bucket "$UMBRA_S3_BUCKET" >/dev/null 2>&1; then + aws s3 rb "s3://$UMBRA_S3_BUCKET" --force + echo "delete-bucket: deleted s3://$UMBRA_S3_BUCKET" +else + echo "delete-bucket: s3://$UMBRA_S3_BUCKET does not exist" +fi + +rm -f "$envfile" +echo "delete-bucket: done" diff --git a/umbra-s3/install b/umbra-s3/install new file mode 100755 index 0000000000..d472dbbf8c --- /dev/null +++ b/umbra-s3/install @@ -0,0 +1,10 @@ +#!/bin/bash +set -eu + +sudo apt-get update -y +sudo apt-get install -y docker.io postgresql-client gzip + +sudo docker pull umbradb/umbra:latest + +mkdir -p data db +chmod -R 777 data db diff --git a/umbra-s3/load b/umbra-s3/load new file mode 100755 index 0000000000..7cad8a711c --- /dev/null +++ b/umbra-s3/load @@ -0,0 +1,52 @@ +#!/bin/bash +set -eu + +mkdir -p data +mv hits.parquet data/ +chmod -R 777 data + +# S3 remote storage credentials for the create remote storage statement in +# create.sql. create-bucket writes these to .s3-env; source it if present and +# the caller hasn't already exported them. Kept out of the SQL so the secret +# access key isn't checked in; fail early with a clear message if any missing. +here="$(cd "$(dirname "$0")" && pwd)" +# shellcheck disable=SC1091 +[ -z "${UMBRA_S3_ACCESS_KEY_ID:-}" ] && [ -f "$here/.s3-env" ] && . "$here/.s3-env" + +# The create scripts set the full storage URI in .s3-env as UMBRA_S3_URI: +# create-bucket the AWS s3://:/ form, create-minio the +# local minio:///: form (s3:// always resolves to real +# AWS, so MinIO needs the minio:// scheme). +: "${UMBRA_S3_URI:?set UMBRA_S3_URI (run create-bucket or create-minio first)}" +: "${UMBRA_S3_ACCESS_KEY_ID:?set UMBRA_S3_ACCESS_KEY_ID}" +: "${UMBRA_S3_ACCESS_KEY:?set UMBRA_S3_ACCESS_KEY}" + +s3_uri="$UMBRA_S3_URI" + +# create.sql for umbra registers the S3 remote storage, creates the +# (backend=cloud) table, and ingests via the parquetview INSERT. Use +# ON_ERROR_STOP=1 so a mid-load failure (e.g. the box runs out of memory +# and Umbra survives but the transaction errored) bubbles up instead of +# leaving a half-loaded table. +PGPASSWORD=postgres psql -p 5432 -h 127.0.0.1 -U postgres \ + -v ON_ERROR_STOP=1 \ + -v s3_bucket="$s3_uri" \ + -v s3_access_key_id="$UMBRA_S3_ACCESS_KEY_ID" \ + -v s3_access_key="$UMBRA_S3_ACCESS_KEY" \ + -f create.sql + +# Belt-and-braces row-count check. Umbra has been observed to leave a +# partial table on memory-constrained hosts (16 GB c6a.4xlarge can't +# hold the full mmap working set), letting the benchmark proceed and +# producing implausibly fast warm timings on the surviving subset. +# ClickBench's hits dataset is exactly 99,997,497 rows. +expected=99997497 +got=$(PGPASSWORD=postgres psql -p 5432 -h 127.0.0.1 -U postgres -tAq \ + -c 'SELECT count(*) FROM hits') +if [ "$got" -ne "$expected" ]; then + echo "umbra/load: hits has $got rows, expected $expected — partial load" >&2 + exit 1 +fi + +rm -f data/hits.parquet +sync diff --git a/umbra-s3/queries.sql b/umbra-s3/queries.sql new file mode 100644 index 0000000000..ecfb6b77d0 --- /dev/null +++ b/umbra-s3/queries.sql @@ -0,0 +1,43 @@ +SELECT COUNT(*) FROM hits; +SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0; +SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits; +SELECT AVG(UserID) FROM hits; +SELECT COUNT(DISTINCT UserID) FROM hits; +SELECT COUNT(DISTINCT SearchPhrase) FROM hits; +SELECT MIN(EventDate), MAX(EventDate) FROM hits; +SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC; +SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10; +SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10; +SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10; +SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10; +SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID FROM hits WHERE UserID = 435090932899640449; +SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%'; +SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10; +SELECT CounterID, AVG(length(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT SUM(ResolutionWidth::bigint), SUM(ResolutionWidth::bigint + 1), SUM(ResolutionWidth::bigint + 2), SUM(ResolutionWidth::bigint + 3), SUM(ResolutionWidth::bigint + 4), SUM(ResolutionWidth::bigint + 5), SUM(ResolutionWidth::bigint + 6), SUM(ResolutionWidth::bigint + 7), SUM(ResolutionWidth::bigint + 8), SUM(ResolutionWidth::bigint + 9), SUM(ResolutionWidth::bigint + 10), SUM(ResolutionWidth::bigint + 11), SUM(ResolutionWidth::bigint + 12), SUM(ResolutionWidth::bigint + 13), SUM(ResolutionWidth::bigint + 14), SUM(ResolutionWidth::bigint + 15), SUM(ResolutionWidth::bigint + 16), SUM(ResolutionWidth::bigint + 17), SUM(ResolutionWidth::bigint + 18), SUM(ResolutionWidth::bigint + 19), SUM(ResolutionWidth::bigint + 20), SUM(ResolutionWidth::bigint + 21), SUM(ResolutionWidth::bigint + 22), SUM(ResolutionWidth::bigint + 23), SUM(ResolutionWidth::bigint + 24), SUM(ResolutionWidth::bigint + 25), SUM(ResolutionWidth::bigint + 26), SUM(ResolutionWidth::bigint + 27), SUM(ResolutionWidth::bigint + 28), SUM(ResolutionWidth::bigint + 29), SUM(ResolutionWidth::bigint + 30), SUM(ResolutionWidth::bigint + 31), SUM(ResolutionWidth::bigint + 32), SUM(ResolutionWidth::bigint + 33), SUM(ResolutionWidth::bigint + 34), SUM(ResolutionWidth::bigint + 35), SUM(ResolutionWidth::bigint + 36), SUM(ResolutionWidth::bigint + 37), SUM(ResolutionWidth::bigint + 38), SUM(ResolutionWidth::bigint + 39), SUM(ResolutionWidth::bigint + 40), SUM(ResolutionWidth::bigint + 41), SUM(ResolutionWidth::bigint + 42), SUM(ResolutionWidth::bigint + 43), SUM(ResolutionWidth::bigint + 44), SUM(ResolutionWidth::bigint + 45), SUM(ResolutionWidth::bigint + 46), SUM(ResolutionWidth::bigint + 47), SUM(ResolutionWidth::bigint + 48), SUM(ResolutionWidth::bigint + 49), SUM(ResolutionWidth::bigint + 50), SUM(ResolutionWidth::bigint + 51), SUM(ResolutionWidth::bigint + 52), SUM(ResolutionWidth::bigint + 53), SUM(ResolutionWidth::bigint + 54), SUM(ResolutionWidth::bigint + 55), SUM(ResolutionWidth::bigint + 56), SUM(ResolutionWidth::bigint + 57), SUM(ResolutionWidth::bigint + 58), SUM(ResolutionWidth::bigint + 59), SUM(ResolutionWidth::bigint + 60), SUM(ResolutionWidth::bigint + 61), SUM(ResolutionWidth::bigint + 62), SUM(ResolutionWidth::bigint + 63), SUM(ResolutionWidth::bigint + 64), SUM(ResolutionWidth::bigint + 65), SUM(ResolutionWidth::bigint + 66), SUM(ResolutionWidth::bigint + 67), SUM(ResolutionWidth::bigint + 68), SUM(ResolutionWidth::bigint + 69), SUM(ResolutionWidth::bigint + 70), SUM(ResolutionWidth::bigint + 71), SUM(ResolutionWidth::bigint + 72), SUM(ResolutionWidth::bigint + 73), SUM(ResolutionWidth::bigint + 74), SUM(ResolutionWidth::bigint + 75), SUM(ResolutionWidth::bigint + 76), SUM(ResolutionWidth::bigint + 77), SUM(ResolutionWidth::bigint + 78), SUM(ResolutionWidth::bigint + 79), SUM(ResolutionWidth::bigint + 80), SUM(ResolutionWidth::bigint + 81), SUM(ResolutionWidth::bigint + 82), SUM(ResolutionWidth::bigint + 83), SUM(ResolutionWidth::bigint + 84), SUM(ResolutionWidth::bigint + 85), SUM(ResolutionWidth::bigint + 86), SUM(ResolutionWidth::bigint + 87), SUM(ResolutionWidth::bigint + 88), SUM(ResolutionWidth::bigint + 89) FROM hits; +SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10; +SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10; +SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10; +SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100; +SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000; +SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000; diff --git a/umbra-s3/query b/umbra-s3/query new file mode 100755 index 0000000000..d8cc72b4c0 --- /dev/null +++ b/umbra-s3/query @@ -0,0 +1,36 @@ +#!/bin/bash +# Reads a SQL query from stdin, runs it via psql against Umbra. +# Stdout: query result. +# Stderr: query runtime in fractional seconds on the last line (parsed from +# psql's `\timing` "Time: ms" output). +# Exit non-zero on error. +set -e + +query=$(cat) + +raw=$(PGPASSWORD=postgres timeout 300s psql -p 5432 -h 127.0.0.1 -U postgres -v ON_ERROR_STOP=1 -t -c '\timing' -c "$query" 2>&1) && exit_code=0 || exit_code=$? + +# Match more failure shapes than just `^ERROR`: PostgreSQL/Umbra also emit +# FATAL: (server-side fatal), PANIC: (Umbra's panic prefix observed in +# `unable to allocate buffer pool`-style failures), and `psql: error` +# (client-side, e.g. connection lost). +# +# Caveat: Umbra silently returns a NULL row for unimplemented functions +# (e.g. regexp_substr) without emitting any error or warning. None of the +# 43 ClickBench queries hit that path, but if a future query does, the +# caller will see a microsecond timing for a "successful" query that +# didn't actually compute anything. +if [ "$exit_code" -ne 0 ] || printf '%s\n' "$raw" | grep -qE '^(ERROR|FATAL|PANIC):|psql: error'; then + printf '%s\n' "$raw" >&2 + exit 1 +fi + +printf '%s\n' "$raw" | grep -v '^Time:' + +ms=$(printf '%s\n' "$raw" | grep -oP 'Time:\s*\K[0-9.]+' | tail -n1) +if [ -z "$ms" ]; then + echo "no Time: in psql output" >&2 + exit 1 +fi + +awk -v m="$ms" 'BEGIN { printf "%.3f\n", m / 1000 }' >&2 diff --git a/umbra-s3/results/20260622/c6a.4xlarge.json b/umbra-s3/results/20260622/c6a.4xlarge.json new file mode 100644 index 0000000000..0c759b6f6d --- /dev/null +++ b/umbra-s3/results/20260622/c6a.4xlarge.json @@ -0,0 +1,235 @@ +{ + "system": "Umbra (S3)", + "date": "2026-06-22", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "yes", + "hardware": "cpu", + "tuned": "no", + "tags": [ + "C++", + "column-oriented", + "PostgreSQL compatible" + ], + "load_time": 122.993, + "data_size": 9650271980, + "concurrent_qps": 2.565, + "concurrent_error_ratio": 0.189, + "result": [ + [ + 0.173, + 0.011, + 0.011 + ], + [ + 1.389, + 0.004, + 0.004 + ], + [ + 2.111, + 0.026, + 0.026 + ], + [ + 1.124, + 0.026, + 0.026 + ], + [ + 1.026, + 0.132, + 0.134 + ], + [ + 1.235, + 0.172, + 0.172 + ], + [ + 0.891, + 0.023, + 0.023 + ], + [ + 1.079, + 0.005, + 0.005 + ], + [ + 1.825, + 0.162, + 0.16 + ], + [ + 3.394, + 0.225, + 0.228 + ], + [ + 1.321, + 0.026, + 0.026 + ], + [ + 1.712, + 0.028, + 0.028 + ], + [ + 0.854, + 0.188, + 0.161 + ], + [ + 1.432, + 0.307, + 0.305 + ], + [ + 1.249, + 0.207, + 0.18 + ], + [ + 0.709, + 0.17, + 0.172 + ], + [ + 1.349, + 0.381, + 0.364 + ], + [ + 1.136, + 0.222, + 0.222 + ], + [ + 2.582, + 0.838, + 0.834 + ], + [ + 0.508, + 0.002, + 0.002 + ], + [ + 2.005, + 0.276, + 0.275 + ], + [ + 1.869, + 0.082, + 0.082 + ], + [ + 3.739, + 0.145, + 0.146 + ], + [ + 7.253, + 0.257, + 0.417 + ], + [ + 0.283, + 0.244, + 0.098 + ], + [ + 0.705, + 0.01, + 0.01 + ], + [ + 0.284, + 0.07, + 0.057 + ], + [ + 1.797, + 0.292, + 0.293 + ], + [ + 2.608, + 1.483, + 1.492 + ], + [ + 0.688, + 0.029, + 0.029 + ], + [ + 2.434, + 0.084, + 0.083 + ], + [ + 2.772, + 0.131, + 0.131 + ], + [ + 2.702, + 1.319, + 1.321 + ], + [ + 2.815, + 0.785, + 0.871 + ], + [ + 2.932, + 0.87, + 0.791 + ], + [ + 0.597, + 0.127, + 0.126 + ], + [ + 0.37, + 0.011, + 0.011 + ], + [ + 0.305, + 0.006, + 0.006 + ], + [ + 0.279, + 0.003, + 0.003 + ], + [ + 0.61, + 0.023, + 0.022 + ], + [ + 0.415, + 0.004, + 0.003 + ], + [ + 0.31, + 0.003, + 0.003 + ], + [ + 0.22, + 0.004, + 0.005 + ] + ] +} diff --git a/umbra-s3/results/20260622/c6a.metal.json b/umbra-s3/results/20260622/c6a.metal.json new file mode 100644 index 0000000000..829668b818 --- /dev/null +++ b/umbra-s3/results/20260622/c6a.metal.json @@ -0,0 +1,235 @@ +{ + "system": "Umbra (S3)", + "date": "2026-06-22", + "machine": "c6a.metal", + "cluster_size": 1, + "proprietary": "yes", + "hardware": "cpu", + "tuned": "no", + "tags": [ + "C++", + "column-oriented", + "PostgreSQL compatible" + ], + "load_time": 29.383, + "data_size": 9951793544, + "concurrent_qps": 16.363, + "concurrent_error_ratio": 0.0, + "result": [ + [ + 0.046, + 0.004, + 0.004 + ], + [ + 0.642, + 0.006, + 0.004 + ], + [ + 0.746, + 0.018, + 0.01 + ], + [ + 0.518, + 0.009, + 0.01 + ], + [ + 0.461, + 0.068, + 0.068 + ], + [ + 0.632, + 0.099, + 0.077 + ], + [ + 0.648, + 0.01, + 0.009 + ], + [ + 0.446, + 0.004, + 0.004 + ], + [ + 0.686, + 0.087, + 0.087 + ], + [ + 0.952, + 0.129, + 0.128 + ], + [ + 0.343, + 0.019, + 0.02 + ], + [ + 0.4, + 0.02, + 0.019 + ], + [ + 0.44, + 0.094, + 0.067 + ], + [ + 0.602, + 0.125, + 0.125 + ], + [ + 0.461, + 0.071, + 0.07 + ], + [ + 0.545, + 0.1, + 0.098 + ], + [ + 0.553, + 0.138, + 0.139 + ], + [ + 0.54, + 0.054, + 0.053 + ], + [ + 0.947, + 0.373, + 0.344 + ], + [ + 0.226, + 0.002, + 0.002 + ], + [ + 0.888, + 0.064, + 0.062 + ], + [ + 0.731, + 0.026, + 0.027 + ], + [ + 1.03, + 0.044, + 0.043 + ], + [ + 24.142, + 0.398, + 0.496 + ], + [ + 0.304, + 0.278, + 0.192 + ], + [ + 0.482, + 0.006, + 0.006 + ], + [ + 0.274, + 0.087, + 0.115 + ], + [ + 0.848, + 0.071, + 0.072 + ], + [ + 5.445, + 0.318, + 0.319 + ], + [ + 0.294, + 0.019, + 0.014 + ], + [ + 0.701, + 0.042, + 0.04 + ], + [ + 0.739, + 0.057, + 0.055 + ], + [ + 1.233, + 0.623, + 0.625 + ], + [ + 1.192, + 0.335, + 0.328 + ], + [ + 1.167, + 0.35, + 0.337 + ], + [ + 0.371, + 0.067, + 0.067 + ], + [ + 0.297, + 0.012, + 0.012 + ], + [ + 0.517, + 0.01, + 0.009 + ], + [ + 0.271, + 0.008, + 0.008 + ], + [ + 0.742, + 0.017, + 0.017 + ], + [ + 0.344, + 0.01, + 0.01 + ], + [ + 0.374, + 0.008, + 0.005 + ], + [ + 0.344, + 0.009, + 0.01 + ] + ] +} diff --git a/umbra-s3/results/20260622/c7a.metal-48xl.json b/umbra-s3/results/20260622/c7a.metal-48xl.json new file mode 100644 index 0000000000..cfa73deefd --- /dev/null +++ b/umbra-s3/results/20260622/c7a.metal-48xl.json @@ -0,0 +1,235 @@ +{ + "system": "Umbra (S3)", + "date": "2026-06-22", + "machine": "c7a.metal-48xl", + "cluster_size": 1, + "proprietary": "yes", + "hardware": "cpu", + "tuned": "no", + "tags": [ + "C++", + "column-oriented", + "PostgreSQL compatible" + ], + "load_time": 24.542, + "data_size": 10005256580, + "concurrent_qps": 31.583, + "concurrent_error_ratio": 0.0, + "result": [ + [ + 0.055, + 0.003, + 0.003 + ], + [ + 0.551, + 0.003, + 0.002 + ], + [ + 0.715, + 0.009, + 0.009 + ], + [ + 0.461, + 0.01, + 0.009 + ], + [ + 0.607, + 0.028, + 0.027 + ], + [ + 0.643, + 0.042, + 0.04 + ], + [ + 0.538, + 0.008, + 0.005 + ], + [ + 0.428, + 0.003, + 0.003 + ], + [ + 0.695, + 0.034, + 0.04 + ], + [ + 0.95, + 0.047, + 0.044 + ], + [ + 0.517, + 0.014, + 0.014 + ], + [ + 0.568, + 0.014, + 0.014 + ], + [ + 0.424, + 0.047, + 0.049 + ], + [ + 0.767, + 0.066, + 0.07 + ], + [ + 0.619, + 0.045, + 0.05 + ], + [ + 0.449, + 0.037, + 0.035 + ], + [ + 0.792, + 0.063, + 0.064 + ], + [ + 0.563, + 0.041, + 0.038 + ], + [ + 0.734, + 0.124, + 0.124 + ], + [ + 0.227, + 0.002, + 0.002 + ], + [ + 5.488, + 0.041, + 0.042 + ], + [ + 0.747, + 0.015, + 0.015 + ], + [ + 1.028, + 0.022, + 0.023 + ], + [ + 12.699, + 5.507, + 0.531 + ], + [ + 0.356, + 0.232, + 0.328 + ], + [ + 0.437, + 0.004, + 0.006 + ], + [ + 5.186, + 0.196, + 0.346 + ], + [ + 0.631, + 0.045, + 0.044 + ], + [ + 0.902, + 0.222, + 0.211 + ], + [ + 0.287, + 0.01, + 0.008 + ], + [ + 0.831, + 0.022, + 0.027 + ], + [ + 0.742, + 0.031, + 0.029 + ], + [ + 0.759, + 0.181, + 0.187 + ], + [ + 5.464, + 0.183, + 0.171 + ], + [ + 0.662, + 0.182, + 0.172 + ], + [ + 0.351, + 0.031, + 0.028 + ], + [ + 0.313, + 0.011, + 0.011 + ], + [ + 0.407, + 0.007, + 0.007 + ], + [ + 0.326, + 0.005, + 0.005 + ], + [ + 0.364, + 0.015, + 0.019 + ], + [ + 0.364, + 0.008, + 0.008 + ], + [ + 0.315, + 0.005, + 0.004 + ], + [ + 0.298, + 0.008, + 0.006 + ] + ] +} diff --git a/umbra-s3/results/20260622/c8g.4xlarge.json b/umbra-s3/results/20260622/c8g.4xlarge.json new file mode 100644 index 0000000000..dac3f8309c --- /dev/null +++ b/umbra-s3/results/20260622/c8g.4xlarge.json @@ -0,0 +1,235 @@ +{ + "system": "Umbra (S3)", + "date": "2026-06-22", + "machine": "c8g.4xlarge", + "cluster_size": 1, + "proprietary": "yes", + "hardware": "cpu", + "tuned": "no", + "tags": [ + "C++", + "column-oriented", + "PostgreSQL compatible" + ], + "load_time": 106.093, + "data_size": 9626344496, + "concurrent_qps": 3.433, + "concurrent_error_ratio": 0.214, + "result": [ + [ + 0.102, + 0.011, + 0.012 + ], + [ + 1.096, + 0.003, + 0.003 + ], + [ + 1.658, + 0.022, + 0.022 + ], + [ + 0.657, + 0.023, + 0.023 + ], + [ + 0.571, + 0.061, + 0.061 + ], + [ + 0.759, + 0.09, + 0.091 + ], + [ + 0.554, + 0.016, + 0.016 + ], + [ + 0.612, + 0.005, + 0.005 + ], + [ + 0.903, + 0.075, + 0.074 + ], + [ + 1.559, + 0.105, + 0.106 + ], + [ + 0.635, + 0.012, + 0.012 + ], + [ + 0.848, + 0.013, + 0.013 + ], + [ + 0.705, + 0.076, + 0.073 + ], + [ + 0.766, + 0.137, + 0.141 + ], + [ + 0.712, + 0.083, + 0.082 + ], + [ + 0.354, + 0.065, + 0.066 + ], + [ + 1.005, + 0.154, + 0.149 + ], + [ + 0.67, + 0.11, + 0.11 + ], + [ + 1.274, + 0.283, + 0.283 + ], + [ + 0.378, + 0.001, + 0.001 + ], + [ + 1.324, + 0.417, + 0.416 + ], + [ + 1.352, + 0.064, + 0.064 + ], + [ + 2.716, + 0.094, + 0.094 + ], + [ + 4.944, + 0.566, + 0.543 + ], + [ + 0.22, + 0.076, + 0.078 + ], + [ + 0.356, + 0.006, + 0.006 + ], + [ + 0.216, + 0.072, + 0.065 + ], + [ + 1.216, + 0.214, + 0.216 + ], + [ + 1.674, + 0.814, + 0.809 + ], + [ + 0.348, + 0.026, + 0.026 + ], + [ + 1.363, + 0.034, + 0.034 + ], + [ + 1.8, + 0.043, + 0.042 + ], + [ + 1.618, + 0.426, + 0.426 + ], + [ + 1.406, + 0.326, + 0.303 + ], + [ + 1.251, + 0.317, + 0.311 + ], + [ + 0.335, + 0.063, + 0.063 + ], + [ + 0.344, + 0.007, + 0.007 + ], + [ + 0.294, + 0.004, + 0.004 + ], + [ + 0.398, + 0.002, + 0.002 + ], + [ + 0.366, + 0.011, + 0.011 + ], + [ + 0.408, + 0.003, + 0.003 + ], + [ + 0.228, + 0.002, + 0.003 + ], + [ + 0.15, + 0.005, + 0.005 + ] + ] +} diff --git a/umbra-s3/results/20260622/c8g.metal-48xl.json b/umbra-s3/results/20260622/c8g.metal-48xl.json new file mode 100644 index 0000000000..1b3d23120d --- /dev/null +++ b/umbra-s3/results/20260622/c8g.metal-48xl.json @@ -0,0 +1,235 @@ +{ + "system": "Umbra (S3)", + "date": "2026-06-22", + "machine": "c8g.metal-48xl", + "cluster_size": 1, + "proprietary": "yes", + "hardware": "cpu", + "tuned": "no", + "tags": [ + "C++", + "column-oriented", + "PostgreSQL compatible" + ], + "load_time": 17.345, + "data_size": 10000818973, + "concurrent_qps": 36.748, + "concurrent_error_ratio": 0.0, + "result": [ + [ + 0.05, + 0.004, + 0.004 + ], + [ + 0.805, + 0.002, + 0.001 + ], + [ + 0.641, + 0.005, + 0.005 + ], + [ + 0.382, + 0.006, + 0.006 + ], + [ + 0.353, + 0.024, + 0.024 + ], + [ + 0.522, + 0.042, + 0.039 + ], + [ + 0.379, + 0.004, + 0.004 + ], + [ + 0.463, + 0.002, + 0.002 + ], + [ + 1.154, + 0.033, + 0.033 + ], + [ + 0.821, + 0.054, + 0.055 + ], + [ + 0.448, + 0.017, + 0.016 + ], + [ + 0.525, + 0.015, + 0.015 + ], + [ + 0.404, + 0.037, + 0.036 + ], + [ + 0.552, + 0.063, + 0.061 + ], + [ + 0.424, + 0.038, + 0.037 + ], + [ + 0.396, + 0.029, + 0.029 + ], + [ + 0.449, + 0.06, + 0.058 + ], + [ + 1.097, + 0.03, + 0.03 + ], + [ + 0.657, + 0.121, + 0.115 + ], + [ + 0.34, + 0.002, + 0.001 + ], + [ + 5.252, + 0.079, + 0.079 + ], + [ + 0.573, + 0.017, + 0.017 + ], + [ + 0.903, + 0.029, + 0.03 + ], + [ + 17.331, + 0.506, + 0.593 + ], + [ + 0.378, + 0.071, + 0.115 + ], + [ + 0.72, + 0.003, + 0.003 + ], + [ + 0.223, + 0.126, + 0.145 + ], + [ + 0.959, + 0.049, + 0.048 + ], + [ + 5.323, + 0.16, + 0.162 + ], + [ + 0.337, + 0.009, + 0.008 + ], + [ + 0.755, + 0.015, + 0.015 + ], + [ + 0.813, + 0.02, + 0.021 + ], + [ + 0.654, + 0.173, + 0.173 + ], + [ + 5.458, + 0.182, + 0.177 + ], + [ + 0.72, + 0.195, + 0.178 + ], + [ + 0.447, + 0.023, + 0.024 + ], + [ + 0.24, + 0.009, + 0.009 + ], + [ + 0.425, + 0.008, + 0.007 + ], + [ + 0.256, + 0.006, + 0.005 + ], + [ + 0.338, + 0.013, + 0.013 + ], + [ + 0.614, + 0.007, + 0.006 + ], + [ + 0.212, + 0.006, + 0.005 + ], + [ + 0.207, + 0.013, + 0.012 + ] + ] +} diff --git a/umbra-s3/start b/umbra-s3/start new file mode 100755 index 0000000000..9c990297eb --- /dev/null +++ b/umbra-s3/start @@ -0,0 +1,18 @@ +#!/bin/bash +set -eu + +if PGPASSWORD=postgres psql -p 5432 -h 127.0.0.1 -U postgres -c 'SELECT 1' >/dev/null 2>&1; then + exit 0 +fi + +sudo docker stop umbradb >/dev/null 2>&1 || true +sudo docker rm umbradb >/dev/null 2>&1 || true + +sudo docker run -d --name umbradb \ + -v "$(pwd)/db:/var/db" \ + -v "$(pwd)/data:/data" \ + -p 5432:5432 \ + --privileged \ + --ulimit nofile=1048576:1048576 \ + --ulimit memlock=-1:-1 \ + umbradb/umbra:latest >/dev/null diff --git a/umbra-s3/stop b/umbra-s3/stop new file mode 100755 index 0000000000..890229a5b6 --- /dev/null +++ b/umbra-s3/stop @@ -0,0 +1,3 @@ +#!/bin/bash + +sudo docker stop umbradb >/dev/null 2>&1 || true diff --git a/umbra-s3/template.json b/umbra-s3/template.json new file mode 100644 index 0000000000..93d0f1eba9 --- /dev/null +++ b/umbra-s3/template.json @@ -0,0 +1,11 @@ +{ + "system": "Umbra (S3)", + "proprietary": "yes", + "hardware": "cpu", + "tuned": "no", + "tags": [ + "C++", + "column-oriented", + "PostgreSQL compatible" + ] +} diff --git a/umbra/benchmark.sh b/umbra/benchmark.sh index 432bd3ae17..b278e40528 100755 --- a/umbra/benchmark.sh +++ b/umbra/benchmark.sh @@ -1,5 +1,5 @@ #!/bin/bash # Thin shim — actual flow is in lib/benchmark-common.sh. -export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv" +export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single" export BENCH_DURABLE=yes exec ../lib/benchmark-common.sh diff --git a/umbra/create.sql b/umbra/create.sql index 2d2cbe4afb..26bde2ea4f 100644 --- a/umbra/create.sql +++ b/umbra/create.sql @@ -103,7 +103,123 @@ create table hits ( hasgclid smallint not null, refererhash bigint not null, urlhash bigint not null, - clid integer not null, - primary key (counterid, eventdate, userid, eventtime, watchid) -); -copy hits from '/data/hits.tsv' with (format text); + clid integer not null +) with (compression=zstd); + +-- Ingest from the Athena parquet rather than the TSV. Umbra's COPY-from- +-- parquet path deadlocks on the second bulk op and the row-group reader had +-- crashes (both worked around / fixed), so we load via the umbra.parquetview +-- table function in a single INSERT — one bulk op, inline type conversion, no +-- staging. The function lives only in the Umbra-mode function table (hence the +-- umbra. qualifier) and surfaces columns CamelCased, so each must be double- +-- quoted. EventTime/ClientEventTime/LocalEventTime are unix-second int64s; +-- EventDate is a uint16 day count from the epoch. Path is the container's +-- '/data' bind mount. +insert into hits +select + "WatchID", + "JavaEnable", + "Title", + "GoodEvent", + to_timestamp("EventTime")::timestamp, + (DATE '1970-01-01' + "EventDate"::int), + "CounterID", + "ClientIP", + "RegionID", + "UserID", + "CounterClass", + "OS", + "UserAgent", + "URL", + "Referer", + "IsRefresh", + "RefererCategoryID", + "RefererRegionID", + "URLCategoryID", + "URLRegionID", + "ResolutionWidth", + "ResolutionHeight", + "ResolutionDepth", + "FlashMajor", + "FlashMinor", + "FlashMinor2", + "NetMajor", + "NetMinor", + "UserAgentMajor", + "UserAgentMinor", + "CookieEnable", + "JavascriptEnable", + "IsMobile", + "MobilePhone", + "MobilePhoneModel", + "Params", + "IPNetworkID", + "TraficSourceID", + "SearchEngineID", + "SearchPhrase", + "AdvEngineID", + "IsArtifical", + "WindowClientWidth", + "WindowClientHeight", + "ClientTimeZone", + to_timestamp("ClientEventTime")::timestamp, + "SilverlightVersion1", + "SilverlightVersion2", + "SilverlightVersion3", + "SilverlightVersion4", + "PageCharset", + "CodeVersion", + "IsLink", + "IsDownload", + "IsNotBounce", + "FUniqID", + "OriginalURL", + "HID", + "IsOldCounter", + "IsEvent", + "IsParameter", + "DontCountHits", + "WithHash", + "HitColor", + to_timestamp("LocalEventTime")::timestamp, + "Age", + "Sex", + "Income", + "Interests", + "Robotness", + "RemoteIP", + "WindowName", + "OpenerName", + "HistoryLength", + "BrowserLanguage", + "BrowserCountry", + "SocialNetwork", + "SocialAction", + "HTTPError", + "SendTiming", + "DNSTiming", + "ConnectTiming", + "ResponseStartTiming", + "ResponseEndTiming", + "FetchTiming", + "SocialSourceNetworkID", + "SocialSourcePage", + "ParamPrice", + "ParamOrderID", + "ParamCurrency", + "ParamCurrencyID", + "OpenstatServiceName", + "OpenstatCampaignID", + "OpenstatAdID", + "OpenstatSourceID", + "UTMSource", + "UTMMedium", + "UTMCampaign", + "UTMContent", + "UTMTerm", + "FromTag", + "HasGCLID", + "RefererHash", + "URLHash", + "CLID" +from umbra.parquetview('/data/hits.parquet'); diff --git a/umbra/load b/umbra/load index f0c1addae2..81152783ed 100755 --- a/umbra/load +++ b/umbra/load @@ -2,7 +2,7 @@ set -eu mkdir -p data -mv hits.tsv data/ +mv hits.parquet data/ chmod -R 777 data # create.sql for umbra both creates the table and ingests via COPY. Use @@ -16,14 +16,14 @@ PGPASSWORD=postgres psql -p 5432 -h 127.0.0.1 -U postgres \ # partial table on memory-constrained hosts (16 GB c6a.4xlarge can't # hold the full mmap working set), letting the benchmark proceed and # producing implausibly fast warm timings on the surviving subset. -# ClickBench's hits dataset is 99,997,497 rows; allow a small margin. +# ClickBench's hits dataset is exactly 99,997,497 rows. expected=99997497 got=$(PGPASSWORD=postgres psql -p 5432 -h 127.0.0.1 -U postgres -tAq \ -c 'SELECT count(*) FROM hits') -if [ "$got" -lt $((expected - 100)) ]; then - echo "umbra/load: hits has $got rows, expected ~$expected — partial load" >&2 +if [ "$got" -ne "$expected" ]; then + echo "umbra/load: hits has $got rows, expected $expected — partial load" >&2 exit 1 fi -rm -f data/hits.tsv +rm -f data/hits.parquet sync diff --git a/umbra/results/20260619/c6a.4xlarge.json b/umbra/results/20260619/c6a.4xlarge.json new file mode 100644 index 0000000000..7b32eb328a --- /dev/null +++ b/umbra/results/20260619/c6a.4xlarge.json @@ -0,0 +1,235 @@ +{ + "system": "Umbra", + "date": "2026-06-19", + "machine": "c6a.4xlarge", + "cluster_size": 1, + "proprietary": "yes", + "hardware": "cpu", + "tuned": "no", + "tags": [ + "C++", + "column-oriented", + "PostgreSQL compatible" + ], + "load_time": 118.635, + "data_size": 9642782263, + "concurrent_qps": 7.443, + "concurrent_error_ratio": 0.325, + "result": [ + [ + 0.149, + 0.008, + 0.008 + ], + [ + 0.22, + 0.004, + 0.004 + ], + [ + 0.221, + 0.027, + 0.026 + ], + [ + 0.296, + 0.028, + 0.027 + ], + [ + 0.348, + 0.134, + 0.134 + ], + [ + 0.498, + 0.171, + 0.171 + ], + [ + 0.194, + 0.024, + 0.025 + ], + [ + 0.244, + 0.005, + 0.005 + ], + [ + 0.46, + 0.161, + 0.16 + ], + [ + 0.603, + 0.231, + 0.227 + ], + [ + 0.392, + 0.026, + 0.026 + ], + [ + 0.271, + 0.028, + 0.028 + ], + [ + 0.523, + 0.161, + 0.16 + ], + [ + 1.144, + 0.306, + 0.307 + ], + [ + 0.547, + 0.179, + 0.178 + ], + [ + 0.393, + 0.169, + 0.171 + ], + [ + 1.055, + 0.361, + 0.362 + ], + [ + 0.973, + 0.222, + 0.222 + ], + [ + 3.378, + 0.866, + 0.829 + ], + [ + 0.348, + 0.002, + 0.002 + ], + [ + 4.185, + 0.273, + 0.273 + ], + [ + 5.07, + 0.082, + 0.081 + ], + [ + 9.086, + 0.147, + 0.147 + ], + [ + 1.833, + 0.011, + 0.011 + ], + [ + 0.236, + 0.018, + 0.006 + ], + [ + 0.256, + 0.01, + 0.01 + ], + [ + 0.248, + 0.006, + 0.006 + ], + [ + 4.202, + 0.287, + 0.289 + ], + [ + 4.2, + 1.51, + 1.503 + ], + [ + 0.167, + 0.028, + 0.03 + ], + [ + 0.785, + 0.083, + 0.083 + ], + [ + 3.813, + 0.131, + 0.129 + ], + [ + 3.974, + 1.315, + 1.314 + ], + [ + 4.468, + 0.926, + 0.779 + ], + [ + 4.462, + 0.863, + 0.779 + ], + [ + 0.299, + 0.127, + 0.124 + ], + [ + 0.252, + 0.01, + 0.011 + ], + [ + 0.233, + 0.006, + 0.006 + ], + [ + 0.27, + 0.003, + 0.003 + ], + [ + 0.306, + 0.026, + 0.022 + ], + [ + 0.224, + 0.004, + 0.003 + ], + [ + 0.212, + 0.003, + 0.003 + ], + [ + 0.217, + 0.004, + 0.004 + ] + ] +} diff --git a/umbra/results/20260619/c6a.metal.json b/umbra/results/20260619/c6a.metal.json new file mode 100644 index 0000000000..66a9f0d381 --- /dev/null +++ b/umbra/results/20260619/c6a.metal.json @@ -0,0 +1,235 @@ +{ + "system": "Umbra", + "date": "2026-06-19", + "machine": "c6a.metal", + "cluster_size": 1, + "proprietary": "yes", + "hardware": "cpu", + "tuned": "no", + "tags": [ + "C++", + "column-oriented", + "PostgreSQL compatible" + ], + "load_time": 42.112, + "data_size": 9929244880, + "concurrent_qps": 15.512, + "concurrent_error_ratio": 0.098, + "result": [ + [ + 0.052, + 0.003, + 0.003 + ], + [ + 0.183, + 0.003, + 0.003 + ], + [ + 0.195, + 0.009, + 0.01 + ], + [ + 0.294, + 0.011, + 0.009 + ], + [ + 0.388, + 0.069, + 0.067 + ], + [ + 0.413, + 0.068, + 0.069 + ], + [ + 0.167, + 0.011, + 0.009 + ], + [ + 0.233, + 0.004, + 0.004 + ], + [ + 0.547, + 0.08, + 0.08 + ], + [ + 0.542, + 0.118, + 0.119 + ], + [ + 0.307, + 0.019, + 0.02 + ], + [ + 0.318, + 0.017, + 0.017 + ], + [ + 0.324, + 0.064, + 0.063 + ], + [ + 1.146, + 0.113, + 0.113 + ], + [ + 0.402, + 0.081, + 0.067 + ], + [ + 0.477, + 0.098, + 0.097 + ], + [ + 1.141, + 0.139, + 0.139 + ], + [ + 1.123, + 0.051, + 0.05 + ], + [ + 3.251, + 0.379, + 0.35 + ], + [ + 0.306, + 0.003, + 0.002 + ], + [ + 4.435, + 0.061, + 0.059 + ], + [ + 5.323, + 0.026, + 0.027 + ], + [ + 9.721, + 0.041, + 0.044 + ], + [ + 2.492, + 0.042, + 0.082 + ], + [ + 0.205, + 0.005, + 0.005 + ], + [ + 0.371, + null, + null + ], + [ + 0.233, + 0.006, + 0.005 + ], + [ + 4.365, + 0.068, + 0.07 + ], + [ + 4.292, + 0.334, + 0.342 + ], + [ + 0.068, + 0.014, + 0.013 + ], + [ + 0.76, + 0.037, + 0.037 + ], + [ + 3.792, + 0.053, + 0.053 + ], + [ + 3.344, + 0.538, + 0.535 + ], + [ + 4.522, + 0.372, + 0.317 + ], + [ + 4.513, + 0.369, + 0.309 + ], + [ + 0.331, + 0.068, + 0.067 + ], + [ + 0.151, + 0.011, + 0.012 + ], + [ + 0.137, + 0.006, + 0.006 + ], + [ + 0.174, + 0.009, + 0.005 + ], + [ + 0.338, + 0.016, + 0.015 + ], + [ + 0.114, + 0.01, + 0.009 + ], + [ + 0.108, + 0.006, + 0.005 + ], + [ + 0.13, + 0.01, + 0.009 + ] + ] +} diff --git a/umbra/results/20260619/c7a.metal-48xl.json b/umbra/results/20260619/c7a.metal-48xl.json new file mode 100644 index 0000000000..892e77b195 --- /dev/null +++ b/umbra/results/20260619/c7a.metal-48xl.json @@ -0,0 +1,235 @@ +{ + "system": "Umbra", + "date": "2026-06-19", + "machine": "c7a.metal-48xl", + "cluster_size": 1, + "proprietary": "yes", + "hardware": "cpu", + "tuned": "no", + "tags": [ + "C++", + "column-oriented", + "PostgreSQL compatible" + ], + "load_time": 40.795, + "data_size": 9903048592, + "concurrent_qps": 28.213, + "concurrent_error_ratio": 0.132, + "result": [ + [ + 0.066, + 0.003, + 0.003 + ], + [ + 0.275, + 0.002, + 0.002 + ], + [ + 0.295, + 0.01, + 0.009 + ], + [ + 0.383, + 0.01, + 0.008 + ], + [ + 0.488, + 0.025, + 0.025 + ], + [ + 0.49, + 0.039, + 0.039 + ], + [ + 0.264, + 0.009, + 0.009 + ], + [ + 0.285, + 0.003, + 0.003 + ], + [ + 0.582, + 0.033, + 0.033 + ], + [ + 0.612, + 0.044, + 0.042 + ], + [ + 0.475, + 0.014, + 0.014 + ], + [ + 0.449, + 0.013, + 0.013 + ], + [ + 0.432, + 0.045, + 0.044 + ], + [ + 1.186, + 0.067, + 0.064 + ], + [ + 0.454, + 0.049, + 0.047 + ], + [ + 0.477, + 0.039, + 0.036 + ], + [ + 1.2, + 0.064, + 0.062 + ], + [ + 1.207, + 0.04, + 0.04 + ], + [ + 3.198, + 0.124, + 0.121 + ], + [ + 0.394, + 0.002, + 0.002 + ], + [ + 4.541, + 0.04, + 0.04 + ], + [ + 5.432, + 0.014, + 0.015 + ], + [ + 9.762, + 0.023, + 0.021 + ], + [ + 3.27, + 0.011, + 0.011 + ], + [ + 0.439, + 0.003, + 0.003 + ], + [ + 0.432, + 0.004, + 0.004 + ], + [ + 0.47, + 0.003, + 0.003 + ], + [ + 4.588, + 0.045, + 0.046 + ], + [ + 4.486, + 0.216, + 0.21 + ], + [ + 0.337, + 0.011, + 0.011 + ], + [ + 0.977, + 0.032, + 0.029 + ], + [ + 3.99, + 0.03, + 0.03 + ], + [ + 3.287, + 0.182, + 0.18 + ], + [ + 4.647, + 0.173, + 0.165 + ], + [ + 4.638, + 0.176, + 0.167 + ], + [ + 0.423, + 0.029, + 0.03 + ], + [ + 0.392, + 0.011, + 0.011 + ], + [ + 0.178, + 0.009, + 0.009 + ], + [ + 0.196, + 0.004, + 0.007 + ], + [ + 0.537, + 0.014, + 0.015 + ], + [ + 0.181, + 0.009, + 0.009 + ], + [ + 0.151, + 0.004, + 0.004 + ], + [ + 0.143, + 0.01, + 0.01 + ] + ] +} diff --git a/umbra/results/20260619/c8g.4xlarge.json b/umbra/results/20260619/c8g.4xlarge.json new file mode 100644 index 0000000000..e24d75d579 --- /dev/null +++ b/umbra/results/20260619/c8g.4xlarge.json @@ -0,0 +1,235 @@ +{ + "system": "Umbra", + "date": "2026-06-19", + "machine": "c8g.4xlarge", + "cluster_size": 1, + "proprietary": "yes", + "hardware": "cpu", + "tuned": "no", + "tags": [ + "C++", + "column-oriented", + "PostgreSQL compatible" + ], + "load_time": 106.836, + "data_size": 9644778608, + "concurrent_qps": 3.655, + "concurrent_error_ratio": 0.283, + "result": [ + [ + 0.125, + 0.012, + 0.012 + ], + [ + 0.137, + 0.003, + 0.003 + ], + [ + 0.151, + 0.022, + 0.022 + ], + [ + 0.28, + 0.024, + 0.024 + ], + [ + 0.272, + 0.064, + 0.064 + ], + [ + 0.332, + 0.097, + 0.097 + ], + [ + 0.136, + 0.016, + 0.016 + ], + [ + 0.149, + 0.004, + 0.004 + ], + [ + 0.365, + 0.074, + 0.077 + ], + [ + 0.501, + 0.109, + 0.109 + ], + [ + 0.29, + 0.013, + 0.013 + ], + [ + 0.321, + 0.014, + 0.014 + ], + [ + 0.317, + 0.083, + 0.084 + ], + [ + 0.955, + 0.149, + 0.151 + ], + [ + 0.383, + 0.087, + 0.087 + ], + [ + 0.307, + 0.075, + 0.074 + ], + [ + 0.89, + 0.161, + 0.16 + ], + [ + 0.869, + 0.113, + 0.113 + ], + [ + 2.935, + 0.315, + 0.316 + ], + [ + 0.239, + 0.001, + 0.001 + ], + [ + 4.14, + 0.427, + 0.425 + ], + [ + 4.984, + 0.068, + 0.071 + ], + [ + 9.048, + 0.108, + 0.1 + ], + [ + 1.851, + 0.019, + 0.019 + ], + [ + 0.228, + 0.003, + 0.003 + ], + [ + 0.257, + 0.006, + 0.006 + ], + [ + 0.255, + 0.003, + 0.003 + ], + [ + 4.159, + 0.223, + 0.223 + ], + [ + 4.109, + 0.859, + 0.847 + ], + [ + 0.141, + 0.026, + 0.026 + ], + [ + 0.699, + 0.035, + 0.035 + ], + [ + 3.704, + 0.046, + 0.046 + ], + [ + 3.248, + 0.475, + 0.48 + ], + [ + 4.256, + 0.377, + 0.348 + ], + [ + 4.256, + 0.371, + 0.383 + ], + [ + 0.274, + 0.064, + 0.064 + ], + [ + 0.193, + 0.007, + 0.007 + ], + [ + 0.179, + 0.004, + 0.004 + ], + [ + 0.203, + 0.003, + 0.003 + ], + [ + 0.239, + 0.012, + 0.012 + ], + [ + 0.185, + 0.003, + 0.003 + ], + [ + 0.165, + 0.003, + 0.003 + ], + [ + 0.164, + 0.005, + 0.005 + ] + ] +} diff --git a/umbra/results/20260619/c8g.metal-48xl.json b/umbra/results/20260619/c8g.metal-48xl.json new file mode 100644 index 0000000000..abcc4856ee --- /dev/null +++ b/umbra/results/20260619/c8g.metal-48xl.json @@ -0,0 +1,235 @@ +{ + "system": "Umbra", + "date": "2026-06-19", + "machine": "c8g.metal-48xl", + "cluster_size": 1, + "proprietary": "yes", + "hardware": "cpu", + "tuned": "no", + "tags": [ + "C++", + "column-oriented", + "PostgreSQL compatible" + ], + "load_time": 39.797, + "data_size": 9874885928, + "concurrent_qps": 30.252, + "concurrent_error_ratio": 0.223, + "result": [ + [ + 0.044, + 0.004, + 0.003 + ], + [ + 0.132, + 0.001, + 0.001 + ], + [ + 0.119, + 0.005, + 0.005 + ], + [ + 0.242, + 0.006, + 0.006 + ], + [ + 0.255, + 0.021, + 0.024 + ], + [ + 0.273, + 0.039, + 0.038 + ], + [ + 0.101, + 0.004, + 0.004 + ], + [ + 0.114, + 0.003, + 0.003 + ], + [ + 0.834, + 0.035, + 0.033 + ], + [ + 0.474, + 0.056, + 0.056 + ], + [ + 0.27, + 0.017, + 0.016 + ], + [ + 0.251, + 0.018, + 0.018 + ], + [ + 0.236, + 0.035, + 0.037 + ], + [ + 1.073, + 0.076, + 0.061 + ], + [ + 0.266, + 0.038, + 0.038 + ], + [ + 0.294, + 0.029, + 0.029 + ], + [ + 1.027, + 0.059, + 0.059 + ], + [ + 1.462, + 0.05, + 0.03 + ], + [ + 3.043, + 0.123, + 0.115 + ], + [ + 0.205, + 0.001, + 0.001 + ], + [ + 4.399, + 0.083, + 0.084 + ], + [ + 5.269, + 0.018, + 0.018 + ], + [ + 9.576, + 0.032, + 0.029 + ], + [ + 2.706, + 0.006, + 0.006 + ], + [ + 0.254, + 0.002, + 0.002 + ], + [ + 0.297, + 0.003, + 0.002 + ], + [ + 0.277, + 0.002, + 0.002 + ], + [ + 4.417, + 0.054, + 0.055 + ], + [ + 4.29, + 0.165, + 0.164 + ], + [ + 0.114, + 0.008, + 0.008 + ], + [ + 0.868, + 0.016, + 0.016 + ], + [ + 3.844, + 0.02, + 0.02 + ], + [ + 3.131, + 0.172, + 0.175 + ], + [ + 4.462, + 0.177, + 0.169 + ], + [ + 4.448, + 0.169, + 0.16 + ], + [ + 0.217, + 0.024, + 0.024 + ], + [ + 0.179, + 0.01, + 0.009 + ], + [ + 0.174, + 0.007, + 0.006 + ], + [ + 0.175, + 0.006, + 0.005 + ], + [ + 0.433, + 0.014, + 0.014 + ], + [ + 0.156, + 0.007, + 0.007 + ], + [ + 0.146, + 0.005, + 0.005 + ], + [ + 0.148, + 0.012, + 0.009 + ] + ] +} diff --git a/umbra/start b/umbra/start index 097802346f..de497b7778 100755 --- a/umbra/start +++ b/umbra/start @@ -8,68 +8,12 @@ fi sudo docker stop umbradb >/dev/null 2>&1 || true sudo docker rm umbradb >/dev/null 2>&1 || true -# Umbra's working set during the ClickBench COPY blows well past the -# guest VM's 16 GiB RAM. The agent has already mkswap'd + swapon'd a -# 256 GiB swap.raw block device, so what we need is: -# - vm.overcommit_memory=1 so the kernel doesn't refuse a single -# huge mmap (default heuristic mode rejects allocations that -# would exceed physical RAM + swap by a wide margin). -# - vm.swappiness=100 to bias the kernel toward paging anonymous -# memory out as soon as we exceed physical RAM (default 60 is -# too conservative — Umbra ENOMEMs before the kernel reclaims -# enough). -# - vm.max_map_count raised. Umbra issues a large number of small -# mmaps; the 65530 default is easy to hit on a 100 M-row COPY. -# - NO docker memory cgroup. cgroup v2 silently discards -# --memory-swappiness, and any --memory cap creates a hard -# ceiling that the kernel will OOM on regardless of how much -# swap is available. Let the host kernel manage memory. -sudo sysctl -wq vm.overcommit_memory=1 vm.swappiness=100 \ - vm.max_map_count=1048576 || true - sudo docker run -d --name umbradb \ -v "$(pwd)/db:/var/db" \ -v "$(pwd)/data:/data" \ -p 5432:5432 \ + --privileged \ --ulimit nofile=1048576:1048576 \ --ulimit memlock=-1:-1 \ + -e ASYNCIO=0 \ umbradb/umbra:latest >/dev/null - -# Container needs a moment before psql can connect. -for _ in $(seq 1 60); do - if PGPASSWORD=postgres psql -p 5432 -h 127.0.0.1 -U postgres \ - -c 'SELECT 1' >/dev/null 2>&1; then - # Diagnostic dump so a future OOM during load lands with the - # memory/swap state of the VM in the provision log. Previously - # silent — every "unable to allocate memory" failure looked - # the same and we couldn't tell whether the agent's mkswap+ - # swapon ran, whether the container saw the swap, or whether - # the sysctl tweaks above stuck. - echo "=== umbra: VM memory state ===" - free -h || true - echo "=== umbra: swap state ===" - swapon --show=NAME,SIZE,USED,PRIO --bytes || true - echo "=== umbra: sysctl ===" - for k in vm.overcommit_memory vm.swappiness vm.max_map_count \ - vm.overcommit_ratio; do - echo " $k = $(sysctl -n $k 2>/dev/null)" - done - echo "=== umbra: container memory cgroup ===" - sudo docker inspect umbradb --format \ - 'memory={{.HostConfig.Memory}} memory-swap={{.HostConfig.MemorySwap}}' || true - echo "=== umbra: container memlock ulimit ===" - sudo docker exec umbradb sh -c 'ulimit -l' 2>&1 || true - cgpath=$(sudo docker inspect umbradb --format '{{.State.Pid}}' 2>/dev/null | \ - xargs -I{} cat /proc/{}/cgroup 2>/dev/null | awk -F: '{print $NF}') - if [ -n "$cgpath" ]; then - for f in memory.max memory.swap.max memory.swap.current; do - p="/sys/fs/cgroup${cgpath}/$f" - [ -r "$p" ] && echo " $f = $(cat "$p")" - done - fi - echo "=== umbra: container procs ===" - sudo docker top umbradb -eo pid,vsz,rss,comm 2>&1 | head -10 - exit 0 - fi - sleep 1 -done