diff --git a/.gitignore b/.gitignore
index b4b68d0..1c912e4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -148,5 +148,7 @@ dmypy.json
# Cython debug symbols
cython_debug/
-# DS_STORE
+# DS_STORE
.DS_Store
+
+benchmark_results
diff --git a/BENCHMARK.md b/BENCHMARK.md
new file mode 100644
index 0000000..025c159
--- /dev/null
+++ b/BENCHMARK.md
@@ -0,0 +1,349 @@
+# PyDeequ Benchmark
+
+Benchmark harness for comparing DuckDB and Spark engine performance.
+
+## Design Overview
+
+### Architecture
+
+```
+benchmark_cli.py # CLI entry point
+benchmark/
+├── config.py # Configuration dataclasses
+├── experiments.py # Experiment logic (data gen, checks, profiling)
+├── worker.py # Subprocess worker for process isolation
+├── spark_server.py # Auto Spark Connect server management
+├── results.py # Results storage and merging
+├── report.py # Markdown report generation
+└── visualize.py # PNG chart generation
+```
+
+### Process Isolation
+
+Each engine runs in a separate subprocess to ensure:
+- Clean JVM state for Spark
+- Independent memory allocation
+- No cross-contamination between engines
+
+### Data Pipeline
+
+1. **Generate** synthetic mixed-type data (strings, floats, ints)
+2. **Cache** as Parquet files with optimized row groups
+3. **Load** from same Parquet files for both engines (fair comparison)
+
+## Experiments
+
+### 1. Varying Rows
+- Fixed: 10 columns, 16 data quality checks
+- Variable: 100K to 130M rows
+- Measures: Validation time scaling with data size
+
+### 2. Varying Columns
+- Fixed: 1M rows
+- Variable: 10 to 80 columns (16 to 226 checks)
+- Measures: Validation time scaling with schema complexity
+
+### 3. Column Profiling
+- Fixed: 10 columns
+- Variable: 100K to 10M rows
+- Measures: Full column profiling performance
+
+## Results
+
+Benchmark run on Apple M3 Max (14 cores), macOS Darwin 25.2.0.
+
+
+
+### Experiment 1: Varying Rows
+
+| Rows | DuckDB (s) | Spark (s) | Speedup |
+|------|------------|-----------|---------|
+| 100K | 0.034 | 0.662 | **19.5x** |
+| 1M | 0.071 | 1.648 | **23.2x** |
+| 5M | 0.167 | 2.470 | **14.8x** |
+| 10M | 0.268 | 3.239 | **12.1x** |
+| 50M | 1.114 | 12.448 | **11.2x** |
+| 130M | 2.752 | 28.404 | **10.3x** |
+
+### Experiment 2: Varying Columns
+
+| Cols | Checks | DuckDB (s) | Spark (s) | Speedup |
+|------|--------|------------|-----------|---------|
+| 10 | 16 | 0.076 | 1.619 | **21.3x** |
+| 20 | 46 | 0.081 | 2.078 | **25.7x** |
+| 40 | 106 | 0.121 | 2.781 | **23.0x** |
+| 80 | 226 | 0.177 | 4.258 | **24.1x** |
+
+### Experiment 3: Column Profiling
+
+| Rows | DuckDB (s) | Spark (s) | Speedup |
+|------|------------|-----------|---------|
+| 100K | 0.045 | 0.585 | **13.0x** |
+| 1M | 0.288 | 0.720 | **2.5x** |
+| 5M | 1.524 | 2.351 | **1.5x** |
+| 10M | 2.993 | 3.975 | **1.3x** |
+
+### Key Takeaways
+
+1. **DuckDB is 10-23x faster** for row-scaling validation workloads
+2. **Consistent speedup across complexity** - 21-26x speedup regardless of column count
+3. **Profiling converges** - at 10M rows, DuckDB is still 1.3x faster
+4. **No JVM overhead** - DuckDB runs natively in Python, no startup cost
+
+## Performance Optimizations
+
+The DuckDB engine includes several optimizations to maintain performance as check complexity increases:
+
+### Optimization 1: Grouping Operator Batching
+
+Grouping operators (Distinctness, Uniqueness, UniqueValueRatio) that share the same columns and WHERE clause are fused into single queries.
+
+**Before**: N queries for N grouping operators on same columns
+```sql
+-- Query 1: Distinctness
+WITH freq AS (SELECT cols, COUNT(*) AS cnt FROM t GROUP BY cols)
+SELECT COUNT(*) AS distinct_count, SUM(cnt) AS total_count FROM freq
+
+-- Query 2: Uniqueness
+WITH freq AS (SELECT cols, COUNT(*) AS cnt FROM t GROUP BY cols)
+SELECT SUM(CASE WHEN cnt = 1 THEN 1 ELSE 0 END) AS unique_count, SUM(cnt) AS total_count FROM freq
+```
+
+**After**: 1 query computing all metrics
+```sql
+WITH freq AS (SELECT cols, COUNT(*) AS cnt FROM t GROUP BY cols)
+SELECT
+ COUNT(*) AS distinct_count,
+ SUM(cnt) AS total_count,
+ SUM(CASE WHEN cnt = 1 THEN 1 ELSE 0 END) AS unique_count
+FROM freq
+```
+
+**Impact**: 20-40% improvement for checks with multiple grouping operators
+
+### Optimization 2: Multi-Column Profiling
+
+Profile statistics for all columns are batched into 2-3 queries instead of 2-3 queries per column.
+
+**Before**: 20-30 queries for 10 columns
+```sql
+-- Per-column queries for completeness, numeric stats, percentiles
+SELECT COUNT(*), SUM(CASE WHEN col1 IS NULL...) FROM t
+SELECT MIN(col1), MAX(col1), AVG(col1)... FROM t
+SELECT QUANTILE_CONT(col1, 0.25)... FROM t
+-- Repeated for each column
+```
+
+**After**: 3 queries total
+```sql
+-- Query 1: All completeness stats
+SELECT COUNT(*), SUM(CASE WHEN col1 IS NULL...), SUM(CASE WHEN col2 IS NULL...)... FROM t
+
+-- Query 2: All numeric stats
+SELECT MIN(col1), MAX(col1), MIN(col2), MAX(col2)... FROM t
+
+-- Query 3: All percentiles
+SELECT QUANTILE_CONT(col1, 0.25), QUANTILE_CONT(col2, 0.25)... FROM t
+```
+
+**Impact**: 40-60% improvement for column profiling
+
+### Optimization 3: DuckDB Configuration
+
+Configurable engine settings optimize DuckDB for analytical workloads:
+
+```python
+from pydeequ.engines.duckdb_config import DuckDBEngineConfig
+
+config = DuckDBEngineConfig(
+ threads=8, # Control parallelism
+ memory_limit="8GB", # Memory management
+ preserve_insertion_order=False, # Better parallel execution
+ parquet_metadata_cache=True, # Faster Parquet reads
+)
+
+engine = DuckDBEngine(con, table="test", config=config)
+```
+
+**Impact**: 5-15% improvement for large parallel scans
+
+### Optimization 4: Constraint Batching
+
+Scan-based constraints (Size, Completeness, Mean, etc.) and ratio-check constraints (isPositive, isContainedIn, etc.) are batched into minimal queries.
+
+**Before**: 1 query per constraint
+```sql
+SELECT COUNT(*) FROM t -- Size
+SELECT COUNT(*), SUM(CASE WHEN col IS NULL...) FROM t -- Completeness
+SELECT AVG(col) FROM t -- Mean
+```
+
+**After**: 1 query for all scan-based constraints
+```sql
+SELECT
+ COUNT(*) AS size,
+ SUM(CASE WHEN col IS NULL THEN 1 ELSE 0 END) AS null_count,
+ AVG(col) AS mean
+FROM t
+```
+
+**Impact**: 20-40% improvement for checks with many constraints
+
+### Optimization 5: Query Profiling Infrastructure
+
+Built-in profiling helps identify bottlenecks and verify optimizations:
+
+```python
+engine = DuckDBEngine(con, table="test", enable_profiling=True)
+engine.run_checks([check])
+
+# Get query statistics
+stats = engine.get_query_stats()
+print(f"Query count: {engine.get_query_count()}")
+print(stats)
+
+# Get query plan for analysis
+plan = engine.explain_query("SELECT COUNT(*) FROM test")
+```
+
+### Measured Performance Improvements
+
+Benchmark comparison: Baseline (2026-01-20) vs After Optimization (2026-01-21, 5-run average)
+
+#### Experiment 2: Varying Columns (KEY METRIC - Speedup Degradation Fix)
+
+| Cols | Checks | Before DuckDB | After DuckDB | Spark | Before Speedup | After Speedup |
+|------|--------|---------------|--------------|-------|----------------|---------------|
+| 10 | 16 | 0.118s | 0.076s | 1.619s | 14.1x | **21.3x** |
+| 20 | 46 | 0.286s | 0.081s | 2.078s | 7.5x | **25.7x** |
+| 40 | 106 | 0.713s | 0.121s | 2.781s | 4.0x | **23.0x** |
+| 80 | 226 | 2.214s | 0.177s | 4.258s | 2.0x | **24.1x** |
+
+**Key Achievement**: The speedup degradation problem is **SOLVED**.
+- **Before**: Speedup degraded from 14x (10 cols) down to 2x (80 cols)
+- **After**: Speedup is consistent **~21-26x** across ALL column counts
+
+#### DuckDB-Only Performance Gains
+
+| Cols | Before | After | Improvement |
+|------|--------|-------|-------------|
+| 10 | 0.118s | 0.076s | 36% faster |
+| 20 | 0.286s | 0.081s | 72% faster |
+| 40 | 0.713s | 0.121s | 83% faster |
+| 80 | 2.214s | 0.177s | **92% faster (~12x)** |
+
+#### Experiment 1: Varying Rows (16 checks)
+
+| Rows | Before | After | Improvement |
+|------|--------|-------|-------------|
+| 100K | 0.052s | 0.034s | 35% faster |
+| 1M | 0.090s | 0.071s | 21% faster |
+| 5M | 0.221s | 0.167s | 24% faster |
+| 10M | 0.335s | 0.268s | 20% faster |
+| 50M | 1.177s | 1.114s | 5% faster |
+| 130M | 2.897s | 2.752s | 5% faster |
+
+#### Experiment 3: Column Profiling (10 columns)
+
+| Rows | Before | After | Change |
+|------|--------|-------|--------|
+| 100K | 0.086s | 0.045s | 48% faster |
+| 1M | 0.388s | 0.288s | 26% faster |
+| 5M | 1.470s | 1.524s | ~same |
+| 10M | 2.659s | 2.993s | 13% slower |
+
+Note: Profiling shows slight regression at very high row counts due to batched query overhead, which is a trade-off for the significant gains in column scaling.
+
+## Quick Start
+
+### Run DuckDB Only (No Spark Required)
+
+```bash
+python benchmark_cli.py run --engine duckdb
+```
+
+### Run Both Engines
+
+```bash
+python benchmark_cli.py run --engine all
+```
+
+Auto-spark is enabled by default. The harness will:
+1. Start a Spark Connect server
+2. Run DuckDB benchmarks
+3. Run Spark benchmarks
+4. Stop the server
+5. Merge results
+
+### Run with External Spark Server
+
+```bash
+# Start server manually first, then:
+python benchmark_cli.py run --engine spark --no-auto-spark
+```
+
+## Output Structure
+
+Each run creates a timestamped folder:
+
+```
+benchmark_results/
+└── benchmark_2024-01-19T14-30-45/
+ ├── results.json # Raw timing data
+ └── BENCHMARK_RESULTS.md # Markdown report
+```
+
+## Visualize Results
+
+Generate a PNG chart comparing engine performance:
+
+```bash
+# From run folder
+python benchmark_cli.py visualize benchmark_results/benchmark_2024-01-19T14-30-45/
+
+# Custom output path
+python benchmark_cli.py visualize benchmark_results/benchmark_2024-01-19T14-30-45/ -o comparison.png
+```
+
+The chart shows:
+- **Top row**: Time comparisons (DuckDB vs Spark) for each experiment
+- **Bottom row**: Speedup ratios (how many times faster DuckDB is)
+
+## Regenerate Report
+
+```bash
+python benchmark_cli.py report benchmark_results/benchmark_2024-01-19T14-30-45/
+```
+
+## Configuration
+
+Default experiment parameters (see `benchmark/config.py`):
+
+| Parameter | Default |
+|-----------|---------|
+| Row counts | 100K, 1M, 5M, 10M, 50M, 130M |
+| Column counts | 10, 20, 40, 80 |
+| Profiling rows | 100K, 1M, 5M, 10M |
+| Validation runs | 3 (averaged) |
+| Cache directory | `~/.deequ_benchmark_data` |
+
+## Requirements
+
+- **DuckDB**: No additional setup
+- **Spark**: Requires `SPARK_HOME` and `JAVA_HOME` environment variables (or use `--spark-home`/`--java-home` flags)
+
+## Example Workflow
+
+```bash
+# 1. Run full benchmark
+python benchmark_cli.py run --engine all
+
+# 2. View results
+cat benchmark_results/benchmark_*/BENCHMARK_RESULTS.md
+
+# 3. Generate chart
+python benchmark_cli.py visualize benchmark_results/benchmark_*/
+
+# 4. Open chart
+open benchmark_results/benchmark_*/benchmark_chart.png
+```
diff --git a/Engines.md b/Engines.md
new file mode 100644
index 0000000..14d79a7
--- /dev/null
+++ b/Engines.md
@@ -0,0 +1,231 @@
+# Engine Parity Analysis Report
+
+## Executive Summary
+
+This report documents the parity testing results between the Spark and DuckDB engines in python-deequ. After implementing all fixes, **84 tests pass** and **5 tests fail** due to inherent algorithmic differences.
+
+### Current Test Results
+
+| Status | Count | Percentage |
+|--------|-------|------------|
+| **Passed** | 84 | 94.4% |
+| **Failed** | 5 | 5.6% |
+
+### Summary of Remaining Failures
+
+| Category | Test Failures | Root Cause | Fixable? |
+|----------|---------------|------------|----------|
+| Approximate Quantile | 2 tests | Different quantile algorithms | Inherent difference |
+| Approximate Count Distinct | 1 test | HyperLogLog implementation variance | Inherent variance |
+| Profile Distinct Values | 2 tests | HyperLogLog variance | Inherent variance |
+
+---
+
+## Fixes Applied
+
+### 1. STDDEV_SAMP → STDDEV_POP
+
+Changed DuckDB to use population standard deviation to match Spark.
+
+**Files Modified:**
+- `pydeequ/engines/operators/scan_operators.py`
+- `pydeequ/engines/operators/profiling_operators.py`
+
+**Tests Fixed:** ~8 tests
+
+### 2. Entropy: LOG2 → LN
+
+Changed DuckDB entropy calculation from log base 2 to natural log to match Spark.
+
+**File Modified:** `pydeequ/engines/operators/grouping_operators.py`
+
+```python
+# Before (bits)
+-SUM((cnt * 1.0 / total_cnt) * LOG2(cnt * 1.0 / total_cnt)) AS entropy
+
+# After (nats) - matches Spark
+-SUM((cnt * 1.0 / total_cnt) * LN(cnt * 1.0 / total_cnt)) AS entropy
+```
+
+**Tests Fixed:** 3 tests
+- `test_entropy_uniform`
+- `test_mutual_information`
+- `test_has_entropy` (constraint)
+
+### 3. Spark Connect Server Fixture
+
+Added automatic Spark Connect server startup for parity tests.
+
+**File Modified:** `tests/engines/comparison/conftest.py`
+
+```python
+@pytest.fixture(scope="session")
+def spark_connect_server():
+ """Automatically starts Spark Connect server if not running."""
+ from benchmark.spark_server import SparkConnectServer
+ from benchmark.config import SparkServerConfig
+
+ config = SparkServerConfig()
+ server = SparkConnectServer(config)
+
+ if not server.is_running():
+ server.start()
+
+ if not os.environ.get("SPARK_REMOTE"):
+ os.environ["SPARK_REMOTE"] = f"sc://localhost:{config.port}"
+
+ yield server
+```
+
+### 4. Flatten Metrics in DeequRelationPlugin (Histogram/DataType Fix)
+
+Fixed the Scala Deequ Connect plugin to properly handle complex metrics like Histogram and DataType by flattening them before output.
+
+**File Modified:** `deequ/src/main/scala/com/amazon/deequ/connect/DeequRelationPlugin.scala`
+
+**Root Cause:** The plugin was only collecting `DoubleMetric` instances directly, but Histogram and DataType return complex metric types (`HistogramMetric`, etc.) that need to be flattened first.
+
+**Before:**
+```scala
+val metrics = context.metricMap.toSeq.collect {
+ case (analyzer, metric: DoubleMetric) => ...
+}
+```
+
+**After:**
+```scala
+val metrics = context.metricMap.toSeq.flatMap { case (analyzer, metric) =>
+ metric.flatten().map { doubleMetric =>
+ val value: Double = doubleMetric.value.getOrElse(Double.NaN)
+ (
+ analyzer.toString,
+ doubleMetric.entity.toString,
+ doubleMetric.instance,
+ doubleMetric.name,
+ value
+ )
+ }
+}
+```
+
+**Tests Fixed:** 2 tests
+- `test_histogram`
+- `test_data_type`
+
+---
+
+## Detailed Analysis of Remaining Failures
+
+### 1. Approximate Quantile (2 tests)
+
+**Root Cause: Different Algorithms**
+
+| Engine | Algorithm |
+|--------|-----------|
+| Spark | T-Digest (approximate) |
+| DuckDB | QUANTILE_CONT (exact interpolation) |
+
+The algorithms produce different results, especially for small datasets.
+
+**Resolution:** Accept as inherent difference or implement T-Digest in DuckDB.
+
+### 2. Approximate Count Distinct (3 tests)
+
+**Root Cause: HyperLogLog Variance**
+
+Both engines use HyperLogLog but with different implementations:
+- Different hash functions
+- Different precision parameters
+
+**Evidence:**
+```
+Spark approx_distinct: 9
+DuckDB approx_distinct: 10 (or 6 vs 5)
+```
+
+~10% variance is expected for probabilistic data structures.
+
+**Resolution:** Accept as inherent variance. The 10% tolerance handles most cases but edge cases with small cardinalities still fail.
+
+---
+
+## Test Results Summary
+
+### Passing Tests (84)
+
+All core analyzers and constraints:
+- Size, Completeness, Mean, Sum, Min, Max
+- StandardDeviation (after STDDEV_POP fix)
+- Distinctness, Uniqueness, UniqueValueRatio, CountDistinct
+- Correlation, PatternMatch, Compliance
+- MinLength, MaxLength
+- Entropy, MutualInformation (after LN fix)
+- **Histogram** (after flatten fix)
+- **DataType** (after flatten fix)
+- All constraint tests (32 tests)
+- All suggestion tests (13 tests)
+- Most profile tests
+
+### Failing Tests (5)
+
+| Test | Category | Status |
+|------|----------|--------|
+| `test_approx_count_distinct` | Analyzer | Inherent HLL variance |
+| `test_approx_quantile_median` | Analyzer | Algorithm difference |
+| `test_approx_quantile_quartiles` | Analyzer | Algorithm difference |
+| `test_completeness_partial` | Profile | Inherent HLL variance |
+| `test_distinct_values` | Profile | Inherent HLL variance |
+
+---
+
+## Files Modified
+
+### Python (python-deequ)
+
+| File | Changes |
+|------|---------|
+| `pydeequ/engines/operators/scan_operators.py` | STDDEV_SAMP → STDDEV_POP |
+| `pydeequ/engines/operators/profiling_operators.py` | STDDEV_SAMP → STDDEV_POP |
+| `pydeequ/engines/operators/grouping_operators.py` | LOG2 → LN for entropy |
+| `tests/engines/comparison/conftest.py` | Added `spark_connect_server` fixture |
+| `tests/engines/comparison/utils.py` | Tolerance adjustments, JSON parsing |
+
+### Scala (deequ)
+
+| File | Changes |
+|------|---------|
+| `deequ/src/main/scala/com/amazon/deequ/connect/DeequRelationPlugin.scala` | Flatten metrics in `analyzerContextToDataFrame` |
+
+---
+
+## Recommendations
+
+### Mark as xfail (5 tests)
+
+These tests should be marked with `@pytest.mark.xfail` with documented reasons:
+
+```python
+@pytest.mark.xfail(reason="HyperLogLog implementation variance")
+def test_approx_count_distinct(self, ...):
+ ...
+
+@pytest.mark.xfail(reason="T-Digest vs QUANTILE_CONT algorithm difference")
+def test_approx_quantile_median(self, ...):
+ ...
+```
+
+### Future Improvements
+
+1. **Exact Count for Small Data**: Use `COUNT(DISTINCT)` instead of HyperLogLog when dataset size < threshold
+2. **Quantile Algorithm Alignment**: Consider implementing T-Digest in DuckDB for exact parity
+
+---
+
+## Conclusion
+
+The parity testing initiative achieved **94.4% test pass rate** (84/89 tests). The remaining 5 failures represent inherent algorithmic differences:
+
+1. **Probabilistic algorithm variance** (3 tests) - Inherent to HyperLogLog
+2. **Algorithm differences** (2 tests) - T-Digest vs QUANTILE_CONT
+
+All major analyzers (Size, Completeness, Mean, StandardDeviation, Entropy, Correlation, Histogram, DataType, etc.) now have full parity between engines.
diff --git a/README.md b/README.md
index 2d19db5..85c1714 100644
--- a/README.md
+++ b/README.md
@@ -6,73 +6,191 @@ PyDeequ is a Python API for [Deequ](https://github.com/awslabs/deequ), a library
## What's New in PyDeequ 2.0
-PyDeequ 2.0 introduces a new architecture using **Spark Connect**, bringing significant improvements:
+PyDeequ 2.0 introduces a new multi-engine architecture with **DuckDB** and **Spark Connect** backends:
| Feature | PyDeequ 1.x | PyDeequ 2.0 |
|---------|-------------|-------------|
-| Communication | Py4J (JVM bridge) | Spark Connect (gRPC) |
+| Backends | Spark only (Py4J) | DuckDB, Spark Connect |
+| JVM Required | Yes | No (DuckDB) / Yes (Spark) |
| Assertions | Python lambdas | Serializable predicates |
-| Spark Session | Local only | Local or Remote |
-| Architecture | Tight JVM coupling | Clean client-server |
+| Remote Execution | No | Yes (Spark Connect) |
**Key Benefits:**
-- **No Py4J dependency** - Uses Spark Connect protocol for communication
+- **DuckDB backend** - Lightweight, no JVM required, perfect for local development and CI/CD
+- **Spark Connect backend** - Production-scale processing with remote cluster support
- **Serializable predicates** - Replace Python lambdas with predicate objects (`eq`, `gte`, `between`, etc.)
-- **Remote execution** - Connect to remote Spark clusters via Spark Connect
-- **Cleaner API** - Simplified imports and more Pythonic interface
+- **Unified API** - Same code works with both backends
### Architecture
```mermaid
-flowchart LR
+flowchart TB
subgraph CLIENT["Python Client"]
- A["Python Code"] --> B["Protobuf
Serialization"]
+ A["pydeequ.connect()"] --> B["Engine Auto-Detection"]
end
- B -- gRPC --> C["Spark Connect (gRPC)"]
- subgraph SERVER["Spark Connect Server"]
- D["DeequRelationPlugin"] --> E["Deequ Core"] --> F["Spark DataFrame API"] --> G["(Data)"]
+
+ B --> C{Connection Type}
+
+ C -->|DuckDB| D["DuckDBEngine"]
+ C -->|SparkSession| E["SparkEngine"]
+
+ subgraph DUCKDB["DuckDB Backend (Local)"]
+ D --> F["SQL Operators"] --> G["DuckDB"] --> H["Local Files
Parquet/CSV"]
+ end
+
+ subgraph SPARK["Spark Connect Backend (Distributed)"]
+ E --> I["Protobuf"] -- gRPC --> J["Spark Connect Server"]
+ J --> K["DeequRelationPlugin"] --> L["Deequ Core"] --> M["Data Lake"]
end
- G --> H["Results"] -- gRPC --> I["Python DataFrame"]
- %% Styling for compactness and distinction
- classDef code fill:#C8F2FB,stroke:#35a7c2,color:#13505B,font-weight:bold;
- class A code;
+
+ H --> N["Results"]
+ M --> N
+ N --> O["MetricResult / ConstraintResult / ColumnProfile"]
+
+ classDef duckdb fill:#FFF4CC,stroke:#E6B800,color:#806600;
+ classDef spark fill:#CCE5FF,stroke:#0066CC,color:#003366;
+ class D,F,G,H duckdb;
+ class E,I,J,K,L,M spark;
```
**How it works:**
-1. **Client Side**: PyDeequ 2.0 builds checks and analyzers as Protobuf messages
-2. **Transport**: Messages are sent via gRPC to the Spark Connect server
-3. **Server Side**: The `DeequRelationPlugin` deserializes messages and executes Deequ operations
-4. **Results**: Verification results are returned as a Spark DataFrame
+- **Auto-detection**: `pydeequ.connect()` inspects the connection type and creates the appropriate engine
+- **DuckDB path**: Direct SQL execution in-process, no JVM required
+- **Spark path**: Protobuf serialization over gRPC to Spark Connect server with Deequ plugin
+- **Unified results**: Both engines return the same `MetricResult`, `ConstraintResult`, and `ColumnProfile` types
### Feature Support Matrix
-| Feature | PyDeequ 1.x | PyDeequ 2.0 |
-|---------|:-----------:|:-----------:|
-| **Constraint Verification** | | |
-| VerificationSuite | Yes | Yes |
-| Check constraints | Yes | Yes |
-| Custom SQL expressions | Yes | Yes |
-| **Metrics & Analysis** | | |
-| AnalysisRunner | Yes | Yes |
-| All standard analyzers | Yes | Yes |
-| **Column Profiling** | | |
-| ColumnProfilerRunner | Yes | Yes |
-| Numeric statistics | Yes | Yes |
-| KLL sketch profiling | Yes | Yes |
-| Low-cardinality histograms | Yes | Yes |
-| **Constraint Suggestions** | | |
-| ConstraintSuggestionRunner | Yes | Yes |
-| Rule sets (DEFAULT, EXTENDED, etc.) | Yes | Yes |
-| Train/test split evaluation | Yes | Yes |
-| **Metrics Repository** | | |
-| FileSystemMetricsRepository | Yes | Planned |
-| **Execution Mode** | | |
-| Local Spark | Yes | No |
-| Spark Connect (remote) | No | Yes |
+| Feature | PyDeequ 1.x | PyDeequ 2.0 (DuckDB) | PyDeequ 2.0 (Spark) |
+|---------|:-----------:|:--------------------:|:-------------------:|
+| **Constraint Verification** | | | |
+| VerificationSuite | Yes | Yes | Yes |
+| Check constraints | Yes | Yes | Yes |
+| Custom SQL expressions | Yes | Yes | Yes |
+| **Metrics & Analysis** | | | |
+| AnalysisRunner | Yes | Yes | Yes |
+| All standard analyzers | Yes | Yes | Yes |
+| **Column Profiling** | | | |
+| ColumnProfilerRunner | Yes | Yes | Yes |
+| Numeric statistics | Yes | Yes | Yes |
+| KLL sketch profiling | Yes | No | Yes |
+| Low-cardinality histograms | Yes | Yes | Yes |
+| **Constraint Suggestions** | | | |
+| ConstraintSuggestionRunner | Yes | Yes | Yes |
+| Rule sets (DEFAULT, EXTENDED, etc.) | Yes | Yes | Yes |
+| Train/test split evaluation | Yes | No | Yes |
+| **Metrics Repository** | | | |
+| FileSystemMetricsRepository | Yes | Planned | Planned |
+| **Execution Environment** | | | |
+| JVM Required | Yes | No | Yes |
+| Local execution | Yes | Yes | Yes |
+| Remote execution | No | No | Yes |
+
+---
+
+## Installation
+
+PyDeequ 2.0 supports multiple backends. Install only what you need:
+
+**From PyPI (when published):**
+```bash
+# DuckDB backend (lightweight, no JVM required)
+pip install pydeequ[duckdb]
+
+# Spark Connect backend (for production-scale processing)
+pip install pydeequ[spark]
+
+# Both backends
+pip install pydeequ[all]
+
+# Development (includes all backends + test tools)
+pip install pydeequ[dev]
+```
+
+**From GitHub Release (beta):**
+```bash
+# Install beta wheel + DuckDB
+pip install https://github.com/awslabs/python-deequ/releases/download/v2.0.0b1/pydeequ-2.0.0b1-py3-none-any.whl
+pip install duckdb
+
+# For Spark backend, also install:
+pip install pyspark[connect]==3.5.0
+```
+
+---
+
+## Quick Start with DuckDB (Recommended for Getting Started)
+
+The DuckDB backend is the easiest way to get started - no JVM or Spark server required.
+
+### Requirements
+- Python 3.9+
+
+### Installation
+
+```bash
+pip install pydeequ[duckdb]
+```
+
+### Run Your First Check
+
+```python
+import duckdb
+import pydeequ
+from pydeequ.v2.analyzers import Size, Completeness, Mean
+from pydeequ.v2.checks import Check, CheckLevel
+from pydeequ.v2.predicates import eq, gte
+
+# Create a DuckDB connection and load data
+con = duckdb.connect()
+con.execute("""
+ CREATE TABLE users AS SELECT * FROM (VALUES
+ (1, 'Alice', 25),
+ (2, 'Bob', 30),
+ (3, 'Charlie', NULL)
+ ) AS t(id, name, age)
+""")
+
+# Create an engine from the connection
+engine = pydeequ.connect(con, table="users")
+
+# Run analyzers
+metrics = engine.compute_metrics([
+ Size(),
+ Completeness("id"),
+ Completeness("age"),
+ Mean("age"),
+])
+print("Metrics:")
+for m in metrics:
+ print(f" {m.name}({m.instance}): {m.value}")
+
+# Run constraint checks
+check = (Check(CheckLevel.Error, "Data quality checks")
+ .hasSize(eq(3))
+ .isComplete("id")
+ .isComplete("name")
+ .hasCompleteness("age", gte(0.5)))
+
+results = engine.run_checks([check])
+print("\nConstraint Results:")
+for r in results:
+ print(f" {r.constraint}: {r.constraint_status}")
+
+# Profile columns
+profiles = engine.profile_columns()
+print("\nColumn Profiles:")
+for p in profiles:
+ print(f" {p.column}: completeness={p.completeness}, distinct={p.approx_distinct_values}")
+
+con.close()
+```
---
-## PyDeequ 2.0 Beta - Quick Start
+## Quick Start with Spark Connect (Production Scale)
+
+For production workloads and large-scale data processing, use the Spark Connect backend.
### Requirements
@@ -142,6 +260,11 @@ pip install pyspark[connect]==3.5.0
pip install setuptools
```
+Or using the extras syntax (once published to PyPI):
+```bash
+pip install pydeequ[spark]
+```
+
### Step 5: Run Your First Check
```python
@@ -444,7 +567,8 @@ The legacy PyDeequ API uses Py4J for JVM communication. It is still available fo
### Installation
```bash
-pip install pydeequ
+# Install with Spark backend (required for 1.x API)
+pip install pydeequ[spark]
```
**Note:** Set the `SPARK_VERSION` environment variable to match your Spark version.
@@ -638,7 +762,14 @@ sdk install spark 3.5.0
### Poetry
```bash
-poetry install
+# Install all dependencies (including dev tools and both backends)
+poetry install --with dev --all-extras
+
+# Or install specific extras
+poetry install --extras duckdb # DuckDB only
+poetry install --extras spark # Spark only
+poetry install --extras all # Both backends
+
poetry update
poetry show -o
```
@@ -646,7 +777,11 @@ poetry show -o
### Running Tests Locally
```bash
+# Run all tests (requires Spark Connect server for comparison tests)
poetry run pytest
+
+# Run DuckDB-only tests (no Spark required)
+poetry run pytest tests/engines/test_duckdb*.py tests/engines/test_operators.py
```
### Running Tests (Docker)
diff --git a/benchmark/__init__.py b/benchmark/__init__.py
new file mode 100644
index 0000000..7e9dfac
--- /dev/null
+++ b/benchmark/__init__.py
@@ -0,0 +1,28 @@
+"""Benchmark package for PyDeequ engine comparison."""
+
+from .config import ExperimentConfig, SparkServerConfig, BenchmarkConfig
+from .results import (
+ ExperimentResult,
+ EnvironmentInfo,
+ BenchmarkRun,
+ generate_run_id,
+ save_results,
+ load_results,
+ collect_environment_info,
+)
+from .spark_server import SparkConnectServer, managed_spark_server
+
+__all__ = [
+ "ExperimentConfig",
+ "SparkServerConfig",
+ "BenchmarkConfig",
+ "ExperimentResult",
+ "EnvironmentInfo",
+ "BenchmarkRun",
+ "generate_run_id",
+ "save_results",
+ "load_results",
+ "collect_environment_info",
+ "SparkConnectServer",
+ "managed_spark_server",
+]
diff --git a/benchmark/config.py b/benchmark/config.py
new file mode 100644
index 0000000..1829876
--- /dev/null
+++ b/benchmark/config.py
@@ -0,0 +1,107 @@
+"""Configuration dataclasses for benchmark."""
+
+import os
+from dataclasses import dataclass, field, asdict
+from typing import List, Optional
+
+
+# Default experiment configurations (from original script)
+DEFAULT_ROW_COUNTS = [100_000, 1_000_000, 5_000_000, 10_000_000, 50_000_000, 130_000_000]
+DEFAULT_COLUMN_COUNTS = [10, 20, 40, 80]
+DEFAULT_PROFILING_ROW_COUNTS = [100_000, 1_000_000, 5_000_000, 10_000_000]
+DEFAULT_FIXED_ROWS = 1_000_000
+DEFAULT_BASE_COLS = 10
+DEFAULT_N_RUNS = 3
+
+@dataclass
+class ExperimentConfig:
+ """Configuration for benchmark experiments."""
+
+ n_runs: int = DEFAULT_N_RUNS
+ row_counts: List[int] = field(default_factory=lambda: DEFAULT_ROW_COUNTS.copy())
+ column_counts: List[int] = field(default_factory=lambda: DEFAULT_COLUMN_COUNTS.copy())
+ profiling_row_counts: List[int] = field(
+ default_factory=lambda: DEFAULT_PROFILING_ROW_COUNTS.copy()
+ )
+ fixed_rows: int = DEFAULT_FIXED_ROWS
+ base_cols: int = DEFAULT_BASE_COLS
+ cache_dir: str = field(
+ default_factory=lambda: os.path.expanduser("~/.deequ_benchmark_data")
+ )
+
+ def to_dict(self) -> dict:
+ """Convert to dictionary for JSON serialization."""
+ return asdict(self)
+
+ @classmethod
+ def from_dict(cls, data: dict) -> "ExperimentConfig":
+ """Create from dictionary."""
+ return cls(**data)
+
+
+@dataclass
+class SparkServerConfig:
+ """Configuration for Spark Connect server."""
+
+ java_home: str = field(
+ default_factory=lambda: os.environ.get(
+ "JAVA_HOME",
+ "/Library/Java/JavaVirtualMachines/amazon-corretto-17.jdk/Contents/Home",
+ )
+ )
+ spark_home: str = field(
+ default_factory=lambda: os.environ.get(
+ "SPARK_HOME", "/Volumes/workplace/deequ_rewrite/spark-3.5.0-bin-hadoop3"
+ )
+ )
+ port: int = 15002
+ startup_timeout: int = 60
+ poll_interval: float = 1.0
+ driver_memory: str = "16g"
+ executor_memory: str = "16g"
+ deequ_jar: str = field(
+ default_factory=lambda: "/Volumes/workplace/deequ_rewrite/deequ/target/deequ_2.12-2.1.0b-spark-3.5.jar"
+ )
+
+ def to_dict(self) -> dict:
+ """Convert to dictionary for JSON serialization."""
+ return asdict(self)
+
+ @classmethod
+ def from_dict(cls, data: dict) -> "SparkServerConfig":
+ """Create from dictionary."""
+ return cls(**data)
+
+
+@dataclass
+class BenchmarkConfig:
+ """Overall benchmark configuration."""
+
+ engine: str = "all" # "all", "duckdb", or "spark"
+ output_dir: str = "benchmark_results"
+ experiment: ExperimentConfig = field(default_factory=ExperimentConfig)
+ spark_server: SparkServerConfig = field(default_factory=SparkServerConfig)
+ spark_remote: str = field(
+ default_factory=lambda: os.environ.get("SPARK_REMOTE", "sc://localhost:15002")
+ )
+
+ def to_dict(self) -> dict:
+ """Convert to dictionary for JSON serialization."""
+ return {
+ "engine": self.engine,
+ "output_dir": self.output_dir,
+ "experiment": self.experiment.to_dict(),
+ "spark_server": self.spark_server.to_dict(),
+ "spark_remote": self.spark_remote,
+ }
+
+ @classmethod
+ def from_dict(cls, data: dict) -> "BenchmarkConfig":
+ """Create from dictionary."""
+ return cls(
+ engine=data.get("engine", "all"),
+ output_dir=data.get("output_dir", "benchmark_results"),
+ experiment=ExperimentConfig.from_dict(data.get("experiment", {})),
+ spark_server=SparkServerConfig.from_dict(data.get("spark_server", {})),
+ spark_remote=data.get("spark_remote", "sc://localhost:15002"),
+ )
diff --git a/benchmark/experiments.py b/benchmark/experiments.py
new file mode 100644
index 0000000..3424807
--- /dev/null
+++ b/benchmark/experiments.py
@@ -0,0 +1,547 @@
+"""Benchmark experiment logic extracted from original benchmark_cli.py."""
+
+import os
+import time
+from typing import List, Dict, Any, Optional, Tuple
+
+import duckdb
+import numpy as np
+import pandas as pd
+import pydeequ
+
+from pydeequ.v2.verification import VerificationSuite
+from pydeequ.v2.checks import Check, CheckLevel
+from pydeequ.v2.predicates import gte, lte, between
+from pydeequ.v2.profiles import ColumnProfilerRunner
+
+from .config import ExperimentConfig
+
+
+# =============================================================================
+# Data Generation
+# =============================================================================
+
+
+def generate_rich_data(n_rows: int, n_extra_cols: int = 0) -> pd.DataFrame:
+ """
+ Generate mixed-type data for benchmarking with optional extra numeric columns.
+
+ Base schema (10 columns):
+ - id: string (unique identifier)
+ - category: string (5 categorical values)
+ - status: string (3 categorical values)
+ - email: string (email-like pattern)
+ - amount: float [0, 10000]
+ - quantity: int [0, 1000]
+ - score: float [0, 100] (normal distribution)
+ - rating: int [1, 5]
+ - price: float [0.01, 9999.99]
+ - discount: float [0, 0.5]
+
+ Args:
+ n_rows: Number of rows to generate
+ n_extra_cols: Number of additional numeric columns
+
+ Returns:
+ DataFrame with mixed-type columns + optional extra numeric columns
+ """
+ np.random.seed(42)
+
+ data = {
+ "id": [f"ID{i:012d}" for i in range(n_rows)],
+ "category": np.random.choice(
+ ["electronics", "clothing", "food", "books", "toys"], n_rows
+ ),
+ "status": np.random.choice(["active", "inactive", "pending"], n_rows),
+ "email": [f"user{i}@example.com" for i in range(n_rows)],
+ "amount": np.random.uniform(0, 10000, n_rows),
+ "quantity": np.random.randint(0, 1001, n_rows),
+ "score": np.random.normal(50, 15, n_rows).clip(0, 100),
+ "rating": np.random.randint(1, 6, n_rows),
+ "price": np.random.uniform(0.01, 9999.99, n_rows),
+ "discount": np.random.uniform(0, 0.5, n_rows),
+ }
+
+ for i in range(n_extra_cols):
+ data[f"extra_{i}"] = np.random.uniform(-10000, 10000, n_rows)
+
+ return pd.DataFrame(data)
+
+
+def save_to_parquet(df: pd.DataFrame, cache_dir: str, name: str, target_row_groups: int = 64) -> str:
+ """
+ Save DataFrame to Parquet with dynamic row group size for Spark parallelism.
+
+ Args:
+ df: DataFrame to save
+ cache_dir: Cache directory path
+ name: Cache file name
+ target_row_groups: Target number of row groups
+
+ Returns:
+ Path to the saved Parquet file
+ """
+ import pyarrow as pa
+ import pyarrow.parquet as pq
+
+ os.makedirs(cache_dir, exist_ok=True)
+ path = os.path.join(cache_dir, f"{name}.parquet")
+
+ if not os.path.exists(path):
+ n_rows = len(df)
+ row_group_size = max(10_000, n_rows // target_row_groups)
+ table = pa.Table.from_pandas(df)
+ pq.write_table(table, path, compression="snappy", row_group_size=row_group_size)
+
+ return path
+
+
+def get_cached_parquet(cache_dir: str, name: str) -> Optional[str]:
+ """Get path to cached Parquet file, or None if not cached."""
+ path = os.path.join(cache_dir, f"{name}.parquet")
+ return path if os.path.exists(path) else None
+
+
+# =============================================================================
+# Check Building
+# =============================================================================
+
+
+def build_rich_check(n_extra_cols: int = 0) -> Check:
+ """
+ Build a Check suite with rich validations on base columns + simple checks on extras.
+
+ Args:
+ n_extra_cols: Number of extra numeric columns to add checks for
+
+ Returns:
+ Check instance with all constraints configured
+ """
+ check = (
+ Check(CheckLevel.Warning, "Rich Benchmark Check")
+ # Completeness checks (3)
+ .isComplete("id")
+ .isComplete("category")
+ .hasCompleteness("email", gte(0.95))
+ # Uniqueness checks (2)
+ .isUnique("id")
+ .hasDistinctness(["category"], gte(0.001))
+ # Numeric range checks (6)
+ .hasMin("amount", gte(0))
+ .hasMax("amount", lte(10000))
+ .hasMean("score", between(0, 100))
+ .hasStandardDeviation("score", lte(50))
+ .isNonNegative("quantity")
+ .isPositive("price")
+ # String checks (5)
+ .hasMinLength("id", gte(8))
+ .hasMaxLength("id", lte(20))
+ .hasPattern("email", r".*@.*\..*", gte(0.9))
+ .isContainedIn("status", ["active", "inactive", "pending"])
+ .isContainedIn("rating", ["1", "2", "3", "4", "5"])
+ )
+
+ for i in range(n_extra_cols):
+ col = f"extra_{i}"
+ check = check.isComplete(col).hasMin(col, gte(-10000)).hasMax(col, lte(10000))
+
+ return check
+
+
+def count_checks(n_extra_cols: int = 0) -> int:
+ """Return total number of checks for given extra columns."""
+ base_checks = 16
+ extra_checks = n_extra_cols * 3
+ return base_checks + extra_checks
+
+
+# =============================================================================
+# DuckDB Setup and Benchmarking
+# =============================================================================
+
+
+def setup_duckdb_from_parquet(parquet_path: str) -> Tuple[Any, duckdb.DuckDBPyConnection]:
+ """Setup DuckDB engine to read from Parquet file."""
+ con = duckdb.connect()
+ engine = pydeequ.connect(con, table=f"read_parquet('{parquet_path}')")
+ return engine, con
+
+
+def setup_duckdb_for_profiling(parquet_path: str) -> Tuple[Any, duckdb.DuckDBPyConnection]:
+ """
+ Setup DuckDB engine for profiling by creating a view from parquet.
+ This is needed because PRAGMA table_info() doesn't work with read_parquet().
+ """
+ con = duckdb.connect()
+ con.execute(f"CREATE VIEW benchmark_data AS SELECT * FROM read_parquet('{parquet_path}')")
+ engine = pydeequ.connect(con, table="benchmark_data")
+ return engine, con
+
+
+def benchmark_duckdb_validation(engine: Any, check: Check, n_runs: int) -> float:
+ """Time DuckDB VerificationSuite.run() over N runs, return average."""
+ times = []
+ for _ in range(n_runs):
+ start = time.perf_counter()
+ result = VerificationSuite().on_engine(engine).addCheck(check).run()
+ _ = len(result)
+ elapsed = time.perf_counter() - start
+ times.append(elapsed)
+ return sum(times) / len(times)
+
+
+def benchmark_duckdb_profiling(engine: Any, n_runs: int) -> float:
+ """Time DuckDB ColumnProfilerRunner.run() over N runs, return average."""
+ times = []
+ for _ in range(n_runs):
+ start = time.perf_counter()
+ result = ColumnProfilerRunner().on_engine(engine).run()
+ _ = len(result)
+ elapsed = time.perf_counter() - start
+ times.append(elapsed)
+ return sum(times) / len(times)
+
+
+# =============================================================================
+# Spark Setup and Benchmarking
+# =============================================================================
+
+
+def setup_spark(spark_remote: str) -> Tuple[Any, float]:
+ """Create SparkSession for Spark Connect. Returns (spark, startup_time)."""
+ from pyspark.sql import SparkSession
+
+ start = time.perf_counter()
+ spark = SparkSession.builder.remote(spark_remote).getOrCreate()
+ startup_time = time.perf_counter() - start
+
+ return spark, startup_time
+
+
+def load_spark_from_parquet(spark: Any, parquet_path: str) -> Tuple[Any, float]:
+ """Load Parquet file into Spark. Returns (spark_df, load_time)."""
+ start = time.perf_counter()
+ spark_df = spark.read.parquet(parquet_path)
+ spark_df.count() # Force materialization
+ load_time = time.perf_counter() - start
+
+ return spark_df, load_time
+
+
+def benchmark_spark_validation(spark: Any, spark_df: Any, check: Check, n_runs: int) -> float:
+ """Time Spark VerificationSuite.run() over N runs, return average."""
+ times = []
+ for _ in range(n_runs):
+ start = time.perf_counter()
+ result = VerificationSuite(spark).onData(spark_df).addCheck(check).run()
+ _ = result.collect()
+ elapsed = time.perf_counter() - start
+ times.append(elapsed)
+ return sum(times) / len(times)
+
+
+def benchmark_spark_profiling(spark: Any, spark_df: Any, n_runs: int) -> float:
+ """Time Spark ColumnProfilerRunner.run() over N runs, return average."""
+ times = []
+ for _ in range(n_runs):
+ start = time.perf_counter()
+ result = ColumnProfilerRunner(spark).onData(spark_df).run()
+ _ = result.collect()
+ elapsed = time.perf_counter() - start
+ times.append(elapsed)
+ return sum(times) / len(times)
+
+
+# =============================================================================
+# DuckDB Experiment Runners
+# =============================================================================
+
+
+def run_varying_rows_experiment_duckdb(config: ExperimentConfig) -> List[Dict[str, Any]]:
+ """Run varying rows experiment for DuckDB engine."""
+ print("\n" + "=" * 70)
+ print(f"EXPERIMENT 1 (DuckDB): VARYING ROWS (Fixed Columns = {config.base_cols})")
+ print("=" * 70)
+
+ results = []
+
+ for n_rows in config.row_counts:
+ n_checks = count_checks(0)
+ print(f"\n--- {n_rows:,} rows x {config.base_cols} cols ({n_checks} checks) ---")
+
+ cache_name = f"rich_rows_{n_rows}"
+ parquet_path = get_cached_parquet(config.cache_dir, cache_name)
+
+ if parquet_path:
+ print(f"Using cached data: {parquet_path}")
+ else:
+ print("Generating rich mixed-type data and saving to Parquet...")
+ df = generate_rich_data(n_rows, n_extra_cols=0)
+ parquet_path = save_to_parquet(df, config.cache_dir, cache_name)
+ del df
+
+ check = build_rich_check(n_extra_cols=0)
+
+ print("Setting up DuckDB (from Parquet)...")
+ duck_engine, duck_con = setup_duckdb_from_parquet(parquet_path)
+
+ print(f"Running DuckDB validation ({config.n_runs} runs)...")
+ duck_validation = benchmark_duckdb_validation(duck_engine, check, config.n_runs)
+ print(f" DuckDB Validation: {duck_validation:.3f}s (avg)")
+
+ duck_con.close()
+
+ results.append({
+ "rows": n_rows,
+ "cols": config.base_cols,
+ "checks": n_checks,
+ "duckdb_validation": duck_validation,
+ })
+
+ return results
+
+
+def run_varying_cols_experiment_duckdb(config: ExperimentConfig) -> List[Dict[str, Any]]:
+ """Run varying columns experiment for DuckDB engine."""
+ print("\n" + "=" * 70)
+ print(f"EXPERIMENT 2 (DuckDB): VARYING COLUMNS (Fixed Rows = {config.fixed_rows:,})")
+ print("=" * 70)
+
+ results = []
+
+ for n_cols in config.column_counts:
+ n_extra_cols = n_cols - config.base_cols
+ n_checks = count_checks(n_extra_cols)
+ print(f"\n--- {config.fixed_rows:,} rows x {n_cols} cols ({n_checks} checks) ---")
+
+ cache_name = f"rich_cols_{n_cols}"
+ parquet_path = get_cached_parquet(config.cache_dir, cache_name)
+
+ if parquet_path:
+ print(f"Using cached data: {parquet_path}")
+ else:
+ print("Generating rich mixed-type data and saving to Parquet...")
+ df = generate_rich_data(config.fixed_rows, n_extra_cols=n_extra_cols)
+ parquet_path = save_to_parquet(df, config.cache_dir, cache_name)
+ del df
+
+ check = build_rich_check(n_extra_cols=n_extra_cols)
+
+ print("Setting up DuckDB (from Parquet)...")
+ duck_engine, duck_con = setup_duckdb_from_parquet(parquet_path)
+
+ print(f"Running DuckDB validation ({config.n_runs} runs)...")
+ duck_validation = benchmark_duckdb_validation(duck_engine, check, config.n_runs)
+ print(f" DuckDB Validation: {duck_validation:.3f}s (avg)")
+
+ duck_con.close()
+
+ results.append({
+ "rows": config.fixed_rows,
+ "cols": n_cols,
+ "checks": n_checks,
+ "duckdb_validation": duck_validation,
+ })
+
+ return results
+
+
+def run_profiling_experiment_duckdb(config: ExperimentConfig) -> List[Dict[str, Any]]:
+ """Run column profiling experiment for DuckDB engine."""
+ print("\n" + "=" * 70)
+ print(f"EXPERIMENT 3 (DuckDB): COLUMN PROFILING (Fixed Columns = {config.base_cols})")
+ print("=" * 70)
+
+ results = []
+
+ for n_rows in config.profiling_row_counts:
+ print(f"\n--- {n_rows:,} rows x {config.base_cols} cols (profiling) ---")
+
+ cache_name = f"rich_rows_{n_rows}"
+ parquet_path = get_cached_parquet(config.cache_dir, cache_name)
+
+ if parquet_path:
+ print(f"Using cached data: {parquet_path}")
+ else:
+ print("Generating rich mixed-type data and saving to Parquet...")
+ df = generate_rich_data(n_rows, n_extra_cols=0)
+ parquet_path = save_to_parquet(df, config.cache_dir, cache_name)
+ del df
+
+ print("Setting up DuckDB for profiling...")
+ duck_engine, duck_con = setup_duckdb_for_profiling(parquet_path)
+
+ print(f"Running DuckDB profiling ({config.n_runs} runs)...")
+ duck_profiling = benchmark_duckdb_profiling(duck_engine, config.n_runs)
+ print(f" DuckDB Profiling: {duck_profiling:.3f}s (avg)")
+
+ duck_con.close()
+
+ results.append({
+ "rows": n_rows,
+ "cols": config.base_cols,
+ "duckdb_profiling": duck_profiling,
+ })
+
+ return results
+
+
+# =============================================================================
+# Spark Experiment Runners
+# =============================================================================
+
+
+def run_varying_rows_experiment_spark(
+ spark: Any, spark_startup_time: float, config: ExperimentConfig
+) -> List[Dict[str, Any]]:
+ """Run varying rows experiment for Spark engine."""
+ print("\n" + "=" * 70)
+ print(f"EXPERIMENT 1 (Spark): VARYING ROWS (Fixed Columns = {config.base_cols})")
+ print("=" * 70)
+
+ results = []
+
+ for n_rows in config.row_counts:
+ n_checks = count_checks(0)
+ print(f"\n--- {n_rows:,} rows x {config.base_cols} cols ({n_checks} checks) ---")
+
+ cache_name = f"rich_rows_{n_rows}"
+ parquet_path = get_cached_parquet(config.cache_dir, cache_name)
+
+ if parquet_path:
+ print(f"Using cached data: {parquet_path}")
+ else:
+ print("Generating rich mixed-type data and saving to Parquet...")
+ df = generate_rich_data(n_rows, n_extra_cols=0)
+ parquet_path = save_to_parquet(df, config.cache_dir, cache_name)
+ del df
+
+ check = build_rich_check(n_extra_cols=0)
+
+ spark_load = None
+ spark_validation = None
+
+ try:
+ print("Loading Parquet into Spark...")
+ spark_df, spark_load = load_spark_from_parquet(spark, parquet_path)
+ print(f" Spark Data Load: {spark_load:.3f}s")
+
+ print(f"Running Spark validation ({config.n_runs} runs)...")
+ spark_validation = benchmark_spark_validation(spark, spark_df, check, config.n_runs)
+ print(f" Spark Validation: {spark_validation:.3f}s (avg)")
+ except Exception as e:
+ print(f" Spark error: {str(e)[:80]}")
+
+ results.append({
+ "rows": n_rows,
+ "cols": config.base_cols,
+ "checks": n_checks,
+ "spark_startup": spark_startup_time,
+ "spark_load": spark_load,
+ "spark_validation": spark_validation,
+ })
+
+ return results
+
+
+def run_varying_cols_experiment_spark(
+ spark: Any, spark_startup_time: float, config: ExperimentConfig
+) -> List[Dict[str, Any]]:
+ """Run varying columns experiment for Spark engine."""
+ print("\n" + "=" * 70)
+ print(f"EXPERIMENT 2 (Spark): VARYING COLUMNS (Fixed Rows = {config.fixed_rows:,})")
+ print("=" * 70)
+
+ results = []
+
+ for n_cols in config.column_counts:
+ n_extra_cols = n_cols - config.base_cols
+ n_checks = count_checks(n_extra_cols)
+ print(f"\n--- {config.fixed_rows:,} rows x {n_cols} cols ({n_checks} checks) ---")
+
+ cache_name = f"rich_cols_{n_cols}"
+ parquet_path = get_cached_parquet(config.cache_dir, cache_name)
+
+ if parquet_path:
+ print(f"Using cached data: {parquet_path}")
+ else:
+ print("Generating rich mixed-type data and saving to Parquet...")
+ df = generate_rich_data(config.fixed_rows, n_extra_cols=n_extra_cols)
+ parquet_path = save_to_parquet(df, config.cache_dir, cache_name)
+ del df
+
+ check = build_rich_check(n_extra_cols=n_extra_cols)
+
+ spark_load = None
+ spark_validation = None
+
+ try:
+ print("Loading Parquet into Spark...")
+ spark_df, spark_load = load_spark_from_parquet(spark, parquet_path)
+ print(f" Spark Data Load: {spark_load:.3f}s")
+
+ print(f"Running Spark validation ({config.n_runs} runs)...")
+ spark_validation = benchmark_spark_validation(spark, spark_df, check, config.n_runs)
+ print(f" Spark Validation: {spark_validation:.3f}s (avg)")
+ except Exception as e:
+ print(f" Spark error: {str(e)[:80]}")
+
+ results.append({
+ "rows": config.fixed_rows,
+ "cols": n_cols,
+ "checks": n_checks,
+ "spark_startup": spark_startup_time,
+ "spark_load": spark_load,
+ "spark_validation": spark_validation,
+ })
+
+ return results
+
+
+def run_profiling_experiment_spark(
+ spark: Any, spark_startup_time: float, config: ExperimentConfig
+) -> List[Dict[str, Any]]:
+ """Run column profiling experiment for Spark engine."""
+ print("\n" + "=" * 70)
+ print(f"EXPERIMENT 3 (Spark): COLUMN PROFILING (Fixed Columns = {config.base_cols})")
+ print("=" * 70)
+
+ results = []
+
+ for n_rows in config.profiling_row_counts:
+ print(f"\n--- {n_rows:,} rows x {config.base_cols} cols (profiling) ---")
+
+ cache_name = f"rich_rows_{n_rows}"
+ parquet_path = get_cached_parquet(config.cache_dir, cache_name)
+
+ if parquet_path:
+ print(f"Using cached data: {parquet_path}")
+ else:
+ print("Generating rich mixed-type data and saving to Parquet...")
+ df = generate_rich_data(n_rows, n_extra_cols=0)
+ parquet_path = save_to_parquet(df, config.cache_dir, cache_name)
+ del df
+
+ spark_load = None
+ spark_profiling = None
+
+ try:
+ print("Loading Parquet into Spark...")
+ spark_df, spark_load = load_spark_from_parquet(spark, parquet_path)
+ print(f" Spark Data Load: {spark_load:.3f}s")
+
+ print(f"Running Spark profiling ({config.n_runs} runs)...")
+ spark_profiling = benchmark_spark_profiling(spark, spark_df, config.n_runs)
+ print(f" Spark Profiling: {spark_profiling:.3f}s (avg)")
+ except Exception as e:
+ print(f" Spark error: {str(e)[:80]}")
+
+ results.append({
+ "rows": n_rows,
+ "cols": config.base_cols,
+ "spark_startup": spark_startup_time,
+ "spark_load": spark_load,
+ "spark_profiling": spark_profiling,
+ })
+
+ return results
diff --git a/benchmark/report.py b/benchmark/report.py
new file mode 100644
index 0000000..48b4366
--- /dev/null
+++ b/benchmark/report.py
@@ -0,0 +1,210 @@
+"""Markdown report generation for benchmark results."""
+
+import os
+from typing import Optional
+
+from .results import BenchmarkRun
+from .experiments import count_checks
+
+
+def format_value(value: Optional[float], precision: int = 3) -> str:
+ """Format a numeric value or return 'N/A' if None."""
+ if value is None:
+ return "N/A"
+ return f"{value:.{precision}f}"
+
+
+def calculate_speedup(spark_time: Optional[float], duckdb_time: Optional[float]) -> str:
+ """Calculate speedup ratio (spark/duckdb) or return 'N/A'."""
+ if spark_time is None or duckdb_time is None or duckdb_time <= 0:
+ return "N/A"
+ return f"{spark_time / duckdb_time:.1f}x"
+
+
+def generate_markdown_report(run: BenchmarkRun) -> str:
+ """
+ Generate a markdown report from benchmark results.
+
+ Args:
+ run: BenchmarkRun instance with results
+
+ Returns:
+ Markdown string
+ """
+ # Extract config values
+ config = run.config
+ exp_config = config.get("experiment", {})
+ n_runs = exp_config.get("n_runs", 3)
+ base_cols = exp_config.get("base_cols", 10)
+ fixed_rows = exp_config.get("fixed_rows", 1_000_000)
+ row_counts = exp_config.get("row_counts", [])
+ column_counts = exp_config.get("column_counts", [])
+
+ spark_startup = run.spark_startup_time or 0.0
+
+ report = f"""# PyDeequ Engine Benchmark Results
+
+## Run Information
+
+| Field | Value |
+|-------|-------|
+| Run ID | `{run.run_id}` |
+| Timestamp | {run.timestamp} |
+| Engine | {run.engine} |
+| Total Duration | {format_value(run.total_duration_seconds, 1)}s |
+
+## Environment
+
+| Component | Version |
+|-----------|---------|
+| Python | {run.environment.python_version} |
+| Platform | {run.environment.platform_system} {run.environment.platform_release} ({run.environment.platform_machine}) |
+| CPU Count | {run.environment.cpu_count} |
+| DuckDB | {run.environment.duckdb_version or 'N/A'} |
+| PySpark | {run.environment.pyspark_version or 'N/A'} |
+| PyDeequ | {run.environment.pydeequ_version or 'N/A'} |
+| Pandas | {run.environment.pandas_version or 'N/A'} |
+| NumPy | {run.environment.numpy_version or 'N/A'} |
+| PyArrow | {run.environment.pyarrow_version or 'N/A'} |
+
+## Methodology
+
+Based on duckdq-exp experiments:
+
+- **Data Source**: Both engines read from the same Parquet files
+- **Rich Dataset**: Mixed-type columns (strings + numerics) with realistic data patterns
+- **Validation Runs**: {n_runs} iterations, reporting average
+- **Base Checks**: {count_checks(0)} rich checks on {base_cols} mixed-type columns
+
+### Rich Dataset Schema ({base_cols} base columns)
+
+| Column | Type | Description |
+|--------|------|-------------|
+| `id` | string | Unique identifier (ID000000000000) |
+| `category` | string | Categorical (5 values) |
+| `status` | string | Categorical (3 values) |
+| `email` | string | Email pattern |
+| `amount` | float | Numeric value [0, 10000] |
+| `quantity` | int | Non-negative integer [0, 1000] |
+| `score` | float | Normal distribution [0, 100] |
+| `rating` | int | Star rating [1, 5] |
+| `price` | float | Positive numeric [0.01, 9999.99] |
+| `discount` | float | Percentage [0, 0.5] |
+
+## Experiment 1: Varying Rows (Fixed Columns = {base_cols}, {count_checks(0)} checks)
+
+| Rows | Cols | Checks | DuckDB (s) | Spark (s) | Speedup |
+|------|------|--------|------------|-----------|---------|
+"""
+
+ for r in run.varying_rows_results:
+ duck_s = r.get("duckdb_validation")
+ spark_s = r.get("spark_validation")
+ checks = r.get("checks", count_checks(0))
+ speedup = calculate_speedup(spark_s, duck_s)
+ report += f"| {r['rows']:,} | {r['cols']} | {checks} | {format_value(duck_s)} | {format_value(spark_s)} | {speedup} |\n"
+
+ report += f"""
+## Experiment 2: Varying Columns (Fixed Rows = {fixed_rows:,})
+
+Column counts: {column_counts} (base {base_cols} mixed-type + extra numeric columns)
+
+| Rows | Cols | Checks | DuckDB (s) | Spark (s) | Speedup |
+|------|------|--------|------------|-----------|---------|
+"""
+
+ for r in run.varying_cols_results:
+ duck_s = r.get("duckdb_validation")
+ spark_s = r.get("spark_validation")
+ checks = r.get("checks", "N/A")
+ speedup = calculate_speedup(spark_s, duck_s)
+ report += f"| {r['rows']:,} | {r['cols']} | {checks} | {format_value(duck_s)} | {format_value(spark_s)} | {speedup} |\n"
+
+ report += f"""
+## Experiment 3: Column Profiling (Fixed Columns = {base_cols})
+
+Uses `ColumnProfilerRunner` to profile all columns.
+
+| Rows | Cols | DuckDB (s) | Spark (s) | Speedup |
+|------|------|------------|-----------|---------|
+"""
+
+ for r in run.profiling_results:
+ duck_s = r.get("duckdb_profiling")
+ spark_s = r.get("spark_profiling")
+ speedup = calculate_speedup(spark_s, duck_s)
+ report += f"| {r['rows']:,} | {r['cols']} | {format_value(duck_s)} | {format_value(spark_s)} | {speedup} |\n"
+
+ report += f"""
+## Timing Details
+
+### Spark Overhead (Excluded from Validation Time)
+
+| Phase | Time (s) |
+|-------|----------|
+| Startup (SparkSession) | {format_value(spark_startup)} |
+
+**Note**: Data load time varies per experiment and is not included in validation/profiling time.
+
+## Key Findings
+
+1. **DuckDB is significantly faster** for single-node data quality validation
+2. **No JVM overhead**: DuckDB runs natively in Python process
+3. **Rich type support**: Both engines handle mixed string/numeric data effectively
+4. **Parquet files**: Both engines read from the same files, eliminating gRPC serialization bottleneck
+5. **Column profiling**: Full profiling available on both engines
+
+## Running the Benchmark
+
+```bash
+# Run DuckDB only (no Spark server needed)
+python benchmark_cli.py run --engine duckdb
+
+# Run Spark only (auto-spark is enabled by default)
+python benchmark_cli.py run --engine spark
+
+# Run both engines
+python benchmark_cli.py run --engine all
+
+# Generate report from saved results (folder or file path)
+python benchmark_cli.py report benchmark_results/{run.run_id}/
+python benchmark_cli.py report benchmark_results/{run.run_id}/results.json
+
+# Generate PNG visualization
+python benchmark_cli.py visualize benchmark_results/{run.run_id}/
+```
+
+## Notes
+
+- Both engines read from the same Parquet files, ensuring fair comparison
+- Memory configuration (16GB+) prevents OOM errors for large datasets
+- For distributed workloads across multiple nodes, Spark scales horizontally
+- DuckDB is optimized for single-node analytical workloads
+"""
+
+ if run.errors:
+ report += "\n## Errors\n\n"
+ for error in run.errors:
+ report += f"- {error}\n"
+
+ return report
+
+
+def save_report(run: BenchmarkRun, output_path: str) -> str:
+ """
+ Generate and save markdown report.
+
+ Args:
+ run: BenchmarkRun instance
+ output_path: Path to save the report
+
+ Returns:
+ Path to the saved report
+ """
+ report = generate_markdown_report(run)
+
+ os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
+ with open(output_path, "w") as f:
+ f.write(report)
+
+ return output_path
diff --git a/benchmark/results.py b/benchmark/results.py
new file mode 100644
index 0000000..dc4e179
--- /dev/null
+++ b/benchmark/results.py
@@ -0,0 +1,286 @@
+"""Results dataclasses and JSON I/O for benchmark."""
+
+import json
+import os
+import platform
+from dataclasses import dataclass, field, asdict
+from datetime import datetime
+from typing import List, Optional, Dict, Any
+
+
+@dataclass
+class ExperimentResult:
+ """Result from a single experiment run."""
+
+ rows: int
+ cols: int
+ checks: Optional[int] = None
+ duckdb_validation: Optional[float] = None
+ duckdb_profiling: Optional[float] = None
+ spark_startup: Optional[float] = None
+ spark_load: Optional[float] = None
+ spark_validation: Optional[float] = None
+ spark_profiling: Optional[float] = None
+ error: Optional[str] = None
+
+ def to_dict(self) -> dict:
+ """Convert to dictionary, excluding None values."""
+ return {k: v for k, v in asdict(self).items() if v is not None}
+
+ @classmethod
+ def from_dict(cls, data: dict) -> "ExperimentResult":
+ """Create from dictionary."""
+ return cls(**{k: v for k, v in data.items() if k in cls.__dataclass_fields__})
+
+
+@dataclass
+class EnvironmentInfo:
+ """Environment information for reproducibility."""
+
+ python_version: str = ""
+ platform_system: str = ""
+ platform_release: str = ""
+ platform_machine: str = ""
+ cpu_count: int = 0
+ duckdb_version: str = ""
+ pyspark_version: str = ""
+ pydeequ_version: str = ""
+ pandas_version: str = ""
+ numpy_version: str = ""
+ pyarrow_version: str = ""
+
+ def to_dict(self) -> dict:
+ """Convert to dictionary."""
+ return asdict(self)
+
+ @classmethod
+ def from_dict(cls, data: dict) -> "EnvironmentInfo":
+ """Create from dictionary."""
+ return cls(**{k: v for k, v in data.items() if k in cls.__dataclass_fields__})
+
+
+@dataclass
+class BenchmarkRun:
+ """Complete benchmark run with all results."""
+
+ run_id: str
+ timestamp: str
+ engine: str
+ config: Dict[str, Any] = field(default_factory=dict)
+ environment: EnvironmentInfo = field(default_factory=EnvironmentInfo)
+ varying_rows_results: List[Dict[str, Any]] = field(default_factory=list)
+ varying_cols_results: List[Dict[str, Any]] = field(default_factory=list)
+ profiling_results: List[Dict[str, Any]] = field(default_factory=list)
+ spark_startup_time: Optional[float] = None
+ total_duration_seconds: Optional[float] = None
+ errors: List[str] = field(default_factory=list)
+
+ def to_dict(self) -> dict:
+ """Convert to dictionary for JSON serialization."""
+ return {
+ "run_id": self.run_id,
+ "timestamp": self.timestamp,
+ "engine": self.engine,
+ "config": self.config,
+ "environment": self.environment.to_dict(),
+ "varying_rows_results": self.varying_rows_results,
+ "varying_cols_results": self.varying_cols_results,
+ "profiling_results": self.profiling_results,
+ "spark_startup_time": self.spark_startup_time,
+ "total_duration_seconds": self.total_duration_seconds,
+ "errors": self.errors,
+ }
+
+ @classmethod
+ def from_dict(cls, data: dict) -> "BenchmarkRun":
+ """Create from dictionary."""
+ return cls(
+ run_id=data.get("run_id", ""),
+ timestamp=data.get("timestamp", ""),
+ engine=data.get("engine", ""),
+ config=data.get("config", {}),
+ environment=EnvironmentInfo.from_dict(data.get("environment", {})),
+ varying_rows_results=data.get("varying_rows_results", []),
+ varying_cols_results=data.get("varying_cols_results", []),
+ profiling_results=data.get("profiling_results", []),
+ spark_startup_time=data.get("spark_startup_time"),
+ total_duration_seconds=data.get("total_duration_seconds"),
+ errors=data.get("errors", []),
+ )
+
+
+def generate_run_id() -> str:
+ """Generate a unique run ID with timestamp."""
+ return f"benchmark_{datetime.now().strftime('%Y-%m-%dT%H-%M-%S')}"
+
+
+def save_results(run: BenchmarkRun, run_dir: str) -> str:
+ """
+ Save benchmark results to JSON file in run directory.
+
+ Args:
+ run: BenchmarkRun instance
+ run_dir: Directory for this benchmark run (e.g., benchmark_results/benchmark_2024-01-19T14-30-45/)
+
+ Returns:
+ Path to the saved JSON file
+ """
+ os.makedirs(run_dir, exist_ok=True)
+ path = os.path.join(run_dir, "results.json")
+
+ with open(path, "w") as f:
+ json.dump(run.to_dict(), f, indent=2)
+
+ return path
+
+
+def load_results(path: str) -> BenchmarkRun:
+ """
+ Load benchmark results from JSON file.
+
+ Args:
+ path: Path to JSON file or run directory containing results.json
+
+ Returns:
+ BenchmarkRun instance
+ """
+ # If path is a directory, look for results.json inside
+ if os.path.isdir(path):
+ path = os.path.join(path, "results.json")
+
+ with open(path) as f:
+ data = json.load(f)
+ return BenchmarkRun.from_dict(data)
+
+
+def collect_environment_info() -> EnvironmentInfo:
+ """Collect environment information for reproducibility."""
+ info = EnvironmentInfo(
+ python_version=platform.python_version(),
+ platform_system=platform.system(),
+ platform_release=platform.release(),
+ platform_machine=platform.machine(),
+ cpu_count=os.cpu_count() or 0,
+ )
+
+ # Try to get package versions
+ try:
+ import duckdb
+
+ info.duckdb_version = duckdb.__version__
+ except (ImportError, AttributeError):
+ pass
+
+ try:
+ import pyspark
+
+ info.pyspark_version = pyspark.__version__
+ except (ImportError, AttributeError):
+ pass
+
+ try:
+ import pydeequ
+
+ info.pydeequ_version = getattr(pydeequ, "__version__", "unknown")
+ except (ImportError, AttributeError):
+ pass
+
+ try:
+ import pandas
+
+ info.pandas_version = pandas.__version__
+ except (ImportError, AttributeError):
+ pass
+
+ try:
+ import numpy
+
+ info.numpy_version = numpy.__version__
+ except (ImportError, AttributeError):
+ pass
+
+ try:
+ import pyarrow
+
+ info.pyarrow_version = pyarrow.__version__
+ except (ImportError, AttributeError):
+ pass
+
+ return info
+
+
+def merge_results(duckdb_run: Optional[BenchmarkRun], spark_run: Optional[BenchmarkRun]) -> BenchmarkRun:
+ """
+ Merge results from separate DuckDB and Spark runs into a single combined result.
+
+ Args:
+ duckdb_run: Results from DuckDB worker (may be None)
+ spark_run: Results from Spark worker (may be None)
+
+ Returns:
+ Combined BenchmarkRun
+ """
+ # Use whichever run is available as the base
+ base = duckdb_run or spark_run
+ if base is None:
+ raise ValueError("At least one run must be provided")
+
+ merged = BenchmarkRun(
+ run_id=base.run_id,
+ timestamp=base.timestamp,
+ engine="all" if duckdb_run and spark_run else base.engine,
+ config=base.config,
+ environment=base.environment,
+ errors=base.errors.copy(),
+ )
+
+ # Merge varying rows results
+ rows_by_key = {}
+ for run in [duckdb_run, spark_run]:
+ if run:
+ for r in run.varying_rows_results:
+ key = (r.get("rows"), r.get("cols"))
+ if key not in rows_by_key:
+ rows_by_key[key] = r.copy()
+ else:
+ rows_by_key[key].update({k: v for k, v in r.items() if v is not None})
+ merged.errors.extend(e for e in run.errors if e not in merged.errors)
+ merged.varying_rows_results = list(rows_by_key.values())
+
+ # Merge varying cols results
+ cols_by_key = {}
+ for run in [duckdb_run, spark_run]:
+ if run:
+ for r in run.varying_cols_results:
+ key = (r.get("rows"), r.get("cols"))
+ if key not in cols_by_key:
+ cols_by_key[key] = r.copy()
+ else:
+ cols_by_key[key].update({k: v for k, v in r.items() if v is not None})
+ merged.varying_cols_results = list(cols_by_key.values())
+
+ # Merge profiling results
+ prof_by_key = {}
+ for run in [duckdb_run, spark_run]:
+ if run:
+ for r in run.profiling_results:
+ key = (r.get("rows"), r.get("cols"))
+ if key not in prof_by_key:
+ prof_by_key[key] = r.copy()
+ else:
+ prof_by_key[key].update({k: v for k, v in r.items() if v is not None})
+ merged.profiling_results = list(prof_by_key.values())
+
+ # Take Spark startup time from Spark run
+ if spark_run and spark_run.spark_startup_time:
+ merged.spark_startup_time = spark_run.spark_startup_time
+
+ # Sum total durations
+ total = 0.0
+ if duckdb_run and duckdb_run.total_duration_seconds:
+ total += duckdb_run.total_duration_seconds
+ if spark_run and spark_run.total_duration_seconds:
+ total += spark_run.total_duration_seconds
+ merged.total_duration_seconds = total if total > 0 else None
+
+ return merged
diff --git a/benchmark/spark_server.py b/benchmark/spark_server.py
new file mode 100644
index 0000000..eac0020
--- /dev/null
+++ b/benchmark/spark_server.py
@@ -0,0 +1,197 @@
+"""Spark Connect server management for benchmarks."""
+
+import os
+import signal
+import socket
+import subprocess
+import time
+from contextlib import contextmanager
+from typing import Optional
+
+from .config import SparkServerConfig
+
+
+class SparkConnectServer:
+ """Manages Spark Connect server lifecycle."""
+
+ def __init__(self, config: Optional[SparkServerConfig] = None):
+ """
+ Initialize Spark Connect server manager.
+
+ Args:
+ config: Server configuration (uses defaults if not provided)
+ """
+ self.config = config or SparkServerConfig()
+ self._process: Optional[subprocess.Popen] = None
+ self._started_by_us = False
+
+ def is_running(self) -> bool:
+ """Check if Spark Connect server is running by attempting to connect."""
+ try:
+ sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+ sock.settimeout(1)
+ result = sock.connect_ex(("localhost", self.config.port))
+ sock.close()
+ return result == 0
+ except (socket.error, OSError):
+ return False
+
+ def start(self) -> float:
+ """
+ Start Spark Connect server if not already running.
+
+ Returns:
+ Time taken to start the server (0 if already running)
+
+ Raises:
+ RuntimeError: If server fails to start within timeout
+ """
+ if self.is_running():
+ print(f"Spark Connect server already running on port {self.config.port}")
+ return 0.0
+
+ start_time = time.time()
+
+ # Build the startup command
+ start_script = os.path.join(self.config.spark_home, "sbin", "start-connect-server.sh")
+
+ if not os.path.exists(start_script):
+ raise RuntimeError(f"Spark Connect start script not found: {start_script}")
+
+ cmd = [
+ start_script,
+ "--conf", f"spark.driver.memory={self.config.driver_memory}",
+ "--conf", f"spark.executor.memory={self.config.executor_memory}",
+ "--packages", "org.apache.spark:spark-connect_2.12:3.5.0",
+ "--jars", self.config.deequ_jar,
+ "--conf", "spark.connect.extensions.relation.classes=com.amazon.deequ.connect.DeequRelationPlugin",
+ ]
+
+ # Set up environment
+ env = os.environ.copy()
+ env["JAVA_HOME"] = self.config.java_home
+ env["SPARK_HOME"] = self.config.spark_home
+
+ print(f"Starting Spark Connect server on port {self.config.port}...")
+ print(f" JAVA_HOME: {self.config.java_home}")
+ print(f" SPARK_HOME: {self.config.spark_home}")
+
+ # Start the server
+ self._process = subprocess.Popen(
+ cmd,
+ env=env,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ )
+ self._started_by_us = True
+
+ # Wait for server to be ready
+ deadline = time.time() + self.config.startup_timeout
+ while time.time() < deadline:
+ if self.is_running():
+ elapsed = time.time() - start_time
+ print(f"Spark Connect server started in {elapsed:.1f}s")
+ return elapsed
+ time.sleep(self.config.poll_interval)
+
+ # Timeout - try to get error output
+ if self._process:
+ self._process.terminate()
+ _, stderr = self._process.communicate(timeout=5)
+ error_msg = stderr.decode() if stderr else "Unknown error"
+ self._process = None
+ self._started_by_us = False
+ raise RuntimeError(
+ f"Spark Connect server failed to start within {self.config.startup_timeout}s: {error_msg[:500]}"
+ )
+
+ raise RuntimeError(
+ f"Spark Connect server failed to start within {self.config.startup_timeout}s"
+ )
+
+ def stop(self) -> None:
+ """Stop Spark Connect server if we started it."""
+ if not self._started_by_us:
+ print("Spark Connect server was not started by us, skipping stop")
+ return
+
+ stop_script = os.path.join(self.config.spark_home, "sbin", "stop-connect-server.sh")
+
+ if os.path.exists(stop_script):
+ print("Stopping Spark Connect server...")
+ env = os.environ.copy()
+ env["JAVA_HOME"] = self.config.java_home
+ env["SPARK_HOME"] = self.config.spark_home
+
+ try:
+ subprocess.run(
+ [stop_script],
+ env=env,
+ timeout=30,
+ capture_output=True,
+ )
+ print("Spark Connect server stopped")
+ except subprocess.TimeoutExpired:
+ print("Warning: stop script timed out")
+ except Exception as e:
+ print(f"Warning: Error stopping server: {e}")
+ else:
+ # Fall back to killing the process directly
+ if self._process:
+ print("Terminating Spark Connect server process...")
+ self._process.terminate()
+ try:
+ self._process.wait(timeout=10)
+ except subprocess.TimeoutExpired:
+ self._process.kill()
+ print("Spark Connect server process terminated")
+
+ self._started_by_us = False
+ self._process = None
+
+
+@contextmanager
+def managed_spark_server(config: Optional[SparkServerConfig] = None):
+ """
+ Context manager for Spark Connect server with signal handling.
+
+ Ensures the server is stopped on exit, including on SIGINT/SIGTERM.
+
+ Args:
+ config: Server configuration
+
+ Yields:
+ SparkConnectServer instance
+ """
+ server = SparkConnectServer(config)
+ original_sigint = signal.getsignal(signal.SIGINT)
+ original_sigterm = signal.getsignal(signal.SIGTERM)
+
+ def signal_handler(signum, frame):
+ """Handle interrupt signals by stopping the server."""
+ print(f"\nReceived signal {signum}, stopping Spark server...")
+ server.stop()
+ # Re-raise the signal to trigger default behavior
+ if signum == signal.SIGINT:
+ signal.signal(signal.SIGINT, original_sigint)
+ if callable(original_sigint):
+ original_sigint(signum, frame)
+ elif signum == signal.SIGTERM:
+ signal.signal(signal.SIGTERM, original_sigterm)
+ if callable(original_sigterm):
+ original_sigterm(signum, frame)
+
+ try:
+ # Install signal handlers
+ signal.signal(signal.SIGINT, signal_handler)
+ signal.signal(signal.SIGTERM, signal_handler)
+
+ yield server
+
+ finally:
+ # Restore original signal handlers
+ signal.signal(signal.SIGINT, original_sigint)
+ signal.signal(signal.SIGTERM, original_sigterm)
+
+ # Stop the server
+ server.stop()
diff --git a/benchmark/visualize.py b/benchmark/visualize.py
new file mode 100644
index 0000000..f239fa9
--- /dev/null
+++ b/benchmark/visualize.py
@@ -0,0 +1,288 @@
+"""
+Benchmark Visualization for PyDeequ Engine Comparison.
+
+Generates PNG charts comparing DuckDB vs Spark performance from benchmark results.
+"""
+
+import os
+from typing import List, Dict, Any, Optional
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+from .results import BenchmarkRun
+
+
+def _format_row_count(n: int) -> str:
+ """Format row count for display (e.g., 1000000 -> '1M')."""
+ if n >= 1_000_000:
+ return f"{n // 1_000_000}M"
+ elif n >= 1_000:
+ return f"{n // 1_000}K"
+ return str(n)
+
+
+def _extract_validation_data(
+ results: List[Dict[str, Any]], x_key: str
+) -> Dict[str, List]:
+ """Extract validation timing data from results."""
+ data = {
+ "x_values": [],
+ "x_labels": [],
+ "duckdb": [],
+ "spark": [],
+ "checks": [],
+ }
+
+ for r in sorted(results, key=lambda x: x.get(x_key, 0)):
+ x_val = r.get(x_key)
+ if x_val is None:
+ continue
+
+ data["x_values"].append(x_val)
+ if x_key == "rows":
+ data["x_labels"].append(_format_row_count(x_val))
+ else:
+ checks = r.get("checks", "")
+ data["x_labels"].append(f"{x_val}\n({checks})" if checks else str(x_val))
+
+ data["duckdb"].append(r.get("duckdb_validation"))
+ data["spark"].append(r.get("spark_validation"))
+ data["checks"].append(r.get("checks"))
+
+ return data
+
+
+def _extract_profiling_data(results: List[Dict[str, Any]]) -> Dict[str, List]:
+ """Extract profiling timing data from results."""
+ data = {
+ "x_values": [],
+ "x_labels": [],
+ "duckdb": [],
+ "spark": [],
+ }
+
+ for r in sorted(results, key=lambda x: x.get("rows", 0)):
+ rows = r.get("rows")
+ if rows is None:
+ continue
+
+ data["x_values"].append(rows)
+ data["x_labels"].append(_format_row_count(rows))
+ data["duckdb"].append(r.get("duckdb_profiling"))
+ data["spark"].append(r.get("spark_profiling"))
+
+ return data
+
+
+def _calculate_speedup(spark_times: List, duckdb_times: List) -> List[Optional[float]]:
+ """Calculate speedup ratios (Spark time / DuckDB time)."""
+ speedups = []
+ for s, d in zip(spark_times, duckdb_times):
+ if s is not None and d is not None and d > 0:
+ speedups.append(s / d)
+ else:
+ speedups.append(None)
+ return speedups
+
+
+def _plot_comparison(
+ ax: plt.Axes,
+ x_labels: List[str],
+ duckdb_times: List,
+ spark_times: List,
+ xlabel: str,
+ ylabel: str,
+ title: str,
+ duckdb_color: str,
+ spark_color: str,
+ use_log_scale: bool = False,
+) -> None:
+ """Plot a side-by-side bar comparison chart."""
+ # Filter out None values
+ valid_indices = [
+ i for i in range(len(x_labels))
+ if duckdb_times[i] is not None or spark_times[i] is not None
+ ]
+
+ if not valid_indices:
+ ax.text(0.5, 0.5, "No data available", ha="center", va="center", transform=ax.transAxes)
+ ax.set_title(title, fontsize=12, fontweight="bold")
+ return
+
+ labels = [x_labels[i] for i in valid_indices]
+ duckdb = [duckdb_times[i] if duckdb_times[i] is not None else 0 for i in valid_indices]
+ spark = [spark_times[i] if spark_times[i] is not None else 0 for i in valid_indices]
+
+ x = np.arange(len(labels))
+ width = 0.35
+
+ has_duckdb = any(d > 0 for d in duckdb)
+ has_spark = any(s > 0 for s in spark)
+
+ if has_duckdb:
+ bars1 = ax.bar(
+ x - width / 2, duckdb, width, label="DuckDB",
+ color=duckdb_color, edgecolor="black", linewidth=0.5
+ )
+ for bar in bars1:
+ height = bar.get_height()
+ if height > 0:
+ ax.annotate(
+ f"{height:.2f}s",
+ xy=(bar.get_x() + bar.get_width() / 2, height),
+ xytext=(0, 3), textcoords="offset points",
+ ha="center", va="bottom", fontsize=7
+ )
+
+ if has_spark:
+ bars2 = ax.bar(
+ x + width / 2, spark, width, label="Spark",
+ color=spark_color, edgecolor="black", linewidth=0.5
+ )
+ for bar in bars2:
+ height = bar.get_height()
+ if height > 0:
+ ax.annotate(
+ f"{height:.1f}s",
+ xy=(bar.get_x() + bar.get_width() / 2, height),
+ xytext=(0, 3), textcoords="offset points",
+ ha="center", va="bottom", fontsize=7
+ )
+
+ ax.set_xlabel(xlabel, fontsize=11)
+ ax.set_ylabel(ylabel, fontsize=11)
+ ax.set_title(title, fontsize=12, fontweight="bold")
+ ax.set_xticks(x)
+ ax.set_xticklabels(labels)
+ ax.legend(loc="upper left")
+
+ if use_log_scale and has_duckdb and has_spark:
+ ax.set_yscale("log")
+
+
+def _plot_speedup(
+ ax: plt.Axes,
+ x_labels: List[str],
+ speedups: List[Optional[float]],
+ xlabel: str,
+ title: str,
+ speedup_color: str,
+) -> None:
+ """Plot a speedup bar chart."""
+ valid_indices = [i for i in range(len(x_labels)) if speedups[i] is not None]
+
+ if not valid_indices:
+ ax.text(0.5, 0.5, "No speedup data\n(need both engines)", ha="center", va="center", transform=ax.transAxes)
+ ax.set_title(title, fontsize=12, fontweight="bold")
+ return
+
+ labels = [x_labels[i] for i in valid_indices]
+ values = [speedups[i] for i in valid_indices]
+
+ x = np.arange(len(labels))
+ bars = ax.bar(x, values, color=speedup_color, edgecolor="black", linewidth=0.5)
+
+ ax.axhline(y=1, color="gray", linestyle="--", alpha=0.7)
+ ax.set_xlabel(xlabel, fontsize=11)
+ ax.set_ylabel("Speedup (x times faster)", fontsize=11)
+ ax.set_title(title, fontsize=12, fontweight="bold")
+ ax.set_xticks(x)
+ ax.set_xticklabels(labels)
+
+ for bar in bars:
+ height = bar.get_height()
+ ax.annotate(
+ f"{height:.1f}x",
+ xy=(bar.get_x() + bar.get_width() / 2, height),
+ xytext=(0, 3), textcoords="offset points",
+ ha="center", va="bottom", fontsize=10, fontweight="bold"
+ )
+
+
+def generate_visualization(run: BenchmarkRun, output_path: str) -> str:
+ """
+ Generate benchmark visualization PNG from results.
+
+ Args:
+ run: BenchmarkRun instance with results
+ output_path: Path to save the PNG file
+
+ Returns:
+ Path to the saved PNG file
+ """
+ # Extract data from results
+ rows_data = _extract_validation_data(run.varying_rows_results, "rows")
+ cols_data = _extract_validation_data(run.varying_cols_results, "cols")
+ profiling_data = _extract_profiling_data(run.profiling_results)
+
+ # Calculate speedups
+ rows_speedup = _calculate_speedup(rows_data["spark"], rows_data["duckdb"])
+ cols_speedup = _calculate_speedup(cols_data["spark"], cols_data["duckdb"])
+ profiling_speedup = _calculate_speedup(profiling_data["spark"], profiling_data["duckdb"])
+
+ # Color scheme
+ duckdb_color = "#FFA500" # Orange
+ spark_color = "#E25A1C" # Spark orange/red
+ speedup_color = "#2E86AB" # Blue
+
+ # Set style and create figure
+ plt.style.use("seaborn-v0_8-whitegrid")
+ fig, axes = plt.subplots(2, 3, figsize=(16, 10))
+
+ # Row 1: Time comparisons
+ _plot_comparison(
+ axes[0, 0], rows_data["x_labels"], rows_data["duckdb"], rows_data["spark"],
+ "Dataset Size (rows)", "Validation Time (seconds)",
+ "Exp 1: Varying Rows (10 cols)",
+ duckdb_color, spark_color, use_log_scale=True
+ )
+
+ _plot_comparison(
+ axes[0, 1], cols_data["x_labels"], cols_data["duckdb"], cols_data["spark"],
+ "Columns (Checks)", "Validation Time (seconds)",
+ "Exp 2: Varying Columns (1M rows)",
+ duckdb_color, spark_color
+ )
+
+ _plot_comparison(
+ axes[0, 2], profiling_data["x_labels"], profiling_data["duckdb"], profiling_data["spark"],
+ "Dataset Size (rows)", "Profiling Time (seconds)",
+ "Exp 3: Column Profiling (10 cols)",
+ duckdb_color, spark_color
+ )
+
+ # Row 2: Speedup charts
+ _plot_speedup(
+ axes[1, 0], rows_data["x_labels"], rows_speedup,
+ "Dataset Size (rows)", "DuckDB Speedup: Varying Rows",
+ speedup_color
+ )
+
+ _plot_speedup(
+ axes[1, 1], cols_data["x_labels"], cols_speedup,
+ "Columns (Checks)", "DuckDB Speedup: Varying Columns",
+ speedup_color
+ )
+
+ _plot_speedup(
+ axes[1, 2], profiling_data["x_labels"], profiling_speedup,
+ "Dataset Size (rows)", "DuckDB Speedup: Column Profiling",
+ speedup_color
+ )
+
+ # Title
+ engine_label = run.engine.upper() if run.engine != "all" else "DuckDB vs Spark"
+ fig.suptitle(
+ f"PyDeequ Benchmark: {engine_label}\n{run.run_id}",
+ fontsize=14, fontweight="bold", y=1.02
+ )
+
+ plt.tight_layout()
+
+ # Save the figure
+ os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
+ plt.savefig(output_path, dpi=150, bbox_inches="tight", facecolor="white", edgecolor="none")
+ plt.close(fig)
+
+ return output_path
diff --git a/benchmark/worker.py b/benchmark/worker.py
new file mode 100644
index 0000000..3724731
--- /dev/null
+++ b/benchmark/worker.py
@@ -0,0 +1,189 @@
+#!/usr/bin/env python3
+"""
+Subprocess worker for running benchmarks in isolated process.
+
+This module is designed to be run as:
+ python -m benchmark.worker --config --engine {duckdb,spark} --output
+
+Each engine runs in a fresh subprocess to ensure clean JVM/Python state.
+"""
+
+import argparse
+import json
+import os
+import sys
+import time
+from datetime import datetime
+from typing import Optional
+
+from .config import BenchmarkConfig, ExperimentConfig
+from .results import (
+ BenchmarkRun,
+ collect_environment_info,
+ generate_run_id,
+)
+from .experiments import (
+ run_varying_rows_experiment_duckdb,
+ run_varying_cols_experiment_duckdb,
+ run_profiling_experiment_duckdb,
+ run_varying_rows_experiment_spark,
+ run_varying_cols_experiment_spark,
+ run_profiling_experiment_spark,
+ setup_spark,
+)
+
+
+def run_duckdb_worker(config: BenchmarkConfig, run_id: str) -> BenchmarkRun:
+ """Run all DuckDB experiments."""
+ start_time = time.time()
+
+ run = BenchmarkRun(
+ run_id=run_id,
+ timestamp=datetime.now().isoformat(),
+ engine="duckdb",
+ config=config.to_dict(),
+ environment=collect_environment_info(),
+ )
+
+ exp_config = config.experiment
+
+ try:
+ run.varying_rows_results = run_varying_rows_experiment_duckdb(exp_config)
+ except Exception as e:
+ run.errors.append(f"DuckDB varying rows experiment failed: {e}")
+ print(f"Error in varying rows experiment: {e}")
+
+ try:
+ run.varying_cols_results = run_varying_cols_experiment_duckdb(exp_config)
+ except Exception as e:
+ run.errors.append(f"DuckDB varying cols experiment failed: {e}")
+ print(f"Error in varying cols experiment: {e}")
+
+ try:
+ run.profiling_results = run_profiling_experiment_duckdb(exp_config)
+ except Exception as e:
+ run.errors.append(f"DuckDB profiling experiment failed: {e}")
+ print(f"Error in profiling experiment: {e}")
+
+ run.total_duration_seconds = time.time() - start_time
+ return run
+
+
+def run_spark_worker(config: BenchmarkConfig, run_id: str) -> BenchmarkRun:
+ """Run all Spark experiments."""
+ start_time = time.time()
+
+ run = BenchmarkRun(
+ run_id=run_id,
+ timestamp=datetime.now().isoformat(),
+ engine="spark",
+ config=config.to_dict(),
+ environment=collect_environment_info(),
+ )
+
+ exp_config = config.experiment
+
+ # Setup Spark
+ spark = None
+ spark_startup_time = 0.0
+
+ try:
+ print("\nSetting up Spark Connect...")
+ spark, spark_startup_time = setup_spark(config.spark_remote)
+ print(f" Spark Startup: {spark_startup_time:.3f}s")
+ run.spark_startup_time = spark_startup_time
+ except Exception as e:
+ error_msg = f"Spark setup failed: {e}"
+ run.errors.append(error_msg)
+ print(f"Error: {error_msg}")
+ run.total_duration_seconds = time.time() - start_time
+ return run
+
+ try:
+ run.varying_rows_results = run_varying_rows_experiment_spark(
+ spark, spark_startup_time, exp_config
+ )
+ except Exception as e:
+ run.errors.append(f"Spark varying rows experiment failed: {e}")
+ print(f"Error in varying rows experiment: {e}")
+
+ try:
+ run.varying_cols_results = run_varying_cols_experiment_spark(
+ spark, spark_startup_time, exp_config
+ )
+ except Exception as e:
+ run.errors.append(f"Spark varying cols experiment failed: {e}")
+ print(f"Error in varying cols experiment: {e}")
+
+ try:
+ run.profiling_results = run_profiling_experiment_spark(
+ spark, spark_startup_time, exp_config
+ )
+ except Exception as e:
+ run.errors.append(f"Spark profiling experiment failed: {e}")
+ print(f"Error in profiling experiment: {e}")
+
+ run.total_duration_seconds = time.time() - start_time
+ return run
+
+
+def main():
+ parser = argparse.ArgumentParser(description="Benchmark worker subprocess")
+ parser.add_argument(
+ "--config",
+ required=True,
+ help="Path to JSON config file",
+ )
+ parser.add_argument(
+ "--engine",
+ choices=["duckdb", "spark"],
+ required=True,
+ help="Engine to benchmark",
+ )
+ parser.add_argument(
+ "--output",
+ required=True,
+ help="Path to write JSON results",
+ )
+ parser.add_argument(
+ "--run-id",
+ help="Run ID to use (optional, generated if not provided)",
+ )
+
+ args = parser.parse_args()
+
+ # Load config from JSON file
+ with open(args.config) as f:
+ config_data = json.load(f)
+ config = BenchmarkConfig.from_dict(config_data)
+
+ # Use provided run ID or generate new one
+ run_id = args.run_id or generate_run_id()
+
+ print(f"Benchmark Worker: {args.engine}")
+ print(f"Run ID: {run_id}")
+ print("=" * 70)
+
+ # Run the appropriate engine
+ if args.engine == "duckdb":
+ result = run_duckdb_worker(config, run_id)
+ else:
+ result = run_spark_worker(config, run_id)
+
+ # Write results to output file
+ os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
+ with open(args.output, "w") as f:
+ json.dump(result.to_dict(), f, indent=2)
+
+ print(f"\nResults written to: {args.output}")
+ print(f"Total duration: {result.total_duration_seconds:.1f}s")
+
+ if result.errors:
+ print(f"Errors encountered: {len(result.errors)}")
+ for error in result.errors:
+ print(f" - {error}")
+ sys.exit(1)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/benchmark_cli.py b/benchmark_cli.py
new file mode 100644
index 0000000..418e401
--- /dev/null
+++ b/benchmark_cli.py
@@ -0,0 +1,354 @@
+#!/usr/bin/env python3
+"""
+PyDeequ Engine Benchmark CLI
+
+Orchestrates benchmark runs with process isolation and auto Spark server management.
+
+Usage:
+ # Run DuckDB only (no Spark server needed)
+ python benchmark_cli.py run --engine duckdb
+
+ # Run Spark only (auto-spark is enabled by default)
+ python benchmark_cli.py run --engine spark
+
+ # Run both engines
+ python benchmark_cli.py run --engine all
+
+ # Run without auto Spark server management (assumes server is running)
+ python benchmark_cli.py run --engine spark --no-auto-spark
+
+ # Generate report from saved results (folder or file path)
+ python benchmark_cli.py report benchmark_results/benchmark_2024-01-19T14-30-45/
+ python benchmark_cli.py report benchmark_results/benchmark_2024-01-19T14-30-45/results.json
+
+ # Generate report to custom location
+ python benchmark_cli.py report benchmark_results/benchmark_2024-01-19T14-30-45/ -o MY_RESULTS.md
+
+ # Generate visualization PNG from results
+ python benchmark_cli.py visualize benchmark_results/benchmark_2024-01-19T14-30-45/
+ python benchmark_cli.py visualize benchmark_results/benchmark_2024-01-19T14-30-45/ -o charts.png
+"""
+
+import argparse
+import json
+import os
+import subprocess
+import sys
+import tempfile
+import time
+from typing import Optional
+
+from benchmark.config import BenchmarkConfig, ExperimentConfig, SparkServerConfig
+from benchmark.results import (
+ BenchmarkRun,
+ generate_run_id,
+ save_results,
+ load_results,
+ merge_results,
+ collect_environment_info,
+)
+from benchmark.spark_server import managed_spark_server
+from benchmark.report import save_report
+from benchmark.visualize import generate_visualization
+
+
+def run_engine_in_subprocess(
+ engine: str,
+ config: BenchmarkConfig,
+ run_id: str,
+) -> Optional[BenchmarkRun]:
+ """
+ Run benchmark for a single engine in an isolated subprocess.
+
+ Args:
+ engine: Engine to run ("duckdb" or "spark")
+ config: Benchmark configuration
+ run_id: Run ID to use
+
+ Returns:
+ BenchmarkRun result, or None on failure
+ """
+ print(f"\n{'=' * 70}")
+ print(f"Running {engine.upper()} benchmarks in subprocess...")
+ print("=" * 70)
+
+ # Write config to temp file
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
+ json.dump(config.to_dict(), f, indent=2)
+ config_path = f.name
+
+ # Temp output file for results (will be cleaned up after loading)
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
+ output_path = f.name
+
+ try:
+ # Run worker subprocess
+ cmd = [
+ sys.executable,
+ "-m",
+ "benchmark.worker",
+ "--config", config_path,
+ "--engine", engine,
+ "--output", output_path,
+ "--run-id", run_id,
+ ]
+
+ print(f"Command: {' '.join(cmd)}")
+ print()
+
+ result = subprocess.run(
+ cmd,
+ cwd=os.path.dirname(os.path.abspath(__file__)),
+ )
+
+ if result.returncode != 0:
+ print(f"\n{engine.upper()} worker exited with code {result.returncode}")
+ # Try to load partial results if they exist
+ if os.path.exists(output_path):
+ return load_results(output_path)
+ return None
+
+ # Load and return results
+ return load_results(output_path)
+
+ except Exception as e:
+ print(f"Error running {engine} worker: {e}")
+ return None
+
+ finally:
+ # Clean up temp files
+ if os.path.exists(config_path):
+ os.unlink(config_path)
+ if os.path.exists(output_path):
+ os.unlink(output_path)
+
+
+def cmd_run(args: argparse.Namespace) -> int:
+ """Execute the 'run' subcommand."""
+ # Build configuration
+ exp_config = ExperimentConfig()
+ if args.n_runs:
+ exp_config.n_runs = args.n_runs
+
+ spark_config = SparkServerConfig()
+ if args.spark_home:
+ spark_config.spark_home = args.spark_home
+ if args.java_home:
+ spark_config.java_home = args.java_home
+
+ config = BenchmarkConfig(
+ engine=args.engine,
+ output_dir=args.output_dir,
+ experiment=exp_config,
+ spark_server=spark_config,
+ )
+
+ # Generate run ID and create run directory
+ run_id = generate_run_id()
+ run_dir = os.path.join(args.output_dir, run_id)
+ os.makedirs(run_dir, exist_ok=True)
+
+ auto_spark = not args.no_auto_spark
+
+ print("PyDeequ Engine Benchmark")
+ print("=" * 70)
+ print(f"Run ID: {run_id}")
+ print(f"Engine: {args.engine}")
+ print(f"Output directory: {run_dir}")
+ print(f"Auto Spark: {auto_spark}")
+ print(f"Validation runs: {exp_config.n_runs}")
+ print(f"Row counts: {exp_config.row_counts}")
+ print(f"Column counts: {exp_config.column_counts}")
+ print(f"Cache directory: {exp_config.cache_dir}")
+
+ start_time = time.time()
+
+ run_duckdb = args.engine in ("all", "duckdb")
+ run_spark = args.engine in ("all", "spark")
+
+ duckdb_result: Optional[BenchmarkRun] = None
+ spark_result: Optional[BenchmarkRun] = None
+
+ # Run DuckDB (doesn't need Spark server)
+ if run_duckdb:
+ duckdb_result = run_engine_in_subprocess("duckdb", config, run_id)
+
+ # Run Spark (may need server management)
+ if run_spark:
+ if auto_spark:
+ # Use managed server context
+ with managed_spark_server(spark_config) as server:
+ startup_time = server.start()
+ if server.is_running():
+ spark_result = run_engine_in_subprocess("spark", config, run_id)
+ else:
+ print("Spark server failed to start, skipping Spark benchmarks")
+ else:
+ # Assume server is already running
+ spark_result = run_engine_in_subprocess("spark", config, run_id)
+
+ # Merge results if both engines ran
+ if duckdb_result and spark_result:
+ final_result = merge_results(duckdb_result, spark_result)
+ elif duckdb_result:
+ final_result = duckdb_result
+ elif spark_result:
+ final_result = spark_result
+ else:
+ print("\nNo benchmark results produced!")
+ return 1
+
+ # Update total duration
+ final_result.total_duration_seconds = time.time() - start_time
+
+ # Save combined results to run directory
+ results_path = save_results(final_result, run_dir)
+ print(f"\n{'=' * 70}")
+ print(f"Results saved to: {results_path}")
+
+ # Generate markdown report in run directory
+ report_path = os.path.join(run_dir, "BENCHMARK_RESULTS.md")
+ save_report(final_result, report_path)
+ print(f"Report saved to: {report_path}")
+
+ print(f"Total duration: {final_result.total_duration_seconds:.1f}s")
+
+ if final_result.errors:
+ print(f"\nErrors encountered: {len(final_result.errors)}")
+ for error in final_result.errors:
+ print(f" - {error}")
+ return 1
+
+ return 0
+
+
+def cmd_report(args: argparse.Namespace) -> int:
+ """Execute the 'report' subcommand."""
+ if not os.path.exists(args.json_file):
+ print(f"Error: File not found: {args.json_file}")
+ return 1
+
+ try:
+ run = load_results(args.json_file)
+ except Exception as e:
+ print(f"Error loading results: {e}")
+ return 1
+
+ output_path = args.output or "BENCHMARK_RESULTS.md"
+ save_report(run, output_path)
+ print(f"Report generated: {output_path}")
+
+ return 0
+
+
+def cmd_visualize(args: argparse.Namespace) -> int:
+ """Execute the 'visualize' subcommand."""
+ if not os.path.exists(args.results_path):
+ print(f"Error: Path not found: {args.results_path}")
+ return 1
+
+ try:
+ run = load_results(args.results_path)
+ except Exception as e:
+ print(f"Error loading results: {e}")
+ return 1
+
+ # Determine output path
+ if args.output:
+ output_path = args.output
+ else:
+ # Default: save in the same directory as results
+ if os.path.isdir(args.results_path):
+ output_path = os.path.join(args.results_path, "benchmark_chart.png")
+ else:
+ output_path = os.path.join(os.path.dirname(args.results_path), "benchmark_chart.png")
+
+ try:
+ generate_visualization(run, output_path)
+ print(f"Visualization saved to: {output_path}")
+ except Exception as e:
+ print(f"Error generating visualization: {e}")
+ return 1
+
+ return 0
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ description="PyDeequ Engine Benchmark CLI",
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ epilog=__doc__,
+ )
+
+ subparsers = parser.add_subparsers(dest="command", help="Available commands")
+
+ # 'run' subcommand
+ run_parser = subparsers.add_parser("run", help="Run benchmark experiments")
+ run_parser.add_argument(
+ "--engine",
+ choices=["all", "duckdb", "spark"],
+ default="all",
+ help="Engine to benchmark (default: all)",
+ )
+ run_parser.add_argument(
+ "--output-dir",
+ default="benchmark_results",
+ help="Output directory for results (default: benchmark_results/)",
+ )
+ run_parser.add_argument(
+ "--no-auto-spark",
+ action="store_true",
+ dest="no_auto_spark",
+ help="Disable automatic Spark Connect server management (assumes server is already running)",
+ )
+ run_parser.add_argument(
+ "--spark-home",
+ help="Path to Spark installation",
+ )
+ run_parser.add_argument(
+ "--java-home",
+ help="Path to Java installation",
+ )
+ run_parser.add_argument(
+ "--n-runs",
+ type=int,
+ help="Number of validation runs for averaging",
+ )
+
+ # 'report' subcommand
+ report_parser = subparsers.add_parser("report", help="Generate markdown report from JSON results")
+ report_parser.add_argument(
+ "json_file",
+ help="Path to JSON results file or run directory containing results.json",
+ )
+ report_parser.add_argument(
+ "-o", "--output",
+ help="Output path for markdown report (default: BENCHMARK_RESULTS.md)",
+ )
+
+ # 'visualize' subcommand
+ visualize_parser = subparsers.add_parser("visualize", help="Generate PNG visualization from results")
+ visualize_parser.add_argument(
+ "results_path",
+ help="Path to JSON results file or run directory containing results.json",
+ )
+ visualize_parser.add_argument(
+ "-o", "--output",
+ help="Output path for PNG file (default: benchmark_chart.png in results directory)",
+ )
+
+ args = parser.parse_args()
+
+ if args.command == "run":
+ sys.exit(cmd_run(args))
+ elif args.command == "report":
+ sys.exit(cmd_report(args))
+ elif args.command == "visualize":
+ sys.exit(cmd_visualize(args))
+ else:
+ parser.print_help()
+ sys.exit(1)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/docs/architecture.md b/docs/architecture.md
new file mode 100644
index 0000000..6304e20
--- /dev/null
+++ b/docs/architecture.md
@@ -0,0 +1,214 @@
+# PyDeequ v2 Architecture
+
+## Overview
+
+PyDeequ v2 introduces a multi-engine architecture enabling data quality checks on different backends. The code is the source of truth - this document provides a high-level map to help you navigate the codebase.
+
+**Supported backends:**
+- **DuckDB**: Local development, small-medium datasets (`pip install duckdb`)
+- **Spark Connect**: Large-scale distributed processing (requires Spark cluster)
+
+## Design Philosophy
+
+The architecture is inspired by [DuckDQ](https://github.com/tdoehmen/duckdq), which demonstrated a key insight:
+
+> **Decouple state computation (engine-dependent) from state merging (engine-independent)**
+
+- **State computation** = expensive, engine-dependent (SQL queries, Spark jobs)
+- **State merging** = cheap, pure Python (addition, max/min, Welford's algorithm)
+
+This separation enables multiple backends, incremental validation, and distributed processing.
+
+## Architecture Diagram
+
+```
+ ┌──────────────────────────────────────┐
+ │ User API │
+ │ VerificationSuite, AnalysisRunner │
+ │ ColumnProfilerRunner, Suggestions │
+ └─────────────────┬────────────────────┘
+ │
+ ┌─────────────────▼────────────────────┐
+ │ Engine Abstraction │
+ │ BaseEngine ABC │
+ │ compute_metrics(), run_checks() │
+ │ profile_columns(), suggest_...() │
+ └─────────────────┬────────────────────┘
+ │
+ ┌───────────────────────┼───────────────────────┐
+ │ │ │
+ ┌─────────▼─────────┐ ┌─────────▼─────────┐ ┌─────────▼─────────┐
+ │ DuckDBEngine │ │ SparkEngine │ │ Future Engines │
+ │ (Direct SQL) │ │ (Spark Connect) │ │ (Polars, etc.) │
+ └───────────────────┘ └───────────────────┘ └───────────────────┘
+```
+
+## Module Structure
+
+```
+pydeequ/
+├── __init__.py # connect() with auto-detection
+├── engines/
+│ ├── __init__.py # BaseEngine ABC, result types
+│ ├── duckdb.py # DuckDBEngine implementation
+│ ├── spark.py # SparkEngine wrapper
+│ ├── operators/
+│ │ ├── base.py # ScanOperator, GroupingOperator ABCs
+│ │ ├── factory.py # OperatorFactory registry
+│ │ ├── mixins.py # WhereClauseMixin, SafeExtractMixin
+│ │ ├── scan_operators.py # 15 single-pass operators
+│ │ ├── grouping_operators.py # 6 GROUP BY operators
+│ │ ├── metadata_operators.py # Schema-based operators
+│ │ └── profiling_operators.py # Column profiling operators
+│ ├── constraints/
+│ │ ├── base.py # BaseEvaluator hierarchy
+│ │ ├── factory.py # ConstraintEvaluatorFactory (27 types)
+│ │ └── evaluators.py # 23 concrete evaluators
+│ └── suggestions/
+│ ├── runner.py # Suggestion generation
+│ ├── rules.py # Rule implementations
+│ └── registry.py # Rule registry
+└── v2/ # User-facing API
+ ├── analyzers.py # Analyzer definitions
+ ├── checks.py # Check/Constraint definitions
+ ├── predicates.py # Predicate classes
+ ├── verification.py # VerificationSuite, AnalysisRunner
+ ├── profiles.py # ColumnProfilerRunner
+ └── suggestions.py # ConstraintSuggestionRunner
+```
+
+## Key Abstractions
+
+### BaseEngine (`pydeequ/engines/__init__.py`)
+
+Abstract base class defining the engine interface. All engines implement:
+- `compute_metrics(analyzers)` - Run analyzers and return `MetricResult` list
+- `run_checks(checks)` - Evaluate constraints and return `ConstraintResult` list
+- `profile_columns(columns)` - Return `ColumnProfile` for each column
+- `suggest_constraints(rules)` - Generate `ConstraintSuggestion` list
+
+### Operators (`pydeequ/engines/operators/`)
+
+Operators translate analyzers into engine-specific queries:
+
+| Type | Description | Examples |
+|------|-------------|----------|
+| **ScanOperator** | Single-pass SQL aggregations, batched together | Size, Completeness, Mean, Sum, Min, Max |
+| **GroupingOperator** | Requires GROUP BY, runs individually | Distinctness, Uniqueness, Entropy |
+| **MetadataOperator** | Schema-based, no query needed | DataType |
+
+See `base.py` for ABCs, `factory.py` for the registry pattern.
+
+### OperatorFactory (`pydeequ/engines/operators/factory.py`)
+
+Registry mapping analyzer names to operator classes. Use `OperatorFactory.create(analyzer)` to instantiate operators. The factory determines query batching strategy.
+
+### Constraint Evaluators (`pydeequ/engines/constraints/`)
+
+Evaluators check if computed metrics satisfy constraints:
+- **AnalyzerBasedEvaluator**: Delegates to an analyzer operator (hasMean, hasMin)
+- **RatioCheckEvaluator**: Computes matches/total ratio (isPositive, isContainedIn)
+
+The `ConstraintEvaluatorFactory` maps 27 constraint types to evaluator classes.
+
+### Result Types (`pydeequ/engines/__init__.py`)
+
+Standardized dataclasses returned by all engines:
+- `MetricResult`: Analyzer output (name, column, value, success)
+- `ConstraintResult`: Check output (constraint, status, message)
+- `ColumnProfile`: Profiling output (column, stats, histogram)
+
+All convert to pandas DataFrames via `results_to_dataframe()`.
+
+## Quick Start Examples
+
+### Analysis
+
+```python
+import duckdb
+import pydeequ
+from pydeequ.v2.analyzers import Size, Completeness, Mean
+from pydeequ.v2.verification import AnalysisRunner
+
+con = duckdb.connect()
+con.execute("CREATE TABLE sales AS SELECT * FROM 'sales.parquet'")
+engine = pydeequ.connect(con, table="sales")
+
+result = (AnalysisRunner()
+ .on_engine(engine)
+ .addAnalyzer(Size())
+ .addAnalyzer(Completeness("customer_id"))
+ .addAnalyzer(Mean("amount"))
+ .run())
+```
+
+### Verification
+
+```python
+from pydeequ.v2.checks import Check, CheckLevel
+from pydeequ.v2.verification import VerificationSuite
+from pydeequ.v2.predicates import gte
+
+result = (VerificationSuite()
+ .on_engine(engine)
+ .addCheck(
+ Check(CheckLevel.Error, "Data Quality")
+ .isComplete("id")
+ .hasCompleteness("email", gte(0.95))
+ .isUnique("transaction_id")
+ )
+ .run())
+```
+
+### Profiling
+
+```python
+from pydeequ.v2.profiles import ColumnProfilerRunner
+
+profiles = (ColumnProfilerRunner()
+ .on_engine(engine)
+ .run())
+```
+
+### Suggestions
+
+```python
+from pydeequ.v2.suggestions import ConstraintSuggestionRunner, Rules
+
+suggestions = (ConstraintSuggestionRunner()
+ .on_engine(engine)
+ .addConstraintRules(Rules.DEFAULT)
+ .run())
+```
+
+## Engine Comparison
+
+| Aspect | DuckDB | Spark |
+|--------|--------|-------|
+| **Use case** | Local dev, CI/CD, files < 10GB | Distributed data, data lakes |
+| **Setup** | `pip install duckdb` | Spark cluster + Deequ plugin |
+| **Latency** | Low (in-process) | Higher (network overhead) |
+| **Scaling** | Single-node, memory-bound | Distributed, scales horizontally |
+| **Approximate metrics** | HyperLogLog, exact quantiles | HLL++, KLL sketches |
+
+Both engines aim for functional parity. Minor differences exist in approximate algorithms and histogram formats - see test suite for tolerances.
+
+## Benchmarks
+
+Performance comparisons between DuckDB and Spark engines are documented in [BENCHMARK.md](../BENCHMARK.md), including:
+- Varying row counts (100K to 130M rows)
+- Varying column counts (10 to 80 columns)
+- Column profiling performance
+
+## Future Enhancements
+
+- State persistence for incremental validation
+- Additional backends (Polars, SQLAlchemy, BigQuery)
+- Anomaly detection on metrics
+- Data lineage for constraint violations
+
+## References
+
+- [DuckDQ](https://github.com/tdoehmen/duckdq) - Inspiration for engine abstraction
+- [AWS Deequ](https://github.com/awslabs/deequ) - Original Scala implementation
+- [Ibis](https://ibis-project.org/) - Multi-backend design patterns
diff --git a/imgs/benchmark_chart.png b/imgs/benchmark_chart.png
new file mode 100644
index 0000000..a6dc1ff
Binary files /dev/null and b/imgs/benchmark_chart.png differ
diff --git a/poetry.lock b/poetry.lock
index 5b439ef..a9272e1 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 2.2.1 and should not be changed by hand.
[[package]]
name = "black"
@@ -6,6 +6,7 @@ version = "24.10.0"
description = "The uncompromising code formatter."
optional = false
python-versions = ">=3.9"
+groups = ["main", "dev"]
files = [
{file = "black-24.10.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e6668650ea4b685440857138e5fe40cde4d652633b1bdffc62933d0db4ed9812"},
{file = "black-24.10.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1c536fcf674217e87b8cc3657b81809d3c085d7bf3ef262ead700da345bfa6ea"},
@@ -52,6 +53,7 @@ version = "3.4.0"
description = "Validate configuration and produce human readable error messages."
optional = false
python-versions = ">=3.8"
+groups = ["main", "dev"]
files = [
{file = "cfgv-3.4.0-py2.py3-none-any.whl", hash = "sha256:b7265b1f29fd3316bfcd2b330d63d024f2bfd8bcb8b0272f8e19a504856c48f9"},
{file = "cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560"},
@@ -63,6 +65,7 @@ version = "8.1.8"
description = "Composable command line interface toolkit"
optional = false
python-versions = ">=3.7"
+groups = ["main", "dev"]
files = [
{file = "click-8.1.8-py3-none-any.whl", hash = "sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2"},
{file = "click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a"},
@@ -77,17 +80,277 @@ version = "0.4.6"
description = "Cross-platform colored terminal text."
optional = false
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
+groups = ["main", "dev"]
+markers = "sys_platform == \"win32\" or platform_system == \"Windows\""
files = [
{file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
{file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
]
+[[package]]
+name = "contourpy"
+version = "1.3.0"
+description = "Python library for calculating contours of 2D quadrilateral grids"
+optional = false
+python-versions = ">=3.9"
+groups = ["main", "dev"]
+markers = "python_version == \"3.9\""
+files = [
+ {file = "contourpy-1.3.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:880ea32e5c774634f9fcd46504bf9f080a41ad855f4fef54f5380f5133d343c7"},
+ {file = "contourpy-1.3.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:76c905ef940a4474a6289c71d53122a4f77766eef23c03cd57016ce19d0f7b42"},
+ {file = "contourpy-1.3.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:92f8557cbb07415a4d6fa191f20fd9d2d9eb9c0b61d1b2f52a8926e43c6e9af7"},
+ {file = "contourpy-1.3.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:36f965570cff02b874773c49bfe85562b47030805d7d8360748f3eca570f4cab"},
+ {file = "contourpy-1.3.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cacd81e2d4b6f89c9f8a5b69b86490152ff39afc58a95af002a398273e5ce589"},
+ {file = "contourpy-1.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:69375194457ad0fad3a839b9e29aa0b0ed53bb54db1bfb6c3ae43d111c31ce41"},
+ {file = "contourpy-1.3.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:7a52040312b1a858b5e31ef28c2e865376a386c60c0e248370bbea2d3f3b760d"},
+ {file = "contourpy-1.3.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3faeb2998e4fcb256542e8a926d08da08977f7f5e62cf733f3c211c2a5586223"},
+ {file = "contourpy-1.3.0-cp310-cp310-win32.whl", hash = "sha256:36e0cff201bcb17a0a8ecc7f454fe078437fa6bda730e695a92f2d9932bd507f"},
+ {file = "contourpy-1.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:87ddffef1dbe5e669b5c2440b643d3fdd8622a348fe1983fad7a0f0ccb1cd67b"},
+ {file = "contourpy-1.3.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0fa4c02abe6c446ba70d96ece336e621efa4aecae43eaa9b030ae5fb92b309ad"},
+ {file = "contourpy-1.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:834e0cfe17ba12f79963861e0f908556b2cedd52e1f75e6578801febcc6a9f49"},
+ {file = "contourpy-1.3.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dbc4c3217eee163fa3984fd1567632b48d6dfd29216da3ded3d7b844a8014a66"},
+ {file = "contourpy-1.3.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4865cd1d419e0c7a7bf6de1777b185eebdc51470800a9f42b9e9decf17762081"},
+ {file = "contourpy-1.3.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:303c252947ab4b14c08afeb52375b26781ccd6a5ccd81abcdfc1fafd14cf93c1"},
+ {file = "contourpy-1.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:637f674226be46f6ba372fd29d9523dd977a291f66ab2a74fbeb5530bb3f445d"},
+ {file = "contourpy-1.3.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:76a896b2f195b57db25d6b44e7e03f221d32fe318d03ede41f8b4d9ba1bff53c"},
+ {file = "contourpy-1.3.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:e1fd23e9d01591bab45546c089ae89d926917a66dceb3abcf01f6105d927e2cb"},
+ {file = "contourpy-1.3.0-cp311-cp311-win32.whl", hash = "sha256:d402880b84df3bec6eab53cd0cf802cae6a2ef9537e70cf75e91618a3801c20c"},
+ {file = "contourpy-1.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:6cb6cc968059db9c62cb35fbf70248f40994dfcd7aa10444bbf8b3faeb7c2d67"},
+ {file = "contourpy-1.3.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:570ef7cf892f0afbe5b2ee410c507ce12e15a5fa91017a0009f79f7d93a1268f"},
+ {file = "contourpy-1.3.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:da84c537cb8b97d153e9fb208c221c45605f73147bd4cadd23bdae915042aad6"},
+ {file = "contourpy-1.3.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0be4d8425bfa755e0fd76ee1e019636ccc7c29f77a7c86b4328a9eb6a26d0639"},
+ {file = "contourpy-1.3.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9c0da700bf58f6e0b65312d0a5e695179a71d0163957fa381bb3c1f72972537c"},
+ {file = "contourpy-1.3.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:eb8b141bb00fa977d9122636b16aa67d37fd40a3d8b52dd837e536d64b9a4d06"},
+ {file = "contourpy-1.3.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3634b5385c6716c258d0419c46d05c8aa7dc8cb70326c9a4fb66b69ad2b52e09"},
+ {file = "contourpy-1.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:0dce35502151b6bd35027ac39ba6e5a44be13a68f55735c3612c568cac3805fd"},
+ {file = "contourpy-1.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:aea348f053c645100612b333adc5983d87be69acdc6d77d3169c090d3b01dc35"},
+ {file = "contourpy-1.3.0-cp312-cp312-win32.whl", hash = "sha256:90f73a5116ad1ba7174341ef3ea5c3150ddf20b024b98fb0c3b29034752c8aeb"},
+ {file = "contourpy-1.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:b11b39aea6be6764f84360fce6c82211a9db32a7c7de8fa6dd5397cf1d079c3b"},
+ {file = "contourpy-1.3.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:3e1c7fa44aaae40a2247e2e8e0627f4bea3dd257014764aa644f319a5f8600e3"},
+ {file = "contourpy-1.3.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:364174c2a76057feef647c802652f00953b575723062560498dc7930fc9b1cb7"},
+ {file = "contourpy-1.3.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:32b238b3b3b649e09ce9aaf51f0c261d38644bdfa35cbaf7b263457850957a84"},
+ {file = "contourpy-1.3.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d51fca85f9f7ad0b65b4b9fe800406d0d77017d7270d31ec3fb1cc07358fdea0"},
+ {file = "contourpy-1.3.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:732896af21716b29ab3e988d4ce14bc5133733b85956316fb0c56355f398099b"},
+ {file = "contourpy-1.3.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d73f659398a0904e125280836ae6f88ba9b178b2fed6884f3b1f95b989d2c8da"},
+ {file = "contourpy-1.3.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:c6c7c2408b7048082932cf4e641fa3b8ca848259212f51c8c59c45aa7ac18f14"},
+ {file = "contourpy-1.3.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f317576606de89da6b7e0861cf6061f6146ead3528acabff9236458a6ba467f8"},
+ {file = "contourpy-1.3.0-cp313-cp313-win32.whl", hash = "sha256:31cd3a85dbdf1fc002280c65caa7e2b5f65e4a973fcdf70dd2fdcb9868069294"},
+ {file = "contourpy-1.3.0-cp313-cp313-win_amd64.whl", hash = "sha256:4553c421929ec95fb07b3aaca0fae668b2eb5a5203d1217ca7c34c063c53d087"},
+ {file = "contourpy-1.3.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:345af746d7766821d05d72cb8f3845dfd08dd137101a2cb9b24de277d716def8"},
+ {file = "contourpy-1.3.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:3bb3808858a9dc68f6f03d319acd5f1b8a337e6cdda197f02f4b8ff67ad2057b"},
+ {file = "contourpy-1.3.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:420d39daa61aab1221567b42eecb01112908b2cab7f1b4106a52caaec8d36973"},
+ {file = "contourpy-1.3.0-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4d63ee447261e963af02642ffcb864e5a2ee4cbfd78080657a9880b8b1868e18"},
+ {file = "contourpy-1.3.0-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:167d6c890815e1dac9536dca00828b445d5d0df4d6a8c6adb4a7ec3166812fa8"},
+ {file = "contourpy-1.3.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:710a26b3dc80c0e4febf04555de66f5fd17e9cf7170a7b08000601a10570bda6"},
+ {file = "contourpy-1.3.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:75ee7cb1a14c617f34a51d11fa7524173e56551646828353c4af859c56b766e2"},
+ {file = "contourpy-1.3.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:33c92cdae89ec5135d036e7218e69b0bb2851206077251f04a6c4e0e21f03927"},
+ {file = "contourpy-1.3.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a11077e395f67ffc2c44ec2418cfebed032cd6da3022a94fc227b6faf8e2acb8"},
+ {file = "contourpy-1.3.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e8134301d7e204c88ed7ab50028ba06c683000040ede1d617298611f9dc6240c"},
+ {file = "contourpy-1.3.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e12968fdfd5bb45ffdf6192a590bd8ddd3ba9e58360b29683c6bb71a7b41edca"},
+ {file = "contourpy-1.3.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fd2a0fc506eccaaa7595b7e1418951f213cf8255be2600f1ea1b61e46a60c55f"},
+ {file = "contourpy-1.3.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4cfb5c62ce023dfc410d6059c936dcf96442ba40814aefbfa575425a3a7f19dc"},
+ {file = "contourpy-1.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:68a32389b06b82c2fdd68276148d7b9275b5f5cf13e5417e4252f6d1a34f72a2"},
+ {file = "contourpy-1.3.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:94e848a6b83da10898cbf1311a815f770acc9b6a3f2d646f330d57eb4e87592e"},
+ {file = "contourpy-1.3.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:d78ab28a03c854a873787a0a42254a0ccb3cb133c672f645c9f9c8f3ae9d0800"},
+ {file = "contourpy-1.3.0-cp39-cp39-win32.whl", hash = "sha256:81cb5ed4952aae6014bc9d0421dec7c5835c9c8c31cdf51910b708f548cf58e5"},
+ {file = "contourpy-1.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:14e262f67bd7e6eb6880bc564dcda30b15e351a594657e55b7eec94b6ef72843"},
+ {file = "contourpy-1.3.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:fe41b41505a5a33aeaed2a613dccaeaa74e0e3ead6dd6fd3a118fb471644fd6c"},
+ {file = "contourpy-1.3.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eca7e17a65f72a5133bdbec9ecf22401c62bcf4821361ef7811faee695799779"},
+ {file = "contourpy-1.3.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:1ec4dc6bf570f5b22ed0d7efba0dfa9c5b9e0431aeea7581aa217542d9e809a4"},
+ {file = "contourpy-1.3.0-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:00ccd0dbaad6d804ab259820fa7cb0b8036bda0686ef844d24125d8287178ce0"},
+ {file = "contourpy-1.3.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8ca947601224119117f7c19c9cdf6b3ab54c5726ef1d906aa4a69dfb6dd58102"},
+ {file = "contourpy-1.3.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:c6ec93afeb848a0845a18989da3beca3eec2c0f852322efe21af1931147d12cb"},
+ {file = "contourpy-1.3.0.tar.gz", hash = "sha256:7ffa0db17717a8ffb127efd0c95a4362d996b892c2904db72428d5b52e1938a4"},
+]
+
+[package.dependencies]
+numpy = ">=1.23"
+
+[package.extras]
+bokeh = ["bokeh", "selenium"]
+docs = ["furo", "sphinx (>=7.2)", "sphinx-copybutton"]
+mypy = ["contourpy[bokeh,docs]", "docutils-stubs", "mypy (==1.11.1)", "types-Pillow"]
+test = ["Pillow", "contourpy[test-no-images]", "matplotlib"]
+test-no-images = ["pytest", "pytest-cov", "pytest-rerunfailures", "pytest-xdist", "wurlitzer"]
+
+[[package]]
+name = "contourpy"
+version = "1.3.2"
+description = "Python library for calculating contours of 2D quadrilateral grids"
+optional = false
+python-versions = ">=3.10"
+groups = ["main", "dev"]
+markers = "python_version == \"3.10\""
+files = [
+ {file = "contourpy-1.3.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ba38e3f9f330af820c4b27ceb4b9c7feee5fe0493ea53a8720f4792667465934"},
+ {file = "contourpy-1.3.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:dc41ba0714aa2968d1f8674ec97504a8f7e334f48eeacebcaa6256213acb0989"},
+ {file = "contourpy-1.3.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9be002b31c558d1ddf1b9b415b162c603405414bacd6932d031c5b5a8b757f0d"},
+ {file = "contourpy-1.3.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8d2e74acbcba3bfdb6d9d8384cdc4f9260cae86ed9beee8bd5f54fee49a430b9"},
+ {file = "contourpy-1.3.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e259bced5549ac64410162adc973c5e2fb77f04df4a439d00b478e57a0e65512"},
+ {file = "contourpy-1.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad687a04bc802cbe8b9c399c07162a3c35e227e2daccf1668eb1f278cb698631"},
+ {file = "contourpy-1.3.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:cdd22595308f53ef2f891040ab2b93d79192513ffccbd7fe19be7aa773a5e09f"},
+ {file = "contourpy-1.3.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b4f54d6a2defe9f257327b0f243612dd051cc43825587520b1bf74a31e2f6ef2"},
+ {file = "contourpy-1.3.2-cp310-cp310-win32.whl", hash = "sha256:f939a054192ddc596e031e50bb13b657ce318cf13d264f095ce9db7dc6ae81c0"},
+ {file = "contourpy-1.3.2-cp310-cp310-win_amd64.whl", hash = "sha256:c440093bbc8fc21c637c03bafcbef95ccd963bc6e0514ad887932c18ca2a759a"},
+ {file = "contourpy-1.3.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6a37a2fb93d4df3fc4c0e363ea4d16f83195fc09c891bc8ce072b9d084853445"},
+ {file = "contourpy-1.3.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b7cd50c38f500bbcc9b6a46643a40e0913673f869315d8e70de0438817cb7773"},
+ {file = "contourpy-1.3.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d6658ccc7251a4433eebd89ed2672c2ed96fba367fd25ca9512aa92a4b46c4f1"},
+ {file = "contourpy-1.3.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:70771a461aaeb335df14deb6c97439973d253ae70660ca085eec25241137ef43"},
+ {file = "contourpy-1.3.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:65a887a6e8c4cd0897507d814b14c54a8c2e2aa4ac9f7686292f9769fcf9a6ab"},
+ {file = "contourpy-1.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3859783aefa2b8355697f16642695a5b9792e7a46ab86da1118a4a23a51a33d7"},
+ {file = "contourpy-1.3.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:eab0f6db315fa4d70f1d8ab514e527f0366ec021ff853d7ed6a2d33605cf4b83"},
+ {file = "contourpy-1.3.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d91a3ccc7fea94ca0acab82ceb77f396d50a1f67412efe4c526f5d20264e6ecd"},
+ {file = "contourpy-1.3.2-cp311-cp311-win32.whl", hash = "sha256:1c48188778d4d2f3d48e4643fb15d8608b1d01e4b4d6b0548d9b336c28fc9b6f"},
+ {file = "contourpy-1.3.2-cp311-cp311-win_amd64.whl", hash = "sha256:5ebac872ba09cb8f2131c46b8739a7ff71de28a24c869bcad554477eb089a878"},
+ {file = "contourpy-1.3.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:4caf2bcd2969402bf77edc4cb6034c7dd7c0803213b3523f111eb7460a51b8d2"},
+ {file = "contourpy-1.3.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:82199cb78276249796419fe36b7386bd8d2cc3f28b3bc19fe2454fe2e26c4c15"},
+ {file = "contourpy-1.3.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:106fab697af11456fcba3e352ad50effe493a90f893fca6c2ca5c033820cea92"},
+ {file = "contourpy-1.3.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d14f12932a8d620e307f715857107b1d1845cc44fdb5da2bc8e850f5ceba9f87"},
+ {file = "contourpy-1.3.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:532fd26e715560721bb0d5fc7610fce279b3699b018600ab999d1be895b09415"},
+ {file = "contourpy-1.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f26b383144cf2d2c29f01a1e8170f50dacf0eac02d64139dcd709a8ac4eb3cfe"},
+ {file = "contourpy-1.3.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:c49f73e61f1f774650a55d221803b101d966ca0c5a2d6d5e4320ec3997489441"},
+ {file = "contourpy-1.3.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3d80b2c0300583228ac98d0a927a1ba6a2ba6b8a742463c564f1d419ee5b211e"},
+ {file = "contourpy-1.3.2-cp312-cp312-win32.whl", hash = "sha256:90df94c89a91b7362e1142cbee7568f86514412ab8a2c0d0fca72d7e91b62912"},
+ {file = "contourpy-1.3.2-cp312-cp312-win_amd64.whl", hash = "sha256:8c942a01d9163e2e5cfb05cb66110121b8d07ad438a17f9e766317bcb62abf73"},
+ {file = "contourpy-1.3.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:de39db2604ae755316cb5967728f4bea92685884b1e767b7c24e983ef5f771cb"},
+ {file = "contourpy-1.3.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3f9e896f447c5c8618f1edb2bafa9a4030f22a575ec418ad70611450720b5b08"},
+ {file = "contourpy-1.3.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:71e2bd4a1c4188f5c2b8d274da78faab884b59df20df63c34f74aa1813c4427c"},
+ {file = "contourpy-1.3.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:de425af81b6cea33101ae95ece1f696af39446db9682a0b56daaa48cfc29f38f"},
+ {file = "contourpy-1.3.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:977e98a0e0480d3fe292246417239d2d45435904afd6d7332d8455981c408b85"},
+ {file = "contourpy-1.3.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:434f0adf84911c924519d2b08fc10491dd282b20bdd3fa8f60fd816ea0b48841"},
+ {file = "contourpy-1.3.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:c66c4906cdbc50e9cba65978823e6e00b45682eb09adbb78c9775b74eb222422"},
+ {file = "contourpy-1.3.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:8b7fc0cd78ba2f4695fd0a6ad81a19e7e3ab825c31b577f384aa9d7817dc3bef"},
+ {file = "contourpy-1.3.2-cp313-cp313-win32.whl", hash = "sha256:15ce6ab60957ca74cff444fe66d9045c1fd3e92c8936894ebd1f3eef2fff075f"},
+ {file = "contourpy-1.3.2-cp313-cp313-win_amd64.whl", hash = "sha256:e1578f7eafce927b168752ed7e22646dad6cd9bca673c60bff55889fa236ebf9"},
+ {file = "contourpy-1.3.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0475b1f6604896bc7c53bb070e355e9321e1bc0d381735421a2d2068ec56531f"},
+ {file = "contourpy-1.3.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:c85bb486e9be652314bb5b9e2e3b0d1b2e643d5eec4992c0fbe8ac71775da739"},
+ {file = "contourpy-1.3.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:745b57db7758f3ffc05a10254edd3182a2a83402a89c00957a8e8a22f5582823"},
+ {file = "contourpy-1.3.2-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:970e9173dbd7eba9b4e01aab19215a48ee5dd3f43cef736eebde064a171f89a5"},
+ {file = "contourpy-1.3.2-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c6c4639a9c22230276b7bffb6a850dfc8258a2521305e1faefe804d006b2e532"},
+ {file = "contourpy-1.3.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc829960f34ba36aad4302e78eabf3ef16a3a100863f0d4eeddf30e8a485a03b"},
+ {file = "contourpy-1.3.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:d32530b534e986374fc19eaa77fcb87e8a99e5431499949b828312bdcd20ac52"},
+ {file = "contourpy-1.3.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:e298e7e70cf4eb179cc1077be1c725b5fd131ebc81181bf0c03525c8abc297fd"},
+ {file = "contourpy-1.3.2-cp313-cp313t-win32.whl", hash = "sha256:d0e589ae0d55204991450bb5c23f571c64fe43adaa53f93fc902a84c96f52fe1"},
+ {file = "contourpy-1.3.2-cp313-cp313t-win_amd64.whl", hash = "sha256:78e9253c3de756b3f6a5174d024c4835acd59eb3f8e2ca13e775dbffe1558f69"},
+ {file = "contourpy-1.3.2-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:fd93cc7f3139b6dd7aab2f26a90dde0aa9fc264dbf70f6740d498a70b860b82c"},
+ {file = "contourpy-1.3.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:107ba8a6a7eec58bb475329e6d3b95deba9440667c4d62b9b6063942b61d7f16"},
+ {file = "contourpy-1.3.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:ded1706ed0c1049224531b81128efbd5084598f18d8a2d9efae833edbd2b40ad"},
+ {file = "contourpy-1.3.2-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:5f5964cdad279256c084b69c3f412b7801e15356b16efa9d78aa974041903da0"},
+ {file = "contourpy-1.3.2-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:49b65a95d642d4efa8f64ba12558fcb83407e58a2dfba9d796d77b63ccfcaff5"},
+ {file = "contourpy-1.3.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:8c5acb8dddb0752bf252e01a3035b21443158910ac16a3b0d20e7fed7d534ce5"},
+ {file = "contourpy-1.3.2.tar.gz", hash = "sha256:b6945942715a034c671b7fc54f9588126b0b8bf23db2696e3ca8328f3ff0ab54"},
+]
+
+[package.dependencies]
+numpy = ">=1.23"
+
+[package.extras]
+bokeh = ["bokeh", "selenium"]
+docs = ["furo", "sphinx (>=7.2)", "sphinx-copybutton"]
+mypy = ["bokeh", "contourpy[bokeh,docs]", "docutils-stubs", "mypy (==1.15.0)", "types-Pillow"]
+test = ["Pillow", "contourpy[test-no-images]", "matplotlib"]
+test-no-images = ["pytest", "pytest-cov", "pytest-rerunfailures", "pytest-xdist", "wurlitzer"]
+
+[[package]]
+name = "contourpy"
+version = "1.3.3"
+description = "Python library for calculating contours of 2D quadrilateral grids"
+optional = false
+python-versions = ">=3.11"
+groups = ["main", "dev"]
+markers = "python_version >= \"3.11\""
+files = [
+ {file = "contourpy-1.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:709a48ef9a690e1343202916450bc48b9e51c049b089c7f79a267b46cffcdaa1"},
+ {file = "contourpy-1.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:23416f38bfd74d5d28ab8429cc4d63fa67d5068bd711a85edb1c3fb0c3e2f381"},
+ {file = "contourpy-1.3.3-cp311-cp311-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:929ddf8c4c7f348e4c0a5a3a714b5c8542ffaa8c22954862a46ca1813b667ee7"},
+ {file = "contourpy-1.3.3-cp311-cp311-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:9e999574eddae35f1312c2b4b717b7885d4edd6cb46700e04f7f02db454e67c1"},
+ {file = "contourpy-1.3.3-cp311-cp311-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:0bf67e0e3f482cb69779dd3061b534eb35ac9b17f163d851e2a547d56dba0a3a"},
+ {file = "contourpy-1.3.3-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:51e79c1f7470158e838808d4a996fa9bac72c498e93d8ebe5119bc1e6becb0db"},
+ {file = "contourpy-1.3.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:598c3aaece21c503615fd59c92a3598b428b2f01bfb4b8ca9c4edeecc2438620"},
+ {file = "contourpy-1.3.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:322ab1c99b008dad206d406bb61d014cf0174df491ae9d9d0fac6a6fda4f977f"},
+ {file = "contourpy-1.3.3-cp311-cp311-win32.whl", hash = "sha256:fd907ae12cd483cd83e414b12941c632a969171bf90fc937d0c9f268a31cafff"},
+ {file = "contourpy-1.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:3519428f6be58431c56581f1694ba8e50626f2dd550af225f82fb5f5814d2a42"},
+ {file = "contourpy-1.3.3-cp311-cp311-win_arm64.whl", hash = "sha256:15ff10bfada4bf92ec8b31c62bf7c1834c244019b4a33095a68000d7075df470"},
+ {file = "contourpy-1.3.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b08a32ea2f8e42cf1d4be3169a98dd4be32bafe4f22b6c4cb4ba810fa9e5d2cb"},
+ {file = "contourpy-1.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:556dba8fb6f5d8742f2923fe9457dbdd51e1049c4a43fd3986a0b14a1d815fc6"},
+ {file = "contourpy-1.3.3-cp312-cp312-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:92d9abc807cf7d0e047b95ca5d957cf4792fcd04e920ca70d48add15c1a90ea7"},
+ {file = "contourpy-1.3.3-cp312-cp312-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:b2e8faa0ed68cb29af51edd8e24798bb661eac3bd9f65420c1887b6ca89987c8"},
+ {file = "contourpy-1.3.3-cp312-cp312-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:626d60935cf668e70a5ce6ff184fd713e9683fb458898e4249b63be9e28286ea"},
+ {file = "contourpy-1.3.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4d00e655fcef08aba35ec9610536bfe90267d7ab5ba944f7032549c55a146da1"},
+ {file = "contourpy-1.3.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:451e71b5a7d597379ef572de31eeb909a87246974d960049a9848c3bc6c41bf7"},
+ {file = "contourpy-1.3.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:459c1f020cd59fcfe6650180678a9993932d80d44ccde1fa1868977438f0b411"},
+ {file = "contourpy-1.3.3-cp312-cp312-win32.whl", hash = "sha256:023b44101dfe49d7d53932be418477dba359649246075c996866106da069af69"},
+ {file = "contourpy-1.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:8153b8bfc11e1e4d75bcb0bff1db232f9e10b274e0929de9d608027e0d34ff8b"},
+ {file = "contourpy-1.3.3-cp312-cp312-win_arm64.whl", hash = "sha256:07ce5ed73ecdc4a03ffe3e1b3e3c1166db35ae7584be76f65dbbe28a7791b0cc"},
+ {file = "contourpy-1.3.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:177fb367556747a686509d6fef71d221a4b198a3905fe824430e5ea0fda54eb5"},
+ {file = "contourpy-1.3.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:d002b6f00d73d69333dac9d0b8d5e84d9724ff9ef044fd63c5986e62b7c9e1b1"},
+ {file = "contourpy-1.3.3-cp313-cp313-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:348ac1f5d4f1d66d3322420f01d42e43122f43616e0f194fc1c9f5d830c5b286"},
+ {file = "contourpy-1.3.3-cp313-cp313-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:655456777ff65c2c548b7c454af9c6f33f16c8884f11083244b5819cc214f1b5"},
+ {file = "contourpy-1.3.3-cp313-cp313-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:644a6853d15b2512d67881586bd03f462c7ab755db95f16f14d7e238f2852c67"},
+ {file = "contourpy-1.3.3-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4debd64f124ca62069f313a9cb86656ff087786016d76927ae2cf37846b006c9"},
+ {file = "contourpy-1.3.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a15459b0f4615b00bbd1e91f1b9e19b7e63aea7483d03d804186f278c0af2659"},
+ {file = "contourpy-1.3.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ca0fdcd73925568ca027e0b17ab07aad764be4706d0a925b89227e447d9737b7"},
+ {file = "contourpy-1.3.3-cp313-cp313-win32.whl", hash = "sha256:b20c7c9a3bf701366556e1b1984ed2d0cedf999903c51311417cf5f591d8c78d"},
+ {file = "contourpy-1.3.3-cp313-cp313-win_amd64.whl", hash = "sha256:1cadd8b8969f060ba45ed7c1b714fe69185812ab43bd6b86a9123fe8f99c3263"},
+ {file = "contourpy-1.3.3-cp313-cp313-win_arm64.whl", hash = "sha256:fd914713266421b7536de2bfa8181aa8c699432b6763a0ea64195ebe28bff6a9"},
+ {file = "contourpy-1.3.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:88df9880d507169449d434c293467418b9f6cbe82edd19284aa0409e7fdb933d"},
+ {file = "contourpy-1.3.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:d06bb1f751ba5d417047db62bca3c8fde202b8c11fb50742ab3ab962c81e8216"},
+ {file = "contourpy-1.3.3-cp313-cp313t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e4e6b05a45525357e382909a4c1600444e2a45b4795163d3b22669285591c1ae"},
+ {file = "contourpy-1.3.3-cp313-cp313t-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:ab3074b48c4e2cf1a960e6bbeb7f04566bf36b1861d5c9d4d8ac04b82e38ba20"},
+ {file = "contourpy-1.3.3-cp313-cp313t-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:6c3d53c796f8647d6deb1abe867daeb66dcc8a97e8455efa729516b997b8ed99"},
+ {file = "contourpy-1.3.3-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:50ed930df7289ff2a8d7afeb9603f8289e5704755c7e5c3bbd929c90c817164b"},
+ {file = "contourpy-1.3.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:4feffb6537d64b84877da813a5c30f1422ea5739566abf0bd18065ac040e120a"},
+ {file = "contourpy-1.3.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:2b7e9480ffe2b0cd2e787e4df64270e3a0440d9db8dc823312e2c940c167df7e"},
+ {file = "contourpy-1.3.3-cp313-cp313t-win32.whl", hash = "sha256:283edd842a01e3dcd435b1c5116798d661378d83d36d337b8dde1d16a5fc9ba3"},
+ {file = "contourpy-1.3.3-cp313-cp313t-win_amd64.whl", hash = "sha256:87acf5963fc2b34825e5b6b048f40e3635dd547f590b04d2ab317c2619ef7ae8"},
+ {file = "contourpy-1.3.3-cp313-cp313t-win_arm64.whl", hash = "sha256:3c30273eb2a55024ff31ba7d052dde990d7d8e5450f4bbb6e913558b3d6c2301"},
+ {file = "contourpy-1.3.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:fde6c716d51c04b1c25d0b90364d0be954624a0ee9d60e23e850e8d48353d07a"},
+ {file = "contourpy-1.3.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:cbedb772ed74ff5be440fa8eee9bd49f64f6e3fc09436d9c7d8f1c287b121d77"},
+ {file = "contourpy-1.3.3-cp314-cp314-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:22e9b1bd7a9b1d652cd77388465dc358dafcd2e217d35552424aa4f996f524f5"},
+ {file = "contourpy-1.3.3-cp314-cp314-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a22738912262aa3e254e4f3cb079a95a67132fc5a063890e224393596902f5a4"},
+ {file = "contourpy-1.3.3-cp314-cp314-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:afe5a512f31ee6bd7d0dda52ec9864c984ca3d66664444f2d72e0dc4eb832e36"},
+ {file = "contourpy-1.3.3-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f64836de09927cba6f79dcd00fdd7d5329f3fccc633468507079c829ca4db4e3"},
+ {file = "contourpy-1.3.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:1fd43c3be4c8e5fd6e4f2baeae35ae18176cf2e5cced681cca908addf1cdd53b"},
+ {file = "contourpy-1.3.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:6afc576f7b33cf00996e5c1102dc2a8f7cc89e39c0b55df93a0b78c1bd992b36"},
+ {file = "contourpy-1.3.3-cp314-cp314-win32.whl", hash = "sha256:66c8a43a4f7b8df8b71ee1840e4211a3c8d93b214b213f590e18a1beca458f7d"},
+ {file = "contourpy-1.3.3-cp314-cp314-win_amd64.whl", hash = "sha256:cf9022ef053f2694e31d630feaacb21ea24224be1c3ad0520b13d844274614fd"},
+ {file = "contourpy-1.3.3-cp314-cp314-win_arm64.whl", hash = "sha256:95b181891b4c71de4bb404c6621e7e2390745f887f2a026b2d99e92c17892339"},
+ {file = "contourpy-1.3.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:33c82d0138c0a062380332c861387650c82e4cf1747aaa6938b9b6516762e772"},
+ {file = "contourpy-1.3.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:ea37e7b45949df430fe649e5de8351c423430046a2af20b1c1961cae3afcda77"},
+ {file = "contourpy-1.3.3-cp314-cp314t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d304906ecc71672e9c89e87c4675dc5c2645e1f4269a5063b99b0bb29f232d13"},
+ {file = "contourpy-1.3.3-cp314-cp314t-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:ca658cd1a680a5c9ea96dc61cdbae1e85c8f25849843aa799dfd3cb370ad4fbe"},
+ {file = "contourpy-1.3.3-cp314-cp314t-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:ab2fd90904c503739a75b7c8c5c01160130ba67944a7b77bbf36ef8054576e7f"},
+ {file = "contourpy-1.3.3-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b7301b89040075c30e5768810bc96a8e8d78085b47d8be6e4c3f5a0b4ed478a0"},
+ {file = "contourpy-1.3.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:2a2a8b627d5cc6b7c41a4beff6c5ad5eb848c88255fda4a8745f7e901b32d8e4"},
+ {file = "contourpy-1.3.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:fd6ec6be509c787f1caf6b247f0b1ca598bef13f4ddeaa126b7658215529ba0f"},
+ {file = "contourpy-1.3.3-cp314-cp314t-win32.whl", hash = "sha256:e74a9a0f5e3fff48fb5a7f2fd2b9b70a3fe014a67522f79b7cca4c0c7e43c9ae"},
+ {file = "contourpy-1.3.3-cp314-cp314t-win_amd64.whl", hash = "sha256:13b68d6a62db8eafaebb8039218921399baf6e47bf85006fd8529f2a08ef33fc"},
+ {file = "contourpy-1.3.3-cp314-cp314t-win_arm64.whl", hash = "sha256:b7448cb5a725bb1e35ce88771b86fba35ef418952474492cf7c764059933ff8b"},
+ {file = "contourpy-1.3.3-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:cd5dfcaeb10f7b7f9dc8941717c6c2ade08f587be2226222c12b25f0483ed497"},
+ {file = "contourpy-1.3.3-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:0c1fc238306b35f246d61a1d416a627348b5cf0648648a031e14bb8705fcdfe8"},
+ {file = "contourpy-1.3.3-pp311-pypy311_pp73-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:70f9aad7de812d6541d29d2bbf8feb22ff7e1c299523db288004e3157ff4674e"},
+ {file = "contourpy-1.3.3-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5ed3657edf08512fc3fe81b510e35c2012fbd3081d2e26160f27ca28affec989"},
+ {file = "contourpy-1.3.3-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:3d1a3799d62d45c18bafd41c5fa05120b96a28079f2393af559b843d1a966a77"},
+ {file = "contourpy-1.3.3.tar.gz", hash = "sha256:083e12155b210502d0bca491432bb04d56dc3432f95a979b429f2848c3dbe880"},
+]
+
+[package.dependencies]
+numpy = ">=1.25"
+
+[package.extras]
+bokeh = ["bokeh", "selenium"]
+docs = ["furo", "sphinx (>=7.2)", "sphinx-copybutton"]
+mypy = ["bokeh", "contourpy[bokeh,docs]", "docutils-stubs", "mypy (==1.17.0)", "types-Pillow"]
+test = ["Pillow", "contourpy[test-no-images]", "matplotlib"]
+test-no-images = ["pytest", "pytest-cov", "pytest-rerunfailures", "pytest-xdist", "wurlitzer"]
+
[[package]]
name = "coverage"
version = "7.10.7"
description = "Code coverage measurement for Python"
optional = false
python-versions = ">=3.9"
+groups = ["main", "dev"]
files = [
{file = "coverage-7.10.7-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:fc04cc7a3db33664e0c2d10eb8990ff6b3536f6842c9590ae8da4c614b9ed05a"},
{file = "coverage-7.10.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e201e015644e207139f7e2351980feb7040e6f4b2c2978892f3e3789d1c125e5"},
@@ -199,7 +462,23 @@ files = [
tomli = {version = "*", optional = true, markers = "python_full_version <= \"3.11.0a6\" and extra == \"toml\""}
[package.extras]
-toml = ["tomli"]
+toml = ["tomli ; python_full_version <= \"3.11.0a6\""]
+
+[[package]]
+name = "cycler"
+version = "0.12.1"
+description = "Composable style cycles"
+optional = false
+python-versions = ">=3.8"
+groups = ["main", "dev"]
+files = [
+ {file = "cycler-0.12.1-py3-none-any.whl", hash = "sha256:85cef7cff222d8644161529808465972e51340599459b8ac3ccbac5a854e0d30"},
+ {file = "cycler-0.12.1.tar.gz", hash = "sha256:88bb128f02ba341da8ef447245a9e138fae777f6a23943da4540077d3601eb1c"},
+]
+
+[package.extras]
+docs = ["ipython", "matplotlib", "numpydoc", "sphinx"]
+tests = ["pytest", "pytest-cov", "pytest-xdist"]
[[package]]
name = "distlib"
@@ -207,17 +486,74 @@ version = "0.4.0"
description = "Distribution utilities"
optional = false
python-versions = "*"
+groups = ["main", "dev"]
files = [
{file = "distlib-0.4.0-py2.py3-none-any.whl", hash = "sha256:9659f7d87e46584a30b5780e43ac7a2143098441670ff0a49d5f9034c54a6c16"},
{file = "distlib-0.4.0.tar.gz", hash = "sha256:feec40075be03a04501a973d81f633735b4b69f98b05450592310c0f401a4e0d"},
]
+[[package]]
+name = "duckdb"
+version = "1.4.3"
+description = "DuckDB in-process database"
+optional = false
+python-versions = ">=3.9.0"
+groups = ["main", "dev"]
+files = [
+ {file = "duckdb-1.4.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:efa7f1191c59e34b688fcd4e588c1b903a4e4e1f4804945902cf0b20e08a9001"},
+ {file = "duckdb-1.4.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:4fef6a053a1c485292000bf0c338bba60f89d334f6a06fc76ba4085a5a322b76"},
+ {file = "duckdb-1.4.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:702dabbc22b27dc5b73e7599c60deef3d8c59968527c36b391773efddd8f4cf1"},
+ {file = "duckdb-1.4.3-cp310-cp310-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:854b79375fa618f6ffa8d84fb45cbc9db887f6c4834076ea10d20bc106f1fd90"},
+ {file = "duckdb-1.4.3-cp310-cp310-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1bb8bd5a3dd205983726185b280a211eacc9f5bc0c4d4505bec8c87ac33a8ccb"},
+ {file = "duckdb-1.4.3-cp310-cp310-win_amd64.whl", hash = "sha256:d0ff08388ef8b1d1a4c95c321d6c5fa11201b241036b1ee740f9d841df3d6ba2"},
+ {file = "duckdb-1.4.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:366bf607088053dce845c9d24c202c04d78022436cc5d8e4c9f0492de04afbe7"},
+ {file = "duckdb-1.4.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:8d080e8d1bf2d226423ec781f539c8f6b6ef3fd42a9a58a7160de0a00877a21f"},
+ {file = "duckdb-1.4.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9dc049ba7e906cb49ca2b6d4fbf7b6615ec3883193e8abb93f0bef2652e42dda"},
+ {file = "duckdb-1.4.3-cp311-cp311-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2b30245375ea94ab528c87c61fc3ab3e36331180b16af92ee3a37b810a745d24"},
+ {file = "duckdb-1.4.3-cp311-cp311-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a7c864df027da1ee95f0c32def67e15d02cd4a906c9c1cbae82c09c5112f526b"},
+ {file = "duckdb-1.4.3-cp311-cp311-win_amd64.whl", hash = "sha256:813f189039b46877b5517f1909c7b94a8fe01b4bde2640ab217537ea0fe9b59b"},
+ {file = "duckdb-1.4.3-cp311-cp311-win_arm64.whl", hash = "sha256:fbc63ffdd03835f660155b37a1b6db2005bcd46e5ad398b8cac141eb305d2a3d"},
+ {file = "duckdb-1.4.3-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:6302452e57aef29aae3977063810ed7b2927967b97912947b9cca45c1c21955f"},
+ {file = "duckdb-1.4.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:deab351ac43b6282a3270e3d40e3d57b3b50f472d9fd8c30975d88a31be41231"},
+ {file = "duckdb-1.4.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5634e40e1e2d972e4f75bced1fbdd9e9e90faa26445c1052b27de97ee546944a"},
+ {file = "duckdb-1.4.3-cp312-cp312-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:274d4a31aba63115f23e7e7b401e3e3a937f3626dc9dea820a9c7d3073f450d2"},
+ {file = "duckdb-1.4.3-cp312-cp312-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4f868a7e6d9b37274a1aa34849ea92aa964e9bd59a5237d6c17e8540533a1e4f"},
+ {file = "duckdb-1.4.3-cp312-cp312-win_amd64.whl", hash = "sha256:ef7ef15347ce97201b1b5182a5697682679b04c3374d5a01ac10ba31cf791b95"},
+ {file = "duckdb-1.4.3-cp312-cp312-win_arm64.whl", hash = "sha256:1b9b445970fd18274d5ac07a0b24c032e228f967332fb5ebab3d7db27738c0e4"},
+ {file = "duckdb-1.4.3-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:16952ac05bd7e7b39946695452bf450db1ebbe387e1e7178e10f593f2ea7b9a8"},
+ {file = "duckdb-1.4.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:de984cd24a6cbefdd6d4a349f7b9a46e583ca3e58ce10d8def0b20a6e5fcbe78"},
+ {file = "duckdb-1.4.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1e5457dda91b67258aae30fb1a0df84183a9f6cd27abac1d5536c0d876c6dfa1"},
+ {file = "duckdb-1.4.3-cp313-cp313-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:006aca6a6d6736c441b02ff5c7600b099bb8b7f4de094b8b062137efddce42df"},
+ {file = "duckdb-1.4.3-cp313-cp313-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a2813f4635f4d6681cc3304020374c46aca82758c6740d7edbc237fe3aae2744"},
+ {file = "duckdb-1.4.3-cp313-cp313-win_amd64.whl", hash = "sha256:6db124f53a3edcb32b0a896ad3519e37477f7e67bf4811cb41ab60c1ef74e4c8"},
+ {file = "duckdb-1.4.3-cp313-cp313-win_arm64.whl", hash = "sha256:a8b0a8764e1b5dd043d168c8f749314f7a1252b5a260fa415adaa26fa3b958fd"},
+ {file = "duckdb-1.4.3-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:316711a9e852bcfe1ed6241a5f654983f67e909e290495f3562cccdf43be8180"},
+ {file = "duckdb-1.4.3-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:9e625b2b4d52bafa1fd0ebdb0990c3961dac8bb00e30d327185de95b68202131"},
+ {file = "duckdb-1.4.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:130c6760f6c573f9c9fe9aba56adba0fab48811a4871b7b8fd667318b4a3e8da"},
+ {file = "duckdb-1.4.3-cp314-cp314-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:20c88effaa557a11267706b01419c542fe42f893dee66e5a6daa5974ea2d4a46"},
+ {file = "duckdb-1.4.3-cp314-cp314-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1b35491db98ccd11d151165497c084a9d29d3dc42fc80abea2715a6c861ca43d"},
+ {file = "duckdb-1.4.3-cp314-cp314-win_amd64.whl", hash = "sha256:23b12854032c1a58d0452e2b212afa908d4ce64171862f3792ba9a596ba7c765"},
+ {file = "duckdb-1.4.3-cp314-cp314-win_arm64.whl", hash = "sha256:90f241f25cffe7241bf9f376754a5845c74775e00e1c5731119dc88cd71e0cb2"},
+ {file = "duckdb-1.4.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:aa26a7406205bc1426cee28bdfdf084f669a5686977dafa4c3ec65873989593c"},
+ {file = "duckdb-1.4.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:caa2164c91f7e91befb1ffb081b3cd97a137117533aef7abe1538b03ad72e3a9"},
+ {file = "duckdb-1.4.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:8d53b217698a76c4957e2c807dd9295d409146f9d3d7932f372883201ba9d25a"},
+ {file = "duckdb-1.4.3-cp39-cp39-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8afba22c370f06b7314aa46bfed052509269e482bcfb3f7b1ea0fa17ae49ce42"},
+ {file = "duckdb-1.4.3-cp39-cp39-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2b195270ff1a661f22cbd547a215baff265b7d4469a76a215c8992b5994107c3"},
+ {file = "duckdb-1.4.3-cp39-cp39-win_amd64.whl", hash = "sha256:23a3a077821bed1768a84ac9cbf6b6487ead33e28e62cb118bda5fb8f9e53dea"},
+ {file = "duckdb-1.4.3.tar.gz", hash = "sha256:fea43e03604c713e25a25211ada87d30cd2a044d8f27afab5deba26ac49e5268"},
+]
+
+[package.extras]
+all = ["adbc-driver-manager", "fsspec", "ipython", "numpy", "pandas", "pyarrow"]
+
[[package]]
name = "exceptiongroup"
version = "1.3.1"
description = "Backport of PEP 654 (exception groups)"
optional = false
python-versions = ">=3.7"
+groups = ["main", "dev"]
+markers = "python_version < \"3.11\""
files = [
{file = "exceptiongroup-1.3.1-py3-none-any.whl", hash = "sha256:a7a39a3bd276781e98394987d3a5701d0c4edffb633bb7a5144577f82c773598"},
{file = "exceptiongroup-1.3.1.tar.gz", hash = "sha256:8b412432c6055b0b7d14c310000ae93352ed6754f70fa8f7c34141f91c4e3219"},
@@ -235,6 +571,8 @@ version = "3.19.1"
description = "A platform independent file lock."
optional = false
python-versions = ">=3.9"
+groups = ["main", "dev"]
+markers = "python_version == \"3.9\""
files = [
{file = "filelock-3.19.1-py3-none-any.whl", hash = "sha256:d38e30481def20772f5baf097c122c3babc4fcdb7e14e57049eb9d88c6dc017d"},
{file = "filelock-3.19.1.tar.gz", hash = "sha256:66eda1888b0171c998b35be2bcc0f6d75c388a7ce20c3f3f37aa8e96c2dddf58"},
@@ -246,17 +584,176 @@ version = "3.20.3"
description = "A platform independent file lock."
optional = false
python-versions = ">=3.10"
+groups = ["main", "dev"]
+markers = "python_version >= \"3.10\""
files = [
{file = "filelock-3.20.3-py3-none-any.whl", hash = "sha256:4b0dda527ee31078689fc205ec4f1c1bf7d56cf88b6dc9426c4f230e46c2dce1"},
{file = "filelock-3.20.3.tar.gz", hash = "sha256:18c57ee915c7ec61cff0ecf7f0f869936c7c30191bb0cf406f1341778d0834e1"},
]
+[[package]]
+name = "fonttools"
+version = "4.60.2"
+description = "Tools to manipulate font files"
+optional = false
+python-versions = ">=3.9"
+groups = ["main", "dev"]
+markers = "python_version == \"3.9\""
+files = [
+ {file = "fonttools-4.60.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:4e36fadcf7e8ca6e34d490eef86ed638d6fd9c55d2f514b05687622cfc4a7050"},
+ {file = "fonttools-4.60.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6e500fc9c04bee749ceabfc20cb4903f6981c2139050d85720ea7ada61b75d5c"},
+ {file = "fonttools-4.60.2-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:22efea5e784e1d1cd8d7b856c198e360a979383ebc6dea4604743b56da1cbc34"},
+ {file = "fonttools-4.60.2-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:677aa92d84d335e4d301d8ba04afca6f575316bc647b6782cb0921943fcb6343"},
+ {file = "fonttools-4.60.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:edd49d3defbf35476e78b61ff737ff5efea811acff68d44233a95a5a48252334"},
+ {file = "fonttools-4.60.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:126839492b69cecc5baf2bddcde60caab2ffafd867bbae2a88463fce6078ca3a"},
+ {file = "fonttools-4.60.2-cp310-cp310-win32.whl", hash = "sha256:ffcab6f5537136046ca902ed2491ab081ba271b07591b916289b7c27ff845f96"},
+ {file = "fonttools-4.60.2-cp310-cp310-win_amd64.whl", hash = "sha256:9c68b287c7ffcd29dd83b5f961004b2a54a862a88825d52ea219c6220309ba45"},
+ {file = "fonttools-4.60.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:a2aed0a7931401b3875265717a24c726f87ecfedbb7b3426c2ca4d2812e281ae"},
+ {file = "fonttools-4.60.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:dea6868e9d2b816c9076cfea77754686f3c19149873bdbc5acde437631c15df1"},
+ {file = "fonttools-4.60.2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2fa27f34950aa1fe0f0b1abe25eed04770a3b3b34ad94e5ace82cc341589678a"},
+ {file = "fonttools-4.60.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:13a53d479d187b09bfaa4a35ffcbc334fc494ff355f0a587386099cb66674f1e"},
+ {file = "fonttools-4.60.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:fac5e921d3bd0ca3bb8517dced2784f0742bc8ca28579a68b139f04ea323a779"},
+ {file = "fonttools-4.60.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:648f4f9186fd7f1f3cd57dbf00d67a583720d5011feca67a5e88b3a491952cfb"},
+ {file = "fonttools-4.60.2-cp311-cp311-win32.whl", hash = "sha256:3274e15fad871bead5453d5ce02658f6d0c7bc7e7021e2a5b8b04e2f9e40da1a"},
+ {file = "fonttools-4.60.2-cp311-cp311-win_amd64.whl", hash = "sha256:91d058d5a483a1525b367803abb69de0923fbd45e1f82ebd000f5c8aa65bc78e"},
+ {file = "fonttools-4.60.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:e0164b7609d2b5c5dd4e044b8085b7bd7ca7363ef8c269a4ab5b5d4885a426b2"},
+ {file = "fonttools-4.60.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:1dd3d9574fc595c1e97faccae0f264dc88784ddf7fbf54c939528378bacc0033"},
+ {file = "fonttools-4.60.2-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:98d0719f1b11c2817307d2da2e94296a3b2a3503f8d6252a101dca3ee663b917"},
+ {file = "fonttools-4.60.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9d3ea26957dd07209f207b4fff64c702efe5496de153a54d3b91007ec28904dd"},
+ {file = "fonttools-4.60.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1ee301273b0850f3a515299f212898f37421f42ff9adfc341702582ca5073c13"},
+ {file = "fonttools-4.60.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c6eb4694cc3b9c03b7c01d65a9cf35b577f21aa6abdbeeb08d3114b842a58153"},
+ {file = "fonttools-4.60.2-cp312-cp312-win32.whl", hash = "sha256:57f07b616c69c244cc1a5a51072eeef07dddda5ebef9ca5c6e9cf6d59ae65b70"},
+ {file = "fonttools-4.60.2-cp312-cp312-win_amd64.whl", hash = "sha256:310035802392f1fe5a7cf43d76f6ff4a24c919e4c72c0352e7b8176e2584b8a0"},
+ {file = "fonttools-4.60.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:2bb5fd231e56ccd7403212636dcccffc96c5ae0d6f9e4721fa0a32cb2e3ca432"},
+ {file = "fonttools-4.60.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:536b5fab7b6fec78ccf59b5c59489189d9d0a8b0d3a77ed1858be59afb096696"},
+ {file = "fonttools-4.60.2-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:6b9288fc38252ac86a9570f19313ecbc9ff678982e0f27c757a85f1f284d3400"},
+ {file = "fonttools-4.60.2-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:93fcb420791d839ef592eada2b69997c445d0ce9c969b5190f2e16828ec10607"},
+ {file = "fonttools-4.60.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:7916a381b094db4052ac284255186aebf74c5440248b78860cb41e300036f598"},
+ {file = "fonttools-4.60.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:58c8c393d5e16b15662cfc2d988491940458aa87894c662154f50c7b49440bef"},
+ {file = "fonttools-4.60.2-cp313-cp313-win32.whl", hash = "sha256:19c6e0afd8b02008caa0aa08ab896dfce5d0bcb510c49b2c499541d5cb95a963"},
+ {file = "fonttools-4.60.2-cp313-cp313-win_amd64.whl", hash = "sha256:6a500dc59e11b2338c2dba1f8cf11a4ae8be35ec24af8b2628b8759a61457b76"},
+ {file = "fonttools-4.60.2-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:9387c532acbe323bbf2a920f132bce3c408a609d5f9dcfc6532fbc7e37f8ccbb"},
+ {file = "fonttools-4.60.2-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:e6f1c824185b5b8fb681297f315f26ae55abb0d560c2579242feea8236b1cfef"},
+ {file = "fonttools-4.60.2-cp314-cp314-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:55a3129d1e4030b1a30260f1b32fe76781b585fb2111d04a988e141c09eb6403"},
+ {file = "fonttools-4.60.2-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b196e63753abc33b3b97a6fd6de4b7c4fef5552c0a5ba5e562be214d1e9668e0"},
+ {file = "fonttools-4.60.2-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:de76c8d740fb55745f3b154f0470c56db92ae3be27af8ad6c2e88f1458260c9a"},
+ {file = "fonttools-4.60.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:6ba6303225c95998c9fda2d410aa792c3d2c1390a09df58d194b03e17583fa25"},
+ {file = "fonttools-4.60.2-cp314-cp314-win32.whl", hash = "sha256:0a89728ce10d7c816fedaa5380c06d2793e7a8a634d7ce16810e536c22047384"},
+ {file = "fonttools-4.60.2-cp314-cp314-win_amd64.whl", hash = "sha256:fa8446e6ab8bd778b82cb1077058a2addba86f30de27ab9cc18ed32b34bc8667"},
+ {file = "fonttools-4.60.2-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:4063bc81ac5a4137642865cb63dd270e37b3cd1f55a07c0d6e41d072699ccca2"},
+ {file = "fonttools-4.60.2-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:ebfdb66fa69732ed604ab8e2a0431e6deff35e933a11d73418cbc7823d03b8e1"},
+ {file = "fonttools-4.60.2-cp314-cp314t-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:50b10b3b1a72d1d54c61b0e59239e1a94c0958f4a06a1febf97ce75388dd91a4"},
+ {file = "fonttools-4.60.2-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:beae16891a13b4a2ddec9b39b4de76092a3025e4d1c82362e3042b62295d5e4d"},
+ {file = "fonttools-4.60.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:522f017fdb3766fd5d2d321774ef351cc6ce88ad4e6ac9efe643e4a2b9d528db"},
+ {file = "fonttools-4.60.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:82cceceaf9c09a965a75b84a4b240dd3768e596ffb65ef53852681606fe7c9ba"},
+ {file = "fonttools-4.60.2-cp314-cp314t-win32.whl", hash = "sha256:bbfbc918a75437fe7e6d64d1b1e1f713237df1cf00f3a36dedae910b2ba01cee"},
+ {file = "fonttools-4.60.2-cp314-cp314t-win_amd64.whl", hash = "sha256:0e5cd9b0830f6550d58c84f3ab151a9892b50c4f9d538c5603c0ce6fff2eb3f1"},
+ {file = "fonttools-4.60.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:a3c75b8b42f7f93906bdba9eb1197bb76aecbe9a0a7cf6feec75f7605b5e8008"},
+ {file = "fonttools-4.60.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:0f86c8c37bc0ec0b9c141d5e90c717ff614e93c187f06d80f18c7057097f71bc"},
+ {file = "fonttools-4.60.2-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:fe905403fe59683b0e9a45f234af2866834376b8821f34633b1c76fb731b6311"},
+ {file = "fonttools-4.60.2-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:38ce703b60a906e421e12d9e3a7f064883f5e61bb23e8961f4be33cfe578500b"},
+ {file = "fonttools-4.60.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:9e810c06f3e79185cecf120e58b343ea5a89b54dd695fd644446bcf8c026da5e"},
+ {file = "fonttools-4.60.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:38faec8cc1d12122599814d15a402183f5123fb7608dac956121e7c6742aebc5"},
+ {file = "fonttools-4.60.2-cp39-cp39-win32.whl", hash = "sha256:80a45cf7bf659acb7b36578f300231873daba67bd3ca8cce181c73f861f14a37"},
+ {file = "fonttools-4.60.2-cp39-cp39-win_amd64.whl", hash = "sha256:c355d5972071938e1b1e0f5a1df001f68ecf1a62f34a3407dc8e0beccf052501"},
+ {file = "fonttools-4.60.2-py3-none-any.whl", hash = "sha256:73cf92eeda67cf6ff10c8af56fc8f4f07c1647d989a979be9e388a49be26552a"},
+ {file = "fonttools-4.60.2.tar.gz", hash = "sha256:d29552e6b155ebfc685b0aecf8d429cb76c14ab734c22ef5d3dea6fdf800c92c"},
+]
+
+[package.extras]
+all = ["brotli (>=1.0.1) ; platform_python_implementation == \"CPython\"", "brotlicffi (>=0.8.0) ; platform_python_implementation != \"CPython\"", "lxml (>=4.0)", "lz4 (>=1.7.4.2)", "matplotlib", "munkres ; platform_python_implementation == \"PyPy\"", "pycairo", "scipy ; platform_python_implementation != \"PyPy\"", "skia-pathops (>=0.5.0)", "sympy", "uharfbuzz (>=0.45.0)", "unicodedata2 (>=17.0.0) ; python_version <= \"3.14\"", "xattr ; sys_platform == \"darwin\"", "zopfli (>=0.1.4)"]
+graphite = ["lz4 (>=1.7.4.2)"]
+interpolatable = ["munkres ; platform_python_implementation == \"PyPy\"", "pycairo", "scipy ; platform_python_implementation != \"PyPy\""]
+lxml = ["lxml (>=4.0)"]
+pathops = ["skia-pathops (>=0.5.0)"]
+plot = ["matplotlib"]
+repacker = ["uharfbuzz (>=0.45.0)"]
+symfont = ["sympy"]
+type1 = ["xattr ; sys_platform == \"darwin\""]
+unicode = ["unicodedata2 (>=17.0.0) ; python_version <= \"3.14\""]
+woff = ["brotli (>=1.0.1) ; platform_python_implementation == \"CPython\"", "brotlicffi (>=0.8.0) ; platform_python_implementation != \"CPython\"", "zopfli (>=0.1.4)"]
+
+[[package]]
+name = "fonttools"
+version = "4.61.1"
+description = "Tools to manipulate font files"
+optional = false
+python-versions = ">=3.10"
+groups = ["main", "dev"]
+markers = "python_version >= \"3.10\""
+files = [
+ {file = "fonttools-4.61.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7c7db70d57e5e1089a274cbb2b1fd635c9a24de809a231b154965d415d6c6d24"},
+ {file = "fonttools-4.61.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5fe9fd43882620017add5eabb781ebfbc6998ee49b35bd7f8f79af1f9f99a958"},
+ {file = "fonttools-4.61.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d8db08051fc9e7d8bc622f2112511b8107d8f27cd89e2f64ec45e9825e8288da"},
+ {file = "fonttools-4.61.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a76d4cb80f41ba94a6691264be76435e5f72f2cb3cab0b092a6212855f71c2f6"},
+ {file = "fonttools-4.61.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a13fc8aeb24bad755eea8f7f9d409438eb94e82cf86b08fe77a03fbc8f6a96b1"},
+ {file = "fonttools-4.61.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b846a1fcf8beadeb9ea4f44ec5bdde393e2f1569e17d700bfc49cd69bde75881"},
+ {file = "fonttools-4.61.1-cp310-cp310-win32.whl", hash = "sha256:78a7d3ab09dc47ac1a363a493e6112d8cabed7ba7caad5f54dbe2f08676d1b47"},
+ {file = "fonttools-4.61.1-cp310-cp310-win_amd64.whl", hash = "sha256:eff1ac3cc66c2ac7cda1e64b4e2f3ffef474b7335f92fc3833fc632d595fcee6"},
+ {file = "fonttools-4.61.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c6604b735bb12fef8e0efd5578c9fb5d3d8532d5001ea13a19cddf295673ee09"},
+ {file = "fonttools-4.61.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5ce02f38a754f207f2f06557523cd39a06438ba3aafc0639c477ac409fc64e37"},
+ {file = "fonttools-4.61.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:77efb033d8d7ff233385f30c62c7c79271c8885d5c9657d967ede124671bbdfb"},
+ {file = "fonttools-4.61.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:75c1a6dfac6abd407634420c93864a1e274ebc1c7531346d9254c0d8f6ca00f9"},
+ {file = "fonttools-4.61.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0de30bfe7745c0d1ffa2b0b7048fb7123ad0d71107e10ee090fa0b16b9452e87"},
+ {file = "fonttools-4.61.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:58b0ee0ab5b1fc9921eccfe11d1435added19d6494dde14e323f25ad2bc30c56"},
+ {file = "fonttools-4.61.1-cp311-cp311-win32.whl", hash = "sha256:f79b168428351d11e10c5aeb61a74e1851ec221081299f4cf56036a95431c43a"},
+ {file = "fonttools-4.61.1-cp311-cp311-win_amd64.whl", hash = "sha256:fe2efccb324948a11dd09d22136fe2ac8a97d6c1347cf0b58a911dcd529f66b7"},
+ {file = "fonttools-4.61.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:f3cb4a569029b9f291f88aafc927dd53683757e640081ca8c412781ea144565e"},
+ {file = "fonttools-4.61.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:41a7170d042e8c0024703ed13b71893519a1a6d6e18e933e3ec7507a2c26a4b2"},
+ {file = "fonttools-4.61.1-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:10d88e55330e092940584774ee5e8a6971b01fc2f4d3466a1d6c158230880796"},
+ {file = "fonttools-4.61.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:15acc09befd16a0fb8a8f62bc147e1a82817542d72184acca9ce6e0aeda9fa6d"},
+ {file = "fonttools-4.61.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e6bcdf33aec38d16508ce61fd81838f24c83c90a1d1b8c68982857038673d6b8"},
+ {file = "fonttools-4.61.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5fade934607a523614726119164ff621e8c30e8fa1ffffbbd358662056ba69f0"},
+ {file = "fonttools-4.61.1-cp312-cp312-win32.whl", hash = "sha256:75da8f28eff26defba42c52986de97b22106cb8f26515b7c22443ebc9c2d3261"},
+ {file = "fonttools-4.61.1-cp312-cp312-win_amd64.whl", hash = "sha256:497c31ce314219888c0e2fce5ad9178ca83fe5230b01a5006726cdf3ac9f24d9"},
+ {file = "fonttools-4.61.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:8c56c488ab471628ff3bfa80964372fc13504ece601e0d97a78ee74126b2045c"},
+ {file = "fonttools-4.61.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:dc492779501fa723b04d0ab1f5be046797fee17d27700476edc7ee9ae535a61e"},
+ {file = "fonttools-4.61.1-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:64102ca87e84261419c3747a0d20f396eb024bdbeb04c2bfb37e2891f5fadcb5"},
+ {file = "fonttools-4.61.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4c1b526c8d3f615a7b1867f38a9410849c8f4aef078535742198e942fba0e9bd"},
+ {file = "fonttools-4.61.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:41ed4b5ec103bd306bb68f81dc166e77409e5209443e5773cb4ed837bcc9b0d3"},
+ {file = "fonttools-4.61.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:b501c862d4901792adaec7c25b1ecc749e2662543f68bb194c42ba18d6eec98d"},
+ {file = "fonttools-4.61.1-cp313-cp313-win32.whl", hash = "sha256:4d7092bb38c53bbc78e9255a59158b150bcdc115a1e3b3ce0b5f267dc35dd63c"},
+ {file = "fonttools-4.61.1-cp313-cp313-win_amd64.whl", hash = "sha256:21e7c8d76f62ab13c9472ccf74515ca5b9a761d1bde3265152a6dc58700d895b"},
+ {file = "fonttools-4.61.1-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:fff4f534200a04b4a36e7ae3cb74493afe807b517a09e99cb4faa89a34ed6ecd"},
+ {file = "fonttools-4.61.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:d9203500f7c63545b4ce3799319fe4d9feb1a1b89b28d3cb5abd11b9dd64147e"},
+ {file = "fonttools-4.61.1-cp314-cp314-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:fa646ecec9528bef693415c79a86e733c70a4965dd938e9a226b0fc64c9d2e6c"},
+ {file = "fonttools-4.61.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:11f35ad7805edba3aac1a3710d104592df59f4b957e30108ae0ba6c10b11dd75"},
+ {file = "fonttools-4.61.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:b931ae8f62db78861b0ff1ac017851764602288575d65b8e8ff1963fed419063"},
+ {file = "fonttools-4.61.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:b148b56f5de675ee16d45e769e69f87623a4944f7443850bf9a9376e628a89d2"},
+ {file = "fonttools-4.61.1-cp314-cp314-win32.whl", hash = "sha256:9b666a475a65f4e839d3d10473fad6d47e0a9db14a2f4a224029c5bfde58ad2c"},
+ {file = "fonttools-4.61.1-cp314-cp314-win_amd64.whl", hash = "sha256:4f5686e1fe5fce75d82d93c47a438a25bf0d1319d2843a926f741140b2b16e0c"},
+ {file = "fonttools-4.61.1-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:e76ce097e3c57c4bcb67c5aa24a0ecdbd9f74ea9219997a707a4061fbe2707aa"},
+ {file = "fonttools-4.61.1-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:9cfef3ab326780c04d6646f68d4b4742aae222e8b8ea1d627c74e38afcbc9d91"},
+ {file = "fonttools-4.61.1-cp314-cp314t-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:a75c301f96db737e1c5ed5fd7d77d9c34466de16095a266509e13da09751bd19"},
+ {file = "fonttools-4.61.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:91669ccac46bbc1d09e9273546181919064e8df73488ea087dcac3e2968df9ba"},
+ {file = "fonttools-4.61.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:c33ab3ca9d3ccd581d58e989d67554e42d8d4ded94ab3ade3508455fe70e65f7"},
+ {file = "fonttools-4.61.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:664c5a68ec406f6b1547946683008576ef8b38275608e1cee6c061828171c118"},
+ {file = "fonttools-4.61.1-cp314-cp314t-win32.whl", hash = "sha256:aed04cabe26f30c1647ef0e8fbb207516fd40fe9472e9439695f5c6998e60ac5"},
+ {file = "fonttools-4.61.1-cp314-cp314t-win_amd64.whl", hash = "sha256:2180f14c141d2f0f3da43f3a81bc8aa4684860f6b0e6f9e165a4831f24e6a23b"},
+ {file = "fonttools-4.61.1-py3-none-any.whl", hash = "sha256:17d2bf5d541add43822bcf0c43d7d847b160c9bb01d15d5007d84e2217aaa371"},
+ {file = "fonttools-4.61.1.tar.gz", hash = "sha256:6675329885c44657f826ef01d9e4fb33b9158e9d93c537d84ad8399539bc6f69"},
+]
+
+[package.extras]
+all = ["brotli (>=1.0.1) ; platform_python_implementation == \"CPython\"", "brotlicffi (>=0.8.0) ; platform_python_implementation != \"CPython\"", "lxml (>=4.0)", "lz4 (>=1.7.4.2)", "matplotlib", "munkres ; platform_python_implementation == \"PyPy\"", "pycairo", "scipy ; platform_python_implementation != \"PyPy\"", "skia-pathops (>=0.5.0)", "sympy", "uharfbuzz (>=0.45.0)", "unicodedata2 (>=17.0.0) ; python_version <= \"3.14\"", "xattr ; sys_platform == \"darwin\"", "zopfli (>=0.1.4)"]
+graphite = ["lz4 (>=1.7.4.2)"]
+interpolatable = ["munkres ; platform_python_implementation == \"PyPy\"", "pycairo", "scipy ; platform_python_implementation != \"PyPy\""]
+lxml = ["lxml (>=4.0)"]
+pathops = ["skia-pathops (>=0.5.0)"]
+plot = ["matplotlib"]
+repacker = ["uharfbuzz (>=0.45.0)"]
+symfont = ["sympy"]
+type1 = ["xattr ; sys_platform == \"darwin\""]
+unicode = ["unicodedata2 (>=17.0.0) ; python_version <= \"3.14\""]
+woff = ["brotli (>=1.0.1) ; platform_python_implementation == \"CPython\"", "brotlicffi (>=0.8.0) ; platform_python_implementation != \"CPython\"", "zopfli (>=0.1.4)"]
+
[[package]]
name = "googleapis-common-protos"
version = "1.72.0"
description = "Common protobufs used in Google APIs"
optional = false
python-versions = ">=3.7"
+groups = ["main", "dev"]
files = [
{file = "googleapis_common_protos-1.72.0-py3-none-any.whl", hash = "sha256:4299c5a82d5ae1a9702ada957347726b167f9f8d1fc352477702a1e851ff4038"},
{file = "googleapis_common_protos-1.72.0.tar.gz", hash = "sha256:e55a601c1b32b52d7a3e65f43563e2aa61bcd737998ee672ac9b951cd49319f5"},
@@ -274,6 +771,7 @@ version = "1.76.0"
description = "HTTP/2-based RPC framework"
optional = false
python-versions = ">=3.9"
+groups = ["main", "dev"]
files = [
{file = "grpcio-1.76.0-cp310-cp310-linux_armv7l.whl", hash = "sha256:65a20de41e85648e00305c1bb09a3598f840422e522277641145a32d42dcefcc"},
{file = "grpcio-1.76.0-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:40ad3afe81676fd9ec6d9d406eda00933f218038433980aa19d401490e46ecde"},
@@ -350,6 +848,7 @@ version = "1.76.0"
description = "Status proto mapping for gRPC"
optional = false
python-versions = ">=3.9"
+groups = ["main", "dev"]
files = [
{file = "grpcio_status-1.76.0-py3-none-any.whl", hash = "sha256:380568794055a8efbbd8871162df92012e0228a5f6dffaf57f2a00c534103b18"},
{file = "grpcio_status-1.76.0.tar.gz", hash = "sha256:25fcbfec74c15d1a1cb5da3fab8ee9672852dc16a5a9eeb5baf7d7a9952943cd"},
@@ -366,6 +865,7 @@ version = "2.6.15"
description = "File identification library for Python"
optional = false
python-versions = ">=3.9"
+groups = ["main", "dev"]
files = [
{file = "identify-2.6.15-py2.py3-none-any.whl", hash = "sha256:1181ef7608e00704db228516541eb83a88a9f94433a8c80bb9b5bd54b1d81757"},
{file = "identify-2.6.15.tar.gz", hash = "sha256:e4f4864b96c6557ef2a1e1c951771838f4edc9df3a72ec7118b338801b11c7bf"},
@@ -374,23 +874,433 @@ files = [
[package.extras]
license = ["ukkonen"]
+[[package]]
+name = "importlib-resources"
+version = "6.5.2"
+description = "Read resources from Python packages"
+optional = false
+python-versions = ">=3.9"
+groups = ["main", "dev"]
+markers = "python_version == \"3.9\""
+files = [
+ {file = "importlib_resources-6.5.2-py3-none-any.whl", hash = "sha256:789cfdc3ed28c78b67a06acb8126751ced69a3d5f79c095a98298cd8a760ccec"},
+ {file = "importlib_resources-6.5.2.tar.gz", hash = "sha256:185f87adef5bcc288449d98fb4fba07cea78bc036455dd44c5fc4a2fe78fed2c"},
+]
+
+[package.dependencies]
+zipp = {version = ">=3.1.0", markers = "python_version < \"3.10\""}
+
+[package.extras]
+check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\""]
+cover = ["pytest-cov"]
+doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
+enabler = ["pytest-enabler (>=2.2)"]
+test = ["jaraco.test (>=5.4)", "pytest (>=6,!=8.1.*)", "zipp (>=3.17)"]
+type = ["pytest-mypy"]
+
[[package]]
name = "iniconfig"
version = "2.1.0"
description = "brain-dead simple config-ini parsing"
optional = false
python-versions = ">=3.8"
+groups = ["main", "dev"]
files = [
{file = "iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760"},
{file = "iniconfig-2.1.0.tar.gz", hash = "sha256:3abbd2e30b36733fee78f9c7f7308f2d0050e88f0087fd25c2645f63c773e1c7"},
]
+[[package]]
+name = "kiwisolver"
+version = "1.4.7"
+description = "A fast implementation of the Cassowary constraint solver"
+optional = false
+python-versions = ">=3.8"
+groups = ["main", "dev"]
+markers = "python_version == \"3.9\""
+files = [
+ {file = "kiwisolver-1.4.7-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:8a9c83f75223d5e48b0bc9cb1bf2776cf01563e00ade8775ffe13b0b6e1af3a6"},
+ {file = "kiwisolver-1.4.7-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:58370b1ffbd35407444d57057b57da5d6549d2d854fa30249771775c63b5fe17"},
+ {file = "kiwisolver-1.4.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:aa0abdf853e09aff551db11fce173e2177d00786c688203f52c87ad7fcd91ef9"},
+ {file = "kiwisolver-1.4.7-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:8d53103597a252fb3ab8b5845af04c7a26d5e7ea8122303dd7a021176a87e8b9"},
+ {file = "kiwisolver-1.4.7-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:88f17c5ffa8e9462fb79f62746428dd57b46eb931698e42e990ad63103f35e6c"},
+ {file = "kiwisolver-1.4.7-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:88a9ca9c710d598fd75ee5de59d5bda2684d9db36a9f50b6125eaea3969c2599"},
+ {file = "kiwisolver-1.4.7-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f4d742cb7af1c28303a51b7a27aaee540e71bb8e24f68c736f6f2ffc82f2bf05"},
+ {file = "kiwisolver-1.4.7-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e28c7fea2196bf4c2f8d46a0415c77a1c480cc0724722f23d7410ffe9842c407"},
+ {file = "kiwisolver-1.4.7-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:e968b84db54f9d42046cf154e02911e39c0435c9801681e3fc9ce8a3c4130278"},
+ {file = "kiwisolver-1.4.7-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:0c18ec74c0472de033e1bebb2911c3c310eef5649133dd0bedf2a169a1b269e5"},
+ {file = "kiwisolver-1.4.7-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:8f0ea6da6d393d8b2e187e6a5e3fb81f5862010a40c3945e2c6d12ae45cfb2ad"},
+ {file = "kiwisolver-1.4.7-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:f106407dda69ae456dd1227966bf445b157ccc80ba0dff3802bb63f30b74e895"},
+ {file = "kiwisolver-1.4.7-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:84ec80df401cfee1457063732d90022f93951944b5b58975d34ab56bb150dfb3"},
+ {file = "kiwisolver-1.4.7-cp310-cp310-win32.whl", hash = "sha256:71bb308552200fb2c195e35ef05de12f0c878c07fc91c270eb3d6e41698c3bcc"},
+ {file = "kiwisolver-1.4.7-cp310-cp310-win_amd64.whl", hash = "sha256:44756f9fd339de0fb6ee4f8c1696cfd19b2422e0d70b4cefc1cc7f1f64045a8c"},
+ {file = "kiwisolver-1.4.7-cp310-cp310-win_arm64.whl", hash = "sha256:78a42513018c41c2ffd262eb676442315cbfe3c44eed82385c2ed043bc63210a"},
+ {file = "kiwisolver-1.4.7-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:d2b0e12a42fb4e72d509fc994713d099cbb15ebf1103545e8a45f14da2dfca54"},
+ {file = "kiwisolver-1.4.7-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2a8781ac3edc42ea4b90bc23e7d37b665d89423818e26eb6df90698aa2287c95"},
+ {file = "kiwisolver-1.4.7-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:46707a10836894b559e04b0fd143e343945c97fd170d69a2d26d640b4e297935"},
+ {file = "kiwisolver-1.4.7-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ef97b8df011141c9b0f6caf23b29379f87dd13183c978a30a3c546d2c47314cb"},
+ {file = "kiwisolver-1.4.7-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3ab58c12a2cd0fc769089e6d38466c46d7f76aced0a1f54c77652446733d2d02"},
+ {file = "kiwisolver-1.4.7-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:803b8e1459341c1bb56d1c5c010406d5edec8a0713a0945851290a7930679b51"},
+ {file = "kiwisolver-1.4.7-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f9a9e8a507420fe35992ee9ecb302dab68550dedc0da9e2880dd88071c5fb052"},
+ {file = "kiwisolver-1.4.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:18077b53dc3bb490e330669a99920c5e6a496889ae8c63b58fbc57c3d7f33a18"},
+ {file = "kiwisolver-1.4.7-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6af936f79086a89b3680a280c47ea90b4df7047b5bdf3aa5c524bbedddb9e545"},
+ {file = "kiwisolver-1.4.7-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:3abc5b19d24af4b77d1598a585b8a719beb8569a71568b66f4ebe1fb0449460b"},
+ {file = "kiwisolver-1.4.7-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:933d4de052939d90afbe6e9d5273ae05fb836cc86c15b686edd4b3560cc0ee36"},
+ {file = "kiwisolver-1.4.7-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:65e720d2ab2b53f1f72fb5da5fb477455905ce2c88aaa671ff0a447c2c80e8e3"},
+ {file = "kiwisolver-1.4.7-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:3bf1ed55088f214ba6427484c59553123fdd9b218a42bbc8c6496d6754b1e523"},
+ {file = "kiwisolver-1.4.7-cp311-cp311-win32.whl", hash = "sha256:4c00336b9dd5ad96d0a558fd18a8b6f711b7449acce4c157e7343ba92dd0cf3d"},
+ {file = "kiwisolver-1.4.7-cp311-cp311-win_amd64.whl", hash = "sha256:929e294c1ac1e9f615c62a4e4313ca1823ba37326c164ec720a803287c4c499b"},
+ {file = "kiwisolver-1.4.7-cp311-cp311-win_arm64.whl", hash = "sha256:e33e8fbd440c917106b237ef1a2f1449dfbb9b6f6e1ce17c94cd6a1e0d438376"},
+ {file = "kiwisolver-1.4.7-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:5360cc32706dab3931f738d3079652d20982511f7c0ac5711483e6eab08efff2"},
+ {file = "kiwisolver-1.4.7-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:942216596dc64ddb25adb215c3c783215b23626f8d84e8eff8d6d45c3f29f75a"},
+ {file = "kiwisolver-1.4.7-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:48b571ecd8bae15702e4f22d3ff6a0f13e54d3d00cd25216d5e7f658242065ee"},
+ {file = "kiwisolver-1.4.7-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ad42ba922c67c5f219097b28fae965e10045ddf145d2928bfac2eb2e17673640"},
+ {file = "kiwisolver-1.4.7-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:612a10bdae23404a72941a0fc8fa2660c6ea1217c4ce0dbcab8a8f6543ea9e7f"},
+ {file = "kiwisolver-1.4.7-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9e838bba3a3bac0fe06d849d29772eb1afb9745a59710762e4ba3f4cb8424483"},
+ {file = "kiwisolver-1.4.7-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:22f499f6157236c19f4bbbd472fa55b063db77a16cd74d49afe28992dff8c258"},
+ {file = "kiwisolver-1.4.7-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:693902d433cf585133699972b6d7c42a8b9f8f826ebcaf0132ff55200afc599e"},
+ {file = "kiwisolver-1.4.7-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:4e77f2126c3e0b0d055f44513ed349038ac180371ed9b52fe96a32aa071a5107"},
+ {file = "kiwisolver-1.4.7-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:657a05857bda581c3656bfc3b20e353c232e9193eb167766ad2dc58b56504948"},
+ {file = "kiwisolver-1.4.7-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:4bfa75a048c056a411f9705856abfc872558e33c055d80af6a380e3658766038"},
+ {file = "kiwisolver-1.4.7-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:34ea1de54beef1c104422d210c47c7d2a4999bdecf42c7b5718fbe59a4cac383"},
+ {file = "kiwisolver-1.4.7-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:90da3b5f694b85231cf93586dad5e90e2d71b9428f9aad96952c99055582f520"},
+ {file = "kiwisolver-1.4.7-cp312-cp312-win32.whl", hash = "sha256:18e0cca3e008e17fe9b164b55735a325140a5a35faad8de92dd80265cd5eb80b"},
+ {file = "kiwisolver-1.4.7-cp312-cp312-win_amd64.whl", hash = "sha256:58cb20602b18f86f83a5c87d3ee1c766a79c0d452f8def86d925e6c60fbf7bfb"},
+ {file = "kiwisolver-1.4.7-cp312-cp312-win_arm64.whl", hash = "sha256:f5a8b53bdc0b3961f8b6125e198617c40aeed638b387913bf1ce78afb1b0be2a"},
+ {file = "kiwisolver-1.4.7-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:2e6039dcbe79a8e0f044f1c39db1986a1b8071051efba3ee4d74f5b365f5226e"},
+ {file = "kiwisolver-1.4.7-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:a1ecf0ac1c518487d9d23b1cd7139a6a65bc460cd101ab01f1be82ecf09794b6"},
+ {file = "kiwisolver-1.4.7-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:7ab9ccab2b5bd5702ab0803676a580fffa2aa178c2badc5557a84cc943fcf750"},
+ {file = "kiwisolver-1.4.7-cp313-cp313-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f816dd2277f8d63d79f9c8473a79fe54047bc0467754962840782c575522224d"},
+ {file = "kiwisolver-1.4.7-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cf8bcc23ceb5a1b624572a1623b9f79d2c3b337c8c455405ef231933a10da379"},
+ {file = "kiwisolver-1.4.7-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:dea0bf229319828467d7fca8c7c189780aa9ff679c94539eed7532ebe33ed37c"},
+ {file = "kiwisolver-1.4.7-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7c06a4c7cf15ec739ce0e5971b26c93638730090add60e183530d70848ebdd34"},
+ {file = "kiwisolver-1.4.7-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:913983ad2deb14e66d83c28b632fd35ba2b825031f2fa4ca29675e665dfecbe1"},
+ {file = "kiwisolver-1.4.7-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:5337ec7809bcd0f424c6b705ecf97941c46279cf5ed92311782c7c9c2026f07f"},
+ {file = "kiwisolver-1.4.7-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:4c26ed10c4f6fa6ddb329a5120ba3b6db349ca192ae211e882970bfc9d91420b"},
+ {file = "kiwisolver-1.4.7-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:c619b101e6de2222c1fcb0531e1b17bbffbe54294bfba43ea0d411d428618c27"},
+ {file = "kiwisolver-1.4.7-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:073a36c8273647592ea332e816e75ef8da5c303236ec0167196793eb1e34657a"},
+ {file = "kiwisolver-1.4.7-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:3ce6b2b0231bda412463e152fc18335ba32faf4e8c23a754ad50ffa70e4091ee"},
+ {file = "kiwisolver-1.4.7-cp313-cp313-win32.whl", hash = "sha256:f4c9aee212bc89d4e13f58be11a56cc8036cabad119259d12ace14b34476fd07"},
+ {file = "kiwisolver-1.4.7-cp313-cp313-win_amd64.whl", hash = "sha256:8a3ec5aa8e38fc4c8af308917ce12c536f1c88452ce554027e55b22cbbfbff76"},
+ {file = "kiwisolver-1.4.7-cp313-cp313-win_arm64.whl", hash = "sha256:76c8094ac20ec259471ac53e774623eb62e6e1f56cd8690c67ce6ce4fcb05650"},
+ {file = "kiwisolver-1.4.7-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5d5abf8f8ec1f4e22882273c423e16cae834c36856cac348cfbfa68e01c40f3a"},
+ {file = "kiwisolver-1.4.7-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:aeb3531b196ef6f11776c21674dba836aeea9d5bd1cf630f869e3d90b16cfade"},
+ {file = "kiwisolver-1.4.7-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b7d755065e4e866a8086c9bdada157133ff466476a2ad7861828e17b6026e22c"},
+ {file = "kiwisolver-1.4.7-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:08471d4d86cbaec61f86b217dd938a83d85e03785f51121e791a6e6689a3be95"},
+ {file = "kiwisolver-1.4.7-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7bbfcb7165ce3d54a3dfbe731e470f65739c4c1f85bb1018ee912bae139e263b"},
+ {file = "kiwisolver-1.4.7-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5d34eb8494bea691a1a450141ebb5385e4b69d38bb8403b5146ad279f4b30fa3"},
+ {file = "kiwisolver-1.4.7-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9242795d174daa40105c1d86aba618e8eab7bf96ba8c3ee614da8302a9f95503"},
+ {file = "kiwisolver-1.4.7-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:a0f64a48bb81af7450e641e3fe0b0394d7381e342805479178b3d335d60ca7cf"},
+ {file = "kiwisolver-1.4.7-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:8e045731a5416357638d1700927529e2b8ab304811671f665b225f8bf8d8f933"},
+ {file = "kiwisolver-1.4.7-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:4322872d5772cae7369f8351da1edf255a604ea7087fe295411397d0cfd9655e"},
+ {file = "kiwisolver-1.4.7-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:e1631290ee9271dffe3062d2634c3ecac02c83890ada077d225e081aca8aab89"},
+ {file = "kiwisolver-1.4.7-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:edcfc407e4eb17e037bca59be0e85a2031a2ac87e4fed26d3e9df88b4165f92d"},
+ {file = "kiwisolver-1.4.7-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:4d05d81ecb47d11e7f8932bd8b61b720bf0b41199358f3f5e36d38e28f0532c5"},
+ {file = "kiwisolver-1.4.7-cp38-cp38-win32.whl", hash = "sha256:b38ac83d5f04b15e515fd86f312479d950d05ce2368d5413d46c088dda7de90a"},
+ {file = "kiwisolver-1.4.7-cp38-cp38-win_amd64.whl", hash = "sha256:d83db7cde68459fc803052a55ace60bea2bae361fc3b7a6d5da07e11954e4b09"},
+ {file = "kiwisolver-1.4.7-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:3f9362ecfca44c863569d3d3c033dbe8ba452ff8eed6f6b5806382741a1334bd"},
+ {file = "kiwisolver-1.4.7-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e8df2eb9b2bac43ef8b082e06f750350fbbaf2887534a5be97f6cf07b19d9583"},
+ {file = "kiwisolver-1.4.7-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f32d6edbc638cde7652bd690c3e728b25332acbadd7cad670cc4a02558d9c417"},
+ {file = "kiwisolver-1.4.7-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:e2e6c39bd7b9372b0be21456caab138e8e69cc0fc1190a9dfa92bd45a1e6e904"},
+ {file = "kiwisolver-1.4.7-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:dda56c24d869b1193fcc763f1284b9126550eaf84b88bbc7256e15028f19188a"},
+ {file = "kiwisolver-1.4.7-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:79849239c39b5e1fd906556c474d9b0439ea6792b637511f3fe3a41158d89ca8"},
+ {file = "kiwisolver-1.4.7-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5e3bc157fed2a4c02ec468de4ecd12a6e22818d4f09cde2c31ee3226ffbefab2"},
+ {file = "kiwisolver-1.4.7-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3da53da805b71e41053dc670f9a820d1157aae77b6b944e08024d17bcd51ef88"},
+ {file = "kiwisolver-1.4.7-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:8705f17dfeb43139a692298cb6637ee2e59c0194538153e83e9ee0c75c2eddde"},
+ {file = "kiwisolver-1.4.7-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:82a5c2f4b87c26bb1a0ef3d16b5c4753434633b83d365cc0ddf2770c93829e3c"},
+ {file = "kiwisolver-1.4.7-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:ce8be0466f4c0d585cdb6c1e2ed07232221df101a4c6f28821d2aa754ca2d9e2"},
+ {file = "kiwisolver-1.4.7-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:409afdfe1e2e90e6ee7fc896f3df9a7fec8e793e58bfa0d052c8a82f99c37abb"},
+ {file = "kiwisolver-1.4.7-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:5b9c3f4ee0b9a439d2415012bd1b1cc2df59e4d6a9939f4d669241d30b414327"},
+ {file = "kiwisolver-1.4.7-cp39-cp39-win32.whl", hash = "sha256:a79ae34384df2b615eefca647a2873842ac3b596418032bef9a7283675962644"},
+ {file = "kiwisolver-1.4.7-cp39-cp39-win_amd64.whl", hash = "sha256:cf0438b42121a66a3a667de17e779330fc0f20b0d97d59d2f2121e182b0505e4"},
+ {file = "kiwisolver-1.4.7-cp39-cp39-win_arm64.whl", hash = "sha256:764202cc7e70f767dab49e8df52c7455e8de0df5d858fa801a11aa0d882ccf3f"},
+ {file = "kiwisolver-1.4.7-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:94252291e3fe68001b1dd747b4c0b3be12582839b95ad4d1b641924d68fd4643"},
+ {file = "kiwisolver-1.4.7-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:5b7dfa3b546da08a9f622bb6becdb14b3e24aaa30adba66749d38f3cc7ea9706"},
+ {file = "kiwisolver-1.4.7-pp310-pypy310_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bd3de6481f4ed8b734da5df134cd5a6a64fe32124fe83dde1e5b5f29fe30b1e6"},
+ {file = "kiwisolver-1.4.7-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a91b5f9f1205845d488c928e8570dcb62b893372f63b8b6e98b863ebd2368ff2"},
+ {file = "kiwisolver-1.4.7-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:40fa14dbd66b8b8f470d5fc79c089a66185619d31645f9b0773b88b19f7223c4"},
+ {file = "kiwisolver-1.4.7-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:eb542fe7933aa09d8d8f9d9097ef37532a7df6497819d16efe4359890a2f417a"},
+ {file = "kiwisolver-1.4.7-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:bfa1acfa0c54932d5607e19a2c24646fb4c1ae2694437789129cf099789a3b00"},
+ {file = "kiwisolver-1.4.7-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:eee3ea935c3d227d49b4eb85660ff631556841f6e567f0f7bda972df6c2c9935"},
+ {file = "kiwisolver-1.4.7-pp38-pypy38_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:f3160309af4396e0ed04db259c3ccbfdc3621b5559b5453075e5de555e1f3a1b"},
+ {file = "kiwisolver-1.4.7-pp38-pypy38_pp73-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:a17f6a29cf8935e587cc8a4dbfc8368c55edc645283db0ce9801016f83526c2d"},
+ {file = "kiwisolver-1.4.7-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:10849fb2c1ecbfae45a693c070e0320a91b35dd4bcf58172c023b994283a124d"},
+ {file = "kiwisolver-1.4.7-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:ac542bf38a8a4be2dc6b15248d36315ccc65f0743f7b1a76688ffb6b5129a5c2"},
+ {file = "kiwisolver-1.4.7-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:8b01aac285f91ca889c800042c35ad3b239e704b150cfd3382adfc9dcc780e39"},
+ {file = "kiwisolver-1.4.7-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:48be928f59a1f5c8207154f935334d374e79f2b5d212826307d072595ad76a2e"},
+ {file = "kiwisolver-1.4.7-pp39-pypy39_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f37cfe618a117e50d8c240555331160d73d0411422b59b5ee217843d7b693608"},
+ {file = "kiwisolver-1.4.7-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:599b5c873c63a1f6ed7eead644a8a380cfbdf5db91dcb6f85707aaab213b1674"},
+ {file = "kiwisolver-1.4.7-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:801fa7802e5cfabe3ab0c81a34c323a319b097dfb5004be950482d882f3d7225"},
+ {file = "kiwisolver-1.4.7-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:0c6c43471bc764fad4bc99c5c2d6d16a676b1abf844ca7c8702bdae92df01ee0"},
+ {file = "kiwisolver-1.4.7.tar.gz", hash = "sha256:9893ff81bd7107f7b685d3017cc6583daadb4fc26e4a888350df530e41980a60"},
+]
+
+[[package]]
+name = "kiwisolver"
+version = "1.4.9"
+description = "A fast implementation of the Cassowary constraint solver"
+optional = false
+python-versions = ">=3.10"
+groups = ["main", "dev"]
+markers = "python_version >= \"3.10\""
+files = [
+ {file = "kiwisolver-1.4.9-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:b4b4d74bda2b8ebf4da5bd42af11d02d04428b2c32846e4c2c93219df8a7987b"},
+ {file = "kiwisolver-1.4.9-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:fb3b8132019ea572f4611d770991000d7f58127560c4889729248eb5852a102f"},
+ {file = "kiwisolver-1.4.9-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:84fd60810829c27ae375114cd379da1fa65e6918e1da405f356a775d49a62bcf"},
+ {file = "kiwisolver-1.4.9-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:b78efa4c6e804ecdf727e580dbb9cba85624d2e1c6b5cb059c66290063bd99a9"},
+ {file = "kiwisolver-1.4.9-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d4efec7bcf21671db6a3294ff301d2fc861c31faa3c8740d1a94689234d1b415"},
+ {file = "kiwisolver-1.4.9-cp310-cp310-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:90f47e70293fc3688b71271100a1a5453aa9944a81d27ff779c108372cf5567b"},
+ {file = "kiwisolver-1.4.9-cp310-cp310-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:8fdca1def57a2e88ef339de1737a1449d6dbf5fab184c54a1fca01d541317154"},
+ {file = "kiwisolver-1.4.9-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:9cf554f21be770f5111a1690d42313e140355e687e05cf82cb23d0a721a64a48"},
+ {file = "kiwisolver-1.4.9-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:fc1795ac5cd0510207482c3d1d3ed781143383b8cfd36f5c645f3897ce066220"},
+ {file = "kiwisolver-1.4.9-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:ccd09f20ccdbbd341b21a67ab50a119b64a403b09288c27481575105283c1586"},
+ {file = "kiwisolver-1.4.9-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:540c7c72324d864406a009d72f5d6856f49693db95d1fbb46cf86febef873634"},
+ {file = "kiwisolver-1.4.9-cp310-cp310-win_amd64.whl", hash = "sha256:ede8c6d533bc6601a47ad4046080d36b8fc99f81e6f1c17b0ac3c2dc91ac7611"},
+ {file = "kiwisolver-1.4.9-cp310-cp310-win_arm64.whl", hash = "sha256:7b4da0d01ac866a57dd61ac258c5607b4cd677f63abaec7b148354d2b2cdd536"},
+ {file = "kiwisolver-1.4.9-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:eb14a5da6dc7642b0f3a18f13654847cd8b7a2550e2645a5bda677862b03ba16"},
+ {file = "kiwisolver-1.4.9-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:39a219e1c81ae3b103643d2aedb90f1ef22650deb266ff12a19e7773f3e5f089"},
+ {file = "kiwisolver-1.4.9-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2405a7d98604b87f3fc28b1716783534b1b4b8510d8142adca34ee0bc3c87543"},
+ {file = "kiwisolver-1.4.9-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:dc1ae486f9abcef254b5618dfb4113dd49f94c68e3e027d03cf0143f3f772b61"},
+ {file = "kiwisolver-1.4.9-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8a1f570ce4d62d718dce3f179ee78dac3b545ac16c0c04bb363b7607a949c0d1"},
+ {file = "kiwisolver-1.4.9-cp311-cp311-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:cb27e7b78d716c591e88e0a09a2139c6577865d7f2e152488c2cc6257f460872"},
+ {file = "kiwisolver-1.4.9-cp311-cp311-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:15163165efc2f627eb9687ea5f3a28137217d217ac4024893d753f46bce9de26"},
+ {file = "kiwisolver-1.4.9-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:bdee92c56a71d2b24c33a7d4c2856bd6419d017e08caa7802d2963870e315028"},
+ {file = "kiwisolver-1.4.9-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:412f287c55a6f54b0650bd9b6dce5aceddb95864a1a90c87af16979d37c89771"},
+ {file = "kiwisolver-1.4.9-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:2c93f00dcba2eea70af2be5f11a830a742fe6b579a1d4e00f47760ef13be247a"},
+ {file = "kiwisolver-1.4.9-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f117e1a089d9411663a3207ba874f31be9ac8eaa5b533787024dc07aeb74f464"},
+ {file = "kiwisolver-1.4.9-cp311-cp311-win_amd64.whl", hash = "sha256:be6a04e6c79819c9a8c2373317d19a96048e5a3f90bec587787e86a1153883c2"},
+ {file = "kiwisolver-1.4.9-cp311-cp311-win_arm64.whl", hash = "sha256:0ae37737256ba2de764ddc12aed4956460277f00c4996d51a197e72f62f5eec7"},
+ {file = "kiwisolver-1.4.9-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:ac5a486ac389dddcc5bef4f365b6ae3ffff2c433324fb38dd35e3fab7c957999"},
+ {file = "kiwisolver-1.4.9-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:f2ba92255faa7309d06fe44c3a4a97efe1c8d640c2a79a5ef728b685762a6fd2"},
+ {file = "kiwisolver-1.4.9-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4a2899935e724dd1074cb568ce7ac0dce28b2cd6ab539c8e001a8578eb106d14"},
+ {file = "kiwisolver-1.4.9-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f6008a4919fdbc0b0097089f67a1eb55d950ed7e90ce2cc3e640abadd2757a04"},
+ {file = "kiwisolver-1.4.9-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:67bb8b474b4181770f926f7b7d2f8c0248cbcb78b660fdd41a47054b28d2a752"},
+ {file = "kiwisolver-1.4.9-cp312-cp312-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:2327a4a30d3ee07d2fbe2e7933e8a37c591663b96ce42a00bc67461a87d7df77"},
+ {file = "kiwisolver-1.4.9-cp312-cp312-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:7a08b491ec91b1d5053ac177afe5290adacf1f0f6307d771ccac5de30592d198"},
+ {file = "kiwisolver-1.4.9-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:d8fc5c867c22b828001b6a38d2eaeb88160bf5783c6cb4a5e440efc981ce286d"},
+ {file = "kiwisolver-1.4.9-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:3b3115b2581ea35bb6d1f24a4c90af37e5d9b49dcff267eeed14c3893c5b86ab"},
+ {file = "kiwisolver-1.4.9-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:858e4c22fb075920b96a291928cb7dea5644e94c0ee4fcd5af7e865655e4ccf2"},
+ {file = "kiwisolver-1.4.9-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ed0fecd28cc62c54b262e3736f8bb2512d8dcfdc2bcf08be5f47f96bf405b145"},
+ {file = "kiwisolver-1.4.9-cp312-cp312-win_amd64.whl", hash = "sha256:f68208a520c3d86ea51acf688a3e3002615a7f0238002cccc17affecc86a8a54"},
+ {file = "kiwisolver-1.4.9-cp312-cp312-win_arm64.whl", hash = "sha256:2c1a4f57df73965f3f14df20b80ee29e6a7930a57d2d9e8491a25f676e197c60"},
+ {file = "kiwisolver-1.4.9-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:a5d0432ccf1c7ab14f9949eec60c5d1f924f17c037e9f8b33352fa05799359b8"},
+ {file = "kiwisolver-1.4.9-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:efb3a45b35622bb6c16dbfab491a8f5a391fe0e9d45ef32f4df85658232ca0e2"},
+ {file = "kiwisolver-1.4.9-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1a12cf6398e8a0a001a059747a1cbf24705e18fe413bc22de7b3d15c67cffe3f"},
+ {file = "kiwisolver-1.4.9-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:b67e6efbf68e077dd71d1a6b37e43e1a99d0bff1a3d51867d45ee8908b931098"},
+ {file = "kiwisolver-1.4.9-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5656aa670507437af0207645273ccdfee4f14bacd7f7c67a4306d0dcaeaf6eed"},
+ {file = "kiwisolver-1.4.9-cp313-cp313-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:bfc08add558155345129c7803b3671cf195e6a56e7a12f3dde7c57d9b417f525"},
+ {file = "kiwisolver-1.4.9-cp313-cp313-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:40092754720b174e6ccf9e845d0d8c7d8e12c3d71e7fc35f55f3813e96376f78"},
+ {file = "kiwisolver-1.4.9-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:497d05f29a1300d14e02e6441cf0f5ee81c1ff5a304b0d9fb77423974684e08b"},
+ {file = "kiwisolver-1.4.9-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:bdd1a81a1860476eb41ac4bc1e07b3f07259e6d55bbf739b79c8aaedcf512799"},
+ {file = "kiwisolver-1.4.9-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:e6b93f13371d341afee3be9f7c5964e3fe61d5fa30f6a30eb49856935dfe4fc3"},
+ {file = "kiwisolver-1.4.9-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:d75aa530ccfaa593da12834b86a0724f58bff12706659baa9227c2ccaa06264c"},
+ {file = "kiwisolver-1.4.9-cp313-cp313-win_amd64.whl", hash = "sha256:dd0a578400839256df88c16abddf9ba14813ec5f21362e1fe65022e00c883d4d"},
+ {file = "kiwisolver-1.4.9-cp313-cp313-win_arm64.whl", hash = "sha256:d4188e73af84ca82468f09cadc5ac4db578109e52acb4518d8154698d3a87ca2"},
+ {file = "kiwisolver-1.4.9-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:5a0f2724dfd4e3b3ac5a82436a8e6fd16baa7d507117e4279b660fe8ca38a3a1"},
+ {file = "kiwisolver-1.4.9-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:1b11d6a633e4ed84fc0ddafd4ebfd8ea49b3f25082c04ad12b8315c11d504dc1"},
+ {file = "kiwisolver-1.4.9-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:61874cdb0a36016354853593cffc38e56fc9ca5aa97d2c05d3dcf6922cd55a11"},
+ {file = "kiwisolver-1.4.9-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:60c439763a969a6af93b4881db0eed8fadf93ee98e18cbc35bc8da868d0c4f0c"},
+ {file = "kiwisolver-1.4.9-cp313-cp313t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:92a2f997387a1b79a75e7803aa7ded2cfbe2823852ccf1ba3bcf613b62ae3197"},
+ {file = "kiwisolver-1.4.9-cp313-cp313t-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a31d512c812daea6d8b3be3b2bfcbeb091dbb09177706569bcfc6240dcf8b41c"},
+ {file = "kiwisolver-1.4.9-cp313-cp313t-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:52a15b0f35dad39862d376df10c5230155243a2c1a436e39eb55623ccbd68185"},
+ {file = "kiwisolver-1.4.9-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:a30fd6fdef1430fd9e1ba7b3398b5ee4e2887783917a687d86ba69985fb08748"},
+ {file = "kiwisolver-1.4.9-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:cc9617b46837c6468197b5945e196ee9ca43057bb7d9d1ae688101e4e1dddf64"},
+ {file = "kiwisolver-1.4.9-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:0ab74e19f6a2b027ea4f845a78827969af45ce790e6cb3e1ebab71bdf9f215ff"},
+ {file = "kiwisolver-1.4.9-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:dba5ee5d3981160c28d5490f0d1b7ed730c22470ff7f6cc26cfcfaacb9896a07"},
+ {file = "kiwisolver-1.4.9-cp313-cp313t-win_arm64.whl", hash = "sha256:0749fd8f4218ad2e851e11cc4dc05c7cbc0cbc4267bdfdb31782e65aace4ee9c"},
+ {file = "kiwisolver-1.4.9-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:9928fe1eb816d11ae170885a74d074f57af3a0d65777ca47e9aeb854a1fba386"},
+ {file = "kiwisolver-1.4.9-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:d0005b053977e7b43388ddec89fa567f43d4f6d5c2c0affe57de5ebf290dc552"},
+ {file = "kiwisolver-1.4.9-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:2635d352d67458b66fd0667c14cb1d4145e9560d503219034a18a87e971ce4f3"},
+ {file = "kiwisolver-1.4.9-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:767c23ad1c58c9e827b649a9ab7809fd5fd9db266a9cf02b0e926ddc2c680d58"},
+ {file = "kiwisolver-1.4.9-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:72d0eb9fba308b8311685c2268cf7d0a0639a6cd027d8128659f72bdd8a024b4"},
+ {file = "kiwisolver-1.4.9-cp314-cp314-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:f68e4f3eeca8fb22cc3d731f9715a13b652795ef657a13df1ad0c7dc0e9731df"},
+ {file = "kiwisolver-1.4.9-cp314-cp314-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d84cd4061ae292d8ac367b2c3fa3aad11cb8625a95d135fe93f286f914f3f5a6"},
+ {file = "kiwisolver-1.4.9-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:a60ea74330b91bd22a29638940d115df9dc00af5035a9a2a6ad9399ffb4ceca5"},
+ {file = "kiwisolver-1.4.9-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:ce6a3a4e106cf35c2d9c4fa17c05ce0b180db622736845d4315519397a77beaf"},
+ {file = "kiwisolver-1.4.9-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:77937e5e2a38a7b48eef0585114fe7930346993a88060d0bf886086d2aa49ef5"},
+ {file = "kiwisolver-1.4.9-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:24c175051354f4a28c5d6a31c93906dc653e2bf234e8a4bbfb964892078898ce"},
+ {file = "kiwisolver-1.4.9-cp314-cp314-win_amd64.whl", hash = "sha256:0763515d4df10edf6d06a3c19734e2566368980d21ebec439f33f9eb936c07b7"},
+ {file = "kiwisolver-1.4.9-cp314-cp314-win_arm64.whl", hash = "sha256:0e4e2bf29574a6a7b7f6cb5fa69293b9f96c928949ac4a53ba3f525dffb87f9c"},
+ {file = "kiwisolver-1.4.9-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:d976bbb382b202f71c67f77b0ac11244021cfa3f7dfd9e562eefcea2df711548"},
+ {file = "kiwisolver-1.4.9-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:2489e4e5d7ef9a1c300a5e0196e43d9c739f066ef23270607d45aba368b91f2d"},
+ {file = "kiwisolver-1.4.9-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:e2ea9f7ab7fbf18fffb1b5434ce7c69a07582f7acc7717720f1d69f3e806f90c"},
+ {file = "kiwisolver-1.4.9-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:b34e51affded8faee0dfdb705416153819d8ea9250bbbf7ea1b249bdeb5f1122"},
+ {file = "kiwisolver-1.4.9-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d8aacd3d4b33b772542b2e01beb50187536967b514b00003bdda7589722d2a64"},
+ {file = "kiwisolver-1.4.9-cp314-cp314t-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:7cf974dd4e35fa315563ac99d6287a1024e4dc2077b8a7d7cd3d2fb65d283134"},
+ {file = "kiwisolver-1.4.9-cp314-cp314t-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:85bd218b5ecfbee8c8a82e121802dcb519a86044c9c3b2e4aef02fa05c6da370"},
+ {file = "kiwisolver-1.4.9-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:0856e241c2d3df4efef7c04a1e46b1936b6120c9bcf36dd216e3acd84bc4fb21"},
+ {file = "kiwisolver-1.4.9-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:9af39d6551f97d31a4deebeac6f45b156f9755ddc59c07b402c148f5dbb6482a"},
+ {file = "kiwisolver-1.4.9-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:bb4ae2b57fc1d8cbd1cf7b1d9913803681ffa903e7488012be5b76dedf49297f"},
+ {file = "kiwisolver-1.4.9-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:aedff62918805fb62d43a4aa2ecd4482c380dc76cd31bd7c8878588a61bd0369"},
+ {file = "kiwisolver-1.4.9-cp314-cp314t-win_amd64.whl", hash = "sha256:1fa333e8b2ce4d9660f2cda9c0e1b6bafcfb2457a9d259faa82289e73ec24891"},
+ {file = "kiwisolver-1.4.9-cp314-cp314t-win_arm64.whl", hash = "sha256:4a48a2ce79d65d363597ef7b567ce3d14d68783d2b2263d98db3d9477805ba32"},
+ {file = "kiwisolver-1.4.9-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:4d1d9e582ad4d63062d34077a9a1e9f3c34088a2ec5135b1f7190c07cf366527"},
+ {file = "kiwisolver-1.4.9-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:deed0c7258ceb4c44ad5ec7d9918f9f14fd05b2be86378d86cf50e63d1e7b771"},
+ {file = "kiwisolver-1.4.9-pp310-pypy310_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0a590506f303f512dff6b7f75fd2fd18e16943efee932008fe7140e5fa91d80e"},
+ {file = "kiwisolver-1.4.9-pp310-pypy310_pp73-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e09c2279a4d01f099f52d5c4b3d9e208e91edcbd1a175c9662a8b16e000fece9"},
+ {file = "kiwisolver-1.4.9-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:c9e7cdf45d594ee04d5be1b24dd9d49f3d1590959b2271fb30b5ca2b262c00fb"},
+ {file = "kiwisolver-1.4.9-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:720e05574713db64c356e86732c0f3c5252818d05f9df320f0ad8380641acea5"},
+ {file = "kiwisolver-1.4.9-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:17680d737d5335b552994a2008fab4c851bcd7de33094a82067ef3a576ff02fa"},
+ {file = "kiwisolver-1.4.9-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:85b5352f94e490c028926ea567fc569c52ec79ce131dadb968d3853e809518c2"},
+ {file = "kiwisolver-1.4.9-pp311-pypy311_pp73-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:464415881e4801295659462c49461a24fb107c140de781d55518c4b80cb6790f"},
+ {file = "kiwisolver-1.4.9-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:fb940820c63a9590d31d88b815e7a3aa5915cad3ce735ab45f0c730b39547de1"},
+ {file = "kiwisolver-1.4.9.tar.gz", hash = "sha256:c3b22c26c6fd6811b0ae8363b95ca8ce4ea3c202d3d0975b2914310ceb1bcc4d"},
+]
+
+[[package]]
+name = "matplotlib"
+version = "3.9.4"
+description = "Python plotting package"
+optional = false
+python-versions = ">=3.9"
+groups = ["main", "dev"]
+markers = "python_version == \"3.9\""
+files = [
+ {file = "matplotlib-3.9.4-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:c5fdd7abfb706dfa8d307af64a87f1a862879ec3cd8d0ec8637458f0885b9c50"},
+ {file = "matplotlib-3.9.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d89bc4e85e40a71d1477780366c27fb7c6494d293e1617788986f74e2a03d7ff"},
+ {file = "matplotlib-3.9.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ddf9f3c26aae695c5daafbf6b94e4c1a30d6cd617ba594bbbded3b33a1fcfa26"},
+ {file = "matplotlib-3.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:18ebcf248030173b59a868fda1fe42397253f6698995b55e81e1f57431d85e50"},
+ {file = "matplotlib-3.9.4-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:974896ec43c672ec23f3f8c648981e8bc880ee163146e0312a9b8def2fac66f5"},
+ {file = "matplotlib-3.9.4-cp310-cp310-win_amd64.whl", hash = "sha256:4598c394ae9711cec135639374e70871fa36b56afae17bdf032a345be552a88d"},
+ {file = "matplotlib-3.9.4-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:d4dd29641d9fb8bc4492420c5480398dd40a09afd73aebe4eb9d0071a05fbe0c"},
+ {file = "matplotlib-3.9.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:30e5b22e8bcfb95442bf7d48b0d7f3bdf4a450cbf68986ea45fca3d11ae9d099"},
+ {file = "matplotlib-3.9.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2bb0030d1d447fd56dcc23b4c64a26e44e898f0416276cac1ebc25522e0ac249"},
+ {file = "matplotlib-3.9.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aca90ed222ac3565d2752b83dbb27627480d27662671e4d39da72e97f657a423"},
+ {file = "matplotlib-3.9.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:a181b2aa2906c608fcae72f977a4a2d76e385578939891b91c2550c39ecf361e"},
+ {file = "matplotlib-3.9.4-cp311-cp311-win_amd64.whl", hash = "sha256:1f6882828231eca17f501c4dcd98a05abb3f03d157fbc0769c6911fe08b6cfd3"},
+ {file = "matplotlib-3.9.4-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:dfc48d67e6661378a21c2983200a654b72b5c5cdbd5d2cf6e5e1ece860f0cc70"},
+ {file = "matplotlib-3.9.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:47aef0fab8332d02d68e786eba8113ffd6f862182ea2999379dec9e237b7e483"},
+ {file = "matplotlib-3.9.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fba1f52c6b7dc764097f52fd9ab627b90db452c9feb653a59945de16752e965f"},
+ {file = "matplotlib-3.9.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:173ac3748acaac21afcc3fa1633924609ba1b87749006bc25051c52c422a5d00"},
+ {file = "matplotlib-3.9.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:320edea0cadc07007765e33f878b13b3738ffa9745c5f707705692df70ffe0e0"},
+ {file = "matplotlib-3.9.4-cp312-cp312-win_amd64.whl", hash = "sha256:a4a4cfc82330b27042a7169533da7991e8789d180dd5b3daeaee57d75cd5a03b"},
+ {file = "matplotlib-3.9.4-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:37eeffeeca3c940985b80f5b9a7b95ea35671e0e7405001f249848d2b62351b6"},
+ {file = "matplotlib-3.9.4-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3e7465ac859ee4abcb0d836137cd8414e7bb7ad330d905abced457217d4f0f45"},
+ {file = "matplotlib-3.9.4-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f4c12302c34afa0cf061bea23b331e747e5e554b0fa595c96e01c7b75bc3b858"},
+ {file = "matplotlib-3.9.4-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2b8c97917f21b75e72108b97707ba3d48f171541a74aa2a56df7a40626bafc64"},
+ {file = "matplotlib-3.9.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:0229803bd7e19271b03cb09f27db76c918c467aa4ce2ae168171bc67c3f508df"},
+ {file = "matplotlib-3.9.4-cp313-cp313-win_amd64.whl", hash = "sha256:7c0d8ef442ebf56ff5e206f8083d08252ee738e04f3dc88ea882853a05488799"},
+ {file = "matplotlib-3.9.4-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:a04c3b00066a688834356d196136349cb32f5e1003c55ac419e91585168b88fb"},
+ {file = "matplotlib-3.9.4-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:04c519587f6c210626741a1e9a68eefc05966ede24205db8982841826af5871a"},
+ {file = "matplotlib-3.9.4-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:308afbf1a228b8b525fcd5cec17f246bbbb63b175a3ef6eb7b4d33287ca0cf0c"},
+ {file = "matplotlib-3.9.4-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddb3b02246ddcffd3ce98e88fed5b238bc5faff10dbbaa42090ea13241d15764"},
+ {file = "matplotlib-3.9.4-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:8a75287e9cb9eee48cb79ec1d806f75b29c0fde978cb7223a1f4c5848d696041"},
+ {file = "matplotlib-3.9.4-cp313-cp313t-win_amd64.whl", hash = "sha256:488deb7af140f0ba86da003e66e10d55ff915e152c78b4b66d231638400b1965"},
+ {file = "matplotlib-3.9.4-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:3c3724d89a387ddf78ff88d2a30ca78ac2b4c89cf37f2db4bd453c34799e933c"},
+ {file = "matplotlib-3.9.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:d5f0a8430ffe23d7e32cfd86445864ccad141797f7d25b7c41759a5b5d17cfd7"},
+ {file = "matplotlib-3.9.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6bb0141a21aef3b64b633dc4d16cbd5fc538b727e4958be82a0e1c92a234160e"},
+ {file = "matplotlib-3.9.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:57aa235109e9eed52e2c2949db17da185383fa71083c00c6c143a60e07e0888c"},
+ {file = "matplotlib-3.9.4-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:b18c600061477ccfdd1e6fd050c33d8be82431700f3452b297a56d9ed7037abb"},
+ {file = "matplotlib-3.9.4-cp39-cp39-win_amd64.whl", hash = "sha256:ef5f2d1b67d2d2145ff75e10f8c008bfbf71d45137c4b648c87193e7dd053eac"},
+ {file = "matplotlib-3.9.4-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:44e0ed786d769d85bc787b0606a53f2d8d2d1d3c8a2608237365e9121c1a338c"},
+ {file = "matplotlib-3.9.4-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:09debb9ce941eb23ecdbe7eab972b1c3e0276dcf01688073faff7b0f61d6c6ca"},
+ {file = "matplotlib-3.9.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bcc53cf157a657bfd03afab14774d54ba73aa84d42cfe2480c91bd94873952db"},
+ {file = "matplotlib-3.9.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:ad45da51be7ad02387801fd154ef74d942f49fe3fcd26a64c94842ba7ec0d865"},
+ {file = "matplotlib-3.9.4.tar.gz", hash = "sha256:1e00e8be7393cbdc6fedfa8a6fba02cf3e83814b285db1c60b906a023ba41bc3"},
+]
+
+[package.dependencies]
+contourpy = ">=1.0.1"
+cycler = ">=0.10"
+fonttools = ">=4.22.0"
+importlib-resources = {version = ">=3.2.0", markers = "python_version < \"3.10\""}
+kiwisolver = ">=1.3.1"
+numpy = ">=1.23"
+packaging = ">=20.0"
+pillow = ">=8"
+pyparsing = ">=2.3.1"
+python-dateutil = ">=2.7"
+
+[package.extras]
+dev = ["meson-python (>=0.13.1,<0.17.0)", "numpy (>=1.25)", "pybind11 (>=2.6,!=2.13.3)", "setuptools (>=64)", "setuptools_scm (>=7)"]
+
+[[package]]
+name = "matplotlib"
+version = "3.10.8"
+description = "Python plotting package"
+optional = false
+python-versions = ">=3.10"
+groups = ["main", "dev"]
+markers = "python_version >= \"3.10\""
+files = [
+ {file = "matplotlib-3.10.8-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:00270d217d6b20d14b584c521f810d60c5c78406dc289859776550df837dcda7"},
+ {file = "matplotlib-3.10.8-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:37b3c1cc42aa184b3f738cfa18c1c1d72fd496d85467a6cf7b807936d39aa656"},
+ {file = "matplotlib-3.10.8-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ee40c27c795bda6a5292e9cff9890189d32f7e3a0bf04e0e3c9430c4a00c37df"},
+ {file = "matplotlib-3.10.8-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a48f2b74020919552ea25d222d5cc6af9ca3f4eb43a93e14d068457f545c2a17"},
+ {file = "matplotlib-3.10.8-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f254d118d14a7f99d616271d6c3c27922c092dac11112670b157798b89bf4933"},
+ {file = "matplotlib-3.10.8-cp310-cp310-win_amd64.whl", hash = "sha256:f9b587c9c7274c1613a30afabf65a272114cd6cdbe67b3406f818c79d7ab2e2a"},
+ {file = "matplotlib-3.10.8-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:6be43b667360fef5c754dda5d25a32e6307a03c204f3c0fc5468b78fa87b4160"},
+ {file = "matplotlib-3.10.8-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a2b336e2d91a3d7006864e0990c83b216fcdca64b5a6484912902cef87313d78"},
+ {file = "matplotlib-3.10.8-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:efb30e3baaea72ce5928e32bab719ab4770099079d66726a62b11b1ef7273be4"},
+ {file = "matplotlib-3.10.8-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d56a1efd5bfd61486c8bc968fa18734464556f0fb8e51690f4ac25d85cbbbbc2"},
+ {file = "matplotlib-3.10.8-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:238b7ce5717600615c895050239ec955d91f321c209dd110db988500558e70d6"},
+ {file = "matplotlib-3.10.8-cp311-cp311-win_amd64.whl", hash = "sha256:18821ace09c763ec93aef5eeff087ee493a24051936d7b9ebcad9662f66501f9"},
+ {file = "matplotlib-3.10.8-cp311-cp311-win_arm64.whl", hash = "sha256:bab485bcf8b1c7d2060b4fcb6fc368a9e6f4cd754c9c2fea281f4be21df394a2"},
+ {file = "matplotlib-3.10.8-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:64fcc24778ca0404ce0cb7b6b77ae1f4c7231cdd60e6778f999ee05cbd581b9a"},
+ {file = "matplotlib-3.10.8-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b9a5ca4ac220a0cdd1ba6bcba3608547117d30468fefce49bb26f55c1a3d5c58"},
+ {file = "matplotlib-3.10.8-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3ab4aabc72de4ff77b3ec33a6d78a68227bf1123465887f9905ba79184a1cc04"},
+ {file = "matplotlib-3.10.8-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:24d50994d8c5816ddc35411e50a86ab05f575e2530c02752e02538122613371f"},
+ {file = "matplotlib-3.10.8-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:99eefd13c0dc3b3c1b4d561c1169e65fe47aab7b8158754d7c084088e2329466"},
+ {file = "matplotlib-3.10.8-cp312-cp312-win_amd64.whl", hash = "sha256:dd80ecb295460a5d9d260df63c43f4afbdd832d725a531f008dad1664f458adf"},
+ {file = "matplotlib-3.10.8-cp312-cp312-win_arm64.whl", hash = "sha256:3c624e43ed56313651bc18a47f838b60d7b8032ed348911c54906b130b20071b"},
+ {file = "matplotlib-3.10.8-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:3f2e409836d7f5ac2f1c013110a4d50b9f7edc26328c108915f9075d7d7a91b6"},
+ {file = "matplotlib-3.10.8-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:56271f3dac49a88d7fca5060f004d9d22b865f743a12a23b1e937a0be4818ee1"},
+ {file = "matplotlib-3.10.8-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a0a7f52498f72f13d4a25ea70f35f4cb60642b466cbb0a9be951b5bc3f45a486"},
+ {file = "matplotlib-3.10.8-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:646d95230efb9ca614a7a594d4fcacde0ac61d25e37dd51710b36477594963ce"},
+ {file = "matplotlib-3.10.8-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f89c151aab2e2e23cb3fe0acad1e8b82841fd265379c4cecd0f3fcb34c15e0f6"},
+ {file = "matplotlib-3.10.8-cp313-cp313-win_amd64.whl", hash = "sha256:e8ea3e2d4066083e264e75c829078f9e149fa119d27e19acd503de65e0b13149"},
+ {file = "matplotlib-3.10.8-cp313-cp313-win_arm64.whl", hash = "sha256:c108a1d6fa78a50646029cb6d49808ff0fc1330fda87fa6f6250c6b5369b6645"},
+ {file = "matplotlib-3.10.8-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:ad3d9833a64cf48cc4300f2b406c3d0f4f4724a91c0bd5640678a6ba7c102077"},
+ {file = "matplotlib-3.10.8-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:eb3823f11823deade26ce3b9f40dcb4a213da7a670013929f31d5f5ed1055b22"},
+ {file = "matplotlib-3.10.8-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d9050fee89a89ed57b4fb2c1bfac9a3d0c57a0d55aed95949eedbc42070fea39"},
+ {file = "matplotlib-3.10.8-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b44d07310e404ba95f8c25aa5536f154c0a8ec473303535949e52eb71d0a1565"},
+ {file = "matplotlib-3.10.8-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:0a33deb84c15ede243aead39f77e990469fff93ad1521163305095b77b72ce4a"},
+ {file = "matplotlib-3.10.8-cp313-cp313t-win_amd64.whl", hash = "sha256:3a48a78d2786784cc2413e57397981fb45c79e968d99656706018d6e62e57958"},
+ {file = "matplotlib-3.10.8-cp313-cp313t-win_arm64.whl", hash = "sha256:15d30132718972c2c074cd14638c7f4592bd98719e2308bccea40e0538bc0cb5"},
+ {file = "matplotlib-3.10.8-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:b53285e65d4fa4c86399979e956235deb900be5baa7fc1218ea67fbfaeaadd6f"},
+ {file = "matplotlib-3.10.8-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:32f8dce744be5569bebe789e46727946041199030db8aeb2954d26013a0eb26b"},
+ {file = "matplotlib-3.10.8-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4cf267add95b1c88300d96ca837833d4112756045364f5c734a2276038dae27d"},
+ {file = "matplotlib-3.10.8-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2cf5bd12cecf46908f286d7838b2abc6c91cda506c0445b8223a7c19a00df008"},
+ {file = "matplotlib-3.10.8-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:41703cc95688f2516b480f7f339d8851a6035f18e100ee6a32bc0b8536a12a9c"},
+ {file = "matplotlib-3.10.8-cp314-cp314-win_amd64.whl", hash = "sha256:83d282364ea9f3e52363da262ce32a09dfe241e4080dcedda3c0db059d3c1f11"},
+ {file = "matplotlib-3.10.8-cp314-cp314-win_arm64.whl", hash = "sha256:2c1998e92cd5999e295a731bcb2911c75f597d937341f3030cc24ef2733d78a8"},
+ {file = "matplotlib-3.10.8-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:b5a2b97dbdc7d4f353ebf343744f1d1f1cca8aa8bfddb4262fcf4306c3761d50"},
+ {file = "matplotlib-3.10.8-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:3f5c3e4da343bba819f0234186b9004faba952cc420fbc522dc4e103c1985908"},
+ {file = "matplotlib-3.10.8-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5f62550b9a30afde8c1c3ae450e5eb547d579dd69b25c2fc7a1c67f934c1717a"},
+ {file = "matplotlib-3.10.8-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:495672de149445ec1b772ff2c9ede9b769e3cb4f0d0aa7fa730d7f59e2d4e1c1"},
+ {file = "matplotlib-3.10.8-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:595ba4d8fe983b88f0eec8c26a241e16d6376fe1979086232f481f8f3f67494c"},
+ {file = "matplotlib-3.10.8-cp314-cp314t-win_amd64.whl", hash = "sha256:25d380fe8b1dc32cf8f0b1b448470a77afb195438bafdf1d858bfb876f3edf7b"},
+ {file = "matplotlib-3.10.8-cp314-cp314t-win_arm64.whl", hash = "sha256:113bb52413ea508ce954a02c10ffd0d565f9c3bc7f2eddc27dfe1731e71c7b5f"},
+ {file = "matplotlib-3.10.8-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:f97aeb209c3d2511443f8797e3e5a569aebb040d4f8bc79aa3ee78a8fb9e3dd8"},
+ {file = "matplotlib-3.10.8-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:fb061f596dad3a0f52b60dc6a5dec4a0c300dec41e058a7efe09256188d170b7"},
+ {file = "matplotlib-3.10.8-pp310-pypy310_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:12d90df9183093fcd479f4172ac26b322b1248b15729cb57f42f71f24c7e37a3"},
+ {file = "matplotlib-3.10.8-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:6da7c2ce169267d0d066adcf63758f0604aa6c3eebf67458930f9d9b79ad1db1"},
+ {file = "matplotlib-3.10.8-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:9153c3292705be9f9c64498a8872118540c3f4123d1a1c840172edf262c8be4a"},
+ {file = "matplotlib-3.10.8-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1ae029229a57cd1e8fe542485f27e7ca7b23aa9e8944ddb4985d0bc444f1eca2"},
+ {file = "matplotlib-3.10.8.tar.gz", hash = "sha256:2299372c19d56bcd35cf05a2738308758d32b9eaed2371898d8f5bd33f084aa3"},
+]
+
+[package.dependencies]
+contourpy = ">=1.0.1"
+cycler = ">=0.10"
+fonttools = ">=4.22.0"
+kiwisolver = ">=1.3.1"
+numpy = ">=1.23"
+packaging = ">=20.0"
+pillow = ">=8"
+pyparsing = ">=3"
+python-dateutil = ">=2.7"
+
+[package.extras]
+dev = ["meson-python (>=0.13.1,<0.17.0)", "pybind11 (>=2.13.2,!=2.13.3)", "setuptools (>=64)", "setuptools_scm (>=7)"]
+
[[package]]
name = "mypy-extensions"
version = "1.1.0"
description = "Type system extensions for programs checked with the mypy type checker."
optional = false
python-versions = ">=3.8"
+groups = ["main", "dev"]
files = [
{file = "mypy_extensions-1.1.0-py3-none-any.whl", hash = "sha256:1be4cccdb0f2482337c4743e60421de3a356cd97508abadd57d47403e94f5505"},
{file = "mypy_extensions-1.1.0.tar.gz", hash = "sha256:52e68efc3284861e772bbcd66823fde5ae21fd2fdb51c62a211403730b916558"},
@@ -402,6 +1312,7 @@ version = "1.10.0"
description = "Node.js virtual environment builder"
optional = false
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
+groups = ["main", "dev"]
files = [
{file = "nodeenv-1.10.0-py2.py3-none-any.whl", hash = "sha256:5bb13e3eed2923615535339b3c620e76779af4cb4c6a90deccc9e36b274d3827"},
{file = "nodeenv-1.10.0.tar.gz", hash = "sha256:996c191ad80897d076bdfba80a41994c2b47c68e224c542b48feba42ba00f8bb"},
@@ -413,6 +1324,7 @@ version = "2.0.2"
description = "Fundamental package for array computing in Python"
optional = false
python-versions = ">=3.9"
+groups = ["main", "dev"]
files = [
{file = "numpy-2.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:51129a29dbe56f9ca83438b706e2e69a39892b5eda6cedcb6b0c9fdc9b0d3ece"},
{file = "numpy-2.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f15975dfec0cf2239224d80e32c3170b1d168335eaedee69da84fbe9f1f9cd04"},
@@ -467,6 +1379,7 @@ version = "25.0"
description = "Core utilities for Python packages"
optional = false
python-versions = ">=3.8"
+groups = ["main", "dev"]
files = [
{file = "packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484"},
{file = "packaging-25.0.tar.gz", hash = "sha256:d443872c98d677bf60f6a1f2f8c1cb748e8fe762d2bf9d3148b5599295b0fc4f"},
@@ -478,6 +1391,7 @@ version = "2.3.3"
description = "Powerful data structures for data analysis, time series, and statistics"
optional = false
python-versions = ">=3.9"
+groups = ["main", "dev"]
files = [
{file = "pandas-2.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:376c6446ae31770764215a6c937f72d917f214b43560603cd60da6408f183b6c"},
{file = "pandas-2.3.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e19d192383eab2f4ceb30b412b22ea30690c9e618f78870357ae1d682912015a"},
@@ -577,6 +1491,7 @@ version = "1.0.3"
description = "Utility library for gitignore style pattern matching of file paths."
optional = false
python-versions = ">=3.9"
+groups = ["main", "dev"]
files = [
{file = "pathspec-1.0.3-py3-none-any.whl", hash = "sha256:e80767021c1cc524aa3fb14bedda9c34406591343cc42797b386ce7b9354fb6c"},
{file = "pathspec-1.0.3.tar.gz", hash = "sha256:bac5cf97ae2c2876e2d25ebb15078eb04d76e4b98921ee31c6f85ade8b59444d"},
@@ -588,12 +1503,249 @@ optional = ["typing-extensions (>=4)"]
re2 = ["google-re2 (>=1.1)"]
tests = ["pytest (>=9)", "typing-extensions (>=4.15)"]
+[[package]]
+name = "pillow"
+version = "11.3.0"
+description = "Python Imaging Library (Fork)"
+optional = false
+python-versions = ">=3.9"
+groups = ["main", "dev"]
+markers = "python_version == \"3.9\""
+files = [
+ {file = "pillow-11.3.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:1b9c17fd4ace828b3003dfd1e30bff24863e0eb59b535e8f80194d9cc7ecf860"},
+ {file = "pillow-11.3.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:65dc69160114cdd0ca0f35cb434633c75e8e7fad4cf855177a05bf38678f73ad"},
+ {file = "pillow-11.3.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7107195ddc914f656c7fc8e4a5e1c25f32e9236ea3ea860f257b0436011fddd0"},
+ {file = "pillow-11.3.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:cc3e831b563b3114baac7ec2ee86819eb03caa1a2cef0b481a5675b59c4fe23b"},
+ {file = "pillow-11.3.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f1f182ebd2303acf8c380a54f615ec883322593320a9b00438eb842c1f37ae50"},
+ {file = "pillow-11.3.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4445fa62e15936a028672fd48c4c11a66d641d2c05726c7ec1f8ba6a572036ae"},
+ {file = "pillow-11.3.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:71f511f6b3b91dd543282477be45a033e4845a40278fa8dcdbfdb07109bf18f9"},
+ {file = "pillow-11.3.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:040a5b691b0713e1f6cbe222e0f4f74cd233421e105850ae3b3c0ceda520f42e"},
+ {file = "pillow-11.3.0-cp310-cp310-win32.whl", hash = "sha256:89bd777bc6624fe4115e9fac3352c79ed60f3bb18651420635f26e643e3dd1f6"},
+ {file = "pillow-11.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:19d2ff547c75b8e3ff46f4d9ef969a06c30ab2d4263a9e287733aa8b2429ce8f"},
+ {file = "pillow-11.3.0-cp310-cp310-win_arm64.whl", hash = "sha256:819931d25e57b513242859ce1876c58c59dc31587847bf74cfe06b2e0cb22d2f"},
+ {file = "pillow-11.3.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:1cd110edf822773368b396281a2293aeb91c90a2db00d78ea43e7e861631b722"},
+ {file = "pillow-11.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9c412fddd1b77a75aa904615ebaa6001f169b26fd467b4be93aded278266b288"},
+ {file = "pillow-11.3.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7d1aa4de119a0ecac0a34a9c8bde33f34022e2e8f99104e47a3ca392fd60e37d"},
+ {file = "pillow-11.3.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:91da1d88226663594e3f6b4b8c3c8d85bd504117d043740a8e0ec449087cc494"},
+ {file = "pillow-11.3.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:643f189248837533073c405ec2f0bb250ba54598cf80e8c1e043381a60632f58"},
+ {file = "pillow-11.3.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:106064daa23a745510dabce1d84f29137a37224831d88eb4ce94bb187b1d7e5f"},
+ {file = "pillow-11.3.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:cd8ff254faf15591e724dc7c4ddb6bf4793efcbe13802a4ae3e863cd300b493e"},
+ {file = "pillow-11.3.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:932c754c2d51ad2b2271fd01c3d121daaa35e27efae2a616f77bf164bc0b3e94"},
+ {file = "pillow-11.3.0-cp311-cp311-win32.whl", hash = "sha256:b4b8f3efc8d530a1544e5962bd6b403d5f7fe8b9e08227c6b255f98ad82b4ba0"},
+ {file = "pillow-11.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:1a992e86b0dd7aeb1f053cd506508c0999d710a8f07b4c791c63843fc6a807ac"},
+ {file = "pillow-11.3.0-cp311-cp311-win_arm64.whl", hash = "sha256:30807c931ff7c095620fe04448e2c2fc673fcbb1ffe2a7da3fb39613489b1ddd"},
+ {file = "pillow-11.3.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:fdae223722da47b024b867c1ea0be64e0df702c5e0a60e27daad39bf960dd1e4"},
+ {file = "pillow-11.3.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:921bd305b10e82b4d1f5e802b6850677f965d8394203d182f078873851dada69"},
+ {file = "pillow-11.3.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:eb76541cba2f958032d79d143b98a3a6b3ea87f0959bbe256c0b5e416599fd5d"},
+ {file = "pillow-11.3.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:67172f2944ebba3d4a7b54f2e95c786a3a50c21b88456329314caaa28cda70f6"},
+ {file = "pillow-11.3.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:97f07ed9f56a3b9b5f49d3661dc9607484e85c67e27f3e8be2c7d28ca032fec7"},
+ {file = "pillow-11.3.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:676b2815362456b5b3216b4fd5bd89d362100dc6f4945154ff172e206a22c024"},
+ {file = "pillow-11.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3e184b2f26ff146363dd07bde8b711833d7b0202e27d13540bfe2e35a323a809"},
+ {file = "pillow-11.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:6be31e3fc9a621e071bc17bb7de63b85cbe0bfae91bb0363c893cbe67247780d"},
+ {file = "pillow-11.3.0-cp312-cp312-win32.whl", hash = "sha256:7b161756381f0918e05e7cb8a371fff367e807770f8fe92ecb20d905d0e1c149"},
+ {file = "pillow-11.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:a6444696fce635783440b7f7a9fc24b3ad10a9ea3f0ab66c5905be1c19ccf17d"},
+ {file = "pillow-11.3.0-cp312-cp312-win_arm64.whl", hash = "sha256:2aceea54f957dd4448264f9bf40875da0415c83eb85f55069d89c0ed436e3542"},
+ {file = "pillow-11.3.0-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:1c627742b539bba4309df89171356fcb3cc5a9178355b2727d1b74a6cf155fbd"},
+ {file = "pillow-11.3.0-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:30b7c02f3899d10f13d7a48163c8969e4e653f8b43416d23d13d1bbfdc93b9f8"},
+ {file = "pillow-11.3.0-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:7859a4cc7c9295f5838015d8cc0a9c215b77e43d07a25e460f35cf516df8626f"},
+ {file = "pillow-11.3.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:ec1ee50470b0d050984394423d96325b744d55c701a439d2bd66089bff963d3c"},
+ {file = "pillow-11.3.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:7db51d222548ccfd274e4572fdbf3e810a5e66b00608862f947b163e613b67dd"},
+ {file = "pillow-11.3.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:2d6fcc902a24ac74495df63faad1884282239265c6839a0a6416d33faedfae7e"},
+ {file = "pillow-11.3.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f0f5d8f4a08090c6d6d578351a2b91acf519a54986c055af27e7a93feae6d3f1"},
+ {file = "pillow-11.3.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c37d8ba9411d6003bba9e518db0db0c58a680ab9fe5179f040b0463644bc9805"},
+ {file = "pillow-11.3.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:13f87d581e71d9189ab21fe0efb5a23e9f28552d5be6979e84001d3b8505abe8"},
+ {file = "pillow-11.3.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:023f6d2d11784a465f09fd09a34b150ea4672e85fb3d05931d89f373ab14abb2"},
+ {file = "pillow-11.3.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:45dfc51ac5975b938e9809451c51734124e73b04d0f0ac621649821a63852e7b"},
+ {file = "pillow-11.3.0-cp313-cp313-win32.whl", hash = "sha256:a4d336baed65d50d37b88ca5b60c0fa9d81e3a87d4a7930d3880d1624d5b31f3"},
+ {file = "pillow-11.3.0-cp313-cp313-win_amd64.whl", hash = "sha256:0bce5c4fd0921f99d2e858dc4d4d64193407e1b99478bc5cacecba2311abde51"},
+ {file = "pillow-11.3.0-cp313-cp313-win_arm64.whl", hash = "sha256:1904e1264881f682f02b7f8167935cce37bc97db457f8e7849dc3a6a52b99580"},
+ {file = "pillow-11.3.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:4c834a3921375c48ee6b9624061076bc0a32a60b5532b322cc0ea64e639dd50e"},
+ {file = "pillow-11.3.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:5e05688ccef30ea69b9317a9ead994b93975104a677a36a8ed8106be9260aa6d"},
+ {file = "pillow-11.3.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1019b04af07fc0163e2810167918cb5add8d74674b6267616021ab558dc98ced"},
+ {file = "pillow-11.3.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f944255db153ebb2b19c51fe85dd99ef0ce494123f21b9db4877ffdfc5590c7c"},
+ {file = "pillow-11.3.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1f85acb69adf2aaee8b7da124efebbdb959a104db34d3a2cb0f3793dbae422a8"},
+ {file = "pillow-11.3.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:05f6ecbeff5005399bb48d198f098a9b4b6bdf27b8487c7f38ca16eeb070cd59"},
+ {file = "pillow-11.3.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:a7bc6e6fd0395bc052f16b1a8670859964dbd7003bd0af2ff08342eb6e442cfe"},
+ {file = "pillow-11.3.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:83e1b0161c9d148125083a35c1c5a89db5b7054834fd4387499e06552035236c"},
+ {file = "pillow-11.3.0-cp313-cp313t-win32.whl", hash = "sha256:2a3117c06b8fb646639dce83694f2f9eac405472713fcb1ae887469c0d4f6788"},
+ {file = "pillow-11.3.0-cp313-cp313t-win_amd64.whl", hash = "sha256:857844335c95bea93fb39e0fa2726b4d9d758850b34075a7e3ff4f4fa3aa3b31"},
+ {file = "pillow-11.3.0-cp313-cp313t-win_arm64.whl", hash = "sha256:8797edc41f3e8536ae4b10897ee2f637235c94f27404cac7297f7b607dd0716e"},
+ {file = "pillow-11.3.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:d9da3df5f9ea2a89b81bb6087177fb1f4d1c7146d583a3fe5c672c0d94e55e12"},
+ {file = "pillow-11.3.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:0b275ff9b04df7b640c59ec5a3cb113eefd3795a8df80bac69646ef699c6981a"},
+ {file = "pillow-11.3.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0743841cabd3dba6a83f38a92672cccbd69af56e3e91777b0ee7f4dba4385632"},
+ {file = "pillow-11.3.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2465a69cf967b8b49ee1b96d76718cd98c4e925414ead59fdf75cf0fd07df673"},
+ {file = "pillow-11.3.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:41742638139424703b4d01665b807c6468e23e699e8e90cffefe291c5832b027"},
+ {file = "pillow-11.3.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:93efb0b4de7e340d99057415c749175e24c8864302369e05914682ba642e5d77"},
+ {file = "pillow-11.3.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7966e38dcd0fa11ca390aed7c6f20454443581d758242023cf36fcb319b1a874"},
+ {file = "pillow-11.3.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:98a9afa7b9007c67ed84c57c9e0ad86a6000da96eaa638e4f8abe5b65ff83f0a"},
+ {file = "pillow-11.3.0-cp314-cp314-win32.whl", hash = "sha256:02a723e6bf909e7cea0dac1b0e0310be9d7650cd66222a5f1c571455c0a45214"},
+ {file = "pillow-11.3.0-cp314-cp314-win_amd64.whl", hash = "sha256:a418486160228f64dd9e9efcd132679b7a02a5f22c982c78b6fc7dab3fefb635"},
+ {file = "pillow-11.3.0-cp314-cp314-win_arm64.whl", hash = "sha256:155658efb5e044669c08896c0c44231c5e9abcaadbc5cd3648df2f7c0b96b9a6"},
+ {file = "pillow-11.3.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:59a03cdf019efbfeeed910bf79c7c93255c3d54bc45898ac2a4140071b02b4ae"},
+ {file = "pillow-11.3.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:f8a5827f84d973d8636e9dc5764af4f0cf2318d26744b3d902931701b0d46653"},
+ {file = "pillow-11.3.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ee92f2fd10f4adc4b43d07ec5e779932b4eb3dbfbc34790ada5a6669bc095aa6"},
+ {file = "pillow-11.3.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c96d333dcf42d01f47b37e0979b6bd73ec91eae18614864622d9b87bbd5bbf36"},
+ {file = "pillow-11.3.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4c96f993ab8c98460cd0c001447bff6194403e8b1d7e149ade5f00594918128b"},
+ {file = "pillow-11.3.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:41342b64afeba938edb034d122b2dda5db2139b9a4af999729ba8818e0056477"},
+ {file = "pillow-11.3.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:068d9c39a2d1b358eb9f245ce7ab1b5c3246c7c8c7d9ba58cfa5b43146c06e50"},
+ {file = "pillow-11.3.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:a1bc6ba083b145187f648b667e05a2534ecc4b9f2784c2cbe3089e44868f2b9b"},
+ {file = "pillow-11.3.0-cp314-cp314t-win32.whl", hash = "sha256:118ca10c0d60b06d006be10a501fd6bbdfef559251ed31b794668ed569c87e12"},
+ {file = "pillow-11.3.0-cp314-cp314t-win_amd64.whl", hash = "sha256:8924748b688aa210d79883357d102cd64690e56b923a186f35a82cbc10f997db"},
+ {file = "pillow-11.3.0-cp314-cp314t-win_arm64.whl", hash = "sha256:79ea0d14d3ebad43ec77ad5272e6ff9bba5b679ef73375ea760261207fa8e0aa"},
+ {file = "pillow-11.3.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:48d254f8a4c776de343051023eb61ffe818299eeac478da55227d96e241de53f"},
+ {file = "pillow-11.3.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:7aee118e30a4cf54fdd873bd3a29de51e29105ab11f9aad8c32123f58c8f8081"},
+ {file = "pillow-11.3.0-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:23cff760a9049c502721bdb743a7cb3e03365fafcdfc2ef9784610714166e5a4"},
+ {file = "pillow-11.3.0-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6359a3bc43f57d5b375d1ad54a0074318a0844d11b76abccf478c37c986d3cfc"},
+ {file = "pillow-11.3.0-cp39-cp39-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:092c80c76635f5ecb10f3f83d76716165c96f5229addbd1ec2bdbbda7d496e06"},
+ {file = "pillow-11.3.0-cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:cadc9e0ea0a2431124cde7e1697106471fc4c1da01530e679b2391c37d3fbb3a"},
+ {file = "pillow-11.3.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:6a418691000f2a418c9135a7cf0d797c1bb7d9a485e61fe8e7722845b95ef978"},
+ {file = "pillow-11.3.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:97afb3a00b65cc0804d1c7abddbf090a81eaac02768af58cbdcaaa0a931e0b6d"},
+ {file = "pillow-11.3.0-cp39-cp39-win32.whl", hash = "sha256:ea944117a7974ae78059fcc1800e5d3295172bb97035c0c1d9345fca1419da71"},
+ {file = "pillow-11.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:e5c5858ad8ec655450a7c7df532e9842cf8df7cc349df7225c60d5d348c8aada"},
+ {file = "pillow-11.3.0-cp39-cp39-win_arm64.whl", hash = "sha256:6abdbfd3aea42be05702a8dd98832329c167ee84400a1d1f61ab11437f1717eb"},
+ {file = "pillow-11.3.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:3cee80663f29e3843b68199b9d6f4f54bd1d4a6b59bdd91bceefc51238bcb967"},
+ {file = "pillow-11.3.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:b5f56c3f344f2ccaf0dd875d3e180f631dc60a51b314295a3e681fe8cf851fbe"},
+ {file = "pillow-11.3.0-pp310-pypy310_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e67d793d180c9df62f1f40aee3accca4829d3794c95098887edc18af4b8b780c"},
+ {file = "pillow-11.3.0-pp310-pypy310_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d000f46e2917c705e9fb93a3606ee4a819d1e3aa7a9b442f6444f07e77cf5e25"},
+ {file = "pillow-11.3.0-pp310-pypy310_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:527b37216b6ac3a12d7838dc3bd75208ec57c1c6d11ef01902266a5a0c14fc27"},
+ {file = "pillow-11.3.0-pp310-pypy310_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:be5463ac478b623b9dd3937afd7fb7ab3d79dd290a28e2b6df292dc75063eb8a"},
+ {file = "pillow-11.3.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:8dc70ca24c110503e16918a658b869019126ecfe03109b754c402daff12b3d9f"},
+ {file = "pillow-11.3.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:7c8ec7a017ad1bd562f93dbd8505763e688d388cde6e4a010ae1486916e713e6"},
+ {file = "pillow-11.3.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:9ab6ae226de48019caa8074894544af5b53a117ccb9d3b3dcb2871464c829438"},
+ {file = "pillow-11.3.0-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:fe27fb049cdcca11f11a7bfda64043c37b30e6b91f10cb5bab275806c32f6ab3"},
+ {file = "pillow-11.3.0-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:465b9e8844e3c3519a983d58b80be3f668e2a7a5db97f2784e7079fbc9f9822c"},
+ {file = "pillow-11.3.0-pp311-pypy311_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5418b53c0d59b3824d05e029669efa023bbef0f3e92e75ec8428f3799487f361"},
+ {file = "pillow-11.3.0-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:504b6f59505f08ae014f724b6207ff6222662aab5cc9542577fb084ed0676ac7"},
+ {file = "pillow-11.3.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:c84d689db21a1c397d001aa08241044aa2069e7587b398c8cc63020390b1c1b8"},
+ {file = "pillow-11.3.0.tar.gz", hash = "sha256:3828ee7586cd0b2091b6209e5ad53e20d0649bbe87164a459d0676e035e8f523"},
+]
+
+[package.extras]
+docs = ["furo", "olefile", "sphinx (>=8.2)", "sphinx-autobuild", "sphinx-copybutton", "sphinx-inline-tabs", "sphinxext-opengraph"]
+fpx = ["olefile"]
+mic = ["olefile"]
+test-arrow = ["pyarrow"]
+tests = ["check-manifest", "coverage (>=7.4.2)", "defusedxml", "markdown2", "olefile", "packaging", "pyroma", "pytest", "pytest-cov", "pytest-timeout", "pytest-xdist", "trove-classifiers (>=2024.10.12)"]
+typing = ["typing-extensions ; python_version < \"3.10\""]
+xmp = ["defusedxml"]
+
+[[package]]
+name = "pillow"
+version = "12.1.0"
+description = "Python Imaging Library (fork)"
+optional = false
+python-versions = ">=3.10"
+groups = ["main", "dev"]
+markers = "python_version >= \"3.10\""
+files = [
+ {file = "pillow-12.1.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:fb125d860738a09d363a88daa0f59c4533529a90e564785e20fe875b200b6dbd"},
+ {file = "pillow-12.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cad302dc10fac357d3467a74a9561c90609768a6f73a1923b0fd851b6486f8b0"},
+ {file = "pillow-12.1.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:a40905599d8079e09f25027423aed94f2823adaf2868940de991e53a449e14a8"},
+ {file = "pillow-12.1.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:92a7fe4225365c5e3a8e598982269c6d6698d3e783b3b1ae979e7819f9cd55c1"},
+ {file = "pillow-12.1.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f10c98f49227ed8383d28174ee95155a675c4ed7f85e2e573b04414f7e371bda"},
+ {file = "pillow-12.1.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8637e29d13f478bc4f153d8daa9ffb16455f0a6cb287da1b432fdad2bfbd66c7"},
+ {file = "pillow-12.1.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:21e686a21078b0f9cb8c8a961d99e6a4ddb88e0fc5ea6e130172ddddc2e5221a"},
+ {file = "pillow-12.1.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:2415373395a831f53933c23ce051021e79c8cd7979822d8cc478547a3f4da8ef"},
+ {file = "pillow-12.1.0-cp310-cp310-win32.whl", hash = "sha256:e75d3dba8fc1ddfec0cd752108f93b83b4f8d6ab40e524a95d35f016b9683b09"},
+ {file = "pillow-12.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:64efdf00c09e31efd754448a383ea241f55a994fd079866b92d2bbff598aad91"},
+ {file = "pillow-12.1.0-cp310-cp310-win_arm64.whl", hash = "sha256:f188028b5af6b8fb2e9a76ac0f841a575bd1bd396e46ef0840d9b88a48fdbcea"},
+ {file = "pillow-12.1.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:a83e0850cb8f5ac975291ebfc4170ba481f41a28065277f7f735c202cd8e0af3"},
+ {file = "pillow-12.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b6e53e82ec2db0717eabb276aa56cf4e500c9a7cec2c2e189b55c24f65a3e8c0"},
+ {file = "pillow-12.1.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:40a8e3b9e8773876d6e30daed22f016509e3987bab61b3b7fe309d7019a87451"},
+ {file = "pillow-12.1.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:800429ac32c9b72909c671aaf17ecd13110f823ddb7db4dfef412a5587c2c24e"},
+ {file = "pillow-12.1.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0b022eaaf709541b391ee069f0022ee5b36c709df71986e3f7be312e46f42c84"},
+ {file = "pillow-12.1.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1f345e7bc9d7f368887c712aa5054558bad44d2a301ddf9248599f4161abc7c0"},
+ {file = "pillow-12.1.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d70347c8a5b7ccd803ec0c85c8709f036e6348f1e6a5bf048ecd9c64d3550b8b"},
+ {file = "pillow-12.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:1fcc52d86ce7a34fd17cb04e87cfdb164648a3662a6f20565910a99653d66c18"},
+ {file = "pillow-12.1.0-cp311-cp311-win32.whl", hash = "sha256:3ffaa2f0659e2f740473bcf03c702c39a8d4b2b7ffc629052028764324842c64"},
+ {file = "pillow-12.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:806f3987ffe10e867bab0ddad45df1148a2b98221798457fa097ad85d6e8bc75"},
+ {file = "pillow-12.1.0-cp311-cp311-win_arm64.whl", hash = "sha256:9f5fefaca968e700ad1a4a9de98bf0869a94e397fe3524c4c9450c1445252304"},
+ {file = "pillow-12.1.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:a332ac4ccb84b6dde65dbace8431f3af08874bf9770719d32a635c4ef411b18b"},
+ {file = "pillow-12.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:907bfa8a9cb790748a9aa4513e37c88c59660da3bcfffbd24a7d9e6abf224551"},
+ {file = "pillow-12.1.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:efdc140e7b63b8f739d09a99033aa430accce485ff78e6d311973a67b6bf3208"},
+ {file = "pillow-12.1.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bef9768cab184e7ae6e559c032e95ba8d07b3023c289f79a2bd36e8bf85605a5"},
+ {file = "pillow-12.1.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:742aea052cf5ab5034a53c3846165bc3ce88d7c38e954120db0ab867ca242661"},
+ {file = "pillow-12.1.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a6dfc2af5b082b635af6e08e0d1f9f1c4e04d17d4e2ca0ef96131e85eda6eb17"},
+ {file = "pillow-12.1.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:609e89d9f90b581c8d16358c9087df76024cf058fa693dd3e1e1620823f39670"},
+ {file = "pillow-12.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:43b4899cfd091a9693a1278c4982f3e50f7fb7cff5153b05174b4afc9593b616"},
+ {file = "pillow-12.1.0-cp312-cp312-win32.whl", hash = "sha256:aa0c9cc0b82b14766a99fbe6084409972266e82f459821cd26997a488a7261a7"},
+ {file = "pillow-12.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:d70534cea9e7966169ad29a903b99fc507e932069a881d0965a1a84bb57f6c6d"},
+ {file = "pillow-12.1.0-cp312-cp312-win_arm64.whl", hash = "sha256:65b80c1ee7e14a87d6a068dd3b0aea268ffcabfe0498d38661b00c5b4b22e74c"},
+ {file = "pillow-12.1.0-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:7b5dd7cbae20285cdb597b10eb5a2c13aa9de6cde9bb64a3c1317427b1db1ae1"},
+ {file = "pillow-12.1.0-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:29a4cef9cb672363926f0470afc516dbf7305a14d8c54f7abbb5c199cd8f8179"},
+ {file = "pillow-12.1.0-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:681088909d7e8fa9e31b9799aaa59ba5234c58e5e4f1951b4c4d1082a2e980e0"},
+ {file = "pillow-12.1.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:983976c2ab753166dc66d36af6e8ec15bb511e4a25856e2227e5f7e00a160587"},
+ {file = "pillow-12.1.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:db44d5c160a90df2d24a24760bbd37607d53da0b34fb546c4c232af7192298ac"},
+ {file = "pillow-12.1.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:6b7a9d1db5dad90e2991645874f708e87d9a3c370c243c2d7684d28f7e133e6b"},
+ {file = "pillow-12.1.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6258f3260986990ba2fa8a874f8b6e808cf5abb51a94015ca3dc3c68aa4f30ea"},
+ {file = "pillow-12.1.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e115c15e3bc727b1ca3e641a909f77f8ca72a64fff150f666fcc85e57701c26c"},
+ {file = "pillow-12.1.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6741e6f3074a35e47c77b23a4e4f2d90db3ed905cb1c5e6e0d49bff2045632bc"},
+ {file = "pillow-12.1.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:935b9d1aed48fcfb3f838caac506f38e29621b44ccc4f8a64d575cb1b2a88644"},
+ {file = "pillow-12.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5fee4c04aad8932da9f8f710af2c1a15a83582cfb884152a9caa79d4efcdbf9c"},
+ {file = "pillow-12.1.0-cp313-cp313-win32.whl", hash = "sha256:a786bf667724d84aa29b5db1c61b7bfdde380202aaca12c3461afd6b71743171"},
+ {file = "pillow-12.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:461f9dfdafa394c59cd6d818bdfdbab4028b83b02caadaff0ffd433faf4c9a7a"},
+ {file = "pillow-12.1.0-cp313-cp313-win_arm64.whl", hash = "sha256:9212d6b86917a2300669511ed094a9406888362e085f2431a7da985a6b124f45"},
+ {file = "pillow-12.1.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:00162e9ca6d22b7c3ee8e61faa3c3253cd19b6a37f126cad04f2f88b306f557d"},
+ {file = "pillow-12.1.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:7d6daa89a00b58c37cb1747ec9fb7ac3bc5ffd5949f5888657dfddde6d1312e0"},
+ {file = "pillow-12.1.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e2479c7f02f9d505682dc47df8c0ea1fc5e264c4d1629a5d63fe3e2334b89554"},
+ {file = "pillow-12.1.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f188d580bd870cda1e15183790d1cc2fa78f666e76077d103edf048eed9c356e"},
+ {file = "pillow-12.1.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0fde7ec5538ab5095cc02df38ee99b0443ff0e1c847a045554cf5f9af1f4aa82"},
+ {file = "pillow-12.1.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0ed07dca4a8464bada6139ab38f5382f83e5f111698caf3191cb8dbf27d908b4"},
+ {file = "pillow-12.1.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:f45bd71d1fa5e5749587613037b172e0b3b23159d1c00ef2fc920da6f470e6f0"},
+ {file = "pillow-12.1.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:277518bf4fe74aa91489e1b20577473b19ee70fb97c374aa50830b279f25841b"},
+ {file = "pillow-12.1.0-cp313-cp313t-win32.whl", hash = "sha256:7315f9137087c4e0ee73a761b163fc9aa3b19f5f606a7fc08d83fd3e4379af65"},
+ {file = "pillow-12.1.0-cp313-cp313t-win_amd64.whl", hash = "sha256:0ddedfaa8b5f0b4ffbc2fa87b556dc59f6bb4ecb14a53b33f9189713ae8053c0"},
+ {file = "pillow-12.1.0-cp313-cp313t-win_arm64.whl", hash = "sha256:80941e6d573197a0c28f394753de529bb436b1ca990ed6e765cf42426abc39f8"},
+ {file = "pillow-12.1.0-cp314-cp314-ios_13_0_arm64_iphoneos.whl", hash = "sha256:5cb7bc1966d031aec37ddb9dcf15c2da5b2e9f7cc3ca7c54473a20a927e1eb91"},
+ {file = "pillow-12.1.0-cp314-cp314-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:97e9993d5ed946aba26baf9c1e8cf18adbab584b99f452ee72f7ee8acb882796"},
+ {file = "pillow-12.1.0-cp314-cp314-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:414b9a78e14ffeb98128863314e62c3f24b8a86081066625700b7985b3f529bd"},
+ {file = "pillow-12.1.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:e6bdb408f7c9dd2a5ff2b14a3b0bb6d4deb29fb9961e6eb3ae2031ae9a5cec13"},
+ {file = "pillow-12.1.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:3413c2ae377550f5487991d444428f1a8ae92784aac79caa8b1e3b89b175f77e"},
+ {file = "pillow-12.1.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e5dcbe95016e88437ecf33544ba5db21ef1b8dd6e1b434a2cb2a3d605299e643"},
+ {file = "pillow-12.1.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d0a7735df32ccbcc98b98a1ac785cc4b19b580be1bdf0aeb5c03223220ea09d5"},
+ {file = "pillow-12.1.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0c27407a2d1b96774cbc4a7594129cc027339fd800cd081e44497722ea1179de"},
+ {file = "pillow-12.1.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:15c794d74303828eaa957ff8070846d0efe8c630901a1c753fdc63850e19ecd9"},
+ {file = "pillow-12.1.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c990547452ee2800d8506c4150280757f88532f3de2a58e3022e9b179107862a"},
+ {file = "pillow-12.1.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:b63e13dd27da389ed9475b3d28510f0f954bca0041e8e551b2a4eb1eab56a39a"},
+ {file = "pillow-12.1.0-cp314-cp314-win32.whl", hash = "sha256:1a949604f73eb07a8adab38c4fe50791f9919344398bdc8ac6b307f755fc7030"},
+ {file = "pillow-12.1.0-cp314-cp314-win_amd64.whl", hash = "sha256:4f9f6a650743f0ddee5593ac9e954ba1bdbc5e150bc066586d4f26127853ab94"},
+ {file = "pillow-12.1.0-cp314-cp314-win_arm64.whl", hash = "sha256:808b99604f7873c800c4840f55ff389936ef1948e4e87645eaf3fccbc8477ac4"},
+ {file = "pillow-12.1.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:bc11908616c8a283cf7d664f77411a5ed2a02009b0097ff8abbba5e79128ccf2"},
+ {file = "pillow-12.1.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:896866d2d436563fa2a43a9d72f417874f16b5545955c54a64941e87c1376c61"},
+ {file = "pillow-12.1.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8e178e3e99d3c0ea8fc64b88447f7cac8ccf058af422a6cedc690d0eadd98c51"},
+ {file = "pillow-12.1.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:079af2fb0c599c2ec144ba2c02766d1b55498e373b3ac64687e43849fbbef5bc"},
+ {file = "pillow-12.1.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bdec5e43377761c5dbca620efb69a77f6855c5a379e32ac5b158f54c84212b14"},
+ {file = "pillow-12.1.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:565c986f4b45c020f5421a4cea13ef294dde9509a8577f29b2fc5edc7587fff8"},
+ {file = "pillow-12.1.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:43aca0a55ce1eefc0aefa6253661cb54571857b1a7b2964bd8a1e3ef4b729924"},
+ {file = "pillow-12.1.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:0deedf2ea233722476b3a81e8cdfbad786f7adbed5d848469fa59fe52396e4ef"},
+ {file = "pillow-12.1.0-cp314-cp314t-win32.whl", hash = "sha256:b17fbdbe01c196e7e159aacb889e091f28e61020a8abeac07b68079b6e626988"},
+ {file = "pillow-12.1.0-cp314-cp314t-win_amd64.whl", hash = "sha256:27b9baecb428899db6c0de572d6d305cfaf38ca1596b5c0542a5182e3e74e8c6"},
+ {file = "pillow-12.1.0-cp314-cp314t-win_arm64.whl", hash = "sha256:f61333d817698bdcdd0f9d7793e365ac3d2a21c1f1eb02b32ad6aefb8d8ea831"},
+ {file = "pillow-12.1.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:ca94b6aac0d7af2a10ba08c0f888b3d5114439b6b3ef39968378723622fed377"},
+ {file = "pillow-12.1.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:351889afef0f485b84078ea40fe33727a0492b9af3904661b0abbafee0355b72"},
+ {file = "pillow-12.1.0-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:bb0984b30e973f7e2884362b7d23d0a348c7143ee559f38ef3eaab640144204c"},
+ {file = "pillow-12.1.0-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:84cabc7095dd535ca934d57e9ce2a72ffd216e435a84acb06b2277b1de2689bd"},
+ {file = "pillow-12.1.0-pp311-pypy311_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:53d8b764726d3af1a138dd353116f774e3862ec7e3794e0c8781e30db0f35dfc"},
+ {file = "pillow-12.1.0-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5da841d81b1a05ef940a8567da92decaa15bc4d7dedb540a8c219ad83d91808a"},
+ {file = "pillow-12.1.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:75af0b4c229ac519b155028fa1be632d812a519abba9b46b20e50c6caa184f19"},
+ {file = "pillow-12.1.0.tar.gz", hash = "sha256:5c5ae0a06e9ea030ab786b0251b32c7e4ce10e58d983c0d5c56029455180b5b9"},
+]
+
+[package.extras]
+docs = ["furo", "olefile", "sphinx (>=8.2)", "sphinx-autobuild", "sphinx-copybutton", "sphinx-inline-tabs", "sphinxext-opengraph"]
+fpx = ["olefile"]
+mic = ["olefile"]
+test-arrow = ["arro3-compute", "arro3-core", "nanoarrow", "pyarrow"]
+tests = ["check-manifest", "coverage (>=7.4.2)", "defusedxml", "markdown2", "olefile", "packaging", "pyroma (>=5)", "pytest", "pytest-cov", "pytest-timeout", "pytest-xdist", "trove-classifiers (>=2024.10.12)"]
+xmp = ["defusedxml"]
+
[[package]]
name = "platformdirs"
version = "4.4.0"
description = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`."
optional = false
python-versions = ">=3.9"
+groups = ["main", "dev"]
files = [
{file = "platformdirs-4.4.0-py3-none-any.whl", hash = "sha256:abd01743f24e5287cd7a5db3752faf1a2d65353f38ec26d98e25a6db65958c85"},
{file = "platformdirs-4.4.0.tar.gz", hash = "sha256:ca753cf4d81dc309bc67b0ea38fd15dc97bc30ce419a7f58d13eb3bf14c4febf"},
@@ -610,6 +1762,7 @@ version = "1.6.0"
description = "plugin and hook calling mechanisms for python"
optional = false
python-versions = ">=3.9"
+groups = ["main", "dev"]
files = [
{file = "pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746"},
{file = "pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3"},
@@ -625,6 +1778,7 @@ version = "3.8.0"
description = "A framework for managing and maintaining multi-language pre-commit hooks."
optional = false
python-versions = ">=3.9"
+groups = ["main", "dev"]
files = [
{file = "pre_commit-3.8.0-py2.py3-none-any.whl", hash = "sha256:9a90a53bf82fdd8778d58085faf8d83df56e40dfe18f45b19446e26bf1b3a63f"},
{file = "pre_commit-3.8.0.tar.gz", hash = "sha256:8bb6494d4a20423842e198980c9ecf9f96607a07ea29549e180eef9ae80fe7af"},
@@ -643,6 +1797,7 @@ version = "6.33.4"
description = ""
optional = false
python-versions = ">=3.9"
+groups = ["main", "dev"]
files = [
{file = "protobuf-6.33.4-cp310-abi3-win32.whl", hash = "sha256:918966612c8232fc6c24c78e1cd89784307f5814ad7506c308ee3cf86662850d"},
{file = "protobuf-6.33.4-cp310-abi3-win_amd64.whl", hash = "sha256:8f11ffae31ec67fc2554c2ef891dcb561dae9a2a3ed941f9e134c2db06657dbc"},
@@ -662,6 +1817,7 @@ version = "0.10.9.7"
description = "Enables Python programs to dynamically access arbitrary Java objects"
optional = false
python-versions = "*"
+groups = ["main", "dev"]
files = [
{file = "py4j-0.10.9.7-py2.py3-none-any.whl", hash = "sha256:85defdfd2b2376eb3abf5ca6474b51ab7e0de341c75a02f46dc9b5976f5a5c1b"},
{file = "py4j-0.10.9.7.tar.gz", hash = "sha256:0b6e5315bb3ada5cf62ac651d107bb2ebc02def3dee9d9548e3baac644ea8dbb"},
@@ -673,6 +1829,7 @@ version = "21.0.0"
description = "Python library for Apache Arrow"
optional = false
python-versions = ">=3.9"
+groups = ["main", "dev"]
files = [
{file = "pyarrow-21.0.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:e563271e2c5ff4d4a4cbeb2c83d5cf0d4938b891518e676025f7268c6fe5fe26"},
{file = "pyarrow-21.0.0-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:fee33b0ca46f4c85443d6c450357101e47d53e6c3f008d658c27a2d020d44c79"},
@@ -728,6 +1885,7 @@ version = "2.19.2"
description = "Pygments is a syntax highlighting package written in Python."
optional = false
python-versions = ">=3.8"
+groups = ["main", "dev"]
files = [
{file = "pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b"},
{file = "pygments-2.19.2.tar.gz", hash = "sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887"},
@@ -736,12 +1894,28 @@ files = [
[package.extras]
windows-terminal = ["colorama (>=0.4.6)"]
+[[package]]
+name = "pyparsing"
+version = "3.3.1"
+description = "pyparsing - Classes and methods to define and execute parsing grammars"
+optional = false
+python-versions = ">=3.9"
+groups = ["main", "dev"]
+files = [
+ {file = "pyparsing-3.3.1-py3-none-any.whl", hash = "sha256:023b5e7e5520ad96642e2c6db4cb683d3970bd640cdf7115049a6e9c3682df82"},
+ {file = "pyparsing-3.3.1.tar.gz", hash = "sha256:47fad0f17ac1e2cad3de3b458570fbc9b03560aa029ed5e16ee5554da9a2251c"},
+]
+
+[package.extras]
+diagrams = ["jinja2", "railroad-diagrams"]
+
[[package]]
name = "pyspark"
version = "3.5.0"
description = "Apache Spark Python API"
optional = false
python-versions = ">=3.8"
+groups = ["main", "dev"]
files = [
{file = "pyspark-3.5.0.tar.gz", hash = "sha256:d41a9b76bd2aca370a6100d075c029e22ba44c5940927877e9435a3a9c566558"},
]
@@ -768,6 +1942,7 @@ version = "8.4.2"
description = "pytest: simple powerful testing with Python"
optional = false
python-versions = ">=3.9"
+groups = ["main", "dev"]
files = [
{file = "pytest-8.4.2-py3-none-any.whl", hash = "sha256:872f880de3fc3a5bdc88a11b39c9710c3497a547cfa9320bc3c5e62fbf272e79"},
{file = "pytest-8.4.2.tar.gz", hash = "sha256:86c0d0b93306b961d58d62a4db4879f27fe25513d4b969df351abdddb3c30e01"},
@@ -791,6 +1966,7 @@ version = "4.1.0"
description = "Pytest plugin for measuring coverage."
optional = false
python-versions = ">=3.7"
+groups = ["main", "dev"]
files = [
{file = "pytest-cov-4.1.0.tar.gz", hash = "sha256:3904b13dfbfec47f003b8e77fd5b589cd11904a21ddf1ab38a64f204d6a10ef6"},
{file = "pytest_cov-4.1.0-py3-none-any.whl", hash = "sha256:6ba70b9e97e69fcc3fb45bfeab2d0a138fb65c4d0d6a41ef33983ad114be8c3a"},
@@ -809,6 +1985,7 @@ version = "14.0"
description = "pytest plugin to re-run tests to eliminate flaky failures"
optional = false
python-versions = ">=3.8"
+groups = ["main", "dev"]
files = [
{file = "pytest-rerunfailures-14.0.tar.gz", hash = "sha256:4a400bcbcd3c7a4ad151ab8afac123d90eca3abe27f98725dc4d9702887d2e92"},
{file = "pytest_rerunfailures-14.0-py3-none-any.whl", hash = "sha256:4197bdd2eaeffdbf50b5ea6e7236f47ff0e44d1def8dae08e409f536d84e7b32"},
@@ -824,6 +2001,7 @@ version = "2.9.0.post0"
description = "Extensions to the standard Python datetime module"
optional = false
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
+groups = ["main", "dev"]
files = [
{file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"},
{file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"},
@@ -838,6 +2016,7 @@ version = "2025.2"
description = "World timezone definitions, modern and historical"
optional = false
python-versions = "*"
+groups = ["main", "dev"]
files = [
{file = "pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00"},
{file = "pytz-2025.2.tar.gz", hash = "sha256:360b9e3dbb49a209c21ad61809c7fb453643e048b38924c765813546746e81c3"},
@@ -849,6 +2028,7 @@ version = "6.0.3"
description = "YAML parser and emitter for Python"
optional = false
python-versions = ">=3.8"
+groups = ["main", "dev"]
files = [
{file = "PyYAML-6.0.3-cp38-cp38-macosx_10_13_x86_64.whl", hash = "sha256:c2514fceb77bc5e7a2f7adfaa1feb2fb311607c9cb518dbc378688ec73d8292f"},
{file = "PyYAML-6.0.3-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9c57bb8c96f6d1808c030b1687b9b5fb476abaa47f0db9c0101f5e9f394e97f4"},
@@ -931,19 +2111,20 @@ version = "80.9.0"
description = "Easily download, build, install, upgrade, and uninstall Python packages"
optional = false
python-versions = ">=3.9"
+groups = ["main"]
files = [
{file = "setuptools-80.9.0-py3-none-any.whl", hash = "sha256:062d34222ad13e0cc312a4c02d73f059e86a4acbfbdea8f8f76b28c99f306922"},
{file = "setuptools-80.9.0.tar.gz", hash = "sha256:f36b47402ecde768dbfafc46e8e4207b4360c654f1f3bb84475f0a28628fb19c"},
]
[package.extras]
-check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)", "ruff (>=0.8.0)"]
-core = ["importlib_metadata (>=6)", "jaraco.functools (>=4)", "jaraco.text (>=3.7)", "more_itertools", "more_itertools (>=8.8)", "packaging (>=24.2)", "platformdirs (>=4.2.2)", "tomli (>=2.0.1)", "wheel (>=0.43.0)"]
+check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\"", "ruff (>=0.8.0) ; sys_platform != \"cygwin\""]
+core = ["importlib_metadata (>=6) ; python_version < \"3.10\"", "jaraco.functools (>=4)", "jaraco.text (>=3.7)", "more_itertools", "more_itertools (>=8.8)", "packaging (>=24.2)", "platformdirs (>=4.2.2)", "tomli (>=2.0.1) ; python_version < \"3.11\"", "wheel (>=0.43.0)"]
cover = ["pytest-cov"]
doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier", "towncrier (<24.7)"]
enabler = ["pytest-enabler (>=2.2)"]
-test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.7.2)", "jaraco.test (>=5.5)", "packaging (>=24.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-home (>=0.5)", "pytest-perf", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel (>=0.44.0)"]
-type = ["importlib_metadata (>=7.0.2)", "jaraco.develop (>=7.21)", "mypy (==1.14.*)", "pytest-mypy"]
+test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21) ; python_version >= \"3.9\" and sys_platform != \"cygwin\"", "jaraco.envs (>=2.2)", "jaraco.path (>=3.7.2)", "jaraco.test (>=5.5)", "packaging (>=24.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-home (>=0.5)", "pytest-perf ; sys_platform != \"cygwin\"", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel (>=0.44.0)"]
+type = ["importlib_metadata (>=7.0.2) ; python_version < \"3.10\"", "jaraco.develop (>=7.21) ; sys_platform != \"cygwin\"", "mypy (==1.14.*)", "pytest-mypy"]
[[package]]
name = "six"
@@ -951,6 +2132,7 @@ version = "1.17.0"
description = "Python 2 and 3 compatibility utilities"
optional = false
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
+groups = ["main", "dev"]
files = [
{file = "six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274"},
{file = "six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81"},
@@ -962,6 +2144,8 @@ version = "2.4.0"
description = "A lil' TOML parser"
optional = false
python-versions = ">=3.8"
+groups = ["main", "dev"]
+markers = "python_version < \"3.11\""
files = [
{file = "tomli-2.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b5ef256a3fd497d4973c11bf142e9ed78b150d36f5773f1ca6088c230ffc5867"},
{file = "tomli-2.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5572e41282d5268eb09a697c89a7bee84fae66511f87533a6f88bd2f7b652da9"},
@@ -1018,6 +2202,7 @@ version = "4.15.0"
description = "Backported and Experimental Type Hints for Python 3.9+"
optional = false
python-versions = ">=3.9"
+groups = ["main", "dev"]
files = [
{file = "typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548"},
{file = "typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466"},
@@ -1029,6 +2214,7 @@ version = "2025.3"
description = "Provider of IANA time zone data"
optional = false
python-versions = ">=2"
+groups = ["main", "dev"]
files = [
{file = "tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1"},
{file = "tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7"},
@@ -1040,6 +2226,7 @@ version = "20.36.1"
description = "Virtual Python Environment builder"
optional = false
python-versions = ">=3.8"
+groups = ["main", "dev"]
files = [
{file = "virtualenv-20.36.1-py3-none-any.whl", hash = "sha256:575a8d6b124ef88f6f51d56d656132389f961062a9177016a50e4f507bbcc19f"},
{file = "virtualenv-20.36.1.tar.gz", hash = "sha256:8befb5c81842c641f8ee658481e42641c68b5eab3521d8e092d18320902466ba"},
@@ -1056,9 +2243,36 @@ typing-extensions = {version = ">=4.13.2", markers = "python_version < \"3.11\""
[package.extras]
docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2,!=7.3)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"]
-test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8)", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10)"]
+test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8) ; platform_python_implementation == \"PyPy\" or platform_python_implementation == \"GraalVM\" or platform_python_implementation == \"CPython\" and sys_platform == \"win32\" and python_version >= \"3.13\"", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10) ; platform_python_implementation == \"CPython\""]
+
+[[package]]
+name = "zipp"
+version = "3.23.0"
+description = "Backport of pathlib-compatible object wrapper for zip files"
+optional = false
+python-versions = ">=3.9"
+groups = ["main", "dev"]
+markers = "python_version == \"3.9\""
+files = [
+ {file = "zipp-3.23.0-py3-none-any.whl", hash = "sha256:071652d6115ed432f5ce1d34c336c0adfd6a884660d1e9712a256d3d3bd4b14e"},
+ {file = "zipp-3.23.0.tar.gz", hash = "sha256:a07157588a12518c9d4034df3fbbee09c814741a33ff63c05fa29d26a2404166"},
+]
+
+[package.extras]
+check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\""]
+cover = ["pytest-cov"]
+doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
+enabler = ["pytest-enabler (>=2.2)"]
+test = ["big-O", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more_itertools", "pytest (>=6,!=8.1.*)", "pytest-ignore-flaky"]
+type = ["pytest-mypy"]
+
+[extras]
+all = ["duckdb", "pyspark"]
+dev = ["black", "coverage", "duckdb", "matplotlib", "pre-commit", "pyspark", "pytest", "pytest-cov", "pytest-rerunfailures"]
+duckdb = ["duckdb"]
+spark = ["pyspark"]
[metadata]
-lock-version = "2.0"
+lock-version = "2.1"
python-versions = ">=3.9,<4"
-content-hash = "18db29f1829ab8baebdd68c486c74b5e7e4304a6d344a26773685b07b85fe7c3"
+content-hash = "7097bf5f307c1956c17cb9b53c57ed2adfa5a18836e3ab01a9beec01589fb20b"
diff --git a/pydeequ/__init__.py b/pydeequ/__init__.py
index 6d2202f..f31b47f 100644
--- a/pydeequ/__init__.py
+++ b/pydeequ/__init__.py
@@ -14,6 +14,15 @@
"""
PyDeequ - Python API for Deequ data quality library.
+For PyDeequ 2.0 with DuckDB (no Spark required):
+ import duckdb
+ import pydeequ
+ from pydeequ.v2.analyzers import Size, Completeness
+
+ con = duckdb.connect()
+ con.execute("CREATE TABLE test AS SELECT 1 as id")
+ engine = pydeequ.connect(con, table="test")
+
For PyDeequ 2.0 (Spark Connect), use:
from pydeequ.v2 import VerificationSuite, Check, CheckLevel
from pydeequ.v2.predicates import eq, gte
@@ -22,8 +31,52 @@
from pydeequ import deequ_maven_coord
from pydeequ.checks import Check, CheckLevel
"""
+from typing import Any, Optional
+
__version__ = "2.0.0b1"
+
+def connect(
+ connection: Any,
+ table: Optional[str] = None,
+ dataframe: Optional[Any] = None,
+):
+ """
+ Create an engine from a connection object with auto-detection.
+
+ This function inspects the connection type and creates the appropriate
+ engine backend. It supports:
+ - DuckDB connections (duckdb.DuckDBPyConnection) - runs locally
+ - Spark sessions (pyspark.sql.SparkSession) - uses Spark Connect
+
+ Args:
+ connection: A database connection or Spark session
+ table: Table name for SQL-based backends (required for DuckDB)
+ dataframe: DataFrame for Spark backend (alternative to table)
+
+ Returns:
+ An engine instance appropriate for the connection type
+
+ Raises:
+ ValueError: If connection type is not supported
+
+ Example:
+ # DuckDB (local, no Spark required)
+ import duckdb
+ import pydeequ
+
+ con = duckdb.connect()
+ con.execute("CREATE TABLE reviews AS SELECT * FROM 'reviews.csv'")
+ engine = pydeequ.connect(con, table="reviews")
+
+ # Spark Connect
+ from pyspark.sql import SparkSession
+ spark = SparkSession.builder.remote("sc://localhost:15002").getOrCreate()
+ engine = pydeequ.connect(spark, dataframe=df)
+ """
+ from pydeequ.engines import connect as engines_connect
+ return engines_connect(connection, table=table, dataframe=dataframe)
+
# Legacy imports are deferred to avoid requiring SPARK_VERSION for V2 users.
# V2 users should import from pydeequ.v2 directly.
diff --git a/pydeequ/configs.py b/pydeequ/configs.py
index e56c97d..ba5e378 100644
--- a/pydeequ/configs.py
+++ b/pydeequ/configs.py
@@ -41,4 +41,4 @@ def _get_deequ_maven_config():
SPARK_VERSION = _get_spark_version()
DEEQU_MAVEN_COORD = _get_deequ_maven_config()
-IS_DEEQU_V1 = re.search("com\.amazon\.deequ\:deequ\:1.*", DEEQU_MAVEN_COORD) is not None
+IS_DEEQU_V1 = re.search(r"com\.amazon\.deequ\:deequ\:1.*", DEEQU_MAVEN_COORD) is not None
diff --git a/pydeequ/engines/__init__.py b/pydeequ/engines/__init__.py
new file mode 100644
index 0000000..63a8327
--- /dev/null
+++ b/pydeequ/engines/__init__.py
@@ -0,0 +1,410 @@
+# -*- coding: utf-8 -*-
+"""
+Engine abstraction for PyDeequ.
+
+This module provides the engine abstraction layer that enables PyDeequ
+to work with different execution backends (Spark, DuckDB, etc.).
+
+Key design principles (inspired by DuckDQ):
+1. State computation is engine-dependent (SQL queries, Spark jobs)
+2. State merging is engine-independent (pure Python)
+3. This separation enables incremental validation and easy backend additions
+
+Example usage:
+ import duckdb
+ import pydeequ
+
+ # Auto-detection from connection type
+ con = duckdb.connect()
+ con.execute("CREATE TABLE test AS SELECT 1 as id, 2 as value")
+ engine = pydeequ.connect(con, table="test")
+
+ # Direct import
+ from pydeequ.engines.duckdb import DuckDBEngine
+ engine = DuckDBEngine(con, table="test")
+"""
+
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import (
+ TYPE_CHECKING,
+ Any,
+ Dict,
+ List,
+ Optional,
+ Sequence,
+ Tuple,
+ Union,
+)
+
+import pandas as pd
+
+if TYPE_CHECKING:
+ from pydeequ.v2.analyzers import _ConnectAnalyzer
+ from pydeequ.v2.checks import Check
+
+
+class ConstraintStatus(Enum):
+ """Status of a constraint evaluation."""
+ SUCCESS = "Success"
+ FAILURE = "Failure"
+
+ # Aliases for backwards compatibility
+ Success = "Success"
+ Failure = "Failure"
+
+
+class CheckStatus(Enum):
+ """Status of a check evaluation."""
+ SUCCESS = "Success"
+ WARNING = "Warning"
+ ERROR = "Error"
+
+ # Aliases for backwards compatibility
+ Success = "Success"
+ Warning = "Warning"
+ Error = "Error"
+
+
+@dataclass
+class MetricResult:
+ """Result of computing a metric."""
+ name: str
+ instance: str
+ entity: str
+ value: Optional[float]
+ success: bool = True
+ message: Optional[str] = None
+
+ def to_dict(self) -> Dict[str, Any]:
+ """Convert to dictionary for DataFrame creation."""
+ return {
+ "name": self.name,
+ "instance": self.instance,
+ "entity": self.entity,
+ "value": self.value,
+ }
+
+
+@dataclass
+class ConstraintResult:
+ """Result of evaluating a constraint."""
+ check_description: str
+ check_level: str
+ check_status: Union[str, "CheckStatus"]
+ constraint: str
+ constraint_status: Union[str, "ConstraintStatus"]
+ constraint_message: Optional[str] = None
+
+ def __post_init__(self):
+ """Convert string status values to enum values."""
+ # Handle check_status
+ if isinstance(self.check_status, str):
+ for status in CheckStatus:
+ if status.value == self.check_status:
+ self.check_status = status
+ break
+ # Handle constraint_status
+ if isinstance(self.constraint_status, str):
+ for status in ConstraintStatus:
+ if status.value == self.constraint_status:
+ self.constraint_status = status
+ break
+
+ def to_dict(self) -> Dict[str, Any]:
+ """Convert to dictionary for DataFrame creation."""
+ check_status_val = self.check_status.value if isinstance(self.check_status, CheckStatus) else self.check_status
+ constraint_status_val = self.constraint_status.value if isinstance(self.constraint_status, ConstraintStatus) else self.constraint_status
+ return {
+ "check": self.check_description,
+ "check_level": self.check_level,
+ "check_status": check_status_val,
+ "constraint": self.constraint,
+ "constraint_status": constraint_status_val,
+ "constraint_message": self.constraint_message or "",
+ }
+
+
+@dataclass
+class ColumnProfile:
+ """Profile of a single column."""
+ column: str
+ completeness: float
+ approx_distinct_values: int
+ data_type: str
+ is_data_type_inferred: bool = True
+ type_counts: Optional[str] = None
+ histogram: Optional[str] = None
+ mean: Optional[float] = None
+ minimum: Optional[float] = None
+ maximum: Optional[float] = None
+ sum: Optional[float] = None
+ std_dev: Optional[float] = None
+ approx_percentiles: Optional[str] = None
+ kll_buckets: Optional[str] = None
+
+ def to_dict(self) -> Dict[str, Any]:
+ """Convert to dictionary for DataFrame creation."""
+ return {
+ "column": self.column,
+ "completeness": self.completeness,
+ "approx_distinct_values": self.approx_distinct_values,
+ "data_type": self.data_type,
+ "is_data_type_inferred": self.is_data_type_inferred,
+ "type_counts": self.type_counts,
+ "histogram": self.histogram,
+ "mean": self.mean,
+ "minimum": self.minimum,
+ "maximum": self.maximum,
+ "sum": self.sum,
+ "std_dev": self.std_dev,
+ "approx_percentiles": self.approx_percentiles,
+ "kll_buckets": self.kll_buckets,
+ }
+
+
+@dataclass
+class ConstraintSuggestion:
+ """A suggested constraint."""
+ column_name: str
+ constraint_name: str
+ current_value: Optional[str]
+ description: str
+ suggesting_rule: str
+ code_for_constraint: str
+
+ def to_dict(self) -> Dict[str, Any]:
+ """Convert to dictionary for DataFrame creation."""
+ return {
+ "column_name": self.column_name,
+ "constraint_name": self.constraint_name,
+ "current_value": self.current_value,
+ "description": self.description,
+ "suggesting_rule": self.suggesting_rule,
+ "code_for_constraint": self.code_for_constraint,
+ }
+
+
+class BaseEngine(ABC):
+ """
+ Abstract base class for execution engines.
+
+ Engines are responsible for:
+ 1. Computing metrics from data (engine-dependent)
+ 2. Evaluating constraints against computed metrics
+ 3. Profiling columns
+ 4. Suggesting constraints
+
+ Subclasses must implement the core computation methods for their
+ specific backend (DuckDB, Spark, etc.).
+ """
+
+ @abstractmethod
+ def compute_metrics(
+ self, analyzers: Sequence["_ConnectAnalyzer"]
+ ) -> List[MetricResult]:
+ """
+ Compute metrics for the given analyzers.
+
+ Args:
+ analyzers: Sequence of analyzers to compute metrics for
+
+ Returns:
+ List of MetricResult objects
+ """
+ pass
+
+ @abstractmethod
+ def run_checks(self, checks: Sequence["Check"]) -> List[ConstraintResult]:
+ """
+ Run verification checks and return constraint results.
+
+ Args:
+ checks: Sequence of Check objects to evaluate
+
+ Returns:
+ List of ConstraintResult objects
+ """
+ pass
+
+ @abstractmethod
+ def profile_columns(
+ self,
+ columns: Optional[Sequence[str]] = None,
+ low_cardinality_threshold: int = 0,
+ ) -> List[ColumnProfile]:
+ """
+ Profile columns in the data source.
+
+ Args:
+ columns: Optional list of columns to profile. If None, profile all.
+ low_cardinality_threshold: Threshold for histogram computation
+
+ Returns:
+ List of ColumnProfile objects
+ """
+ pass
+
+ @abstractmethod
+ def suggest_constraints(
+ self,
+ columns: Optional[Sequence[str]] = None,
+ rules: Optional[Sequence[str]] = None,
+ ) -> List[ConstraintSuggestion]:
+ """
+ Suggest constraints based on data characteristics.
+
+ Args:
+ columns: Optional list of columns to analyze
+ rules: Optional list of rule sets to apply
+
+ Returns:
+ List of ConstraintSuggestion objects
+ """
+ pass
+
+ @abstractmethod
+ def get_schema(self) -> Dict[str, str]:
+ """
+ Get the schema of the data source.
+
+ Returns:
+ Dictionary mapping column names to data types
+ """
+ pass
+
+ def metrics_to_dataframe(self, metrics: List[MetricResult]) -> pd.DataFrame:
+ """Convert metrics to a pandas DataFrame."""
+ if not metrics:
+ return pd.DataFrame(columns=["name", "instance", "entity", "value"])
+ return pd.DataFrame([m.to_dict() for m in metrics])
+
+ def constraints_to_dataframe(
+ self, results: List[ConstraintResult]
+ ) -> pd.DataFrame:
+ """Convert constraint results to a pandas DataFrame."""
+ if not results:
+ return pd.DataFrame(
+ columns=[
+ "check", "check_level", "check_status",
+ "constraint", "constraint_status", "constraint_message"
+ ]
+ )
+ return pd.DataFrame([r.to_dict() for r in results])
+
+ def profiles_to_dataframe(self, profiles: List[ColumnProfile]) -> pd.DataFrame:
+ """Convert column profiles to a pandas DataFrame."""
+ if not profiles:
+ return pd.DataFrame(columns=["column", "completeness", "data_type"])
+ return pd.DataFrame([p.to_dict() for p in profiles])
+
+ def suggestions_to_dataframe(
+ self, suggestions: List[ConstraintSuggestion]
+ ) -> pd.DataFrame:
+ """Convert suggestions to a pandas DataFrame."""
+ if not suggestions:
+ return pd.DataFrame(
+ columns=[
+ "column_name", "constraint_name", "current_value",
+ "description", "suggesting_rule", "code_for_constraint"
+ ]
+ )
+ return pd.DataFrame([s.to_dict() for s in suggestions])
+
+
+def connect(
+ connection: Any,
+ table: Optional[str] = None,
+ dataframe: Optional[Any] = None,
+) -> BaseEngine:
+ """
+ Create an engine from a connection object with auto-detection.
+
+ This function inspects the connection type and creates the appropriate
+ engine backend. It supports:
+ - DuckDB connections (duckdb.DuckDBPyConnection)
+ - Spark sessions (pyspark.sql.SparkSession) - wraps existing v2 API
+
+ Args:
+ connection: A database connection or Spark session
+ table: Table name for SQL-based backends
+ dataframe: DataFrame for Spark backend (alternative to table)
+
+ Returns:
+ An engine instance appropriate for the connection type
+
+ Raises:
+ ValueError: If connection type is not supported
+
+ Example:
+ import duckdb
+ import pydeequ
+
+ con = duckdb.connect()
+ con.execute("CREATE TABLE reviews AS SELECT * FROM 'reviews.csv'")
+ engine = pydeequ.connect(con, table="reviews")
+ """
+ connection_type = type(connection).__name__
+ connection_module = type(connection).__module__
+
+ # Try DuckDB
+ if "duckdb" in connection_module.lower():
+ try:
+ import duckdb
+ if isinstance(connection, duckdb.DuckDBPyConnection):
+ if table is None:
+ raise ValueError("table parameter is required for DuckDB connections")
+ from pydeequ.engines.duckdb import DuckDBEngine
+ return DuckDBEngine(connection, table)
+ except ImportError:
+ raise ImportError(
+ "DuckDB backend requires the 'duckdb' package. "
+ "Install it with: pip install pydeequ[duckdb]"
+ ) from None
+
+ # Try Spark
+ if "pyspark" in connection_module.lower() or "spark" in connection_type.lower():
+ try:
+ from pyspark.sql import SparkSession
+ if isinstance(connection, SparkSession):
+ from pydeequ.engines.spark import SparkEngine
+ return SparkEngine(connection, table=table, dataframe=dataframe)
+ except ImportError:
+ raise ImportError(
+ "Spark backend requires the 'pyspark' package. "
+ "Install it with: pip install pydeequ[spark]"
+ ) from None
+
+ raise ValueError(
+ f"Unsupported connection type: {connection_type}. "
+ "Supported types:\n"
+ " - duckdb.DuckDBPyConnection (pip install pydeequ[duckdb])\n"
+ " - pyspark.sql.SparkSession (pip install pydeequ[spark])"
+ )
+
+
+# Export public API
+__all__ = [
+ # Base classes
+ "BaseEngine",
+ # Result types
+ "MetricResult",
+ "ConstraintResult",
+ "ConstraintStatus",
+ "CheckStatus",
+ "ColumnProfile",
+ "ConstraintSuggestion",
+ # Factory function
+ "connect",
+]
+
+
+# Lazy import for DuckDB config to avoid import errors when duckdb is not installed
+def __getattr__(name: str) -> Any:
+ if name == "DuckDBEngineConfig":
+ from pydeequ.engines.duckdb_config import DuckDBEngineConfig
+ return DuckDBEngineConfig
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/pydeequ/engines/constraints/__init__.py b/pydeequ/engines/constraints/__init__.py
new file mode 100644
index 0000000..a93b3bf
--- /dev/null
+++ b/pydeequ/engines/constraints/__init__.py
@@ -0,0 +1,132 @@
+# -*- coding: utf-8 -*-
+"""
+Constraint evaluator abstractions for data quality checks.
+
+This module provides a constraint evaluator pattern that:
+1. Encapsulates constraint evaluation logic in self-contained classes
+2. Separates value computation from assertion evaluation
+3. Provides consistent WHERE clause handling
+4. Enables easy addition of new constraint types
+
+Architecture:
+ Protocols (Contracts)
+ └── ConstraintEvaluatorProtocol - Defines evaluator contract
+
+ Base Classes (Hierarchy)
+ ├── BaseEvaluator - Base with WHERE clause and assertion handling
+ ├── RatioCheckEvaluator - For match/total ratio constraints
+ └── AnalyzerBasedEvaluator - Delegates to analyzer operators
+
+ Evaluator Implementations
+ ├── Analyzer-based (SizeEvaluator, CompletenessEvaluator, etc.)
+ ├── Ratio-check (IsPositiveEvaluator, IsContainedInEvaluator, etc.)
+ ├── Comparison (ColumnComparisonEvaluator)
+ └── Multi-column (MultiColumnCompletenessEvaluator)
+
+ Factory
+ └── ConstraintEvaluatorFactory - Creates evaluators from protobufs
+
+Example usage:
+ from pydeequ.engines.constraints import ConstraintEvaluatorFactory
+
+ # Create evaluator from constraint protobuf
+ evaluator = ConstraintEvaluatorFactory.create(constraint_proto)
+
+ if evaluator:
+ # Compute the metric value
+ value = evaluator.compute_value(table, execute_fn)
+
+ # Evaluate the assertion
+ passed = evaluator.evaluate(value)
+
+ # Get human-readable description
+ description = evaluator.to_string()
+"""
+
+from pydeequ.engines.constraints.base import (
+ AnalyzerBasedEvaluator,
+ BaseEvaluator,
+ RatioCheckEvaluator,
+)
+from pydeequ.engines.constraints.batch_evaluator import (
+ ConstraintBatchEvaluator,
+ SCAN_BASED_EVALUATORS,
+)
+from pydeequ.engines.constraints.evaluators import (
+ ApproxCountDistinctEvaluator,
+ ApproxQuantileEvaluator,
+ ColumnComparisonEvaluator,
+ CompletenessEvaluator,
+ ComplianceEvaluator,
+ ContainsCreditCardEvaluator,
+ ContainsEmailEvaluator,
+ ContainsSSNEvaluator,
+ ContainsURLEvaluator,
+ CorrelationEvaluator,
+ DistinctnessEvaluator,
+ EntropyEvaluator,
+ IsContainedInEvaluator,
+ IsNonNegativeEvaluator,
+ IsPositiveEvaluator,
+ MaximumEvaluator,
+ MaxLengthEvaluator,
+ MeanEvaluator,
+ MinimumEvaluator,
+ MinLengthEvaluator,
+ MultiColumnCompletenessEvaluator,
+ MutualInformationEvaluator,
+ PatternMatchEvaluator,
+ SizeEvaluator,
+ StandardDeviationEvaluator,
+ SumEvaluator,
+ UniquenessEvaluator,
+ UniqueValueRatioEvaluator,
+)
+from pydeequ.engines.constraints.factory import ConstraintEvaluatorFactory
+from pydeequ.engines.constraints.protocols import ConstraintEvaluatorProtocol
+
+__all__ = [
+ # Protocols
+ "ConstraintEvaluatorProtocol",
+ # Base classes
+ "BaseEvaluator",
+ "RatioCheckEvaluator",
+ "AnalyzerBasedEvaluator",
+ # Batch evaluator
+ "ConstraintBatchEvaluator",
+ "SCAN_BASED_EVALUATORS",
+ # Analyzer-based evaluators
+ "SizeEvaluator",
+ "CompletenessEvaluator",
+ "MeanEvaluator",
+ "MinimumEvaluator",
+ "MaximumEvaluator",
+ "SumEvaluator",
+ "StandardDeviationEvaluator",
+ "UniquenessEvaluator",
+ "DistinctnessEvaluator",
+ "UniqueValueRatioEvaluator",
+ "CorrelationEvaluator",
+ "EntropyEvaluator",
+ "MutualInformationEvaluator",
+ "PatternMatchEvaluator",
+ "MinLengthEvaluator",
+ "MaxLengthEvaluator",
+ "ApproxCountDistinctEvaluator",
+ "ApproxQuantileEvaluator",
+ "ComplianceEvaluator",
+ # Ratio-check evaluators
+ "IsPositiveEvaluator",
+ "IsNonNegativeEvaluator",
+ "IsContainedInEvaluator",
+ "ContainsEmailEvaluator",
+ "ContainsURLEvaluator",
+ "ContainsCreditCardEvaluator",
+ "ContainsSSNEvaluator",
+ # Comparison evaluators
+ "ColumnComparisonEvaluator",
+ # Multi-column evaluators
+ "MultiColumnCompletenessEvaluator",
+ # Factory
+ "ConstraintEvaluatorFactory",
+]
diff --git a/pydeequ/engines/constraints/base.py b/pydeequ/engines/constraints/base.py
new file mode 100644
index 0000000..abcd32e
--- /dev/null
+++ b/pydeequ/engines/constraints/base.py
@@ -0,0 +1,271 @@
+# -*- coding: utf-8 -*-
+"""
+Base classes for constraint evaluators.
+
+This module provides the abstract base classes that combine mixins
+to create the foundation for concrete evaluator implementations.
+"""
+
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, Callable, List, Optional
+
+from pydeequ.engines.operators.mixins import SafeExtractMixin, WhereClauseMixin
+
+if TYPE_CHECKING:
+ import pandas as pd
+ from pydeequ.v2.predicates import Predicate
+
+
+class BaseEvaluator(WhereClauseMixin, SafeExtractMixin, ABC):
+ """
+ Base class for all constraint evaluators.
+
+ Provides shared functionality for WHERE clause handling,
+ assertion parsing, and predicate evaluation.
+
+ Attributes:
+ column: Optional column name for single-column constraints
+ columns: List of column names for multi-column constraints
+ where: Optional SQL WHERE clause for filtering
+ assertion: Parsed predicate for evaluation
+ """
+
+ def __init__(self, constraint_proto):
+ """
+ Initialize evaluator from constraint protobuf.
+
+ Args:
+ constraint_proto: Protobuf message containing constraint definition
+ """
+ self.column = constraint_proto.column if constraint_proto.column else None
+ self.columns = list(constraint_proto.columns) if constraint_proto.columns else []
+ self.where = constraint_proto.where if constraint_proto.where else None
+ self.assertion = self._parse_assertion(constraint_proto)
+ self._constraint_type = constraint_proto.type
+
+ @property
+ def constraint_type(self) -> str:
+ """Return the constraint type identifier."""
+ return self._constraint_type
+
+ def _parse_assertion(self, constraint_proto) -> Optional["Predicate"]:
+ """
+ Parse assertion predicate from constraint protobuf.
+
+ Args:
+ constraint_proto: Protobuf message containing constraint definition
+
+ Returns:
+ Parsed predicate or None if no assertion specified
+ """
+ from pydeequ.v2.proto import deequ_connect_pb2 as proto
+
+ if not constraint_proto.HasField("assertion"):
+ return None
+
+ pred_msg = constraint_proto.assertion
+
+ if pred_msg.operator == proto.PredicateMessage.Operator.BETWEEN:
+ from pydeequ.v2.predicates import Between
+ return Between(pred_msg.lower_bound, pred_msg.upper_bound)
+ else:
+ from pydeequ.v2.predicates import Comparison
+ return Comparison(pred_msg.operator, pred_msg.value)
+
+ def _evaluate_predicate(self, value: float, assertion: "Predicate") -> bool:
+ """
+ Evaluate a predicate against a value.
+
+ Args:
+ value: The value to check
+ assertion: The predicate to evaluate
+
+ Returns:
+ True if the value satisfies the predicate
+ """
+ from pydeequ.v2.predicates import Between, Comparison
+ from pydeequ.v2.proto import deequ_connect_pb2 as proto
+
+ if isinstance(assertion, Comparison):
+ op = assertion.operator
+ target = assertion.value
+
+ if op == proto.PredicateMessage.Operator.EQ:
+ return abs(value - target) < 1e-9
+ elif op == proto.PredicateMessage.Operator.NE:
+ return abs(value - target) >= 1e-9
+ elif op == proto.PredicateMessage.Operator.GT:
+ return value > target
+ elif op == proto.PredicateMessage.Operator.GE:
+ return value >= target
+ elif op == proto.PredicateMessage.Operator.LT:
+ return value < target
+ elif op == proto.PredicateMessage.Operator.LE:
+ return value <= target
+
+ elif isinstance(assertion, Between):
+ return assertion.lower <= value <= assertion.upper
+
+ return False
+
+ def evaluate(self, value: Optional[float]) -> bool:
+ """
+ Evaluate whether the computed value satisfies the constraint.
+
+ Args:
+ value: The computed metric value
+
+ Returns:
+ True if the constraint is satisfied, False otherwise
+ """
+ if value is None:
+ return False
+
+ if self.assertion:
+ return self._evaluate_predicate(value, self.assertion)
+
+ # Default: value must equal 1.0 (for completeness-like constraints)
+ return value == 1.0
+
+ @abstractmethod
+ def compute_value(
+ self, table: str, execute_fn: Callable[[str], "pd.DataFrame"]
+ ) -> Optional[float]:
+ """
+ Compute the metric value for this constraint.
+
+ Args:
+ table: Name of the table to query
+ execute_fn: Function to execute SQL and return DataFrame
+
+ Returns:
+ Computed metric value, or None if computation fails
+ """
+ raise NotImplementedError
+
+ @abstractmethod
+ def to_string(self) -> str:
+ """
+ Return a human-readable string representation of the constraint.
+
+ Returns:
+ Description of what the constraint checks
+ """
+ raise NotImplementedError
+
+
+class RatioCheckEvaluator(BaseEvaluator):
+ """
+ Base class for constraints that compute matches/total ratio.
+
+ These constraints check what fraction of rows satisfy some condition,
+ such as isPositive, isNonNegative, isContainedIn, etc.
+ """
+
+ @abstractmethod
+ def get_condition(self) -> str:
+ """
+ Get the SQL condition that defines a 'match'.
+
+ Returns:
+ SQL boolean expression for the match condition
+ """
+ raise NotImplementedError
+
+ def compute_value(
+ self, table: str, execute_fn: Callable[[str], "pd.DataFrame"]
+ ) -> Optional[float]:
+ """
+ Compute the fraction of rows matching the condition.
+
+ Args:
+ table: Name of the table to query
+ execute_fn: Function to execute SQL and return DataFrame
+
+ Returns:
+ Ratio of matching rows to total rows
+ """
+ condition = self.get_condition()
+
+ if self.where:
+ query = f"""
+ SELECT
+ SUM(CASE WHEN {self.where} THEN 1 ELSE 0 END) as total,
+ SUM(CASE WHEN ({self.where}) AND ({condition}) THEN 1 ELSE 0 END) as matches
+ FROM {table}
+ """
+ else:
+ query = f"""
+ SELECT
+ COUNT(*) as total,
+ SUM(CASE WHEN {condition} THEN 1 ELSE 0 END) as matches
+ FROM {table}
+ """
+
+ result = execute_fn(query)
+ total = self.safe_float(result, "total") or 0
+ matches = self.safe_float(result, "matches") or 0
+
+ if total == 0:
+ return 1.0
+ return matches / total
+
+
+class AnalyzerBasedEvaluator(BaseEvaluator):
+ """
+ Base class for constraints that delegate to an analyzer operator.
+
+ These constraints compute their value by creating and running
+ the corresponding analyzer operator.
+ """
+
+ @abstractmethod
+ def get_operator(self):
+ """
+ Get the operator instance to compute the metric.
+
+ Returns:
+ Operator instance (ScanOperator or GroupingOperator)
+ """
+ raise NotImplementedError
+
+ def compute_value(
+ self, table: str, execute_fn: Callable[[str], "pd.DataFrame"]
+ ) -> Optional[float]:
+ """
+ Compute the metric value using the analyzer operator.
+
+ Args:
+ table: Name of the table to query
+ execute_fn: Function to execute SQL and return DataFrame
+
+ Returns:
+ Computed metric value
+ """
+ operator = self.get_operator()
+
+ # Check if it's a scan or grouping operator
+ if hasattr(operator, "get_aggregations"):
+ # Scan operator
+ aggregations = operator.get_aggregations()
+ query = f"SELECT {', '.join(aggregations)} FROM {table}"
+ result = execute_fn(query)
+ metric_result = operator.extract_result(result)
+ return metric_result.value
+ elif hasattr(operator, "build_query"):
+ # Grouping operator
+ query = operator.build_query(table)
+ result = execute_fn(query)
+ metric_result = operator.extract_result(result)
+ return metric_result.value
+
+ return None
+
+
+__all__ = [
+ "BaseEvaluator",
+ "RatioCheckEvaluator",
+ "AnalyzerBasedEvaluator",
+]
diff --git a/pydeequ/engines/constraints/batch_evaluator.py b/pydeequ/engines/constraints/batch_evaluator.py
new file mode 100644
index 0000000..14eea45
--- /dev/null
+++ b/pydeequ/engines/constraints/batch_evaluator.py
@@ -0,0 +1,298 @@
+# -*- coding: utf-8 -*-
+"""
+Constraint batch evaluation for DuckDB performance optimization.
+
+This module provides functionality to batch constraint evaluations that can
+share SQL queries, reducing the number of queries executed.
+
+Key optimizations:
+1. Scan-based constraints (Size, Mean, Completeness, etc.) can be batched
+ when they use scan operators with compatible aggregations.
+2. Ratio-check constraints (isPositive, isNonNegative, isContainedIn, etc.)
+ can be batched into a single query when they operate on the same table.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Type
+
+from pydeequ.engines.constraints.base import (
+ AnalyzerBasedEvaluator,
+ BaseEvaluator,
+ RatioCheckEvaluator,
+)
+from pydeequ.engines.constraints.evaluators import (
+ CompletenessEvaluator,
+ MaximumEvaluator,
+ MeanEvaluator,
+ MinimumEvaluator,
+ SizeEvaluator,
+ StandardDeviationEvaluator,
+ SumEvaluator,
+)
+
+if TYPE_CHECKING:
+ import pandas as pd
+
+
+# Evaluators that use scan operators (can be batched via aggregations)
+SCAN_BASED_EVALUATORS: Tuple[Type[AnalyzerBasedEvaluator], ...] = (
+ SizeEvaluator,
+ CompletenessEvaluator,
+ MeanEvaluator,
+ MinimumEvaluator,
+ MaximumEvaluator,
+ SumEvaluator,
+ StandardDeviationEvaluator,
+)
+
+
+class ConstraintBatchEvaluator:
+ """
+ Batches constraint evaluations to minimize SQL queries.
+
+ This class groups constraints by their evaluation pattern and executes
+ them in batches where possible:
+ - Scan-based evaluators: batched into single aggregation queries
+ - Ratio-check evaluators: batched into single ratio queries
+ - Other evaluators: executed individually
+ """
+
+ def __init__(self, evaluators: List[BaseEvaluator]):
+ """
+ Initialize the batch evaluator.
+
+ Args:
+ evaluators: List of constraint evaluators
+ """
+ self.evaluators = evaluators
+ self._scan_based: List[AnalyzerBasedEvaluator] = []
+ self._ratio_checks: List[RatioCheckEvaluator] = []
+ self._other: List[BaseEvaluator] = []
+ self._analyze()
+
+ def _analyze(self) -> None:
+ """Categorize evaluators by type for batching."""
+ for evaluator in self.evaluators:
+ if isinstance(evaluator, SCAN_BASED_EVALUATORS):
+ self._scan_based.append(evaluator)
+ elif isinstance(evaluator, RatioCheckEvaluator):
+ self._ratio_checks.append(evaluator)
+ else:
+ self._other.append(evaluator)
+
+ def get_batch_info(self) -> Dict[str, int]:
+ """Return batch grouping information for debugging."""
+ return {
+ "scan_based": len(self._scan_based),
+ "ratio_checks": len(self._ratio_checks),
+ "other": len(self._other),
+ }
+
+ def execute(
+ self,
+ table: str,
+ execute_fn: Callable[[str], "pd.DataFrame"],
+ ) -> Dict[BaseEvaluator, Optional[float]]:
+ """
+ Execute all evaluators with batching optimization.
+
+ Args:
+ table: Name of the table to query
+ execute_fn: Function to execute SQL and return DataFrame
+
+ Returns:
+ Dictionary mapping evaluators to their computed values
+ """
+ results: Dict[BaseEvaluator, Optional[float]] = {}
+
+ # Batch scan-based evaluators
+ if self._scan_based:
+ scan_results = self._execute_scan_batch(table, execute_fn)
+ results.update(scan_results)
+
+ # Batch ratio-check evaluators
+ if self._ratio_checks:
+ ratio_results = self._execute_ratio_batch(table, execute_fn)
+ results.update(ratio_results)
+
+ # Execute other evaluators individually
+ for evaluator in self._other:
+ try:
+ value = evaluator.compute_value(table, execute_fn)
+ results[evaluator] = value
+ except Exception:
+ results[evaluator] = None
+
+ return results
+
+ def _execute_scan_batch(
+ self,
+ table: str,
+ execute_fn: Callable[[str], "pd.DataFrame"],
+ ) -> Dict[BaseEvaluator, Optional[float]]:
+ """
+ Execute scan-based evaluators in a single batched query.
+
+ Args:
+ table: Name of the table to query
+ execute_fn: Function to execute SQL and return DataFrame
+
+ Returns:
+ Dictionary mapping evaluators to their computed values
+ """
+ results: Dict[BaseEvaluator, Optional[float]] = {}
+
+ # Collect all aggregations from scan operators
+ operators = []
+ operator_to_evaluator = {}
+
+ for evaluator in self._scan_based:
+ operator = evaluator.get_operator()
+ if operator and hasattr(operator, "get_aggregations"):
+ operators.append(operator)
+ operator_to_evaluator[id(operator)] = evaluator
+
+ if not operators:
+ # Fall back to individual execution
+ for evaluator in self._scan_based:
+ try:
+ value = evaluator.compute_value(table, execute_fn)
+ results[evaluator] = value
+ except Exception:
+ results[evaluator] = None
+ return results
+
+ # Build batched query
+ aggregations = []
+ for operator in operators:
+ aggregations.extend(operator.get_aggregations())
+
+ query = f"SELECT {', '.join(aggregations)} FROM {table}"
+
+ try:
+ df = execute_fn(query)
+
+ # Extract results for each operator
+ for operator in operators:
+ evaluator = operator_to_evaluator[id(operator)]
+ try:
+ metric_result = operator.extract_result(df)
+ results[evaluator] = metric_result.value
+ except Exception:
+ results[evaluator] = None
+
+ except Exception:
+ # Fall back to individual execution on batch failure
+ for evaluator in self._scan_based:
+ try:
+ value = evaluator.compute_value(table, execute_fn)
+ results[evaluator] = value
+ except Exception:
+ results[evaluator] = None
+
+ return results
+
+ def _execute_ratio_batch(
+ self,
+ table: str,
+ execute_fn: Callable[[str], "pd.DataFrame"],
+ ) -> Dict[BaseEvaluator, Optional[float]]:
+ """
+ Execute ratio-check evaluators in a single batched query.
+
+ Args:
+ table: Name of the table to query
+ execute_fn: Function to execute SQL and return DataFrame
+
+ Returns:
+ Dictionary mapping evaluators to their computed values
+ """
+ results: Dict[BaseEvaluator, Optional[float]] = {}
+
+ # Group evaluators by WHERE clause for proper batching
+ where_groups: Dict[Optional[str], List[RatioCheckEvaluator]] = {}
+ for evaluator in self._ratio_checks:
+ where = getattr(evaluator, "where", None)
+ if where not in where_groups:
+ where_groups[where] = []
+ where_groups[where].append(evaluator)
+
+ # Execute each where-group as a batch
+ for where, group_evaluators in where_groups.items():
+ try:
+ group_results = self._execute_ratio_group(
+ table, execute_fn, group_evaluators, where
+ )
+ results.update(group_results)
+ except Exception:
+ # Fall back to individual execution
+ for evaluator in group_evaluators:
+ try:
+ value = evaluator.compute_value(table, execute_fn)
+ results[evaluator] = value
+ except Exception:
+ results[evaluator] = None
+
+ return results
+
+ def _execute_ratio_group(
+ self,
+ table: str,
+ execute_fn: Callable[[str], "pd.DataFrame"],
+ evaluators: List[RatioCheckEvaluator],
+ where: Optional[str],
+ ) -> Dict[BaseEvaluator, Optional[float]]:
+ """
+ Execute a group of ratio-check evaluators with the same WHERE clause.
+
+ Args:
+ table: Name of the table to query
+ execute_fn: Function to execute SQL and return DataFrame
+ evaluators: List of ratio-check evaluators
+ where: WHERE clause (None if no filter)
+
+ Returns:
+ Dictionary mapping evaluators to their computed values
+ """
+ results: Dict[BaseEvaluator, Optional[float]] = {}
+
+ # Build batched ratio query
+ cases = []
+ for i, evaluator in enumerate(evaluators):
+ condition = evaluator.get_condition()
+ cases.append(f"SUM(CASE WHEN {condition} THEN 1 ELSE 0 END) as matches_{i}")
+
+ # Add total count
+ if where:
+ query = f"""
+ SELECT
+ SUM(CASE WHEN {where} THEN 1 ELSE 0 END) as total,
+ {', '.join([f"SUM(CASE WHEN ({where}) AND ({evaluators[i].get_condition()}) THEN 1 ELSE 0 END) as matches_{i}" for i in range(len(evaluators))])}
+ FROM {table}
+ """
+ else:
+ query = f"""
+ SELECT
+ COUNT(*) as total,
+ {', '.join(cases)}
+ FROM {table}
+ """
+
+ df = execute_fn(query)
+ total = float(df["total"].iloc[0]) if df["total"].iloc[0] else 0
+
+ for i, evaluator in enumerate(evaluators):
+ matches = float(df[f"matches_{i}"].iloc[0]) if df[f"matches_{i}"].iloc[0] else 0
+ if total == 0:
+ results[evaluator] = 1.0
+ else:
+ results[evaluator] = matches / total
+
+ return results
+
+
+__all__ = [
+ "ConstraintBatchEvaluator",
+ "SCAN_BASED_EVALUATORS",
+]
diff --git a/pydeequ/engines/constraints/evaluators.py b/pydeequ/engines/constraints/evaluators.py
new file mode 100644
index 0000000..2ac5650
--- /dev/null
+++ b/pydeequ/engines/constraints/evaluators.py
@@ -0,0 +1,494 @@
+# -*- coding: utf-8 -*-
+"""
+Constraint evaluator implementations.
+
+This module contains all concrete evaluator classes that implement
+specific constraint types.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Callable, Optional
+
+from pydeequ.engines.constraints.base import (
+ AnalyzerBasedEvaluator,
+ BaseEvaluator,
+ RatioCheckEvaluator,
+)
+from pydeequ.engines.operators import (
+ ApproxCountDistinctOperator,
+ ApproxQuantileOperator,
+ CompletenessOperator,
+ ComplianceOperator,
+ CorrelationOperator,
+ DistinctnessOperator,
+ EntropyOperator,
+ MaximumOperator,
+ MaxLengthOperator,
+ MeanOperator,
+ MinimumOperator,
+ MinLengthOperator,
+ MutualInformationOperator,
+ PatternMatchOperator,
+ SizeOperator,
+ StandardDeviationOperator,
+ SumOperator,
+ UniqueValueRatioOperator,
+ UniquenessOperator,
+)
+
+if TYPE_CHECKING:
+ import pandas as pd
+
+
+# =============================================================================
+# Analyzer-based evaluators
+# =============================================================================
+
+
+class SizeEvaluator(AnalyzerBasedEvaluator):
+ """Evaluator for hasSize constraint."""
+
+ def get_operator(self):
+ return SizeOperator(where=self.where)
+
+ def to_string(self) -> str:
+ if self.assertion:
+ return f"hasSize(assertion)"
+ return "hasSize()"
+
+
+class CompletenessEvaluator(AnalyzerBasedEvaluator):
+ """Evaluator for isComplete and hasCompleteness constraints."""
+
+ def get_operator(self):
+ return CompletenessOperator(self.column, where=self.where)
+
+ def to_string(self) -> str:
+ if self.assertion:
+ return f"hasCompleteness({self.column}, assertion)"
+ return f"isComplete({self.column})"
+
+
+class MeanEvaluator(AnalyzerBasedEvaluator):
+ """Evaluator for hasMean constraint."""
+
+ def get_operator(self):
+ return MeanOperator(self.column, where=self.where)
+
+ def to_string(self) -> str:
+ return f"hasMean({self.column}, assertion)"
+
+
+class MinimumEvaluator(AnalyzerBasedEvaluator):
+ """Evaluator for hasMin constraint."""
+
+ def get_operator(self):
+ return MinimumOperator(self.column, where=self.where)
+
+ def to_string(self) -> str:
+ return f"hasMin({self.column}, assertion)"
+
+
+class MaximumEvaluator(AnalyzerBasedEvaluator):
+ """Evaluator for hasMax constraint."""
+
+ def get_operator(self):
+ return MaximumOperator(self.column, where=self.where)
+
+ def to_string(self) -> str:
+ return f"hasMax({self.column}, assertion)"
+
+
+class SumEvaluator(AnalyzerBasedEvaluator):
+ """Evaluator for hasSum constraint."""
+
+ def get_operator(self):
+ return SumOperator(self.column, where=self.where)
+
+ def to_string(self) -> str:
+ return f"hasSum({self.column}, assertion)"
+
+
+class StandardDeviationEvaluator(AnalyzerBasedEvaluator):
+ """Evaluator for hasStandardDeviation constraint."""
+
+ def get_operator(self):
+ return StandardDeviationOperator(self.column, where=self.where)
+
+ def to_string(self) -> str:
+ return f"hasStandardDeviation({self.column}, assertion)"
+
+
+class UniquenessEvaluator(AnalyzerBasedEvaluator):
+ """Evaluator for isUnique and hasUniqueness constraints."""
+
+ def get_operator(self):
+ cols = self.columns if self.columns else [self.column]
+ return UniquenessOperator(cols, where=self.where)
+
+ def to_string(self) -> str:
+ cols = self.columns if self.columns else [self.column]
+ col_str = ", ".join(cols)
+ if self.assertion:
+ return f"hasUniqueness({col_str}, assertion)"
+ return f"isUnique({col_str})"
+
+
+class DistinctnessEvaluator(AnalyzerBasedEvaluator):
+ """Evaluator for hasDistinctness constraint."""
+
+ def get_operator(self):
+ cols = self.columns if self.columns else [self.column]
+ return DistinctnessOperator(cols, where=self.where)
+
+ def to_string(self) -> str:
+ cols = self.columns if self.columns else [self.column]
+ col_str = ", ".join(cols)
+ return f"hasDistinctness({col_str}, assertion)"
+
+
+class UniqueValueRatioEvaluator(AnalyzerBasedEvaluator):
+ """Evaluator for hasUniqueValueRatio constraint."""
+
+ def get_operator(self):
+ cols = self.columns if self.columns else [self.column]
+ return UniqueValueRatioOperator(cols, where=self.where)
+
+ def to_string(self) -> str:
+ cols = self.columns if self.columns else [self.column]
+ col_str = ", ".join(cols)
+ return f"hasUniqueValueRatio({col_str}, assertion)"
+
+
+class CorrelationEvaluator(AnalyzerBasedEvaluator):
+ """Evaluator for hasCorrelation constraint."""
+
+ def get_operator(self):
+ if len(self.columns) >= 2:
+ return CorrelationOperator(self.columns[0], self.columns[1], where=self.where)
+ return None
+
+ def compute_value(
+ self, table: str, execute_fn: Callable[[str], "pd.DataFrame"]
+ ) -> Optional[float]:
+ if len(self.columns) < 2:
+ return None
+ return super().compute_value(table, execute_fn)
+
+ def to_string(self) -> str:
+ if len(self.columns) >= 2:
+ return f"hasCorrelation({self.columns[0]}, {self.columns[1]}, assertion)"
+ return "hasCorrelation()"
+
+
+class EntropyEvaluator(AnalyzerBasedEvaluator):
+ """Evaluator for hasEntropy constraint."""
+
+ def get_operator(self):
+ return EntropyOperator(self.column, where=self.where)
+
+ def to_string(self) -> str:
+ return f"hasEntropy({self.column}, assertion)"
+
+
+class MutualInformationEvaluator(AnalyzerBasedEvaluator):
+ """Evaluator for hasMutualInformation constraint."""
+
+ def get_operator(self):
+ if len(self.columns) >= 2:
+ return MutualInformationOperator(self.columns, where=self.where)
+ return None
+
+ def compute_value(
+ self, table: str, execute_fn: Callable[[str], "pd.DataFrame"]
+ ) -> Optional[float]:
+ if len(self.columns) < 2:
+ return None
+ return super().compute_value(table, execute_fn)
+
+ def to_string(self) -> str:
+ col_str = ", ".join(self.columns)
+ return f"hasMutualInformation({col_str}, assertion)"
+
+
+class PatternMatchEvaluator(AnalyzerBasedEvaluator):
+ """Evaluator for hasPattern constraint."""
+
+ def __init__(self, constraint_proto):
+ super().__init__(constraint_proto)
+ self.pattern = constraint_proto.pattern if constraint_proto.pattern else ""
+
+ def get_operator(self):
+ return PatternMatchOperator(self.column, self.pattern, where=self.where)
+
+ def to_string(self) -> str:
+ return f"hasPattern({self.column}, '{self.pattern}')"
+
+
+class MinLengthEvaluator(AnalyzerBasedEvaluator):
+ """Evaluator for hasMinLength constraint."""
+
+ def get_operator(self):
+ return MinLengthOperator(self.column, where=self.where)
+
+ def to_string(self) -> str:
+ return f"hasMinLength({self.column}, assertion)"
+
+
+class MaxLengthEvaluator(AnalyzerBasedEvaluator):
+ """Evaluator for hasMaxLength constraint."""
+
+ def get_operator(self):
+ return MaxLengthOperator(self.column, where=self.where)
+
+ def to_string(self) -> str:
+ return f"hasMaxLength({self.column}, assertion)"
+
+
+class ApproxCountDistinctEvaluator(AnalyzerBasedEvaluator):
+ """Evaluator for hasApproxCountDistinct constraint."""
+
+ def get_operator(self):
+ return ApproxCountDistinctOperator(self.column, where=self.where)
+
+ def to_string(self) -> str:
+ return f"hasApproxCountDistinct({self.column}, assertion)"
+
+
+class ApproxQuantileEvaluator(AnalyzerBasedEvaluator):
+ """Evaluator for hasApproxQuantile constraint."""
+
+ def __init__(self, constraint_proto):
+ super().__init__(constraint_proto)
+ self.quantile = constraint_proto.quantile if constraint_proto.quantile else 0.5
+
+ def get_operator(self):
+ return ApproxQuantileOperator(self.column, self.quantile, where=self.where)
+
+ def to_string(self) -> str:
+ return f"hasApproxQuantile({self.column}, {self.quantile}, assertion)"
+
+
+class ComplianceEvaluator(AnalyzerBasedEvaluator):
+ """Evaluator for satisfies constraint."""
+
+ def __init__(self, constraint_proto):
+ super().__init__(constraint_proto)
+ self.predicate = constraint_proto.column_condition if constraint_proto.column_condition else ""
+ self.name = constraint_proto.constraint_name if constraint_proto.constraint_name else "satisfies"
+
+ def get_operator(self):
+ return ComplianceOperator(self.name, self.predicate, where=self.where)
+
+ def to_string(self) -> str:
+ return f"satisfies({self.name}, '{self.predicate}')"
+
+
+# =============================================================================
+# Ratio-check evaluators
+# =============================================================================
+
+
+class IsPositiveEvaluator(RatioCheckEvaluator):
+ """Evaluator for isPositive constraint."""
+
+ def get_condition(self) -> str:
+ return f"{self.column} > 0"
+
+ def to_string(self) -> str:
+ return f"isPositive({self.column})"
+
+
+class IsNonNegativeEvaluator(RatioCheckEvaluator):
+ """Evaluator for isNonNegative constraint."""
+
+ def get_condition(self) -> str:
+ return f"{self.column} >= 0"
+
+ def to_string(self) -> str:
+ return f"isNonNegative({self.column})"
+
+
+class IsContainedInEvaluator(RatioCheckEvaluator):
+ """Evaluator for isContainedIn constraint."""
+
+ def __init__(self, constraint_proto):
+ super().__init__(constraint_proto)
+ self.allowed_values = list(constraint_proto.allowed_values) if constraint_proto.allowed_values else []
+
+ def get_condition(self) -> str:
+ # Escape single quotes in values
+ escaped_values = [v.replace("'", "''") for v in self.allowed_values]
+ values_str = ", ".join([f"'{v}'" for v in escaped_values])
+ return f"{self.column} IN ({values_str})"
+
+ def to_string(self) -> str:
+ values_str = ", ".join([f"'{v}'" for v in self.allowed_values])
+ return f"isContainedIn({self.column}, [{values_str}])"
+
+
+class ContainsEmailEvaluator(RatioCheckEvaluator):
+ """Evaluator for containsEmail constraint."""
+
+ EMAIL_PATTERN = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
+
+ def get_condition(self) -> str:
+ return f"REGEXP_MATCHES({self.column}, '{self.EMAIL_PATTERN}')"
+
+ def to_string(self) -> str:
+ return f"containsEmail({self.column})"
+
+
+class ContainsURLEvaluator(RatioCheckEvaluator):
+ """Evaluator for containsURL constraint."""
+
+ URL_PATTERN = r"^https?://[^\s]+$"
+
+ def get_condition(self) -> str:
+ return f"REGEXP_MATCHES({self.column}, '{self.URL_PATTERN}')"
+
+ def to_string(self) -> str:
+ return f"containsURL({self.column})"
+
+
+class ContainsCreditCardEvaluator(RatioCheckEvaluator):
+ """Evaluator for containsCreditCardNumber constraint."""
+
+ CC_PATTERN = r"^\d{13,19}$"
+
+ def get_condition(self) -> str:
+ return f"REGEXP_MATCHES({self.column}, '{self.CC_PATTERN}')"
+
+ def to_string(self) -> str:
+ return f"containsCreditCardNumber({self.column})"
+
+
+class ContainsSSNEvaluator(RatioCheckEvaluator):
+ """Evaluator for containsSocialSecurityNumber constraint."""
+
+ SSN_PATTERN = r"^\d{3}-\d{2}-\d{4}$"
+
+ def get_condition(self) -> str:
+ return f"REGEXP_MATCHES({self.column}, '{self.SSN_PATTERN}')"
+
+ def to_string(self) -> str:
+ return f"containsSocialSecurityNumber({self.column})"
+
+
+# =============================================================================
+# Comparison evaluators
+# =============================================================================
+
+
+class ColumnComparisonEvaluator(RatioCheckEvaluator):
+ """Evaluator for column comparison constraints."""
+
+ def __init__(self, constraint_proto):
+ super().__init__(constraint_proto)
+ self._comparison_type = constraint_proto.type
+
+ def get_condition(self) -> str:
+ if len(self.columns) < 2:
+ return "1=0" # Always false if not enough columns
+
+ col_a, col_b = self.columns[0], self.columns[1]
+
+ if self._comparison_type == "isLessThan":
+ return f"{col_a} < {col_b}"
+ elif self._comparison_type == "isLessThanOrEqualTo":
+ return f"{col_a} <= {col_b}"
+ elif self._comparison_type == "isGreaterThan":
+ return f"{col_a} > {col_b}"
+ elif self._comparison_type == "isGreaterThanOrEqualTo":
+ return f"{col_a} >= {col_b}"
+
+ return "1=0"
+
+ def to_string(self) -> str:
+ if len(self.columns) >= 2:
+ return f"{self._comparison_type}({self.columns[0]}, {self.columns[1]})"
+ return f"{self._comparison_type}()"
+
+
+# =============================================================================
+# Multi-column evaluators
+# =============================================================================
+
+
+class MultiColumnCompletenessEvaluator(BaseEvaluator):
+ """Evaluator for areComplete and haveCompleteness constraints."""
+
+ def compute_value(
+ self, table: str, execute_fn: Callable[[str], "pd.DataFrame"]
+ ) -> Optional[float]:
+ if not self.columns:
+ return 1.0
+
+ # All columns must be non-null for a row to be "complete"
+ null_conditions = " OR ".join([f"{col} IS NULL" for col in self.columns])
+
+ if self.where:
+ query = f"""
+ SELECT
+ SUM(CASE WHEN {self.where} THEN 1 ELSE 0 END) as total,
+ SUM(CASE WHEN ({self.where}) AND ({null_conditions}) THEN 1 ELSE 0 END) as any_null
+ FROM {table}
+ """
+ else:
+ query = f"""
+ SELECT
+ COUNT(*) as total,
+ SUM(CASE WHEN {null_conditions} THEN 1 ELSE 0 END) as any_null
+ FROM {table}
+ """
+
+ result = execute_fn(query)
+ total = self.safe_float(result, "total") or 0
+ any_null = self.safe_float(result, "any_null") or 0
+
+ if total == 0:
+ return 1.0
+ return (total - any_null) / total
+
+ def to_string(self) -> str:
+ col_str = ", ".join(self.columns)
+ if self.assertion:
+ return f"haveCompleteness({col_str}, assertion)"
+ return f"areComplete({col_str})"
+
+
+__all__ = [
+ # Analyzer-based evaluators
+ "SizeEvaluator",
+ "CompletenessEvaluator",
+ "MeanEvaluator",
+ "MinimumEvaluator",
+ "MaximumEvaluator",
+ "SumEvaluator",
+ "StandardDeviationEvaluator",
+ "UniquenessEvaluator",
+ "DistinctnessEvaluator",
+ "UniqueValueRatioEvaluator",
+ "CorrelationEvaluator",
+ "EntropyEvaluator",
+ "MutualInformationEvaluator",
+ "PatternMatchEvaluator",
+ "MinLengthEvaluator",
+ "MaxLengthEvaluator",
+ "ApproxCountDistinctEvaluator",
+ "ApproxQuantileEvaluator",
+ "ComplianceEvaluator",
+ # Ratio-check evaluators
+ "IsPositiveEvaluator",
+ "IsNonNegativeEvaluator",
+ "IsContainedInEvaluator",
+ "ContainsEmailEvaluator",
+ "ContainsURLEvaluator",
+ "ContainsCreditCardEvaluator",
+ "ContainsSSNEvaluator",
+ # Comparison evaluators
+ "ColumnComparisonEvaluator",
+ # Multi-column evaluators
+ "MultiColumnCompletenessEvaluator",
+]
diff --git a/pydeequ/engines/constraints/factory.py b/pydeequ/engines/constraints/factory.py
new file mode 100644
index 0000000..c04e60c
--- /dev/null
+++ b/pydeequ/engines/constraints/factory.py
@@ -0,0 +1,136 @@
+# -*- coding: utf-8 -*-
+"""
+Factory for creating constraint evaluators.
+
+This module provides a registry-based factory pattern for creating
+evaluator instances from constraint protobufs.
+"""
+
+from __future__ import annotations
+
+from typing import Dict, Optional, Type
+
+from pydeequ.engines.constraints.base import BaseEvaluator
+from pydeequ.engines.constraints.evaluators import (
+ ApproxCountDistinctEvaluator,
+ ApproxQuantileEvaluator,
+ ColumnComparisonEvaluator,
+ CompletenessEvaluator,
+ ComplianceEvaluator,
+ ContainsCreditCardEvaluator,
+ ContainsEmailEvaluator,
+ ContainsSSNEvaluator,
+ ContainsURLEvaluator,
+ CorrelationEvaluator,
+ DistinctnessEvaluator,
+ EntropyEvaluator,
+ IsContainedInEvaluator,
+ IsNonNegativeEvaluator,
+ IsPositiveEvaluator,
+ MaximumEvaluator,
+ MaxLengthEvaluator,
+ MeanEvaluator,
+ MinimumEvaluator,
+ MinLengthEvaluator,
+ MultiColumnCompletenessEvaluator,
+ MutualInformationEvaluator,
+ PatternMatchEvaluator,
+ SizeEvaluator,
+ StandardDeviationEvaluator,
+ SumEvaluator,
+ UniquenessEvaluator,
+ UniqueValueRatioEvaluator,
+)
+
+class ConstraintEvaluatorFactory:
+ """
+ Factory for creating constraint evaluators from protobufs.
+
+ This factory uses a registry pattern to map constraint type strings
+ to their corresponding evaluator classes.
+ """
+
+ _registry: Dict[str, Type[BaseEvaluator]] = {
+ # Analyzer-based evaluators
+ "hasSize": SizeEvaluator,
+ "isComplete": CompletenessEvaluator,
+ "hasCompleteness": CompletenessEvaluator,
+ "hasMean": MeanEvaluator,
+ "hasMin": MinimumEvaluator,
+ "hasMax": MaximumEvaluator,
+ "hasSum": SumEvaluator,
+ "hasStandardDeviation": StandardDeviationEvaluator,
+ "isUnique": UniquenessEvaluator,
+ "hasUniqueness": UniquenessEvaluator,
+ "hasDistinctness": DistinctnessEvaluator,
+ "hasUniqueValueRatio": UniqueValueRatioEvaluator,
+ "hasCorrelation": CorrelationEvaluator,
+ "hasEntropy": EntropyEvaluator,
+ "hasMutualInformation": MutualInformationEvaluator,
+ "hasPattern": PatternMatchEvaluator,
+ "hasMinLength": MinLengthEvaluator,
+ "hasMaxLength": MaxLengthEvaluator,
+ "hasApproxCountDistinct": ApproxCountDistinctEvaluator,
+ "hasApproxQuantile": ApproxQuantileEvaluator,
+ "satisfies": ComplianceEvaluator,
+ # Ratio-check evaluators
+ "isPositive": IsPositiveEvaluator,
+ "isNonNegative": IsNonNegativeEvaluator,
+ "isContainedIn": IsContainedInEvaluator,
+ "containsEmail": ContainsEmailEvaluator,
+ "containsURL": ContainsURLEvaluator,
+ "containsCreditCardNumber": ContainsCreditCardEvaluator,
+ "containsSocialSecurityNumber": ContainsSSNEvaluator,
+ # Comparison evaluators
+ "isLessThan": ColumnComparisonEvaluator,
+ "isLessThanOrEqualTo": ColumnComparisonEvaluator,
+ "isGreaterThan": ColumnComparisonEvaluator,
+ "isGreaterThanOrEqualTo": ColumnComparisonEvaluator,
+ # Multi-column evaluators
+ "areComplete": MultiColumnCompletenessEvaluator,
+ "haveCompleteness": MultiColumnCompletenessEvaluator,
+ }
+
+ @classmethod
+ def create(cls, constraint_proto) -> Optional[BaseEvaluator]:
+ """
+ Create an evaluator instance from a constraint protobuf.
+
+ Args:
+ constraint_proto: Protobuf message containing constraint definition
+
+ Returns:
+ Evaluator instance or None if constraint type not supported
+ """
+ evaluator_class = cls._registry.get(constraint_proto.type)
+ if evaluator_class:
+ return evaluator_class(constraint_proto)
+ return None
+
+ @classmethod
+ def is_supported(cls, constraint_type: str) -> bool:
+ """
+ Check if a constraint type is supported by the factory.
+
+ Args:
+ constraint_type: The constraint type string to check
+
+ Returns:
+ True if the constraint type is supported
+ """
+ return constraint_type in cls._registry
+
+ @classmethod
+ def supported_types(cls) -> list:
+ """
+ Get list of all supported constraint types.
+
+ Returns:
+ List of supported constraint type strings
+ """
+ return list(cls._registry.keys())
+
+
+__all__ = [
+ "ConstraintEvaluatorFactory",
+]
diff --git a/pydeequ/engines/constraints/protocols.py b/pydeequ/engines/constraints/protocols.py
new file mode 100644
index 0000000..87a7e67
--- /dev/null
+++ b/pydeequ/engines/constraints/protocols.py
@@ -0,0 +1,73 @@
+# -*- coding: utf-8 -*-
+"""
+Protocol definitions for constraint evaluators.
+
+This module defines the structural typing contracts that all constraint
+evaluators must satisfy.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Callable, Optional, Protocol, runtime_checkable
+
+if TYPE_CHECKING:
+ from pydeequ.v2.predicates import Predicate
+
+
+@runtime_checkable
+class ConstraintEvaluatorProtocol(Protocol):
+ """
+ Contract for constraint evaluators.
+
+ Constraint evaluators compute values from data and evaluate
+ assertions against those values to determine pass/fail status.
+ """
+
+ @property
+ def constraint_type(self) -> str:
+ """Return the constraint type identifier."""
+ ...
+
+ def compute_value(
+ self, table: str, execute_fn: Callable[[str], "pd.DataFrame"]
+ ) -> Optional[float]:
+ """
+ Compute the metric value for this constraint.
+
+ Args:
+ table: Name of the table to query
+ execute_fn: Function to execute SQL and return DataFrame
+
+ Returns:
+ Computed metric value, or None if computation fails
+ """
+ ...
+
+ def evaluate(
+ self, value: Optional[float], assertion: Optional["Predicate"] = None
+ ) -> bool:
+ """
+ Evaluate whether the computed value satisfies the constraint.
+
+ Args:
+ value: The computed metric value
+ assertion: Optional predicate to evaluate against
+
+ Returns:
+ True if the constraint is satisfied, False otherwise
+ """
+ ...
+
+ def to_string(self) -> str:
+ """
+ Return a human-readable string representation of the constraint.
+
+ Returns:
+ Description of what the constraint checks
+ """
+ ...
+
+
+__all__ = [
+ "ConstraintEvaluatorProtocol",
+]
diff --git a/pydeequ/engines/duckdb.py b/pydeequ/engines/duckdb.py
new file mode 100644
index 0000000..e4b0200
--- /dev/null
+++ b/pydeequ/engines/duckdb.py
@@ -0,0 +1,533 @@
+# -*- coding: utf-8 -*-
+"""
+DuckDB execution engine for PyDeequ.
+
+This module provides a DuckDB-based execution engine that runs data quality
+checks directly via SQL queries, without requiring a Spark cluster.
+
+Example usage:
+ import duckdb
+ from pydeequ.engines.duckdb import DuckDBEngine
+ from pydeequ.v2.analyzers import Size, Completeness, Mean
+
+ con = duckdb.connect()
+ con.execute("CREATE TABLE test AS SELECT 1 as id, 2 as value")
+
+ engine = DuckDBEngine(con, table="test")
+ metrics = engine.compute_metrics([Size(), Completeness("id"), Mean("value")])
+
+ # With profiling enabled
+ engine = DuckDBEngine(con, table="test", enable_profiling=True)
+ engine.compute_metrics([Size(), Completeness("id")])
+ stats = engine.get_query_stats()
+ print(f"Total queries: {engine.get_query_count()}")
+"""
+
+from __future__ import annotations
+
+import time
+from typing import TYPE_CHECKING, Dict, List, Optional, Sequence
+
+import pandas as pd
+
+from pydeequ.engines import (
+ BaseEngine,
+ ColumnProfile,
+ ConstraintResult,
+ ConstraintSuggestion,
+ ConstraintStatus,
+ CheckStatus,
+ MetricResult,
+)
+from pydeequ.engines.operators import GroupingOperatorBatcher, OperatorFactory
+
+if TYPE_CHECKING:
+ import duckdb
+ from pydeequ.engines.duckdb_config import DuckDBEngineConfig
+ from pydeequ.v2.analyzers import _ConnectAnalyzer
+ from pydeequ.v2.checks import Check
+ from pydeequ.v2.predicates import Predicate
+
+
+class DuckDBEngine(BaseEngine):
+ """
+ DuckDB-based execution engine.
+
+ This engine executes data quality checks using DuckDB SQL queries.
+ It supports most analyzers through standard SQL aggregations.
+
+ Attributes:
+ con: DuckDB connection
+ table: Name of the table to analyze
+ enable_profiling: Whether to collect query timing statistics
+ config: Optional configuration for DuckDB optimization
+ """
+
+ def __init__(
+ self,
+ con: "duckdb.DuckDBPyConnection",
+ table: str,
+ enable_profiling: bool = False,
+ config: Optional["DuckDBEngineConfig"] = None,
+ ):
+ """
+ Create a new DuckDBEngine.
+
+ Args:
+ con: DuckDB connection object
+ table: Name of the table to analyze
+ enable_profiling: Whether to collect query timing statistics
+ config: Optional DuckDB configuration for optimization
+ """
+ self.con = con
+ self.table = table
+ self._schema: Optional[Dict[str, str]] = None
+ self._enable_profiling = enable_profiling
+ self._query_stats: List[Dict] = []
+
+ # Apply configuration if provided
+ if config is not None:
+ config.apply(con)
+
+ def get_schema(self) -> Dict[str, str]:
+ """Get the schema of the table."""
+ if self._schema is None:
+ df = self.con.execute(f"PRAGMA table_info('{self.table}')").fetchdf()
+ self._schema = {}
+ for _, row in df.iterrows():
+ # Normalize type names to uppercase for consistency
+ col_type = str(row["type"]).upper()
+ # Extract base type (e.g., "DECIMAL(10,2)" -> "DECIMAL")
+ base_type = col_type.split("(")[0]
+ self._schema[row["name"]] = base_type
+ return self._schema
+
+ def _execute_query(self, query: str) -> pd.DataFrame:
+ """Execute a SQL query and return results as DataFrame."""
+ if self._enable_profiling:
+ start = time.perf_counter()
+ result = self.con.execute(query).fetchdf()
+ elapsed = time.perf_counter() - start
+ self._query_stats.append({
+ 'query': query[:200] + ('...' if len(query) > 200 else ''),
+ 'time_ms': elapsed * 1000,
+ 'rows': len(result),
+ })
+ return result
+ return self.con.execute(query).fetchdf()
+
+ def get_query_stats(self) -> pd.DataFrame:
+ """Return profiling statistics as DataFrame."""
+ return pd.DataFrame(self._query_stats)
+
+ def get_query_count(self) -> int:
+ """Return number of queries executed."""
+ return len(self._query_stats)
+
+ def explain_query(self, query: str) -> str:
+ """Get DuckDB query plan with EXPLAIN ANALYZE."""
+ return self.con.execute(f"EXPLAIN ANALYZE {query}").fetchdf().to_string()
+
+ def reset_profiling(self) -> None:
+ """Reset profiling statistics."""
+ self._query_stats = []
+
+ def _get_row_count(self, where: Optional[str] = None) -> int:
+ """Get the row count, optionally filtered."""
+ if where:
+ query = f"SELECT COUNT(*) as cnt FROM {self.table} WHERE {where}"
+ else:
+ query = f"SELECT COUNT(*) as cnt FROM {self.table}"
+ result = self._execute_query(query)
+ return int(result["cnt"].iloc[0])
+
+ # =========================================================================
+ # Main compute_metrics implementation using operators
+ # =========================================================================
+
+ def compute_metrics(
+ self, analyzers: Sequence["_ConnectAnalyzer"]
+ ) -> List[MetricResult]:
+ """
+ Compute metrics for the given analyzers.
+
+ This method uses the operator abstraction to:
+ 1. Create operators from analyzers via OperatorFactory
+ 2. Batch scan operators into a single SQL query
+ 3. Execute grouping operators individually
+ 4. Handle metadata operators using schema access
+ 5. Extract results using operator-specific logic
+ """
+ results: List[MetricResult] = []
+
+ # Separate analyzers by operator type
+ scan_operators = []
+ grouping_operators = []
+ metadata_operators = []
+
+ for analyzer in analyzers:
+ if OperatorFactory.is_scan_operator(analyzer):
+ operator = OperatorFactory.create(analyzer)
+ if operator:
+ scan_operators.append(operator)
+ elif OperatorFactory.is_grouping_operator(analyzer):
+ operator = OperatorFactory.create(analyzer)
+ if operator:
+ grouping_operators.append(operator)
+ elif OperatorFactory.is_metadata_operator(analyzer):
+ operator = OperatorFactory.create(analyzer)
+ if operator:
+ metadata_operators.append(operator)
+ else:
+ # Unsupported analyzer
+ results.append(MetricResult(
+ name=type(analyzer).__name__,
+ instance=getattr(analyzer, 'column', '*'),
+ entity="Column" if hasattr(analyzer, 'column') else "Dataset",
+ value=None,
+ success=False,
+ message=f"Analyzer {type(analyzer).__name__} not implemented"
+ ))
+
+ # Execute batched scan query
+ if scan_operators:
+ try:
+ # Collect all aggregations
+ aggregations = []
+ for operator in scan_operators:
+ aggregations.extend(operator.get_aggregations())
+
+ # Build and execute single query
+ query = f"SELECT {', '.join(aggregations)} FROM {self.table}"
+ scan_result = self._execute_query(query)
+
+ # Extract results from each operator
+ for operator in scan_operators:
+ try:
+ result = operator.extract_result(scan_result)
+ results.append(result)
+ except Exception as e:
+ results.append(MetricResult(
+ name=operator.metric_name,
+ instance=operator.instance,
+ entity=operator.entity,
+ value=None,
+ success=False,
+ message=str(e)
+ ))
+
+ except Exception as e:
+ # If batch query fails, report error for all scan operators
+ for operator in scan_operators:
+ results.append(MetricResult(
+ name=operator.metric_name,
+ instance=operator.instance,
+ entity=operator.entity,
+ value=None,
+ success=False,
+ message=f"Batch query failed: {str(e)}"
+ ))
+
+ # Execute grouping operators with batching optimization
+ if grouping_operators:
+ batcher = GroupingOperatorBatcher(grouping_operators)
+
+ # Execute batched queries (fused operators with same columns/where)
+ try:
+ batched_results = batcher.execute_batched(
+ self.table, self._execute_query
+ )
+ results.extend(batched_results)
+ except Exception as e:
+ # If batched execution fails, fall back to individual execution
+ for operator in grouping_operators:
+ if operator not in batcher.get_unbatchable_operators():
+ results.append(MetricResult(
+ name=operator.metric_name,
+ instance=operator.instance,
+ entity=operator.entity,
+ value=None,
+ success=False,
+ message=f"Batched query failed: {str(e)}"
+ ))
+
+ # Execute unbatchable operators individually
+ for operator in batcher.get_unbatchable_operators():
+ try:
+ query = operator.build_query(self.table)
+ df = self._execute_query(query)
+ result = operator.extract_result(df)
+ results.append(result)
+ except Exception as e:
+ results.append(MetricResult(
+ name=operator.metric_name,
+ instance=operator.instance,
+ entity=operator.entity,
+ value=None,
+ success=False,
+ message=str(e)
+ ))
+
+ # Execute metadata operators using schema
+ schema = self.get_schema()
+ for operator in metadata_operators:
+ try:
+ result = operator.compute_from_schema(schema)
+ results.append(result)
+ except Exception as e:
+ results.append(MetricResult(
+ name=operator.metric_name,
+ instance=operator.instance,
+ entity=operator.entity,
+ value=None,
+ success=False,
+ message=str(e)
+ ))
+
+ return results
+
+ # =========================================================================
+ # Constraint checking
+ # =========================================================================
+
+ def run_checks(self, checks: Sequence["Check"]) -> List[ConstraintResult]:
+ """Run verification checks and return constraint results.
+
+ Uses ConstraintBatchEvaluator to batch compatible constraints,
+ reducing the number of SQL queries executed.
+ """
+ from pydeequ.v2.checks import CheckLevel
+ from pydeequ.engines.constraints import (
+ ConstraintBatchEvaluator,
+ ConstraintEvaluatorFactory,
+ )
+
+ results: List[ConstraintResult] = []
+
+ # Phase 1: Create all evaluators and collect metadata
+ all_evaluators = []
+ constraint_info = [] # (check, constraint, evaluator) tuples
+
+ for check in checks:
+ for constraint in check._constraints:
+ evaluator = ConstraintEvaluatorFactory.create(constraint)
+ if evaluator:
+ all_evaluators.append(evaluator)
+ constraint_info.append((check, constraint, evaluator))
+ else:
+ constraint_info.append((check, constraint, None))
+
+ # Phase 2: Batch execute all evaluators
+ computed_values: Dict = {}
+ if all_evaluators:
+ batcher = ConstraintBatchEvaluator(all_evaluators)
+ computed_values = batcher.execute(self.table, self._execute_query)
+
+ # Phase 3: Process results by check
+ info_idx = 0
+ for check in checks:
+ check_description = check.description
+ check_level = check.level.value
+ check_has_failure = False
+
+ for constraint in check._constraints:
+ _, _, evaluator = constraint_info[info_idx]
+ info_idx += 1
+
+ constraint_message = None
+ constraint_passed = False
+
+ try:
+ if evaluator:
+ # Get pre-computed value from batch execution
+ value = computed_values.get(evaluator)
+
+ # Evaluate the constraint
+ constraint_passed = evaluator.evaluate(value)
+
+ # Get constraint description
+ constraint_str = evaluator.to_string()
+
+ if not constraint_passed:
+ if value is not None:
+ constraint_message = f"Value: {value:.6g}"
+ else:
+ constraint_message = "Could not compute metric"
+ else:
+ constraint_str = constraint.type
+ constraint_message = f"Unknown constraint type: {constraint.type}"
+
+ except Exception as e:
+ constraint_str = constraint.type
+ constraint_message = f"Error: {str(e)}"
+ constraint_passed = False
+
+ if not constraint_passed:
+ check_has_failure = True
+
+ results.append(ConstraintResult(
+ check_description=check_description,
+ check_level=check_level,
+ check_status=CheckStatus.ERROR.value if check_has_failure else CheckStatus.SUCCESS.value,
+ constraint=constraint_str,
+ constraint_status=ConstraintStatus.SUCCESS.value if constraint_passed else ConstraintStatus.FAILURE.value,
+ constraint_message=constraint_message,
+ ))
+
+ # Update check status for all constraints in this check
+ final_status = CheckStatus.ERROR.value if check_has_failure else CheckStatus.SUCCESS.value
+ if check.level == CheckLevel.Warning and check_has_failure:
+ final_status = CheckStatus.WARNING.value
+
+ for i in range(len(results) - len(check._constraints), len(results)):
+ results[i] = ConstraintResult(
+ check_description=results[i].check_description,
+ check_level=results[i].check_level,
+ check_status=final_status,
+ constraint=results[i].constraint,
+ constraint_status=results[i].constraint_status,
+ constraint_message=results[i].constraint_message,
+ )
+
+ return results
+
+ # =========================================================================
+ # Column profiling
+ # =========================================================================
+
+ def profile_columns(
+ self,
+ columns: Optional[Sequence[str]] = None,
+ low_cardinality_threshold: int = 0,
+ ) -> List[ColumnProfile]:
+ """
+ Profile columns in the table.
+
+ Uses MultiColumnProfileOperator to batch profile statistics across
+ multiple columns, significantly reducing the number of SQL queries
+ from 2-3 per column to 2-3 total.
+
+ Args:
+ columns: Optional list of columns to profile. If None, profile all.
+ low_cardinality_threshold: Threshold for histogram computation.
+ If > 0 and distinct values <= threshold, compute histogram.
+
+ Returns:
+ List of ColumnProfile objects
+ """
+ from pydeequ.engines.operators.profiling_operators import (
+ ColumnProfileOperator,
+ MultiColumnProfileOperator,
+ )
+
+ schema = self.get_schema()
+
+ # Determine which columns to profile
+ if columns:
+ cols_to_profile = [c for c in columns if c in schema]
+ else:
+ cols_to_profile = list(schema.keys())
+
+ if not cols_to_profile:
+ return []
+
+ # Use MultiColumnProfileOperator for batched profiling
+ operator = MultiColumnProfileOperator(cols_to_profile, schema)
+
+ # Query 1: Completeness and distinct counts for all columns
+ completeness_query = operator.build_completeness_query(self.table)
+ completeness_df = self._execute_query(completeness_query)
+
+ # Query 2: Numeric stats for all numeric columns (if any)
+ numeric_df = None
+ numeric_query = operator.build_numeric_stats_query(self.table)
+ if numeric_query:
+ numeric_df = self._execute_query(numeric_query)
+
+ # Query 3: Percentiles for all numeric columns (if any)
+ percentile_df = None
+ percentile_query = operator.build_percentile_query(self.table)
+ if percentile_query:
+ try:
+ percentile_df = self._execute_query(percentile_query)
+ except Exception:
+ # Percentile computation may fail for some types
+ pass
+
+ # Extract profiles from batched results
+ profiles = operator.extract_profiles(completeness_df, numeric_df, percentile_df)
+
+ # Add histograms for low cardinality columns (requires per-column queries)
+ if low_cardinality_threshold > 0:
+ for profile in profiles:
+ if profile.approx_distinct_values <= low_cardinality_threshold:
+ col_type = schema.get(profile.column, "VARCHAR")
+ col_operator = ColumnProfileOperator(
+ column=profile.column,
+ column_type=col_type,
+ compute_percentiles=False,
+ compute_histogram=True,
+ histogram_limit=low_cardinality_threshold,
+ )
+ hist_query = col_operator.build_histogram_query(self.table)
+ hist_result = self._execute_query(hist_query)
+ profile.histogram = col_operator.extract_histogram_result(hist_result)
+
+ return profiles
+
+ # =========================================================================
+ # Constraint suggestions
+ # =========================================================================
+
+ def suggest_constraints(
+ self,
+ columns: Optional[Sequence[str]] = None,
+ rules: Optional[Sequence[str]] = None,
+ ) -> List[ConstraintSuggestion]:
+ """
+ Suggest constraints based on data characteristics.
+
+ Uses the SuggestionRunner to apply modular suggestion rules against
+ column profiles. Rules are organized into sets:
+ - DEFAULT: completeness, non-negative, categorical
+ - NUMERICAL: min, max, mean
+ - STRING: min/max length
+ - COMMON: uniqueness
+ - EXTENDED: all rules
+
+ Args:
+ columns: Optional list of columns to analyze. If None, analyze all.
+ rules: Optional list of rule sets to apply. Defaults to ["DEFAULT"].
+
+ Returns:
+ List of ConstraintSuggestion objects
+ """
+ from pydeequ.engines.suggestions import SuggestionRunner
+ from pydeequ.v2.suggestions import Rules
+
+ # Default rules - normalize to strings for SuggestionRunner
+ if rules is None:
+ rule_strings = ["DEFAULT"]
+ else:
+ # Accept both Rules enum and string values
+ rule_strings = []
+ for rule in rules:
+ if isinstance(rule, Rules):
+ rule_strings.append(rule.value)
+ else:
+ rule_strings.append(rule)
+
+ # Profile columns with histograms for categorical detection
+ profiles = self.profile_columns(columns, low_cardinality_threshold=100)
+
+ # Get row count for uniqueness checks
+ row_count = self._get_row_count()
+
+ # Run suggestion rules
+ runner = SuggestionRunner(rule_sets=rule_strings)
+ return runner.run(
+ profiles,
+ execute_fn=self._execute_query,
+ table=self.table,
+ row_count=row_count,
+ )
diff --git a/pydeequ/engines/duckdb_config.py b/pydeequ/engines/duckdb_config.py
new file mode 100644
index 0000000..b470ae0
--- /dev/null
+++ b/pydeequ/engines/duckdb_config.py
@@ -0,0 +1,140 @@
+# -*- coding: utf-8 -*-
+"""
+DuckDB engine configuration for PyDeequ.
+
+This module provides configuration options to optimize DuckDB performance
+for analytical workloads like data quality checks.
+
+Example usage:
+ import duckdb
+ from pydeequ.engines.duckdb import DuckDBEngine
+ from pydeequ.engines.duckdb_config import DuckDBEngineConfig
+
+ # Create config with optimizations
+ config = DuckDBEngineConfig(
+ threads=8,
+ memory_limit="8GB",
+ preserve_insertion_order=False, # Better parallelism
+ )
+
+ con = duckdb.connect()
+ config.apply(con)
+
+ engine = DuckDBEngine(con, table="test")
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Dict, Optional
+
+if TYPE_CHECKING:
+ import duckdb
+
+
+@dataclass
+class DuckDBEngineConfig:
+ """
+ Configuration for DuckDB engine optimization.
+
+ Attributes:
+ threads: Number of threads to use. None = auto (all cores).
+ memory_limit: Memory limit string (e.g., "8GB"). None = auto.
+ preserve_insertion_order: If False, allows better parallelism.
+ Set to False for read-only analytical workloads.
+ parquet_metadata_cache: Cache Parquet metadata for faster repeated reads.
+ enable_object_cache: Enable object caching for repeated queries.
+ enable_progress_bar: Show progress bar for long-running queries.
+ default_null_order: How to order NULLs (NULLS_FIRST, NULLS_LAST).
+ custom_settings: Additional DuckDB settings as key-value pairs.
+ """
+
+ threads: Optional[int] = None
+ memory_limit: Optional[str] = None
+ preserve_insertion_order: bool = False
+ parquet_metadata_cache: bool = True
+ enable_object_cache: bool = True
+ enable_progress_bar: bool = False
+ default_null_order: str = "NULLS_LAST"
+ custom_settings: Dict[str, str] = field(default_factory=dict)
+
+ def apply(self, con: "duckdb.DuckDBPyConnection") -> None:
+ """
+ Apply configuration settings to a DuckDB connection.
+
+ Args:
+ con: DuckDB connection to configure
+ """
+ # Thread configuration
+ if self.threads is not None:
+ con.execute(f"SET threads = {self.threads}")
+
+ # Memory configuration
+ if self.memory_limit is not None:
+ con.execute(f"SET memory_limit = '{self.memory_limit}'")
+
+ # Parallelism optimization
+ con.execute(
+ f"SET preserve_insertion_order = {str(self.preserve_insertion_order).lower()}"
+ )
+
+ # Caching optimization
+ if self.parquet_metadata_cache:
+ con.execute("SET parquet_metadata_cache = true")
+
+ if self.enable_object_cache:
+ con.execute("SET enable_object_cache = true")
+
+ # Progress bar (useful for debugging long queries)
+ con.execute(
+ f"SET enable_progress_bar = {str(self.enable_progress_bar).lower()}"
+ )
+
+ # NULL ordering
+ con.execute(f"SET default_null_order = '{self.default_null_order}'")
+
+ # Apply custom settings
+ for key, value in self.custom_settings.items():
+ # Determine if value needs quoting (strings vs numbers/booleans)
+ if value.lower() in ("true", "false") or value.isdigit():
+ con.execute(f"SET {key} = {value}")
+ else:
+ con.execute(f"SET {key} = '{value}'")
+
+ @classmethod
+ def default(cls) -> "DuckDBEngineConfig":
+ """Create a default configuration."""
+ return cls()
+
+ @classmethod
+ def high_performance(cls) -> "DuckDBEngineConfig":
+ """
+ Create a high-performance configuration for analytical workloads.
+
+ This configuration prioritizes read performance over write safety:
+ - Disables insertion order preservation for better parallelism
+ - Enables all caching options
+ - Uses all available cores
+ """
+ return cls(
+ threads=None, # Use all cores
+ preserve_insertion_order=False,
+ parquet_metadata_cache=True,
+ enable_object_cache=True,
+ )
+
+ @classmethod
+ def memory_constrained(cls, memory_limit: str = "4GB") -> "DuckDBEngineConfig":
+ """
+ Create a configuration for memory-constrained environments.
+
+ Args:
+ memory_limit: Memory limit string (e.g., "4GB")
+ """
+ return cls(
+ memory_limit=memory_limit,
+ enable_object_cache=False, # Reduce memory usage
+ )
+
+
+__all__ = ["DuckDBEngineConfig"]
diff --git a/pydeequ/engines/operators/__init__.py b/pydeequ/engines/operators/__init__.py
new file mode 100644
index 0000000..a76f8b9
--- /dev/null
+++ b/pydeequ/engines/operators/__init__.py
@@ -0,0 +1,139 @@
+# -*- coding: utf-8 -*-
+"""
+SQL Operator abstractions for data quality metrics.
+
+This module provides a hierarchical operator abstraction pattern that:
+1. Eliminates code duplication across analyzer implementations
+2. Separates SQL generation from result extraction
+3. Enables efficient batch execution of scan operators
+4. Provides consistent WHERE clause handling
+
+Architecture:
+ Protocols (Contracts)
+ ├── ScanOperatorProtocol - Single-pass aggregation operators
+ └── GroupingOperatorProtocol - GROUP BY-based operators
+
+ Mixins (Shared Behaviors)
+ ├── WhereClauseMixin - Conditional aggregation wrapping
+ ├── SafeExtractMixin - Safe value extraction from DataFrames
+ └── ColumnAliasMixin - Consistent alias generation
+
+ Base Classes (Hierarchy)
+ ├── ScanOperator - Base for single-pass operators
+ └── GroupingOperator - Base for GROUP BY operators
+
+ Factory
+ └── OperatorFactory - Creates operators from analyzers
+
+Example usage:
+ from pydeequ.engines.operators import OperatorFactory
+
+ # Create operator from analyzer
+ operator = OperatorFactory.create(Mean("price"))
+
+ # Get SQL aggregations for scan operators
+ aggregations = operator.get_aggregations()
+ # ["AVG(price) AS mean_price"]
+
+ # Execute query and extract result
+ df = engine._execute_query(f"SELECT {', '.join(aggregations)} FROM table")
+ result = operator.extract_result(df)
+"""
+
+from pydeequ.engines.operators.base import GroupingOperator, ScanOperator
+from pydeequ.engines.operators.factory import OperatorFactory
+from pydeequ.engines.operators.grouping_batcher import (
+ GroupingOperatorBatcher,
+ BATCHABLE_OPERATORS,
+)
+from pydeequ.engines.operators.grouping_operators import (
+ DistinctnessOperator,
+ EntropyOperator,
+ HistogramOperator,
+ MutualInformationOperator,
+ UniqueValueRatioOperator,
+ UniquenessOperator,
+)
+from pydeequ.engines.operators.metadata_operators import (
+ DataTypeOperator,
+)
+from pydeequ.engines.operators.profiling_operators import (
+ ColumnProfileOperator,
+ MultiColumnProfileOperator,
+ NUMERIC_TYPES,
+ STRING_TYPES,
+)
+from pydeequ.engines.operators.mixins import (
+ ColumnAliasMixin,
+ SafeExtractMixin,
+ WhereClauseMixin,
+)
+from pydeequ.engines.operators.protocols import (
+ GroupingOperatorProtocol,
+ ScanOperatorProtocol,
+)
+from pydeequ.engines.operators.scan_operators import (
+ ApproxCountDistinctOperator,
+ ApproxQuantileOperator,
+ ComplianceOperator,
+ CompletenessOperator,
+ CorrelationOperator,
+ CountDistinctOperator,
+ MaximumOperator,
+ MaxLengthOperator,
+ MeanOperator,
+ MinimumOperator,
+ MinLengthOperator,
+ PatternMatchOperator,
+ SizeOperator,
+ StandardDeviationOperator,
+ SumOperator,
+)
+
+__all__ = [
+ # Protocols
+ "ScanOperatorProtocol",
+ "GroupingOperatorProtocol",
+ # Mixins
+ "WhereClauseMixin",
+ "SafeExtractMixin",
+ "ColumnAliasMixin",
+ # Base classes
+ "ScanOperator",
+ "GroupingOperator",
+ # Scan operators
+ "SizeOperator",
+ "CompletenessOperator",
+ "MeanOperator",
+ "SumOperator",
+ "MinimumOperator",
+ "MaximumOperator",
+ "StandardDeviationOperator",
+ "MaxLengthOperator",
+ "MinLengthOperator",
+ "PatternMatchOperator",
+ "ComplianceOperator",
+ "CorrelationOperator",
+ "CountDistinctOperator",
+ "ApproxCountDistinctOperator",
+ "ApproxQuantileOperator",
+ # Grouping operators
+ "DistinctnessOperator",
+ "UniquenessOperator",
+ "UniqueValueRatioOperator",
+ "EntropyOperator",
+ "MutualInformationOperator",
+ "HistogramOperator",
+ # Grouping operator batching
+ "GroupingOperatorBatcher",
+ "BATCHABLE_OPERATORS",
+ # Metadata operators
+ "DataTypeOperator",
+ # Profiling operators
+ "ColumnProfileOperator",
+ "MultiColumnProfileOperator",
+ "NUMERIC_TYPES",
+ "STRING_TYPES",
+ # Factory
+ "OperatorFactory",
+]
diff --git a/pydeequ/engines/operators/base.py b/pydeequ/engines/operators/base.py
new file mode 100644
index 0000000..88ef664
--- /dev/null
+++ b/pydeequ/engines/operators/base.py
@@ -0,0 +1,178 @@
+# -*- coding: utf-8 -*-
+"""
+Base classes for SQL operators.
+
+This module provides the abstract base classes that combine protocols
+and mixins to create the foundation for concrete operator implementations.
+"""
+
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, List, Optional
+
+from pydeequ.engines.operators.mixins import (
+ ColumnAliasMixin,
+ SafeExtractMixin,
+ WhereClauseMixin,
+)
+
+if TYPE_CHECKING:
+ import pandas as pd
+ from pydeequ.engines import MetricResult
+
+
+class ScanOperator(WhereClauseMixin, SafeExtractMixin, ColumnAliasMixin, ABC):
+ """
+ Base class for single-pass aggregation operators.
+
+ Scan operators compute metrics via SQL aggregations that can be
+ combined into a single SELECT statement. This enables efficient
+ batch execution where multiple metrics are computed in one query.
+
+ Subclasses must implement:
+ - get_aggregations(): Return SQL aggregation expressions
+ - extract_result(): Parse query results into MetricResult
+
+ Attributes:
+ column: Column name to analyze
+ where: Optional SQL WHERE clause for filtering
+ """
+
+ def __init__(self, column: str, where: Optional[str] = None):
+ """
+ Initialize scan operator.
+
+ Args:
+ column: Column name to analyze
+ where: Optional SQL WHERE clause for filtering
+ """
+ self.column = column
+ self.where = where
+
+ @abstractmethod
+ def get_aggregations(self) -> List[str]:
+ """
+ Return SQL aggregation expressions.
+
+ Returns:
+ List of SQL aggregation expressions with AS alias clauses
+ """
+ raise NotImplementedError
+
+ @abstractmethod
+ def extract_result(self, df: "pd.DataFrame") -> "MetricResult":
+ """
+ Extract metric from query result DataFrame.
+
+ Args:
+ df: DataFrame containing query results
+
+ Returns:
+ MetricResult with extracted value
+ """
+ raise NotImplementedError
+
+ @property
+ def instance(self) -> str:
+ """Return the instance identifier for this operator."""
+ return self.column
+
+ @property
+ def entity(self) -> str:
+ """Return the entity type for this operator."""
+ return "Column"
+
+ @property
+ @abstractmethod
+ def metric_name(self) -> str:
+ """Return the metric name for this operator."""
+ raise NotImplementedError
+
+
+class GroupingOperator(WhereClauseMixin, SafeExtractMixin, ColumnAliasMixin, ABC):
+ """
+ Base class for operators requiring GROUP BY queries.
+
+ Grouping operators need to compute intermediate aggregations
+ via GROUP BY before computing the final metric. They cannot
+ be batched with scan operators and require separate queries.
+
+ Subclasses must implement:
+ - get_grouping_columns(): Return columns to GROUP BY
+ - build_query(): Build complete CTE-based query
+ - extract_result(): Parse query results into MetricResult
+
+ Attributes:
+ columns: Column name(s) to analyze
+ where: Optional SQL WHERE clause for filtering
+ """
+
+ def __init__(self, columns: List[str], where: Optional[str] = None):
+ """
+ Initialize grouping operator.
+
+ Args:
+ columns: Column name(s) to analyze
+ where: Optional SQL WHERE clause for filtering
+ """
+ self.columns = columns
+ self.where = where
+
+ @abstractmethod
+ def get_grouping_columns(self) -> List[str]:
+ """
+ Return columns to GROUP BY.
+
+ Returns:
+ List of column names for the GROUP BY clause
+ """
+ raise NotImplementedError
+
+ @abstractmethod
+ def build_query(self, table: str) -> str:
+ """
+ Build complete CTE-based query.
+
+ Args:
+ table: Name of the table to query
+
+ Returns:
+ Complete SQL query string
+ """
+ raise NotImplementedError
+
+ @abstractmethod
+ def extract_result(self, df: "pd.DataFrame") -> "MetricResult":
+ """
+ Extract metric from query result DataFrame.
+
+ Args:
+ df: DataFrame containing query results
+
+ Returns:
+ MetricResult with extracted value
+ """
+ raise NotImplementedError
+
+ @property
+ def instance(self) -> str:
+ """Return the instance identifier for this operator."""
+ return ",".join(self.columns)
+
+ @property
+ def entity(self) -> str:
+ """Return the entity type for this operator."""
+ return "Multicolumn" if len(self.columns) > 1 else "Column"
+
+ @property
+ @abstractmethod
+ def metric_name(self) -> str:
+ """Return the metric name for this operator."""
+ raise NotImplementedError
+
+
+__all__ = [
+ "ScanOperator",
+ "GroupingOperator",
+]
diff --git a/pydeequ/engines/operators/factory.py b/pydeequ/engines/operators/factory.py
new file mode 100644
index 0000000..ed3c2ce
--- /dev/null
+++ b/pydeequ/engines/operators/factory.py
@@ -0,0 +1,277 @@
+# -*- coding: utf-8 -*-
+"""
+Operator factory for creating operators from analyzers.
+
+This module provides a registry-based factory pattern that eliminates
+isinstance() chains when creating operators from analyzers.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Dict, Optional, Type, Union
+
+from pydeequ.engines.operators.grouping_operators import (
+ DistinctnessOperator,
+ EntropyOperator,
+ HistogramOperator,
+ MutualInformationOperator,
+ UniqueValueRatioOperator,
+ UniquenessOperator,
+)
+from pydeequ.engines.operators.metadata_operators import (
+ DataTypeOperator,
+)
+from pydeequ.engines.operators.scan_operators import (
+ ApproxCountDistinctOperator,
+ ApproxQuantileOperator,
+ ComplianceOperator,
+ CompletenessOperator,
+ CorrelationOperator,
+ CountDistinctOperator,
+ MaximumOperator,
+ MaxLengthOperator,
+ MeanOperator,
+ MinimumOperator,
+ MinLengthOperator,
+ PatternMatchOperator,
+ SizeOperator,
+ StandardDeviationOperator,
+ SumOperator,
+)
+
+if TYPE_CHECKING:
+ from pydeequ.engines.operators.base import GroupingOperator, ScanOperator
+ from pydeequ.v2.analyzers import _ConnectAnalyzer
+
+# Type alias for operator types
+OperatorType = Union["ScanOperator", "GroupingOperator", "DataTypeOperator"]
+
+
+class OperatorFactory:
+ """
+ Creates operators from analyzers using registry pattern.
+
+ This factory eliminates isinstance() chains by mapping analyzer
+ types to their corresponding operator classes.
+ """
+
+ # Registry mapping analyzer type names to operator classes
+ _scan_registry: Dict[str, Type] = {}
+ _grouping_registry: Dict[str, Type] = {}
+ _metadata_registry: Dict[str, Type] = {}
+
+ @classmethod
+ def register_scan(cls, analyzer_name: str):
+ """
+ Decorator to register a scan operator for an analyzer type.
+
+ Args:
+ analyzer_name: Name of the analyzer class (e.g., "Mean", "Sum")
+
+ Returns:
+ Decorator function
+ """
+ def decorator(operator_class: Type):
+ cls._scan_registry[analyzer_name] = operator_class
+ return operator_class
+ return decorator
+
+ @classmethod
+ def register_grouping(cls, analyzer_name: str):
+ """
+ Decorator to register a grouping operator for an analyzer type.
+
+ Args:
+ analyzer_name: Name of the analyzer class
+
+ Returns:
+ Decorator function
+ """
+ def decorator(operator_class: Type):
+ cls._grouping_registry[analyzer_name] = operator_class
+ return operator_class
+ return decorator
+
+ @classmethod
+ def register_metadata(cls, analyzer_name: str):
+ """
+ Decorator to register a metadata operator for an analyzer type.
+
+ Metadata operators compute metrics from schema information rather
+ than SQL queries. They are used for type inference and similar
+ schema-based analysis.
+
+ Args:
+ analyzer_name: Name of the analyzer class
+
+ Returns:
+ Decorator function
+ """
+ def decorator(operator_class: Type):
+ cls._metadata_registry[analyzer_name] = operator_class
+ return operator_class
+ return decorator
+
+ @classmethod
+ def create(cls, analyzer: "_ConnectAnalyzer") -> Optional[OperatorType]:
+ """
+ Create operator instance for given analyzer.
+
+ Args:
+ analyzer: Analyzer instance to create operator for
+
+ Returns:
+ Operator instance or None if analyzer type not supported
+ """
+ analyzer_name = type(analyzer).__name__
+
+ # Try scan registry first
+ if analyzer_name in cls._scan_registry:
+ return cls._create_scan_operator(analyzer_name, analyzer)
+
+ # Try grouping registry
+ if analyzer_name in cls._grouping_registry:
+ return cls._create_grouping_operator(analyzer_name, analyzer)
+
+ # Try metadata registry
+ if analyzer_name in cls._metadata_registry:
+ return cls._create_metadata_operator(analyzer_name, analyzer)
+
+ return None
+
+ @classmethod
+ def _create_scan_operator(
+ cls, analyzer_name: str, analyzer: "_ConnectAnalyzer"
+ ) -> "ScanOperator":
+ """Create a scan operator from an analyzer."""
+ operator_class = cls._scan_registry[analyzer_name]
+
+ # Extract common attributes
+ column = getattr(analyzer, "column", None)
+ where = getattr(analyzer, "where", None)
+
+ # Handle special cases
+ if analyzer_name == "Size":
+ return operator_class(where=where)
+ elif analyzer_name == "Compliance":
+ instance = getattr(analyzer, "instance", "compliance")
+ predicate = getattr(analyzer, "predicate", "")
+ return operator_class(instance, predicate, where=where)
+ elif analyzer_name == "PatternMatch":
+ pattern = getattr(analyzer, "pattern", "")
+ return operator_class(column, pattern, where=where)
+ elif analyzer_name == "Correlation":
+ column1 = getattr(analyzer, "column1", "")
+ column2 = getattr(analyzer, "column2", "")
+ return operator_class(column1, column2, where=where)
+ elif analyzer_name == "CountDistinct":
+ columns = list(getattr(analyzer, "columns", []))
+ return operator_class(columns, where=where)
+ elif analyzer_name == "ApproxQuantile":
+ quantile = getattr(analyzer, "quantile", 0.5)
+ return operator_class(column, quantile, where=where)
+ else:
+ # Standard single-column operators
+ return operator_class(column, where=where)
+
+ @classmethod
+ def _create_grouping_operator(
+ cls, analyzer_name: str, analyzer: "_ConnectAnalyzer"
+ ) -> "GroupingOperator":
+ """Create a grouping operator from an analyzer."""
+ operator_class = cls._grouping_registry[analyzer_name]
+
+ where = getattr(analyzer, "where", None)
+
+ if analyzer_name == "Entropy":
+ column = getattr(analyzer, "column", "")
+ return operator_class(column, where=where)
+ elif analyzer_name == "Histogram":
+ column = getattr(analyzer, "column", "")
+ max_bins = getattr(analyzer, "max_detail_bins", 100) or 100
+ return operator_class(column, max_bins, where=where)
+ else:
+ # Multi-column operators (Distinctness, Uniqueness, etc.)
+ columns = getattr(analyzer, "columns", [])
+ if isinstance(columns, str):
+ columns = [columns]
+ return operator_class(list(columns), where=where)
+
+ @classmethod
+ def _create_metadata_operator(
+ cls, analyzer_name: str, analyzer: "_ConnectAnalyzer"
+ ) -> "DataTypeOperator":
+ """Create a metadata operator from an analyzer."""
+ operator_class = cls._metadata_registry[analyzer_name]
+
+ # Extract common attributes
+ column = getattr(analyzer, "column", None)
+ where = getattr(analyzer, "where", None)
+
+ # Standard single-column metadata operators
+ return operator_class(column, where=where)
+
+ @classmethod
+ def is_scan_operator(cls, analyzer: "_ConnectAnalyzer") -> bool:
+ """Check if analyzer maps to a scan operator."""
+ return type(analyzer).__name__ in cls._scan_registry
+
+ @classmethod
+ def is_grouping_operator(cls, analyzer: "_ConnectAnalyzer") -> bool:
+ """Check if analyzer maps to a grouping operator."""
+ return type(analyzer).__name__ in cls._grouping_registry
+
+ @classmethod
+ def is_metadata_operator(cls, analyzer: "_ConnectAnalyzer") -> bool:
+ """Check if analyzer maps to a metadata operator."""
+ return type(analyzer).__name__ in cls._metadata_registry
+
+ @classmethod
+ def is_supported(cls, analyzer: "_ConnectAnalyzer") -> bool:
+ """Check if analyzer type is supported by the factory."""
+ analyzer_name = type(analyzer).__name__
+ return (
+ analyzer_name in cls._scan_registry
+ or analyzer_name in cls._grouping_registry
+ or analyzer_name in cls._metadata_registry
+ )
+
+
+# Register all scan operators
+OperatorFactory._scan_registry = {
+ "Size": SizeOperator,
+ "Completeness": CompletenessOperator,
+ "Mean": MeanOperator,
+ "Sum": SumOperator,
+ "Minimum": MinimumOperator,
+ "Maximum": MaximumOperator,
+ "StandardDeviation": StandardDeviationOperator,
+ "MaxLength": MaxLengthOperator,
+ "MinLength": MinLengthOperator,
+ "PatternMatch": PatternMatchOperator,
+ "Compliance": ComplianceOperator,
+ "Correlation": CorrelationOperator,
+ "CountDistinct": CountDistinctOperator,
+ "ApproxCountDistinct": ApproxCountDistinctOperator,
+ "ApproxQuantile": ApproxQuantileOperator,
+}
+
+# Register all grouping operators
+OperatorFactory._grouping_registry = {
+ "Distinctness": DistinctnessOperator,
+ "Uniqueness": UniquenessOperator,
+ "UniqueValueRatio": UniqueValueRatioOperator,
+ "Entropy": EntropyOperator,
+ "MutualInformation": MutualInformationOperator,
+ "Histogram": HistogramOperator,
+}
+
+# Register all metadata operators
+OperatorFactory._metadata_registry = {
+ "DataType": DataTypeOperator,
+}
+
+
+__all__ = [
+ "OperatorFactory",
+]
diff --git a/pydeequ/engines/operators/grouping_batcher.py b/pydeequ/engines/operators/grouping_batcher.py
new file mode 100644
index 0000000..db75641
--- /dev/null
+++ b/pydeequ/engines/operators/grouping_batcher.py
@@ -0,0 +1,220 @@
+# -*- coding: utf-8 -*-
+"""
+Grouping operator batching for DuckDB performance optimization.
+
+This module provides functionality to batch grouping operators that share
+identical CTEs (same columns and where clause) into single queries.
+
+Key insight: DistinctnessOperator, UniquenessOperator, and UniqueValueRatioOperator
+all use the same frequency CTE:
+ WITH freq AS (SELECT cols, COUNT(*) AS cnt FROM table GROUP BY cols)
+
+By fusing operators with matching (columns, where_clause), we can:
+- Compute all metrics in a single query
+- Reduce the number of table scans
+- Improve performance by 20-40% for checks with multiple grouping operators
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
+
+from pydeequ.engines import MetricResult
+from pydeequ.engines.operators.grouping_operators import (
+ DistinctnessOperator,
+ UniquenessOperator,
+ UniqueValueRatioOperator,
+)
+
+if TYPE_CHECKING:
+ import pandas as pd
+ from pydeequ.engines.operators.base import GroupingOperator
+
+
+# Operators that can be batched together (share same freq CTE structure)
+BATCHABLE_OPERATORS = (DistinctnessOperator, UniquenessOperator, UniqueValueRatioOperator)
+
+
+class GroupingOperatorBatcher:
+ """
+ Batches grouping operators with matching (columns, where) into single queries.
+
+ This class analyzes grouping operators and fuses compatible ones to reduce
+ the number of SQL queries executed.
+ """
+
+ def __init__(self, operators: List["GroupingOperator"]):
+ """
+ Initialize the batcher with operators to analyze.
+
+ Args:
+ operators: List of grouping operators
+ """
+ self.operators = operators
+ self._batched_groups: Dict[Tuple, List["GroupingOperator"]] = {}
+ self._unbatchable: List["GroupingOperator"] = []
+ self._analyze()
+
+ def _get_batch_key(self, operator: "GroupingOperator") -> Optional[Tuple]:
+ """
+ Get the batch key for an operator (columns tuple + where clause).
+
+ Returns None if the operator cannot be batched.
+ """
+ if not isinstance(operator, BATCHABLE_OPERATORS):
+ return None
+
+ # Create key from (columns tuple, where clause)
+ cols = tuple(operator.columns)
+ where = operator.where or ""
+ return (cols, where)
+
+ def _analyze(self) -> None:
+ """Analyze operators and group batchable ones by key."""
+ for operator in self.operators:
+ key = self._get_batch_key(operator)
+ if key is None:
+ self._unbatchable.append(operator)
+ else:
+ if key not in self._batched_groups:
+ self._batched_groups[key] = []
+ self._batched_groups[key].append(operator)
+
+ def get_unbatchable_operators(self) -> List["GroupingOperator"]:
+ """Return operators that cannot be batched."""
+ return self._unbatchable
+
+ def get_batch_count(self) -> int:
+ """Return the number of batched query groups."""
+ return len(self._batched_groups)
+
+ def execute_batched(
+ self,
+ table: str,
+ execute_fn,
+ ) -> List[MetricResult]:
+ """
+ Execute batched queries and return results.
+
+ Args:
+ table: Name of the table to query
+ execute_fn: Function to execute SQL and return DataFrame
+
+ Returns:
+ List of MetricResult objects for all batched operators
+ """
+ results: List[MetricResult] = []
+
+ for (cols, where), operators in self._batched_groups.items():
+ # Build fused query
+ query = self._build_fused_query(table, cols, where, operators)
+
+ # Execute query
+ df = execute_fn(query)
+
+ # Extract results for each operator
+ for operator in operators:
+ result = self._extract_result(df, operator)
+ results.append(result)
+
+ return results
+
+ def _build_fused_query(
+ self,
+ table: str,
+ cols: Tuple[str, ...],
+ where: str,
+ operators: List["GroupingOperator"],
+ ) -> str:
+ """
+ Build a fused query that computes metrics for all operators in a batch.
+
+ Args:
+ table: Name of the table to query
+ cols: Tuple of column names
+ where: WHERE clause (empty string if none)
+ operators: List of operators to fuse
+
+ Returns:
+ SQL query string
+ """
+ cols_str = ", ".join(cols)
+ where_clause = f"WHERE {where}" if where else ""
+
+ # Determine which metrics we need to compute
+ needs_distinct = any(isinstance(op, DistinctnessOperator) for op in operators)
+ needs_unique = any(isinstance(op, UniquenessOperator) for op in operators)
+ needs_unique_ratio = any(isinstance(op, UniqueValueRatioOperator) for op in operators)
+
+ # Build SELECT clause
+ select_parts = []
+
+ # Always need total_count for Distinctness and Uniqueness
+ if needs_distinct or needs_unique:
+ select_parts.append("SUM(cnt) AS total_count")
+
+ # distinct_count needed for Distinctness and UniqueValueRatio
+ if needs_distinct or needs_unique_ratio:
+ select_parts.append("COUNT(*) AS distinct_count")
+
+ # unique_count needed for Uniqueness and UniqueValueRatio
+ if needs_unique or needs_unique_ratio:
+ select_parts.append("SUM(CASE WHEN cnt = 1 THEN 1 ELSE 0 END) AS unique_count")
+
+ return f"""
+ WITH freq AS (
+ SELECT {cols_str}, COUNT(*) AS cnt
+ FROM {table}
+ {where_clause}
+ GROUP BY {cols_str}
+ )
+ SELECT {', '.join(select_parts)}
+ FROM freq
+ """
+
+ def _extract_result(
+ self,
+ df: "pd.DataFrame",
+ operator: "GroupingOperator",
+ ) -> MetricResult:
+ """
+ Extract the metric result for a specific operator from the fused query result.
+
+ Args:
+ df: DataFrame containing fused query results
+ operator: The operator to extract result for
+
+ Returns:
+ MetricResult for the operator
+ """
+ if isinstance(operator, DistinctnessOperator):
+ distinct = operator.safe_float(df, "distinct_count") or 0
+ total = operator.safe_float(df, "total_count") or 0
+ value = distinct / total if total > 0 else 0.0
+
+ elif isinstance(operator, UniquenessOperator):
+ unique = operator.safe_float(df, "unique_count") or 0
+ total = operator.safe_float(df, "total_count") or 0
+ value = unique / total if total > 0 else 0.0
+
+ elif isinstance(operator, UniqueValueRatioOperator):
+ distinct = operator.safe_float(df, "distinct_count") or 0
+ unique = operator.safe_float(df, "unique_count") or 0
+ value = unique / distinct if distinct > 0 else 0.0
+
+ else:
+ # Fallback (shouldn't happen for batchable operators)
+ value = 0.0
+
+ return MetricResult(
+ name=operator.metric_name,
+ instance=operator.instance,
+ entity=operator.entity,
+ value=value,
+ )
+
+
+__all__ = [
+ "GroupingOperatorBatcher",
+ "BATCHABLE_OPERATORS",
+]
diff --git a/pydeequ/engines/operators/grouping_operators.py b/pydeequ/engines/operators/grouping_operators.py
new file mode 100644
index 0000000..702cfac
--- /dev/null
+++ b/pydeequ/engines/operators/grouping_operators.py
@@ -0,0 +1,334 @@
+# -*- coding: utf-8 -*-
+"""
+Grouping operator implementations.
+
+Grouping operators require GROUP BY queries and cannot be batched with
+scan operators. They require separate query execution.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, List, Optional
+
+from pydeequ.engines import MetricResult
+from pydeequ.engines.operators.base import GroupingOperator
+
+if TYPE_CHECKING:
+ import pandas as pd
+
+
+class DistinctnessOperator(GroupingOperator):
+ """
+ Computes distinctness = count_distinct / total_count.
+
+ Distinctness measures what fraction of the total rows have
+ unique value combinations in the specified columns.
+ """
+
+ def __init__(self, columns: List[str], where: Optional[str] = None):
+ super().__init__(columns, where)
+
+ @property
+ def metric_name(self) -> str:
+ return "Distinctness"
+
+ def get_grouping_columns(self) -> List[str]:
+ return self.columns
+
+ def build_query(self, table: str) -> str:
+ cols_str = ", ".join(self.columns)
+ where_clause = self.get_where_clause()
+
+ return f"""
+ WITH freq AS (
+ SELECT {cols_str}, COUNT(*) AS cnt
+ FROM {table}
+ {where_clause}
+ GROUP BY {cols_str}
+ )
+ SELECT
+ COUNT(*) AS distinct_count,
+ SUM(cnt) AS total_count
+ FROM freq
+ """
+
+ def extract_result(self, df: "pd.DataFrame") -> MetricResult:
+ distinct = self.safe_float(df, "distinct_count") or 0
+ total = self.safe_float(df, "total_count") or 0
+ value = distinct / total if total > 0 else 0.0
+ return MetricResult(
+ name=self.metric_name,
+ instance=self.instance,
+ entity=self.entity,
+ value=value,
+ )
+
+
+class UniquenessOperator(GroupingOperator):
+ """
+ Computes uniqueness = count_unique (count=1) / total_count.
+
+ Uniqueness measures what fraction of the total rows have
+ value combinations that appear exactly once.
+ """
+
+ def __init__(self, columns: List[str], where: Optional[str] = None):
+ super().__init__(columns, where)
+
+ @property
+ def metric_name(self) -> str:
+ return "Uniqueness"
+
+ def get_grouping_columns(self) -> List[str]:
+ return self.columns
+
+ def build_query(self, table: str) -> str:
+ cols_str = ", ".join(self.columns)
+ where_clause = self.get_where_clause()
+
+ return f"""
+ WITH freq AS (
+ SELECT {cols_str}, COUNT(*) AS cnt
+ FROM {table}
+ {where_clause}
+ GROUP BY {cols_str}
+ )
+ SELECT
+ SUM(CASE WHEN cnt = 1 THEN 1 ELSE 0 END) AS unique_count,
+ SUM(cnt) AS total_count
+ FROM freq
+ """
+
+ def extract_result(self, df: "pd.DataFrame") -> MetricResult:
+ unique = self.safe_float(df, "unique_count") or 0
+ total = self.safe_float(df, "total_count") or 0
+ value = unique / total if total > 0 else 0.0
+ return MetricResult(
+ name=self.metric_name,
+ instance=self.instance,
+ entity=self.entity,
+ value=value,
+ )
+
+
+class UniqueValueRatioOperator(GroupingOperator):
+ """
+ Computes unique value ratio = count_unique / count_distinct.
+
+ This measures what fraction of distinct value combinations
+ appear exactly once.
+ """
+
+ def __init__(self, columns: List[str], where: Optional[str] = None):
+ super().__init__(columns, where)
+
+ @property
+ def metric_name(self) -> str:
+ return "UniqueValueRatio"
+
+ def get_grouping_columns(self) -> List[str]:
+ return self.columns
+
+ def build_query(self, table: str) -> str:
+ cols_str = ", ".join(self.columns)
+ where_clause = self.get_where_clause()
+
+ return f"""
+ WITH freq AS (
+ SELECT {cols_str}, COUNT(*) AS cnt
+ FROM {table}
+ {where_clause}
+ GROUP BY {cols_str}
+ )
+ SELECT
+ COUNT(*) AS distinct_count,
+ SUM(CASE WHEN cnt = 1 THEN 1 ELSE 0 END) AS unique_count
+ FROM freq
+ """
+
+ def extract_result(self, df: "pd.DataFrame") -> MetricResult:
+ distinct = self.safe_float(df, "distinct_count") or 0
+ unique = self.safe_float(df, "unique_count") or 0
+ value = unique / distinct if distinct > 0 else 0.0
+ return MetricResult(
+ name=self.metric_name,
+ instance=self.instance,
+ entity=self.entity,
+ value=value,
+ )
+
+
+class EntropyOperator(GroupingOperator):
+ """
+ Computes entropy = -SUM(p * ln(p)).
+
+ Entropy measures the information content of a column's
+ value distribution. Uses natural log (nats) for Spark parity.
+ """
+
+ def __init__(self, column: str, where: Optional[str] = None):
+ super().__init__([column], where)
+ self.column = column
+
+ @property
+ def metric_name(self) -> str:
+ return "Entropy"
+
+ @property
+ def instance(self) -> str:
+ return self.column
+
+ @property
+ def entity(self) -> str:
+ return "Column"
+
+ def get_grouping_columns(self) -> List[str]:
+ return [self.column]
+
+ def build_query(self, table: str) -> str:
+ where_clause = self.get_where_clause()
+
+ return f"""
+ WITH freq AS (
+ SELECT {self.column}, COUNT(*) AS cnt
+ FROM {table}
+ {where_clause}
+ GROUP BY {self.column}
+ ),
+ total AS (
+ SELECT SUM(cnt) AS total_cnt FROM freq
+ )
+ SELECT
+ -SUM((cnt * 1.0 / total_cnt) * LN(cnt * 1.0 / total_cnt)) AS entropy
+ FROM freq, total
+ WHERE cnt > 0
+ """
+
+ def extract_result(self, df: "pd.DataFrame") -> MetricResult:
+ value = self.safe_float(df, "entropy")
+ if value is None:
+ value = 0.0
+ return MetricResult(
+ name=self.metric_name,
+ instance=self.instance,
+ entity=self.entity,
+ value=value,
+ )
+
+
+class MutualInformationOperator(GroupingOperator):
+ """Computes mutual information between two columns."""
+
+ def __init__(self, columns: List[str], where: Optional[str] = None):
+ if len(columns) != 2:
+ raise ValueError("MutualInformation requires exactly 2 columns")
+ super().__init__(columns, where)
+
+ @property
+ def metric_name(self) -> str:
+ return "MutualInformation"
+
+ def get_grouping_columns(self) -> List[str]:
+ return self.columns
+
+ def build_query(self, table: str) -> str:
+ col1, col2 = self.columns
+ where_clause = self.get_where_clause()
+
+ return f"""
+ WITH
+ joint AS (
+ SELECT {col1}, {col2}, COUNT(*) AS cnt
+ FROM {table}
+ {where_clause}
+ GROUP BY {col1}, {col2}
+ ),
+ total AS (SELECT SUM(cnt) AS n FROM joint),
+ marginal1 AS (
+ SELECT {col1}, SUM(cnt) AS cnt1 FROM joint GROUP BY {col1}
+ ),
+ marginal2 AS (
+ SELECT {col2}, SUM(cnt) AS cnt2 FROM joint GROUP BY {col2}
+ )
+ SELECT SUM(
+ (j.cnt * 1.0 / t.n) *
+ LN((j.cnt * 1.0 / t.n) / ((m1.cnt1 * 1.0 / t.n) * (m2.cnt2 * 1.0 / t.n)))
+ ) AS mi
+ FROM joint j, total t, marginal1 m1, marginal2 m2
+ WHERE j.{col1} = m1.{col1} AND j.{col2} = m2.{col2} AND j.cnt > 0
+ """
+
+ def extract_result(self, df: "pd.DataFrame") -> MetricResult:
+ value = self.safe_float(df, "mi")
+ if value is None:
+ value = 0.0
+ return MetricResult(
+ name=self.metric_name,
+ instance=self.instance,
+ entity=self.entity,
+ value=value,
+ )
+
+
+class HistogramOperator(GroupingOperator):
+ """
+ Computes histogram of value distribution in a column.
+
+ Returns a JSON-serialized dict mapping values to their counts.
+ """
+
+ def __init__(self, column: str, max_bins: int = 100, where: Optional[str] = None):
+ super().__init__([column], where)
+ self.column = column
+ self.max_bins = max_bins
+
+ @property
+ def metric_name(self) -> str:
+ return "Histogram"
+
+ @property
+ def instance(self) -> str:
+ return self.column
+
+ @property
+ def entity(self) -> str:
+ return "Column"
+
+ def get_grouping_columns(self) -> List[str]:
+ return [self.column]
+
+ def build_query(self, table: str) -> str:
+ where_clause = self.get_where_clause()
+ if where_clause:
+ where_clause += f" AND {self.column} IS NOT NULL"
+ else:
+ where_clause = f"WHERE {self.column} IS NOT NULL"
+
+ return f"""
+ SELECT {self.column} as value, COUNT(*) as count
+ FROM {table}
+ {where_clause}
+ GROUP BY {self.column}
+ ORDER BY count DESC
+ LIMIT {self.max_bins}
+ """
+
+ def extract_result(self, df: "pd.DataFrame") -> MetricResult:
+ import json
+ histogram = {str(row["value"]): int(row["count"]) for _, row in df.iterrows()}
+ return MetricResult(
+ name=self.metric_name,
+ instance=self.instance,
+ entity=self.entity,
+ value=json.dumps(histogram),
+ )
+
+
+__all__ = [
+ "DistinctnessOperator",
+ "UniquenessOperator",
+ "UniqueValueRatioOperator",
+ "EntropyOperator",
+ "MutualInformationOperator",
+ "HistogramOperator",
+]
diff --git a/pydeequ/engines/operators/metadata_operators.py b/pydeequ/engines/operators/metadata_operators.py
new file mode 100644
index 0000000..963a2a1
--- /dev/null
+++ b/pydeequ/engines/operators/metadata_operators.py
@@ -0,0 +1,145 @@
+# -*- coding: utf-8 -*-
+"""
+Metadata operator implementations.
+
+Metadata operators compute metrics using schema information rather than
+SQL aggregations. They are useful for type inference and schema-based
+analysis that don't require scanning data.
+"""
+
+from __future__ import annotations
+
+import json
+from typing import Dict, Optional
+
+from pydeequ.engines import MetricResult
+from pydeequ.engines.operators.mixins import (
+ ColumnAliasMixin,
+ SafeExtractMixin,
+)
+
+class DataTypeOperator(SafeExtractMixin, ColumnAliasMixin):
+ """
+ Computes data type information from schema metadata.
+
+ Unlike scan operators that require SQL queries, DataTypeOperator
+ infers type information directly from the table schema, making it
+ more efficient for type analysis.
+
+ Type Mapping:
+ DuckDB types are mapped to Deequ-compatible type categories:
+ - Integral: TINYINT, SMALLINT, INTEGER, BIGINT, HUGEINT, etc.
+ - Fractional: FLOAT, DOUBLE, REAL, DECIMAL, NUMERIC
+ - String: VARCHAR, CHAR, TEXT, etc.
+ - Boolean: BOOLEAN, BOOL
+
+ Attributes:
+ column: Column name to analyze
+ where: Optional WHERE clause (ignored for schema-based inference)
+ """
+
+ # Mapping from DuckDB SQL types to Deequ type categories
+ TYPE_MAPPING: Dict[str, str] = {
+ # Integral types
+ "TINYINT": "Integral",
+ "SMALLINT": "Integral",
+ "INTEGER": "Integral",
+ "BIGINT": "Integral",
+ "HUGEINT": "Integral",
+ "UTINYINT": "Integral",
+ "USMALLINT": "Integral",
+ "UINTEGER": "Integral",
+ "UBIGINT": "Integral",
+ "INT": "Integral",
+ "INT1": "Integral",
+ "INT2": "Integral",
+ "INT4": "Integral",
+ "INT8": "Integral",
+ # Fractional types
+ "FLOAT": "Fractional",
+ "DOUBLE": "Fractional",
+ "REAL": "Fractional",
+ "DECIMAL": "Fractional",
+ "NUMERIC": "Fractional",
+ "FLOAT4": "Fractional",
+ "FLOAT8": "Fractional",
+ # String types
+ "VARCHAR": "String",
+ "CHAR": "String",
+ "BPCHAR": "String",
+ "TEXT": "String",
+ "STRING": "String",
+ # Boolean types
+ "BOOLEAN": "Boolean",
+ "BOOL": "Boolean",
+ # Date/Time types (mapped to String for Deequ compatibility)
+ "DATE": "String",
+ "TIMESTAMP": "String",
+ "TIME": "String",
+ "TIMESTAMPTZ": "String",
+ "TIMETZ": "String",
+ "INTERVAL": "String",
+ # Binary types
+ "BLOB": "Unknown",
+ "BYTEA": "Unknown",
+ # UUID
+ "UUID": "String",
+ }
+
+ def __init__(self, column: str, where: Optional[str] = None):
+ """
+ Initialize DataTypeOperator.
+
+ Args:
+ column: Column name to analyze
+ where: Optional WHERE clause (ignored for schema-based inference)
+ """
+ self.column = column
+ self.where = where # Stored but ignored for schema-based type inference
+
+ @property
+ def metric_name(self) -> str:
+ """Return the metric name for this operator."""
+ return "DataType"
+
+ @property
+ def instance(self) -> str:
+ """Return the instance identifier for this operator."""
+ return self.column
+
+ @property
+ def entity(self) -> str:
+ """Return the entity type for this operator."""
+ return "Column"
+
+ def compute_from_schema(self, schema: Dict[str, str]) -> MetricResult:
+ """
+ Compute data type information from schema.
+
+ Args:
+ schema: Dictionary mapping column names to SQL type names
+
+ Returns:
+ MetricResult with JSON-encoded type information
+ """
+ sql_type = schema.get(self.column, "Unknown")
+ mapped_type = self.TYPE_MAPPING.get(sql_type, "Unknown")
+
+ # Build result compatible with Spark Deequ format
+ result = {
+ "dtype": sql_type,
+ "mapped_type": mapped_type,
+ "type_counts": {mapped_type: 1.0} # DuckDB has strict typing
+ }
+
+ return MetricResult(
+ name=self.metric_name,
+ instance=self.instance,
+ entity=self.entity,
+ value=json.dumps(result),
+ )
+
+
+__all__ = [
+ "DataTypeOperator",
+]
diff --git a/pydeequ/engines/operators/mixins.py b/pydeequ/engines/operators/mixins.py
new file mode 100644
index 0000000..c68dfad
--- /dev/null
+++ b/pydeequ/engines/operators/mixins.py
@@ -0,0 +1,188 @@
+# -*- coding: utf-8 -*-
+"""
+Mixin classes providing shared behaviors for SQL operators.
+
+These mixins provide reusable functionality that eliminates code duplication
+across operator implementations.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Optional
+
+if TYPE_CHECKING:
+ import pandas as pd
+
+
+class WhereClauseMixin:
+ """
+ Provides WHERE clause wrapping for conditional aggregations.
+
+ This mixin eliminates the repeated if/else pattern for handling
+ optional WHERE clauses in aggregations. Expects the class to
+ have a `where` attribute.
+ """
+
+ where: Optional[str]
+
+ def wrap_agg_with_where(self, agg_func: str, column: str) -> str:
+ """
+ Wrap an aggregation with optional WHERE filter using CASE WHEN.
+
+ Args:
+ agg_func: SQL aggregation function name (e.g., "AVG", "SUM", "MIN")
+ column: Column name to aggregate
+
+ Returns:
+ SQL expression with conditional aggregation if where is set,
+ otherwise standard aggregation
+
+ Example:
+ >>> op = SomeOperator(column="price", where="status='active'")
+ >>> op.wrap_agg_with_where("AVG", "price")
+ "AVG(CASE WHEN status='active' THEN price ELSE NULL END)"
+ """
+ if self.where:
+ return f"{agg_func}(CASE WHEN {self.where} THEN {column} ELSE NULL END)"
+ return f"{agg_func}({column})"
+
+ def wrap_count_with_where(self, condition: str = "1") -> str:
+ """
+ Wrap COUNT with optional WHERE filter.
+
+ Args:
+ condition: SQL condition to count (default "1" counts all rows)
+
+ Returns:
+ SQL expression for conditional count
+
+ Example:
+ >>> op = SomeOperator(where="status='active'")
+ >>> op.wrap_count_with_where()
+ "SUM(CASE WHEN status='active' THEN 1 ELSE 0 END)"
+ >>> op.wrap_count_with_where("price > 0")
+ "SUM(CASE WHEN status='active' AND (price > 0) THEN 1 ELSE 0 END)"
+ """
+ if self.where:
+ if condition == "1":
+ return f"SUM(CASE WHEN {self.where} THEN 1 ELSE 0 END)"
+ return f"SUM(CASE WHEN ({self.where}) AND ({condition}) THEN 1 ELSE 0 END)"
+ if condition == "1":
+ return "COUNT(*)"
+ return f"SUM(CASE WHEN {condition} THEN 1 ELSE 0 END)"
+
+ def get_where_clause(self) -> str:
+ """
+ Get WHERE clause for standalone queries.
+
+ Returns:
+ "WHERE {condition}" if where is set, otherwise empty string
+ """
+ if self.where:
+ return f"WHERE {self.where}"
+ return ""
+
+
+class SafeExtractMixin:
+ """
+ Provides safe value extraction from DataFrames.
+
+ This mixin standardizes the pattern of safely extracting values
+ from query result DataFrames, handling NULL and NaN values.
+ """
+
+ def safe_float(self, df: "pd.DataFrame", column: str) -> Optional[float]:
+ """
+ Extract float value from DataFrame, handling NULL/NaN.
+
+ Args:
+ df: DataFrame containing query results
+ column: Column name to extract
+
+ Returns:
+ Float value or None if not present/invalid
+ """
+ import pandas as pd
+
+ if column not in df.columns:
+ return None
+ val = df[column].iloc[0]
+ if val is not None and not pd.isna(val):
+ return float(val)
+ return None
+
+ def safe_int(self, df: "pd.DataFrame", column: str) -> Optional[int]:
+ """
+ Extract int value from DataFrame, handling NULL/NaN.
+
+ Args:
+ df: DataFrame containing query results
+ column: Column name to extract
+
+ Returns:
+ Integer value or None if not present/invalid
+ """
+ val = self.safe_float(df, column)
+ return int(val) if val is not None else None
+
+ def safe_string(self, df: "pd.DataFrame", column: str) -> Optional[str]:
+ """
+ Extract string value from DataFrame, handling NULL/NaN.
+
+ Args:
+ df: DataFrame containing query results
+ column: Column name to extract
+
+ Returns:
+ String value or None if not present/invalid
+ """
+ import pandas as pd
+
+ if column not in df.columns:
+ return None
+ val = df[column].iloc[0]
+ if val is not None and not pd.isna(val):
+ return str(val)
+ return None
+
+
+class ColumnAliasMixin:
+ """
+ Provides consistent column alias generation.
+
+ This mixin ensures all operators generate unique and predictable
+ column aliases for their SQL expressions.
+ """
+
+ def make_alias(self, prefix: str, *parts: str) -> str:
+ """
+ Generate unique column alias from prefix and parts.
+
+ Args:
+ prefix: Alias prefix (e.g., "mean", "count", "sum")
+ *parts: Additional parts to include (e.g., column names)
+
+ Returns:
+ Underscore-separated alias with sanitized column names
+
+ Example:
+ >>> op = SomeOperator()
+ >>> op.make_alias("mean", "price")
+ "mean_price"
+ >>> op.make_alias("corr", "price", "quantity")
+ "corr_price_quantity"
+ """
+ # Sanitize parts: replace dots and other special chars
+ sanitized = []
+ for p in parts:
+ if p:
+ sanitized.append(p.replace(".", "_").replace(" ", "_"))
+ suffix = "_".join(sanitized)
+ return f"{prefix}_{suffix}" if suffix else prefix
+
+
+__all__ = [
+ "WhereClauseMixin",
+ "SafeExtractMixin",
+ "ColumnAliasMixin",
+]
diff --git a/pydeequ/engines/operators/profiling_operators.py b/pydeequ/engines/operators/profiling_operators.py
new file mode 100644
index 0000000..9e4a892
--- /dev/null
+++ b/pydeequ/engines/operators/profiling_operators.py
@@ -0,0 +1,449 @@
+# -*- coding: utf-8 -*-
+"""
+Profiling operator implementations.
+
+Profiling operators compute column profile statistics including completeness,
+distinct values, min, max, mean, sum, stddev, percentiles, and histograms.
+"""
+
+from __future__ import annotations
+
+import json
+from typing import TYPE_CHECKING, Dict, List, Optional, Set
+
+from pydeequ.engines import ColumnProfile
+from pydeequ.engines.operators.mixins import (
+ ColumnAliasMixin,
+ SafeExtractMixin,
+ WhereClauseMixin,
+)
+
+if TYPE_CHECKING:
+ import pandas as pd
+
+
+# SQL types that are considered numeric
+NUMERIC_TYPES: Set[str] = {
+ "TINYINT", "SMALLINT", "INTEGER", "BIGINT", "HUGEINT",
+ "UTINYINT", "USMALLINT", "UINTEGER", "UBIGINT",
+ "FLOAT", "DOUBLE", "REAL", "DECIMAL", "NUMERIC",
+ "INT", "INT1", "INT2", "INT4", "INT8",
+ "FLOAT4", "FLOAT8",
+}
+
+# SQL types that are considered string
+STRING_TYPES: Set[str] = {"VARCHAR", "CHAR", "BPCHAR", "TEXT", "STRING"}
+
+
+class ColumnProfileOperator(WhereClauseMixin, SafeExtractMixin, ColumnAliasMixin):
+ """
+ Computes all profile statistics for a column.
+
+ This operator generates SQL queries to compute completeness, distinct values,
+ and (for numeric columns) min, max, mean, sum, stddev, and percentiles.
+
+ Attributes:
+ column: Column name to profile
+ column_type: SQL type of the column (e.g., "INTEGER", "VARCHAR")
+ is_numeric: Whether the column is a numeric type
+ compute_percentiles: Whether to compute percentile statistics
+ compute_histogram: Whether to compute value histogram
+ histogram_limit: Maximum number of histogram buckets
+ where: Optional WHERE clause for filtering
+ """
+
+ def __init__(
+ self,
+ column: str,
+ column_type: str,
+ compute_percentiles: bool = True,
+ compute_histogram: bool = False,
+ histogram_limit: int = 100,
+ where: Optional[str] = None,
+ ):
+ """
+ Initialize ColumnProfileOperator.
+
+ Args:
+ column: Column name to profile
+ column_type: SQL type of the column
+ compute_percentiles: Whether to compute percentile statistics
+ compute_histogram: Whether to compute value histogram
+ histogram_limit: Maximum number of histogram buckets
+ where: Optional WHERE clause for filtering
+ """
+ self.column = column
+ self.column_type = column_type
+ self.is_numeric = column_type in NUMERIC_TYPES
+ self.compute_percentiles = compute_percentiles and self.is_numeric
+ self.compute_histogram = compute_histogram
+ self.histogram_limit = histogram_limit
+ self.where = where
+
+ def build_base_query(self, table: str) -> str:
+ """
+ Build query for basic statistics.
+
+ Args:
+ table: Table name to query
+
+ Returns:
+ SQL query string for base statistics
+ """
+ col = self.column
+ if self.is_numeric:
+ query = f"""
+ SELECT
+ COUNT(*) as total,
+ SUM(CASE WHEN {col} IS NULL THEN 1 ELSE 0 END) as null_count,
+ APPROX_COUNT_DISTINCT({col}) as distinct_count,
+ MIN({col}) as min_val,
+ MAX({col}) as max_val,
+ AVG({col}) as mean_val,
+ SUM({col}) as sum_val,
+ STDDEV_POP({col}) as stddev_val
+ FROM {table}
+ """
+ else:
+ query = f"""
+ SELECT
+ COUNT(*) as total,
+ SUM(CASE WHEN {col} IS NULL THEN 1 ELSE 0 END) as null_count,
+ APPROX_COUNT_DISTINCT({col}) as distinct_count
+ FROM {table}
+ """
+ return query.strip()
+
+ def build_percentile_query(self, table: str) -> str:
+ """
+ Build query for percentiles (separate query).
+
+ Args:
+ table: Table name to query
+
+ Returns:
+ SQL query string for percentile statistics
+ """
+ col = self.column
+ return f"""
+ SELECT
+ QUANTILE_CONT({col}, 0.25) as p25,
+ QUANTILE_CONT({col}, 0.50) as p50,
+ QUANTILE_CONT({col}, 0.75) as p75
+ FROM {table}
+ """.strip()
+
+ def build_histogram_query(self, table: str) -> str:
+ """
+ Build query for histogram (separate query).
+
+ Args:
+ table: Table name to query
+
+ Returns:
+ SQL query string for histogram
+ """
+ col = self.column
+ return f"""
+ SELECT {col} as value, COUNT(*) as count
+ FROM {table}
+ WHERE {col} IS NOT NULL
+ GROUP BY {col}
+ ORDER BY count DESC
+ LIMIT {self.histogram_limit}
+ """.strip()
+
+ def extract_base_result(self, df: "pd.DataFrame") -> Dict:
+ """
+ Extract base statistics from query result.
+
+ Args:
+ df: DataFrame containing query results
+
+ Returns:
+ Dictionary of extracted statistics
+ """
+ import pandas as pd
+
+ total = int(df["total"].iloc[0])
+
+ # Handle NaN for empty datasets
+ null_count_raw = df["null_count"].iloc[0]
+ null_count = int(null_count_raw) if not pd.isna(null_count_raw) else 0
+
+ distinct_count_raw = df["distinct_count"].iloc[0]
+ distinct_count = int(distinct_count_raw) if not pd.isna(distinct_count_raw) else 0
+
+ completeness = (total - null_count) / total if total > 0 else 1.0
+
+ result = {
+ "total": total,
+ "null_count": null_count,
+ "distinct_count": distinct_count,
+ "completeness": completeness,
+ }
+
+ if self.is_numeric:
+ result["minimum"] = self.safe_float(df, "min_val")
+ result["maximum"] = self.safe_float(df, "max_val")
+ result["mean"] = self.safe_float(df, "mean_val")
+ result["sum"] = self.safe_float(df, "sum_val")
+ result["std_dev"] = self.safe_float(df, "stddev_val")
+
+ return result
+
+ def extract_percentile_result(self, df: "pd.DataFrame") -> Optional[str]:
+ """
+ Extract percentile statistics from query result.
+
+ Args:
+ df: DataFrame containing percentile query results
+
+ Returns:
+ JSON string of percentile values or None
+ """
+ p25 = self.safe_float(df, "p25")
+ p50 = self.safe_float(df, "p50")
+ p75 = self.safe_float(df, "p75")
+
+ return json.dumps({
+ "0.25": p25,
+ "0.50": p50,
+ "0.75": p75,
+ })
+
+ def extract_histogram_result(self, df: "pd.DataFrame") -> Optional[str]:
+ """
+ Extract histogram from query result.
+
+ Args:
+ df: DataFrame containing histogram query results
+
+ Returns:
+ JSON string of histogram or None
+ """
+ histogram = {
+ str(row["value"]): int(row["count"])
+ for _, row in df.iterrows()
+ }
+ return json.dumps(histogram)
+
+ def build_profile(
+ self,
+ base_stats: Dict,
+ percentiles: Optional[str] = None,
+ histogram: Optional[str] = None,
+ ) -> ColumnProfile:
+ """
+ Build ColumnProfile from extracted statistics.
+
+ Args:
+ base_stats: Dictionary of base statistics
+ percentiles: JSON string of percentile values
+ histogram: JSON string of histogram
+
+ Returns:
+ ColumnProfile object
+ """
+ profile = ColumnProfile(
+ column=self.column,
+ completeness=base_stats["completeness"],
+ approx_distinct_values=base_stats["distinct_count"],
+ data_type=self.column_type,
+ is_data_type_inferred=True,
+ )
+
+ if self.is_numeric:
+ profile.minimum = base_stats.get("minimum")
+ profile.maximum = base_stats.get("maximum")
+ profile.mean = base_stats.get("mean")
+ profile.sum = base_stats.get("sum")
+ profile.std_dev = base_stats.get("std_dev")
+
+ if percentiles:
+ profile.approx_percentiles = percentiles
+
+ if histogram:
+ profile.histogram = histogram
+
+ return profile
+
+
+class MultiColumnProfileOperator(WhereClauseMixin, SafeExtractMixin, ColumnAliasMixin):
+ """
+ Profiles multiple columns in minimal queries.
+
+ This operator batches profile statistics for multiple columns to reduce
+ the number of SQL queries needed for profiling.
+
+ Attributes:
+ columns: List of column names to profile
+ schema: Dictionary mapping column names to SQL types
+ numeric_columns: List of numeric column names
+ string_columns: List of string column names
+ where: Optional WHERE clause for filtering
+ """
+
+ def __init__(
+ self,
+ columns: List[str],
+ schema: Dict[str, str],
+ where: Optional[str] = None,
+ ):
+ """
+ Initialize MultiColumnProfileOperator.
+
+ Args:
+ columns: List of column names to profile
+ schema: Dictionary mapping column names to SQL types
+ where: Optional WHERE clause for filtering
+ """
+ self.columns = columns
+ self.schema = schema
+ self.where = where
+
+ # Categorize columns by type
+ self.numeric_columns = [c for c in columns if schema.get(c) in NUMERIC_TYPES]
+ self.string_columns = [c for c in columns if schema.get(c) in STRING_TYPES]
+ self.other_columns = [
+ c for c in columns
+ if c not in self.numeric_columns and c not in self.string_columns
+ ]
+
+ def build_completeness_query(self, table: str) -> str:
+ """
+ Build query for completeness of all columns.
+
+ Args:
+ table: Table name to query
+
+ Returns:
+ SQL query string
+ """
+ aggregations = ["COUNT(*) as total"]
+ for col in self.columns:
+ aggregations.append(
+ f"SUM(CASE WHEN {col} IS NULL THEN 1 ELSE 0 END) as null_{col}"
+ )
+ aggregations.append(f"APPROX_COUNT_DISTINCT({col}) as distinct_{col}")
+
+ return f"SELECT {', '.join(aggregations)} FROM {table}"
+
+ def build_numeric_stats_query(self, table: str) -> str:
+ """
+ Build query for numeric column statistics.
+
+ Args:
+ table: Table name to query
+
+ Returns:
+ SQL query string
+ """
+ if not self.numeric_columns:
+ return ""
+
+ aggregations = []
+ for col in self.numeric_columns:
+ aggregations.extend([
+ f"MIN({col}) as min_{col}",
+ f"MAX({col}) as max_{col}",
+ f"AVG({col}) as mean_{col}",
+ f"SUM({col}) as sum_{col}",
+ f"STDDEV_POP({col}) as stddev_{col}",
+ ])
+
+ return f"SELECT {', '.join(aggregations)} FROM {table}"
+
+ def build_percentile_query(self, table: str) -> str:
+ """
+ Build query for percentiles of all numeric columns.
+
+ Args:
+ table: Table name to query
+
+ Returns:
+ SQL query string (empty if no numeric columns)
+ """
+ if not self.numeric_columns:
+ return ""
+
+ aggregations = []
+ for col in self.numeric_columns:
+ aggregations.extend([
+ f"QUANTILE_CONT({col}, 0.25) as p25_{col}",
+ f"QUANTILE_CONT({col}, 0.50) as p50_{col}",
+ f"QUANTILE_CONT({col}, 0.75) as p75_{col}",
+ ])
+
+ return f"SELECT {', '.join(aggregations)} FROM {table}"
+
+ def extract_profiles(
+ self,
+ completeness_df: "pd.DataFrame",
+ numeric_df: Optional["pd.DataFrame"] = None,
+ percentile_df: Optional["pd.DataFrame"] = None,
+ ) -> List[ColumnProfile]:
+ """
+ Extract column profiles from query results.
+
+ Args:
+ completeness_df: DataFrame with completeness statistics
+ numeric_df: DataFrame with numeric statistics (optional)
+ percentile_df: DataFrame with percentile statistics (optional)
+
+ Returns:
+ List of ColumnProfile objects
+ """
+ import pandas as pd
+
+ profiles = []
+ total = int(completeness_df["total"].iloc[0])
+
+ for col in self.columns:
+ # Extract completeness stats
+ null_count_raw = completeness_df[f"null_{col}"].iloc[0]
+ null_count = int(null_count_raw) if not pd.isna(null_count_raw) else 0
+
+ distinct_count_raw = completeness_df[f"distinct_{col}"].iloc[0]
+ distinct_count = int(distinct_count_raw) if not pd.isna(distinct_count_raw) else 0
+
+ completeness = (total - null_count) / total if total > 0 else 1.0
+
+ profile = ColumnProfile(
+ column=col,
+ completeness=completeness,
+ approx_distinct_values=distinct_count,
+ data_type=self.schema.get(col, "Unknown"),
+ is_data_type_inferred=True,
+ )
+
+ # Add numeric stats if applicable
+ if col in self.numeric_columns and numeric_df is not None:
+ profile.minimum = self.safe_float(numeric_df, f"min_{col}")
+ profile.maximum = self.safe_float(numeric_df, f"max_{col}")
+ profile.mean = self.safe_float(numeric_df, f"mean_{col}")
+ profile.sum = self.safe_float(numeric_df, f"sum_{col}")
+ profile.std_dev = self.safe_float(numeric_df, f"stddev_{col}")
+
+ # Add percentiles if applicable
+ if col in self.numeric_columns and percentile_df is not None:
+ p25 = self.safe_float(percentile_df, f"p25_{col}")
+ p50 = self.safe_float(percentile_df, f"p50_{col}")
+ p75 = self.safe_float(percentile_df, f"p75_{col}")
+ profile.approx_percentiles = json.dumps({
+ "0.25": p25,
+ "0.50": p50,
+ "0.75": p75,
+ })
+
+ profiles.append(profile)
+
+ return profiles
+
+
+__all__ = [
+ "ColumnProfileOperator",
+ "MultiColumnProfileOperator",
+ "NUMERIC_TYPES",
+ "STRING_TYPES",
+]
diff --git a/pydeequ/engines/operators/protocols.py b/pydeequ/engines/operators/protocols.py
new file mode 100644
index 0000000..3d36c5a
--- /dev/null
+++ b/pydeequ/engines/operators/protocols.py
@@ -0,0 +1,100 @@
+# -*- coding: utf-8 -*-
+"""
+Protocol definitions for SQL operators.
+
+This module defines the structural typing contracts that operators must
+implement. Using Protocol from typing allows for duck typing while still
+providing IDE support and type checking.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, List, Protocol, runtime_checkable
+
+if TYPE_CHECKING:
+ import pandas as pd
+ from pydeequ.engines import MetricResult
+
+
+@runtime_checkable
+class ScanOperatorProtocol(Protocol):
+ """
+ Contract for single-pass aggregation operators.
+
+ Scan operators compute metrics via SQL aggregations that can be
+ combined into a single SELECT statement, enabling efficient
+ batch execution.
+ """
+
+ def get_aggregations(self) -> List[str]:
+ """
+ Return SQL aggregation expressions.
+
+ Returns:
+ List of SQL aggregation expressions with AS alias clauses,
+ e.g., ["AVG(col) AS mean_col", "COUNT(*) AS count_col"]
+ """
+ ...
+
+ def extract_result(self, df: "pd.DataFrame") -> "MetricResult":
+ """
+ Extract metric from query result DataFrame.
+
+ Args:
+ df: DataFrame containing query results with columns
+ matching the aliases from get_aggregations()
+
+ Returns:
+ MetricResult with extracted value
+ """
+ ...
+
+
+@runtime_checkable
+class GroupingOperatorProtocol(Protocol):
+ """
+ Contract for operators requiring GROUP BY queries.
+
+ Grouping operators need to compute intermediate aggregations
+ via GROUP BY before computing the final metric. They cannot
+ be batched with scan operators and require separate queries.
+ """
+
+ def get_grouping_columns(self) -> List[str]:
+ """
+ Return columns to GROUP BY.
+
+ Returns:
+ List of column names for the GROUP BY clause
+ """
+ ...
+
+ def build_query(self, table: str) -> str:
+ """
+ Build complete CTE-based query.
+
+ Args:
+ table: Name of the table to query
+
+ Returns:
+ Complete SQL query string with CTEs as needed
+ """
+ ...
+
+ def extract_result(self, df: "pd.DataFrame") -> "MetricResult":
+ """
+ Extract metric from query result DataFrame.
+
+ Args:
+ df: DataFrame containing query results
+
+ Returns:
+ MetricResult with extracted value
+ """
+ ...
+
+
+__all__ = [
+ "ScanOperatorProtocol",
+ "GroupingOperatorProtocol",
+]
diff --git a/pydeequ/engines/operators/scan_operators.py b/pydeequ/engines/operators/scan_operators.py
new file mode 100644
index 0000000..b9a7f22
--- /dev/null
+++ b/pydeequ/engines/operators/scan_operators.py
@@ -0,0 +1,502 @@
+# -*- coding: utf-8 -*-
+"""
+Scan operator implementations.
+
+Scan operators compute metrics via SQL aggregations that can be combined
+into a single SELECT statement, enabling efficient batch execution.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, List, Optional
+
+from pydeequ.engines import MetricResult
+from pydeequ.engines.operators.base import ScanOperator
+from pydeequ.engines.operators.mixins import (
+ ColumnAliasMixin,
+ SafeExtractMixin,
+ WhereClauseMixin,
+)
+
+if TYPE_CHECKING:
+ import pandas as pd
+
+
+class SizeOperator(WhereClauseMixin, SafeExtractMixin, ColumnAliasMixin):
+ """
+ Computes the number of rows in a table.
+
+ Unlike other scan operators, Size operates on the dataset level
+ rather than a specific column.
+ """
+
+ def __init__(self, where: Optional[str] = None):
+ self.where = where
+ self.alias = "size_value"
+
+ @property
+ def metric_name(self) -> str:
+ return "Size"
+
+ @property
+ def instance(self) -> str:
+ return "*"
+
+ @property
+ def entity(self) -> str:
+ return "Dataset"
+
+ def get_aggregations(self) -> List[str]:
+ if self.where:
+ sql = f"SUM(CASE WHEN {self.where} THEN 1 ELSE 0 END) AS {self.alias}"
+ else:
+ sql = f"COUNT(*) AS {self.alias}"
+ return [sql]
+
+ def extract_result(self, df: "pd.DataFrame") -> MetricResult:
+ value = self.safe_float(df, self.alias)
+ return MetricResult(
+ name=self.metric_name,
+ instance=self.instance,
+ entity=self.entity,
+ value=value,
+ )
+
+
+class CompletenessOperator(ScanOperator):
+ """Computes the fraction of non-null values in a column."""
+
+ def __init__(self, column: str, where: Optional[str] = None):
+ super().__init__(column, where)
+ self.count_alias = self.make_alias("count", column)
+ self.null_alias = self.make_alias("null_count", column)
+
+ @property
+ def metric_name(self) -> str:
+ return "Completeness"
+
+ def get_aggregations(self) -> List[str]:
+ count_sql = self.wrap_count_with_where("1")
+ null_sql = self.wrap_count_with_where(f"{self.column} IS NULL")
+ return [
+ f"{count_sql} AS {self.count_alias}",
+ f"{null_sql} AS {self.null_alias}",
+ ]
+
+ def extract_result(self, df: "pd.DataFrame") -> MetricResult:
+ total = self.safe_float(df, self.count_alias) or 0
+ nulls = self.safe_float(df, self.null_alias) or 0
+ value = (total - nulls) / total if total > 0 else 1.0
+ return MetricResult(
+ name=self.metric_name,
+ instance=self.instance,
+ entity=self.entity,
+ value=value,
+ )
+
+
+class MeanOperator(ScanOperator):
+ """Computes the average of a numeric column."""
+
+ def __init__(self, column: str, where: Optional[str] = None):
+ super().__init__(column, where)
+ self.alias = self.make_alias("mean", column)
+
+ @property
+ def metric_name(self) -> str:
+ return "Mean"
+
+ def get_aggregations(self) -> List[str]:
+ agg = self.wrap_agg_with_where("AVG", self.column)
+ return [f"{agg} AS {self.alias}"]
+
+ def extract_result(self, df: "pd.DataFrame") -> MetricResult:
+ value = self.safe_float(df, self.alias)
+ return MetricResult(
+ name=self.metric_name,
+ instance=self.instance,
+ entity=self.entity,
+ value=value,
+ )
+
+
+class SumOperator(ScanOperator):
+ """Computes the sum of a numeric column."""
+
+ def __init__(self, column: str, where: Optional[str] = None):
+ super().__init__(column, where)
+ self.alias = self.make_alias("sum", column)
+
+ @property
+ def metric_name(self) -> str:
+ return "Sum"
+
+ def get_aggregations(self) -> List[str]:
+ agg = self.wrap_agg_with_where("SUM", self.column)
+ return [f"{agg} AS {self.alias}"]
+
+ def extract_result(self, df: "pd.DataFrame") -> MetricResult:
+ value = self.safe_float(df, self.alias)
+ return MetricResult(
+ name=self.metric_name,
+ instance=self.instance,
+ entity=self.entity,
+ value=value,
+ )
+
+
+class MinimumOperator(ScanOperator):
+ """Computes the minimum value of a numeric column."""
+
+ def __init__(self, column: str, where: Optional[str] = None):
+ super().__init__(column, where)
+ self.alias = self.make_alias("min", column)
+
+ @property
+ def metric_name(self) -> str:
+ return "Minimum"
+
+ def get_aggregations(self) -> List[str]:
+ agg = self.wrap_agg_with_where("MIN", self.column)
+ return [f"{agg} AS {self.alias}"]
+
+ def extract_result(self, df: "pd.DataFrame") -> MetricResult:
+ value = self.safe_float(df, self.alias)
+ return MetricResult(
+ name=self.metric_name,
+ instance=self.instance,
+ entity=self.entity,
+ value=value,
+ )
+
+
+class MaximumOperator(ScanOperator):
+ """Computes the maximum value of a numeric column."""
+
+ def __init__(self, column: str, where: Optional[str] = None):
+ super().__init__(column, where)
+ self.alias = self.make_alias("max", column)
+
+ @property
+ def metric_name(self) -> str:
+ return "Maximum"
+
+ def get_aggregations(self) -> List[str]:
+ agg = self.wrap_agg_with_where("MAX", self.column)
+ return [f"{agg} AS {self.alias}"]
+
+ def extract_result(self, df: "pd.DataFrame") -> MetricResult:
+ value = self.safe_float(df, self.alias)
+ return MetricResult(
+ name=self.metric_name,
+ instance=self.instance,
+ entity=self.entity,
+ value=value,
+ )
+
+
+class StandardDeviationOperator(ScanOperator):
+ """Computes the standard deviation of a numeric column."""
+
+ def __init__(self, column: str, where: Optional[str] = None):
+ super().__init__(column, where)
+ self.alias = self.make_alias("stddev", column)
+
+ @property
+ def metric_name(self) -> str:
+ return "StandardDeviation"
+
+ def get_aggregations(self) -> List[str]:
+ agg = self.wrap_agg_with_where("STDDEV_POP", self.column)
+ return [f"{agg} AS {self.alias}"]
+
+ def extract_result(self, df: "pd.DataFrame") -> MetricResult:
+ value = self.safe_float(df, self.alias)
+ return MetricResult(
+ name=self.metric_name,
+ instance=self.instance,
+ entity=self.entity,
+ value=value,
+ )
+
+
+class MaxLengthOperator(ScanOperator):
+ """Computes the maximum string length in a column."""
+
+ def __init__(self, column: str, where: Optional[str] = None):
+ super().__init__(column, where)
+ self.alias = self.make_alias("max_length", column)
+
+ @property
+ def metric_name(self) -> str:
+ return "MaxLength"
+
+ def get_aggregations(self) -> List[str]:
+ if self.where:
+ sql = f"MAX(CASE WHEN {self.where} THEN LENGTH({self.column}) ELSE NULL END)"
+ else:
+ sql = f"MAX(LENGTH({self.column}))"
+ return [f"{sql} AS {self.alias}"]
+
+ def extract_result(self, df: "pd.DataFrame") -> MetricResult:
+ value = self.safe_float(df, self.alias)
+ return MetricResult(
+ name=self.metric_name,
+ instance=self.instance,
+ entity=self.entity,
+ value=value,
+ )
+
+
+class MinLengthOperator(ScanOperator):
+ """Computes the minimum string length in a column."""
+
+ def __init__(self, column: str, where: Optional[str] = None):
+ super().__init__(column, where)
+ self.alias = self.make_alias("min_length", column)
+
+ @property
+ def metric_name(self) -> str:
+ return "MinLength"
+
+ def get_aggregations(self) -> List[str]:
+ if self.where:
+ sql = f"MIN(CASE WHEN {self.where} THEN LENGTH({self.column}) ELSE NULL END)"
+ else:
+ sql = f"MIN(LENGTH({self.column}))"
+ return [f"{sql} AS {self.alias}"]
+
+ def extract_result(self, df: "pd.DataFrame") -> MetricResult:
+ value = self.safe_float(df, self.alias)
+ return MetricResult(
+ name=self.metric_name,
+ instance=self.instance,
+ entity=self.entity,
+ value=value,
+ )
+
+
+class PatternMatchOperator(ScanOperator):
+ """Computes the fraction of values matching a regex pattern."""
+
+ def __init__(self, column: str, pattern: str, where: Optional[str] = None):
+ super().__init__(column, where)
+ self.pattern = pattern.replace("'", "''") # Escape single quotes
+ self.count_alias = self.make_alias("count", column)
+ self.match_alias = self.make_alias("pattern_match", column)
+
+ @property
+ def metric_name(self) -> str:
+ return "PatternMatch"
+
+ def get_aggregations(self) -> List[str]:
+ count_sql = self.wrap_count_with_where("1")
+ match_cond = f"REGEXP_MATCHES({self.column}, '{self.pattern}')"
+ match_sql = self.wrap_count_with_where(match_cond)
+ return [
+ f"{count_sql} AS {self.count_alias}",
+ f"{match_sql} AS {self.match_alias}",
+ ]
+
+ def extract_result(self, df: "pd.DataFrame") -> MetricResult:
+ total = self.safe_float(df, self.count_alias) or 0
+ matches = self.safe_float(df, self.match_alias) or 0
+ value = matches / total if total > 0 else 1.0
+ return MetricResult(
+ name=self.metric_name,
+ instance=self.instance,
+ entity=self.entity,
+ value=value,
+ )
+
+
+class ComplianceOperator(WhereClauseMixin, SafeExtractMixin, ColumnAliasMixin):
+ """
+ Computes the fraction of rows satisfying a SQL condition.
+
+ Unlike other scan operators, Compliance operates on a predicate
+ rather than a specific column.
+ """
+
+ def __init__(self, instance: str, predicate: str, where: Optional[str] = None):
+ self.instance_name = instance
+ self.predicate = predicate
+ self.where = where
+ self.count_alias = "compliance_count"
+ self.match_alias = self.make_alias("compliance_match", instance)
+
+ @property
+ def metric_name(self) -> str:
+ return "Compliance"
+
+ @property
+ def instance(self) -> str:
+ return self.instance_name
+
+ @property
+ def entity(self) -> str:
+ return "Dataset"
+
+ def get_aggregations(self) -> List[str]:
+ count_sql = self.wrap_count_with_where("1")
+ match_sql = self.wrap_count_with_where(f"({self.predicate})")
+ return [
+ f"{count_sql} AS {self.count_alias}",
+ f"{match_sql} AS {self.match_alias}",
+ ]
+
+ def extract_result(self, df: "pd.DataFrame") -> MetricResult:
+ total = self.safe_float(df, self.count_alias) or 0
+ matches = self.safe_float(df, self.match_alias) or 0
+ value = matches / total if total > 0 else 1.0
+ return MetricResult(
+ name=self.metric_name,
+ instance=self.instance,
+ entity=self.entity,
+ value=value,
+ )
+
+
+class CorrelationOperator(WhereClauseMixin, SafeExtractMixin, ColumnAliasMixin):
+ """Computes Pearson correlation between two columns."""
+
+ def __init__(self, column1: str, column2: str, where: Optional[str] = None):
+ self.column1 = column1
+ self.column2 = column2
+ self.where = where
+ self.alias = self.make_alias("corr", column1, column2)
+
+ @property
+ def metric_name(self) -> str:
+ return "Correlation"
+
+ @property
+ def instance(self) -> str:
+ return f"{self.column1},{self.column2}"
+
+ @property
+ def entity(self) -> str:
+ return "Multicolumn"
+
+ def get_aggregations(self) -> List[str]:
+ # Note: CORR doesn't support CASE WHEN wrapping in most DBs
+ # For WHERE clause, the engine should apply it to the whole query
+ sql = f"CORR({self.column1}, {self.column2})"
+ return [f"{sql} AS {self.alias}"]
+
+ def extract_result(self, df: "pd.DataFrame") -> MetricResult:
+ value = self.safe_float(df, self.alias)
+ return MetricResult(
+ name=self.metric_name,
+ instance=self.instance,
+ entity=self.entity,
+ value=value,
+ )
+
+
+class CountDistinctOperator(WhereClauseMixin, SafeExtractMixin, ColumnAliasMixin):
+ """Computes the count of distinct values in column(s)."""
+
+ def __init__(self, columns: List[str], where: Optional[str] = None):
+ self.columns = columns
+ self.where = where
+ self.alias = self.make_alias("count_distinct", *columns)
+
+ @property
+ def metric_name(self) -> str:
+ return "CountDistinct"
+
+ @property
+ def instance(self) -> str:
+ return ",".join(self.columns)
+
+ @property
+ def entity(self) -> str:
+ return "Multicolumn" if len(self.columns) > 1 else "Column"
+
+ def get_aggregations(self) -> List[str]:
+ cols_str = ", ".join(self.columns)
+ sql = f"COUNT(DISTINCT ({cols_str}))"
+ return [f"{sql} AS {self.alias}"]
+
+ def extract_result(self, df: "pd.DataFrame") -> MetricResult:
+ value = self.safe_float(df, self.alias)
+ return MetricResult(
+ name=self.metric_name,
+ instance=self.instance,
+ entity=self.entity,
+ value=value,
+ )
+
+
+class ApproxCountDistinctOperator(ScanOperator):
+ """Computes approximate count distinct using HyperLogLog."""
+
+ def __init__(self, column: str, where: Optional[str] = None):
+ super().__init__(column, where)
+ self.alias = self.make_alias("approx_count_distinct", column)
+
+ @property
+ def metric_name(self) -> str:
+ return "ApproxCountDistinct"
+
+ def get_aggregations(self) -> List[str]:
+ agg = self.wrap_agg_with_where("APPROX_COUNT_DISTINCT", self.column)
+ return [f"{agg} AS {self.alias}"]
+
+ def extract_result(self, df: "pd.DataFrame") -> MetricResult:
+ value = self.safe_float(df, self.alias)
+ return MetricResult(
+ name=self.metric_name,
+ instance=self.instance,
+ entity=self.entity,
+ value=value,
+ )
+
+
+class ApproxQuantileOperator(ScanOperator):
+ """Computes approximate quantile using QUANTILE_CONT."""
+
+ def __init__(self, column: str, quantile: float = 0.5, where: Optional[str] = None):
+ super().__init__(column, where)
+ self.quantile = quantile
+ self.alias = self.make_alias("approx_quantile", column)
+
+ @property
+ def metric_name(self) -> str:
+ return "ApproxQuantile"
+
+ def get_aggregations(self) -> List[str]:
+ if self.where:
+ agg = f"QUANTILE_CONT(CASE WHEN {self.where} THEN {self.column} ELSE NULL END, {self.quantile})"
+ else:
+ agg = f"QUANTILE_CONT({self.column}, {self.quantile})"
+ return [f"{agg} AS {self.alias}"]
+
+ def extract_result(self, df: "pd.DataFrame") -> MetricResult:
+ value = self.safe_float(df, self.alias)
+ return MetricResult(
+ name=self.metric_name,
+ instance=self.instance,
+ entity=self.entity,
+ value=value,
+ )
+
+
+__all__ = [
+ "SizeOperator",
+ "CompletenessOperator",
+ "MeanOperator",
+ "SumOperator",
+ "MinimumOperator",
+ "MaximumOperator",
+ "StandardDeviationOperator",
+ "MaxLengthOperator",
+ "MinLengthOperator",
+ "PatternMatchOperator",
+ "ComplianceOperator",
+ "CorrelationOperator",
+ "CountDistinctOperator",
+ "ApproxCountDistinctOperator",
+ "ApproxQuantileOperator",
+]
diff --git a/pydeequ/engines/spark.py b/pydeequ/engines/spark.py
new file mode 100644
index 0000000..2072bac
--- /dev/null
+++ b/pydeequ/engines/spark.py
@@ -0,0 +1,264 @@
+# -*- coding: utf-8 -*-
+"""
+Spark execution engine for PyDeequ.
+
+This module provides a Spark-based execution engine that wraps the existing
+v2 Spark Connect API, providing a unified engine interface.
+
+Example usage:
+ from pyspark.sql import SparkSession
+ from pydeequ.engines.spark import SparkEngine
+ from pydeequ.v2.analyzers import Size, Completeness
+
+ spark = SparkSession.builder.remote("sc://localhost:15002").getOrCreate()
+ df = spark.createDataFrame([(1, 2), (3, 4)], ["a", "b"])
+
+ engine = SparkEngine(spark, dataframe=df)
+ metrics = engine.compute_metrics([Size(), Completeness("a")])
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Dict, List, Optional, Sequence
+
+import pandas as pd
+
+from pydeequ.engines import (
+ BaseEngine,
+ ColumnProfile,
+ ConstraintResult,
+ ConstraintSuggestion,
+ ConstraintStatus,
+ CheckStatus,
+ MetricResult,
+)
+
+if TYPE_CHECKING:
+ from pyspark.sql import DataFrame, SparkSession
+ from pydeequ.v2.analyzers import _ConnectAnalyzer
+ from pydeequ.v2.checks import Check
+
+
+class SparkEngine(BaseEngine):
+ """
+ Spark-based execution engine.
+
+ This engine wraps the existing v2 Spark Connect API to provide
+ a unified engine interface. It delegates execution to the
+ Deequ plugin running on the Spark cluster.
+
+ Attributes:
+ spark: SparkSession
+ table: Optional table name
+ dataframe: Optional DataFrame to analyze
+ """
+
+ def __init__(
+ self,
+ spark: "SparkSession",
+ table: Optional[str] = None,
+ dataframe: Optional["DataFrame"] = None,
+ ):
+ """
+ Create a new SparkEngine.
+
+ Args:
+ spark: SparkSession (Spark Connect)
+ table: Optional table name to analyze
+ dataframe: Optional DataFrame to analyze (preferred over table)
+ """
+ self.spark = spark
+ self.table = table
+ self._dataframe = dataframe
+
+ def _get_dataframe(self) -> "DataFrame":
+ """Get the DataFrame to analyze."""
+ if self._dataframe is not None:
+ return self._dataframe
+ if self.table:
+ return self.spark.table(self.table)
+ raise ValueError("Either dataframe or table must be provided")
+
+ def get_schema(self) -> Dict[str, str]:
+ """Get the schema of the data source."""
+ df = self._get_dataframe()
+ return {field.name: str(field.dataType) for field in df.schema.fields}
+
+ def compute_metrics(
+ self, analyzers: Sequence["_ConnectAnalyzer"]
+ ) -> List[MetricResult]:
+ """
+ Compute metrics using the Spark Connect Deequ plugin.
+
+ Args:
+ analyzers: Sequence of analyzers to compute metrics for
+
+ Returns:
+ List of MetricResult objects
+ """
+ from pydeequ.v2.verification import AnalysisRunner
+
+ df = self._get_dataframe()
+
+ # Build and run the analysis
+ runner = AnalysisRunner(self.spark).onData(df)
+ for analyzer in analyzers:
+ runner = runner.addAnalyzer(analyzer)
+
+ result_df = runner.run()
+
+ # Convert Spark DataFrame result to MetricResult objects
+ results: List[MetricResult] = []
+ for row in result_df.collect():
+ results.append(MetricResult(
+ name=row["name"],
+ instance=row["instance"],
+ entity=row["entity"],
+ value=float(row["value"]) if row["value"] is not None else None,
+ ))
+
+ return results
+
+ def run_checks(self, checks: Sequence["Check"]) -> List[ConstraintResult]:
+ """
+ Run verification checks using the Spark Connect Deequ plugin.
+
+ Args:
+ checks: Sequence of Check objects to evaluate
+
+ Returns:
+ List of ConstraintResult objects
+ """
+ from pydeequ.v2.verification import VerificationSuite
+
+ df = self._get_dataframe()
+
+ # Build and run the verification
+ suite = VerificationSuite(self.spark).onData(df)
+ for check in checks:
+ suite = suite.addCheck(check)
+
+ result_df = suite.run()
+
+ # Convert Spark DataFrame result to ConstraintResult objects
+ results: List[ConstraintResult] = []
+ for row in result_df.collect():
+ results.append(ConstraintResult(
+ check_description=row["check"],
+ check_level=row["check_level"],
+ check_status=row["check_status"],
+ constraint=row["constraint"],
+ constraint_status=row["constraint_status"],
+ constraint_message=row["constraint_message"],
+ ))
+
+ return results
+
+ def profile_columns(
+ self,
+ columns: Optional[Sequence[str]] = None,
+ low_cardinality_threshold: int = 0,
+ ) -> List[ColumnProfile]:
+ """
+ Profile columns using the Spark Connect Deequ plugin.
+
+ Args:
+ columns: Optional list of columns to profile
+ low_cardinality_threshold: Threshold for histogram computation
+
+ Returns:
+ List of ColumnProfile objects
+ """
+ from pydeequ.v2.profiles import ColumnProfilerRunner
+
+ df = self._get_dataframe()
+
+ # Build and run the profiler
+ runner = ColumnProfilerRunner(self.spark).onData(df)
+
+ if columns:
+ runner = runner.restrictToColumns(columns)
+
+ if low_cardinality_threshold > 0:
+ runner = runner.withLowCardinalityHistogramThreshold(low_cardinality_threshold)
+
+ result_df = runner.run()
+
+ # Convert Spark DataFrame result to ColumnProfile objects
+ profiles: List[ColumnProfile] = []
+ for row in result_df.collect():
+ profiles.append(ColumnProfile(
+ column=row["column"],
+ completeness=float(row["completeness"]) if row["completeness"] is not None else 0.0,
+ approx_distinct_values=int(row["approx_distinct_values"]) if row["approx_distinct_values"] is not None else 0,
+ data_type=row["data_type"] if row["data_type"] else "Unknown",
+ is_data_type_inferred=bool(row["is_data_type_inferred"]) if "is_data_type_inferred" in row else True,
+ type_counts=row["type_counts"] if "type_counts" in row else None,
+ histogram=row["histogram"] if "histogram" in row else None,
+ mean=float(row["mean"]) if "mean" in row and row["mean"] is not None else None,
+ minimum=float(row["minimum"]) if "minimum" in row and row["minimum"] is not None else None,
+ maximum=float(row["maximum"]) if "maximum" in row and row["maximum"] is not None else None,
+ sum=float(row["sum"]) if "sum" in row and row["sum"] is not None else None,
+ std_dev=float(row["std_dev"]) if "std_dev" in row and row["std_dev"] is not None else None,
+ ))
+
+ return profiles
+
+ def suggest_constraints(
+ self,
+ columns: Optional[Sequence[str]] = None,
+ rules: Optional[Sequence[str]] = None,
+ ) -> List[ConstraintSuggestion]:
+ """
+ Suggest constraints using the Spark Connect Deequ plugin.
+
+ Args:
+ columns: Optional list of columns to analyze
+ rules: Optional list of rule sets to apply
+
+ Returns:
+ List of ConstraintSuggestion objects
+ """
+ from pydeequ.v2.suggestions import ConstraintSuggestionRunner, Rules
+
+ df = self._get_dataframe()
+
+ # Build and run the suggestion runner
+ runner = ConstraintSuggestionRunner(self.spark).onData(df)
+
+ if columns:
+ runner = runner.restrictToColumns(columns)
+
+ # Map rule strings to Rules enum (accept both strings and enum values)
+ if rules:
+ rule_map = {
+ "DEFAULT": Rules.DEFAULT,
+ "STRING": Rules.STRING,
+ "NUMERICAL": Rules.NUMERICAL,
+ "COMMON": Rules.COMMON,
+ "EXTENDED": Rules.EXTENDED,
+ }
+ for rule in rules:
+ # Accept both Rules enum and string values
+ if isinstance(rule, Rules):
+ runner = runner.addConstraintRules(rule)
+ elif rule in rule_map:
+ runner = runner.addConstraintRules(rule_map[rule])
+ else:
+ runner = runner.addConstraintRules(Rules.DEFAULT)
+
+ result_df = runner.run()
+
+ # Convert Spark DataFrame result to ConstraintSuggestion objects
+ suggestions: List[ConstraintSuggestion] = []
+ for row in result_df.collect():
+ suggestions.append(ConstraintSuggestion(
+ column_name=row["column_name"],
+ constraint_name=row["constraint_name"],
+ current_value=row["current_value"] if "current_value" in row else None,
+ description=row["description"],
+ suggesting_rule=row["suggesting_rule"],
+ code_for_constraint=row["code_for_constraint"],
+ ))
+
+ return suggestions
diff --git a/pydeequ/engines/suggestions/__init__.py b/pydeequ/engines/suggestions/__init__.py
new file mode 100644
index 0000000..6303692
--- /dev/null
+++ b/pydeequ/engines/suggestions/__init__.py
@@ -0,0 +1,67 @@
+# -*- coding: utf-8 -*-
+"""
+Constraint suggestion module.
+
+This module provides a modular, rule-based system for suggesting data quality
+constraints based on column profiles.
+
+Architecture:
+ rules.py - SuggestionRule base class and 10 rule implementations
+ registry.py - RuleRegistry for organizing rules by rule set
+ runner.py - SuggestionRunner for orchestrating rule execution
+
+Available Rule Sets:
+ - DEFAULT: Basic rules (completeness, non-negative, categorical)
+ - NUMERICAL: Rules for numeric columns (min, max, mean)
+ - STRING: Rules for string columns (min/max length)
+ - COMMON: General rules (uniqueness)
+ - EXTENDED: All rules combined
+
+Example usage:
+ from pydeequ.engines.suggestions import SuggestionRunner
+
+ # Run default rules
+ runner = SuggestionRunner(rule_sets=["DEFAULT"])
+ suggestions = runner.run(profiles, execute_fn=engine._execute_query, table="my_table")
+
+ # Run multiple rule sets
+ runner = SuggestionRunner(rule_sets=["DEFAULT", "NUMERICAL", "STRING"])
+ suggestions = runner.run(profiles, execute_fn=engine._execute_query, table="my_table")
+"""
+
+from pydeequ.engines.suggestions.registry import RuleRegistry
+from pydeequ.engines.suggestions.rules import (
+ SuggestionRule,
+ CompleteIfCompleteRule,
+ RetainCompletenessRule,
+ NonNegativeNumbersRule,
+ CategoricalRangeRule,
+ HasMinRule,
+ HasMaxRule,
+ HasMeanRule,
+ HasMinLengthRule,
+ HasMaxLengthRule,
+ UniqueIfApproximatelyUniqueRule,
+)
+from pydeequ.engines.suggestions.runner import SuggestionRunner
+
+
+__all__ = [
+ # Registry
+ "RuleRegistry",
+ # Runner
+ "SuggestionRunner",
+ # Base class
+ "SuggestionRule",
+ # Rules
+ "CompleteIfCompleteRule",
+ "RetainCompletenessRule",
+ "NonNegativeNumbersRule",
+ "CategoricalRangeRule",
+ "HasMinRule",
+ "HasMaxRule",
+ "HasMeanRule",
+ "HasMinLengthRule",
+ "HasMaxLengthRule",
+ "UniqueIfApproximatelyUniqueRule",
+]
diff --git a/pydeequ/engines/suggestions/registry.py b/pydeequ/engines/suggestions/registry.py
new file mode 100644
index 0000000..f762845
--- /dev/null
+++ b/pydeequ/engines/suggestions/registry.py
@@ -0,0 +1,98 @@
+# -*- coding: utf-8 -*-
+"""
+Suggestion rule registry.
+
+This module provides a registry for suggestion rules, allowing rules to be
+organized by rule sets (DEFAULT, NUMERICAL, STRING, COMMON, EXTENDED).
+"""
+
+from __future__ import annotations
+
+from typing import List
+
+from pydeequ.engines.suggestions.rules import (
+ SuggestionRule,
+ CompleteIfCompleteRule,
+ RetainCompletenessRule,
+ NonNegativeNumbersRule,
+ CategoricalRangeRule,
+ HasMinRule,
+ HasMaxRule,
+ HasMeanRule,
+ HasMinLengthRule,
+ HasMaxLengthRule,
+ UniqueIfApproximatelyUniqueRule,
+)
+
+
+class RuleRegistry:
+ """
+ Registry of suggestion rules by rule set.
+
+ Provides centralized management of suggestion rules and retrieval
+ by rule set names.
+ """
+
+ _rules: List[SuggestionRule] = []
+
+ @classmethod
+ def register(cls, rule: SuggestionRule) -> None:
+ """
+ Register a suggestion rule.
+
+ Args:
+ rule: SuggestionRule instance to register
+ """
+ cls._rules.append(rule)
+
+ @classmethod
+ def get_rules_for_sets(cls, rule_sets: List[str]) -> List[SuggestionRule]:
+ """
+ Get all rules that belong to any of the specified rule sets.
+
+ Args:
+ rule_sets: List of rule set names (e.g., ["DEFAULT", "NUMERICAL"])
+
+ Returns:
+ List of rules that belong to any of the specified sets
+ """
+ return [r for r in cls._rules if any(s in r.rule_sets for s in rule_sets)]
+
+ @classmethod
+ def get_all_rules(cls) -> List[SuggestionRule]:
+ """
+ Get all registered rules.
+
+ Returns:
+ List of all registered rules
+ """
+ return cls._rules.copy()
+
+ @classmethod
+ def clear(cls) -> None:
+ """Clear all registered rules (mainly for testing)."""
+ cls._rules = []
+
+
+# Auto-register all default rules
+def _register_default_rules() -> None:
+ """Register all built-in suggestion rules."""
+ RuleRegistry.register(CompleteIfCompleteRule())
+ RuleRegistry.register(RetainCompletenessRule())
+ RuleRegistry.register(NonNegativeNumbersRule())
+ RuleRegistry.register(CategoricalRangeRule())
+ RuleRegistry.register(HasMinRule())
+ RuleRegistry.register(HasMaxRule())
+ RuleRegistry.register(HasMeanRule())
+ RuleRegistry.register(HasMinLengthRule())
+ RuleRegistry.register(HasMaxLengthRule())
+ RuleRegistry.register(UniqueIfApproximatelyUniqueRule())
+
+
+# Register rules on module load
+_register_default_rules()
+
+
+__all__ = [
+ "RuleRegistry",
+]
diff --git a/pydeequ/engines/suggestions/rules.py b/pydeequ/engines/suggestions/rules.py
new file mode 100644
index 0000000..40a7ebf
--- /dev/null
+++ b/pydeequ/engines/suggestions/rules.py
@@ -0,0 +1,380 @@
+# -*- coding: utf-8 -*-
+"""
+Suggestion rule implementations.
+
+This module provides the base class and implementations for constraint
+suggestion rules. Each rule analyzes column profiles and generates
+appropriate constraint suggestions.
+"""
+
+from __future__ import annotations
+
+import json
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, List, Optional, Set
+
+if TYPE_CHECKING:
+ from pydeequ.engines import ColumnProfile, ConstraintSuggestion
+
+
+# SQL types that are considered string
+STRING_TYPES: Set[str] = {"VARCHAR", "CHAR", "BPCHAR", "TEXT", "STRING"}
+
+
+class SuggestionRule(ABC):
+ """
+ Base class for constraint suggestion rules.
+
+ Each rule examines column profiles and generates appropriate
+ constraint suggestions based on data characteristics.
+ """
+
+ @property
+ @abstractmethod
+ def name(self) -> str:
+ """Rule name for identification."""
+ pass
+
+ @property
+ @abstractmethod
+ def rule_sets(self) -> List[str]:
+ """Which rule sets this rule belongs to (DEFAULT, NUMERICAL, etc)."""
+ pass
+
+ @abstractmethod
+ def applies_to(self, profile: "ColumnProfile") -> bool:
+ """Whether this rule applies to the given column profile."""
+ pass
+
+ @abstractmethod
+ def generate(self, profile: "ColumnProfile") -> Optional["ConstraintSuggestion"]:
+ """Generate a suggestion if applicable, or None."""
+ pass
+
+
+class CompleteIfCompleteRule(SuggestionRule):
+ """Suggests isComplete() constraint for fully complete columns."""
+
+ @property
+ def name(self) -> str:
+ return "CompleteIfComplete"
+
+ @property
+ def rule_sets(self) -> List[str]:
+ return ["DEFAULT", "EXTENDED"]
+
+ def applies_to(self, profile: "ColumnProfile") -> bool:
+ return profile.completeness == 1.0
+
+ def generate(self, profile: "ColumnProfile") -> Optional["ConstraintSuggestion"]:
+ from pydeequ.engines import ConstraintSuggestion
+
+ return ConstraintSuggestion(
+ column_name=profile.column,
+ constraint_name="Completeness",
+ current_value="1.0",
+ description=f"'{profile.column}' is complete",
+ suggesting_rule=self.name,
+ code_for_constraint=f'.isComplete("{profile.column}")',
+ )
+
+
+class RetainCompletenessRule(SuggestionRule):
+ """Suggests hasCompleteness() constraint for highly complete columns."""
+
+ THRESHOLD = 0.9 # Minimum completeness to suggest retaining
+
+ @property
+ def name(self) -> str:
+ return "RetainCompleteness"
+
+ @property
+ def rule_sets(self) -> List[str]:
+ return ["DEFAULT", "EXTENDED"]
+
+ def applies_to(self, profile: "ColumnProfile") -> bool:
+ # Apply only if not fully complete but >= threshold
+ return self.THRESHOLD <= profile.completeness < 1.0
+
+ def generate(self, profile: "ColumnProfile") -> Optional["ConstraintSuggestion"]:
+ from pydeequ.engines import ConstraintSuggestion
+
+ return ConstraintSuggestion(
+ column_name=profile.column,
+ constraint_name="Completeness",
+ current_value=f"{profile.completeness:.4f}",
+ description=f"'{profile.column}' has completeness {profile.completeness:.2%}",
+ suggesting_rule=self.name,
+ code_for_constraint=f'.hasCompleteness("{profile.column}", gte({profile.completeness:.2f}))',
+ )
+
+
+class NonNegativeNumbersRule(SuggestionRule):
+ """Suggests isNonNegative() constraint for columns with no negative values."""
+
+ @property
+ def name(self) -> str:
+ return "NonNegativeNumbers"
+
+ @property
+ def rule_sets(self) -> List[str]:
+ return ["DEFAULT", "EXTENDED"]
+
+ def applies_to(self, profile: "ColumnProfile") -> bool:
+ return profile.minimum is not None and profile.minimum >= 0
+
+ def generate(self, profile: "ColumnProfile") -> Optional["ConstraintSuggestion"]:
+ from pydeequ.engines import ConstraintSuggestion
+
+ return ConstraintSuggestion(
+ column_name=profile.column,
+ constraint_name="NonNegative",
+ current_value=f"{profile.minimum:.2f}",
+ description=f"'{profile.column}' has no negative values",
+ suggesting_rule=self.name,
+ code_for_constraint=f'.isNonNegative("{profile.column}")',
+ )
+
+
+class CategoricalRangeRule(SuggestionRule):
+ """Suggests isContainedIn() constraint for low cardinality categorical columns."""
+
+ MAX_CATEGORIES = 10 # Maximum distinct values to suggest containment
+
+ @property
+ def name(self) -> str:
+ return "CategoricalRangeRule"
+
+ @property
+ def rule_sets(self) -> List[str]:
+ return ["DEFAULT", "EXTENDED"]
+
+ def applies_to(self, profile: "ColumnProfile") -> bool:
+ if not profile.histogram:
+ return False
+ hist = json.loads(profile.histogram)
+ return len(hist) <= self.MAX_CATEGORIES
+
+ def generate(self, profile: "ColumnProfile") -> Optional["ConstraintSuggestion"]:
+ from pydeequ.engines import ConstraintSuggestion
+
+ hist = json.loads(profile.histogram)
+ values = list(hist.keys())
+ values_str = ", ".join([f'"{v}"' for v in values])
+
+ return ConstraintSuggestion(
+ column_name=profile.column,
+ constraint_name="Compliance",
+ current_value=f"{len(values)} distinct values",
+ description=f"'{profile.column}' has categorical values",
+ suggesting_rule=self.name,
+ code_for_constraint=f'.isContainedIn("{profile.column}", [{values_str}])',
+ )
+
+
+class HasMinRule(SuggestionRule):
+ """Suggests hasMin() constraint for numeric columns."""
+
+ @property
+ def name(self) -> str:
+ return "HasMin"
+
+ @property
+ def rule_sets(self) -> List[str]:
+ return ["NUMERICAL", "EXTENDED"]
+
+ def applies_to(self, profile: "ColumnProfile") -> bool:
+ return profile.minimum is not None and profile.mean is not None
+
+ def generate(self, profile: "ColumnProfile") -> Optional["ConstraintSuggestion"]:
+ from pydeequ.engines import ConstraintSuggestion
+
+ return ConstraintSuggestion(
+ column_name=profile.column,
+ constraint_name="Minimum",
+ current_value=f"{profile.minimum:.2f}",
+ description=f"'{profile.column}' has minimum {profile.minimum:.2f}",
+ suggesting_rule=self.name,
+ code_for_constraint=f'.hasMin("{profile.column}", gte({profile.minimum:.2f}))',
+ )
+
+
+class HasMaxRule(SuggestionRule):
+ """Suggests hasMax() constraint for numeric columns."""
+
+ @property
+ def name(self) -> str:
+ return "HasMax"
+
+ @property
+ def rule_sets(self) -> List[str]:
+ return ["NUMERICAL", "EXTENDED"]
+
+ def applies_to(self, profile: "ColumnProfile") -> bool:
+ return profile.maximum is not None and profile.mean is not None
+
+ def generate(self, profile: "ColumnProfile") -> Optional["ConstraintSuggestion"]:
+ from pydeequ.engines import ConstraintSuggestion
+
+ return ConstraintSuggestion(
+ column_name=profile.column,
+ constraint_name="Maximum",
+ current_value=f"{profile.maximum:.2f}",
+ description=f"'{profile.column}' has maximum {profile.maximum:.2f}",
+ suggesting_rule=self.name,
+ code_for_constraint=f'.hasMax("{profile.column}", lte({profile.maximum:.2f}))',
+ )
+
+
+class HasMeanRule(SuggestionRule):
+ """Suggests hasMean() constraint for numeric columns."""
+
+ @property
+ def name(self) -> str:
+ return "HasMean"
+
+ @property
+ def rule_sets(self) -> List[str]:
+ return ["NUMERICAL", "EXTENDED"]
+
+ def applies_to(self, profile: "ColumnProfile") -> bool:
+ return profile.mean is not None
+
+ def generate(self, profile: "ColumnProfile") -> Optional["ConstraintSuggestion"]:
+ from pydeequ.engines import ConstraintSuggestion
+
+ lower = profile.mean * 0.9
+ upper = profile.mean * 1.1
+
+ return ConstraintSuggestion(
+ column_name=profile.column,
+ constraint_name="Mean",
+ current_value=f"{profile.mean:.2f}",
+ description=f"'{profile.column}' has mean {profile.mean:.2f}",
+ suggesting_rule=self.name,
+ code_for_constraint=f'.hasMean("{profile.column}", between({lower:.2f}, {upper:.2f}))',
+ )
+
+
+class HasMinLengthRule(SuggestionRule):
+ """Suggests hasMinLength() constraint for string columns."""
+
+ @property
+ def name(self) -> str:
+ return "HasMinLength"
+
+ @property
+ def rule_sets(self) -> List[str]:
+ return ["STRING", "EXTENDED"]
+
+ def applies_to(self, profile: "ColumnProfile") -> bool:
+ return profile.data_type in STRING_TYPES
+
+ def generate(
+ self,
+ profile: "ColumnProfile",
+ min_length: Optional[int] = None,
+ ) -> Optional["ConstraintSuggestion"]:
+ from pydeequ.engines import ConstraintSuggestion
+
+ if min_length is None or min_length <= 0:
+ return None
+
+ return ConstraintSuggestion(
+ column_name=profile.column,
+ constraint_name="MinLength",
+ current_value=str(min_length),
+ description=f"'{profile.column}' has minimum length {min_length}",
+ suggesting_rule=self.name,
+ code_for_constraint=f'.hasMinLength("{profile.column}", gte({min_length}))',
+ )
+
+
+class HasMaxLengthRule(SuggestionRule):
+ """Suggests hasMaxLength() constraint for string columns."""
+
+ @property
+ def name(self) -> str:
+ return "HasMaxLength"
+
+ @property
+ def rule_sets(self) -> List[str]:
+ return ["STRING", "EXTENDED"]
+
+ def applies_to(self, profile: "ColumnProfile") -> bool:
+ return profile.data_type in STRING_TYPES
+
+ def generate(
+ self,
+ profile: "ColumnProfile",
+ max_length: Optional[int] = None,
+ ) -> Optional["ConstraintSuggestion"]:
+ from pydeequ.engines import ConstraintSuggestion
+
+ if max_length is None or max_length <= 0:
+ return None
+
+ return ConstraintSuggestion(
+ column_name=profile.column,
+ constraint_name="MaxLength",
+ current_value=str(max_length),
+ description=f"'{profile.column}' has maximum length {max_length}",
+ suggesting_rule=self.name,
+ code_for_constraint=f'.hasMaxLength("{profile.column}", lte({max_length}))',
+ )
+
+
+class UniqueIfApproximatelyUniqueRule(SuggestionRule):
+ """Suggests isUnique() constraint for approximately unique columns."""
+
+ UNIQUENESS_THRESHOLD = 0.99 # Minimum distinct ratio to consider unique
+
+ @property
+ def name(self) -> str:
+ return "UniqueIfApproximatelyUnique"
+
+ @property
+ def rule_sets(self) -> List[str]:
+ return ["COMMON", "EXTENDED"]
+
+ def applies_to(self, profile: "ColumnProfile") -> bool:
+ # Need total row count to determine uniqueness
+ return True # Check is done in generate with row_count
+
+ def generate(
+ self,
+ profile: "ColumnProfile",
+ row_count: Optional[int] = None,
+ ) -> Optional["ConstraintSuggestion"]:
+ from pydeequ.engines import ConstraintSuggestion
+
+ if row_count is None or row_count <= 0:
+ return None
+
+ if profile.approx_distinct_values >= row_count * self.UNIQUENESS_THRESHOLD:
+ return ConstraintSuggestion(
+ column_name=profile.column,
+ constraint_name="Uniqueness",
+ current_value="~1.0",
+ description=f"'{profile.column}' appears to be unique",
+ suggesting_rule=self.name,
+ code_for_constraint=f'.isUnique("{profile.column}")',
+ )
+ return None
+
+
+# Export all rule classes
+__all__ = [
+ "SuggestionRule",
+ "CompleteIfCompleteRule",
+ "RetainCompletenessRule",
+ "NonNegativeNumbersRule",
+ "CategoricalRangeRule",
+ "HasMinRule",
+ "HasMaxRule",
+ "HasMeanRule",
+ "HasMinLengthRule",
+ "HasMaxLengthRule",
+ "UniqueIfApproximatelyUniqueRule",
+ "STRING_TYPES",
+]
diff --git a/pydeequ/engines/suggestions/runner.py b/pydeequ/engines/suggestions/runner.py
new file mode 100644
index 0000000..f9b1479
--- /dev/null
+++ b/pydeequ/engines/suggestions/runner.py
@@ -0,0 +1,178 @@
+# -*- coding: utf-8 -*-
+"""
+Suggestion runner for executing rules against column profiles.
+
+This module provides the SuggestionRunner class that orchestrates
+running suggestion rules against column profiles.
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Callable, List, Optional
+
+from pydeequ.engines.suggestions.registry import RuleRegistry
+from pydeequ.engines.suggestions.rules import (
+ HasMinLengthRule,
+ HasMaxLengthRule,
+ UniqueIfApproximatelyUniqueRule,
+ STRING_TYPES,
+)
+
+if TYPE_CHECKING:
+ import pandas as pd
+ from pydeequ.engines import ColumnProfile, ConstraintSuggestion
+
+
+class SuggestionRunner:
+ """
+ Runs suggestion rules against column profiles.
+
+ The runner retrieves rules from the registry based on the specified
+ rule sets and executes them against each column profile.
+
+ Attributes:
+ rule_sets: List of rule set names to apply
+ """
+
+ def __init__(self, rule_sets: Optional[List[str]] = None):
+ """
+ Initialize SuggestionRunner.
+
+ Args:
+ rule_sets: List of rule set names (e.g., ["DEFAULT", "NUMERICAL"]).
+ If None, defaults to ["DEFAULT"].
+ """
+ self.rule_sets = rule_sets or ["DEFAULT"]
+
+ def run(
+ self,
+ profiles: List["ColumnProfile"],
+ execute_fn: Optional[Callable[[str], "pd.DataFrame"]] = None,
+ table: Optional[str] = None,
+ row_count: Optional[int] = None,
+ ) -> List["ConstraintSuggestion"]:
+ """
+ Run suggestion rules against column profiles.
+
+ Args:
+ profiles: List of column profiles to analyze
+ execute_fn: Optional function to execute SQL queries (for rules
+ that need additional data like string lengths)
+ table: Optional table name for queries
+ row_count: Optional total row count for uniqueness checks
+
+ Returns:
+ List of constraint suggestions
+ """
+ rules = RuleRegistry.get_rules_for_sets(self.rule_sets)
+ suggestions: List["ConstraintSuggestion"] = []
+
+ for profile in profiles:
+ for rule in rules:
+ suggestion = self._apply_rule(
+ rule, profile, execute_fn, table, row_count
+ )
+ if suggestion:
+ suggestions.append(suggestion)
+
+ return suggestions
+
+ def _apply_rule(
+ self,
+ rule,
+ profile: "ColumnProfile",
+ execute_fn: Optional[Callable[[str], "pd.DataFrame"]],
+ table: Optional[str],
+ row_count: Optional[int],
+ ) -> Optional["ConstraintSuggestion"]:
+ """
+ Apply a single rule to a profile.
+
+ Some rules require special handling (e.g., string length rules need
+ to query the database, uniqueness rules need row count).
+
+ Args:
+ rule: The rule to apply
+ profile: Column profile to analyze
+ execute_fn: Optional SQL execution function
+ table: Optional table name
+ row_count: Optional row count
+
+ Returns:
+ Constraint suggestion or None
+ """
+ # Handle HasMinLengthRule - needs string length from query
+ if isinstance(rule, HasMinLengthRule):
+ return self._handle_string_length_rule(
+ rule, profile, execute_fn, table, is_min=True
+ )
+
+ # Handle HasMaxLengthRule - needs string length from query
+ if isinstance(rule, HasMaxLengthRule):
+ return self._handle_string_length_rule(
+ rule, profile, execute_fn, table, is_min=False
+ )
+
+ # Handle UniqueIfApproximatelyUniqueRule - needs row count
+ if isinstance(rule, UniqueIfApproximatelyUniqueRule):
+ if rule.applies_to(profile):
+ return rule.generate(profile, row_count=row_count)
+ return None
+
+ # Standard rule handling
+ if rule.applies_to(profile):
+ return rule.generate(profile)
+ return None
+
+ def _handle_string_length_rule(
+ self,
+ rule,
+ profile: "ColumnProfile",
+ execute_fn: Optional[Callable[[str], "pd.DataFrame"]],
+ table: Optional[str],
+ is_min: bool,
+ ) -> Optional["ConstraintSuggestion"]:
+ """
+ Handle string length rules that need database queries.
+
+ Args:
+ rule: HasMinLengthRule or HasMaxLengthRule
+ profile: Column profile
+ execute_fn: SQL execution function
+ table: Table name
+ is_min: True for min length, False for max length
+
+ Returns:
+ Constraint suggestion or None
+ """
+ import pandas as pd
+
+ if not rule.applies_to(profile):
+ return None
+
+ if execute_fn is None or table is None:
+ return None
+
+ col = profile.column
+ agg_func = "MIN" if is_min else "MAX"
+ query = f"SELECT {agg_func}(LENGTH({col})) as len FROM {table} WHERE {col} IS NOT NULL"
+
+ try:
+ result = execute_fn(query)
+ length = result["len"].iloc[0]
+ if length is not None and not pd.isna(length):
+ length = int(length)
+ if length > 0:
+ if is_min:
+ return rule.generate(profile, min_length=length)
+ else:
+ return rule.generate(profile, max_length=length)
+ except Exception:
+ pass
+
+ return None
+
+
+__all__ = [
+ "SuggestionRunner",
+]
diff --git a/pydeequ/v2/analyzers.py b/pydeequ/v2/analyzers.py
index 53a979c..b796f12 100644
--- a/pydeequ/v2/analyzers.py
+++ b/pydeequ/v2/analyzers.py
@@ -25,7 +25,7 @@
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
-from typing import List, Optional, Sequence, Union
+from typing import Optional, Sequence, Union
from pydeequ.v2.proto import deequ_connect_pb2 as proto
diff --git a/pydeequ/v2/checks.py b/pydeequ/v2/checks.py
index 2a86ba8..c788f6c 100644
--- a/pydeequ/v2/checks.py
+++ b/pydeequ/v2/checks.py
@@ -19,7 +19,7 @@
from __future__ import annotations
from enum import Enum
-from typing import List, Optional, Sequence, Union
+from typing import List, Optional, Sequence
from pydeequ.v2.predicates import Predicate, is_one
from pydeequ.v2.proto import deequ_connect_pb2 as proto
diff --git a/pydeequ/v2/predicates.py b/pydeequ/v2/predicates.py
index adaf23d..3f2c06c 100644
--- a/pydeequ/v2/predicates.py
+++ b/pydeequ/v2/predicates.py
@@ -40,6 +40,26 @@ def to_proto(self) -> proto.PredicateMessage:
def __repr__(self) -> str:
raise NotImplementedError
+ @abstractmethod
+ def to_callable(self):
+ """
+ Convert predicate to a callable function.
+
+ Returns:
+ A callable that takes a value and returns True/False
+
+ Example:
+ pred = gte(0.95)
+ func = pred.to_callable()
+ assert func(0.96) == True
+ assert func(0.90) == False
+ """
+ raise NotImplementedError
+
+ def __call__(self, value: float) -> bool:
+ """Allow predicates to be called directly like functions."""
+ return self.to_callable()(value)
+
@dataclass
class Comparison(Predicate):
@@ -62,6 +82,26 @@ def __repr__(self) -> str:
}
return f"x {op_map.get(self.operator, '?')} {self.value}"
+ def to_callable(self):
+ """Convert to a callable function."""
+ op = self.operator
+ target = self.value
+
+ if op == proto.PredicateMessage.Operator.EQ:
+ return lambda x: abs(x - target) < 1e-9 if x is not None else False
+ elif op == proto.PredicateMessage.Operator.NE:
+ return lambda x: abs(x - target) >= 1e-9 if x is not None else False
+ elif op == proto.PredicateMessage.Operator.GT:
+ return lambda x: x > target if x is not None else False
+ elif op == proto.PredicateMessage.Operator.GE:
+ return lambda x: x >= target if x is not None else False
+ elif op == proto.PredicateMessage.Operator.LT:
+ return lambda x: x < target if x is not None else False
+ elif op == proto.PredicateMessage.Operator.LE:
+ return lambda x: x <= target if x is not None else False
+ else:
+ return lambda x: False
+
@dataclass
class Between(Predicate):
@@ -80,6 +120,12 @@ def to_proto(self) -> proto.PredicateMessage:
def __repr__(self) -> str:
return f"{self.lower} <= x <= {self.upper}"
+ def to_callable(self):
+ """Convert to a callable function."""
+ lower = self.lower
+ upper = self.upper
+ return lambda x: lower <= x <= upper if x is not None else False
+
# ============================================================================
# Factory Functions - Convenient way to create predicates
diff --git a/pydeequ/v2/profiles.py b/pydeequ/v2/profiles.py
index 97f71ef..1e2a373 100644
--- a/pydeequ/v2/profiles.py
+++ b/pydeequ/v2/profiles.py
@@ -1,12 +1,25 @@
# -*- coding: utf-8 -*-
"""
-Column Profiler for Deequ Spark Connect.
+Column Profiler for PyDeequ v2.
This module provides column profiling capabilities that analyze DataFrame columns
to compute statistics like completeness, data type distribution, and optional
KLL sketch-based quantile estimation.
-Example usage:
+Example usage with DuckDB:
+ import duckdb
+ import pydeequ
+ from pydeequ.v2.profiles import ColumnProfilerRunner
+
+ con = duckdb.connect()
+ con.execute("CREATE TABLE test AS SELECT 1 as id, 'foo' as name")
+ engine = pydeequ.connect(con, table="test")
+
+ profiles = (ColumnProfilerRunner()
+ .on_engine(engine)
+ .run())
+
+Example usage with Spark Connect:
from pyspark.sql import SparkSession
from pydeequ.v2.profiles import ColumnProfilerRunner, KLLParameters
@@ -30,8 +43,9 @@
from __future__ import annotations
from dataclasses import dataclass
-from typing import TYPE_CHECKING, Dict, Optional, Sequence
+from typing import TYPE_CHECKING, Dict, List, Optional, Sequence
+import pandas as pd
from google.protobuf import any_pb2
from pydeequ.v2.proto import deequ_connect_pb2 as proto
@@ -39,6 +53,7 @@
if TYPE_CHECKING:
from pyspark.sql import DataFrame, SparkSession
+ from pydeequ.engines import BaseEngine
@dataclass
@@ -74,9 +89,15 @@ class ColumnProfilerRunner:
ColumnProfilerRunner analyzes DataFrame columns to compute statistics
including completeness, data type, distinct values, and optionally
- KLL sketches for numeric columns.
+ KLL sketches for numeric columns. Supports both engine-based and Spark-based execution.
- Example:
+ Example (Engine-based with DuckDB):
+ profiles = (ColumnProfilerRunner()
+ .on_engine(engine)
+ .restrictToColumns(["col1", "col2"])
+ .run())
+
+ Example (Spark Connect):
profiles = (ColumnProfilerRunner(spark)
.onData(df)
.restrictToColumns(["col1", "col2"])
@@ -84,27 +105,49 @@ class ColumnProfilerRunner:
.run())
"""
- def __init__(self, spark: "SparkSession"):
+ def __init__(self, spark: Optional["SparkSession"] = None):
"""
Create a new ColumnProfilerRunner.
Args:
- spark: SparkSession (can be either local or Spark Connect)
+ spark: Optional SparkSession for Spark Connect mode.
+ Not required for engine-based execution.
"""
self._spark = spark
def onData(self, df: "DataFrame") -> "ColumnProfilerRunBuilder":
"""
- Specify the DataFrame to profile.
+ Specify the DataFrame to profile (Spark mode).
Args:
df: DataFrame to profile
Returns:
ColumnProfilerRunBuilder for method chaining
+
+ Raises:
+ ValueError: If SparkSession was not provided in constructor
"""
+ if self._spark is None:
+ raise ValueError(
+ "SparkSession required for onData(). "
+ "Use ColumnProfilerRunner(spark).onData(df) or "
+ "ColumnProfilerRunner().on_engine(engine) for engine-based execution."
+ )
return ColumnProfilerRunBuilder(self._spark, df)
+ def on_engine(self, engine: "BaseEngine") -> "EngineColumnProfilerRunBuilder":
+ """
+ Specify the engine to run profiling on (Engine mode).
+
+ Args:
+ engine: BaseEngine instance (e.g., DuckDBEngine)
+
+ Returns:
+ EngineColumnProfilerRunBuilder for method chaining
+ """
+ return EngineColumnProfilerRunBuilder(engine)
+
class ColumnProfilerRunBuilder:
"""
@@ -274,9 +317,80 @@ def _run_via_spark_connect(
return dataframe_from_plan(plan, self._spark)
+class EngineColumnProfilerRunBuilder:
+ """
+ Builder for configuring and executing engine-based column profiling.
+
+ This class works with DuckDB and other SQL backends via the engine abstraction.
+ """
+
+ def __init__(self, engine: "BaseEngine"):
+ """
+ Create a new EngineColumnProfilerRunBuilder.
+
+ Args:
+ engine: BaseEngine instance (e.g., DuckDBEngine)
+ """
+ self._engine = engine
+ self._restrict_to_columns: Optional[Sequence[str]] = None
+ self._low_cardinality_threshold: int = 0
+
+ def restrictToColumns(self, columns: Sequence[str]) -> "EngineColumnProfilerRunBuilder":
+ """
+ Restrict profiling to specific columns.
+
+ Args:
+ columns: List of column names to profile
+
+ Returns:
+ self for method chaining
+ """
+ self._restrict_to_columns = columns
+ return self
+
+ def withLowCardinalityHistogramThreshold(
+ self, threshold: int
+ ) -> "EngineColumnProfilerRunBuilder":
+ """
+ Set threshold for computing histograms.
+
+ Columns with distinct values <= threshold will have histograms computed.
+
+ Args:
+ threshold: Maximum distinct values for histogram computation
+
+ Returns:
+ self for method chaining
+ """
+ self._low_cardinality_threshold = threshold
+ return self
+
+ def run(self) -> pd.DataFrame:
+ """
+ Execute the profiling and return results as a pandas DataFrame.
+
+ The result DataFrame contains columns:
+ - column: Column name
+ - completeness: Non-null ratio (0.0-1.0)
+ - approx_distinct_values: Approximate cardinality
+ - data_type: Detected/provided type
+ - mean, minimum, maximum, sum, std_dev: Numeric stats (null for non-numeric)
+ - histogram: JSON string of histogram (or null)
+
+ Returns:
+ pandas DataFrame with profiling results (one row per column)
+ """
+ profiles = self._engine.profile_columns(
+ columns=self._restrict_to_columns,
+ low_cardinality_threshold=self._low_cardinality_threshold,
+ )
+ return self._engine.profiles_to_dataframe(profiles)
+
+
# Export all public symbols
__all__ = [
"ColumnProfilerRunner",
"ColumnProfilerRunBuilder",
+ "EngineColumnProfilerRunBuilder",
"KLLParameters",
]
diff --git a/pydeequ/v2/suggestions.py b/pydeequ/v2/suggestions.py
index b89b07b..5d6c371 100644
--- a/pydeequ/v2/suggestions.py
+++ b/pydeequ/v2/suggestions.py
@@ -1,12 +1,26 @@
# -*- coding: utf-8 -*-
"""
-Constraint Suggestions for Deequ Spark Connect.
+Constraint Suggestions for PyDeequ v2.
This module provides automatic constraint suggestion capabilities that analyze
DataFrame columns and suggest appropriate data quality constraints based on
the data characteristics.
-Example usage:
+Example usage with DuckDB:
+ import duckdb
+ import pydeequ
+ from pydeequ.v2.suggestions import ConstraintSuggestionRunner, Rules
+
+ con = duckdb.connect()
+ con.execute("CREATE TABLE test AS SELECT 1 as id, 'foo' as name")
+ engine = pydeequ.connect(con, table="test")
+
+ suggestions = (ConstraintSuggestionRunner()
+ .on_engine(engine)
+ .addConstraintRules(Rules.DEFAULT)
+ .run())
+
+Example usage with Spark Connect:
from pyspark.sql import SparkSession
from pydeequ.v2.suggestions import ConstraintSuggestionRunner, Rules
@@ -33,6 +47,7 @@
from enum import Enum
from typing import TYPE_CHECKING, Dict, List, Optional, Sequence
+import pandas as pd
from google.protobuf import any_pb2
from pydeequ.v2.profiles import KLLParameters
@@ -41,6 +56,7 @@
if TYPE_CHECKING:
from pyspark.sql import DataFrame, SparkSession
+ from pydeequ.engines import BaseEngine
class Rules(Enum):
@@ -79,35 +95,64 @@ class ConstraintSuggestionRunner:
ConstraintSuggestionRunner analyzes DataFrame columns to suggest
appropriate data quality constraints based on the data characteristics.
+ Supports both engine-based and Spark-based execution.
- Example:
+ Example (Engine-based with DuckDB):
+ suggestions = (ConstraintSuggestionRunner()
+ .on_engine(engine)
+ .addConstraintRules(Rules.DEFAULT)
+ .run())
+
+ Example (Spark Connect):
suggestions = (ConstraintSuggestionRunner(spark)
.onData(df)
.addConstraintRules(Rules.DEFAULT)
.run())
"""
- def __init__(self, spark: "SparkSession"):
+ def __init__(self, spark: Optional["SparkSession"] = None):
"""
Create a new ConstraintSuggestionRunner.
Args:
- spark: SparkSession (can be either local or Spark Connect)
+ spark: Optional SparkSession for Spark Connect mode.
+ Not required for engine-based execution.
"""
self._spark = spark
def onData(self, df: "DataFrame") -> "ConstraintSuggestionRunBuilder":
"""
- Specify the DataFrame to analyze.
+ Specify the DataFrame to analyze (Spark mode).
Args:
df: DataFrame to analyze for constraint suggestions
Returns:
ConstraintSuggestionRunBuilder for method chaining
+
+ Raises:
+ ValueError: If SparkSession was not provided in constructor
"""
+ if self._spark is None:
+ raise ValueError(
+ "SparkSession required for onData(). "
+ "Use ConstraintSuggestionRunner(spark).onData(df) or "
+ "ConstraintSuggestionRunner().on_engine(engine) for engine-based execution."
+ )
return ConstraintSuggestionRunBuilder(self._spark, df)
+ def on_engine(self, engine: "BaseEngine") -> "EngineConstraintSuggestionRunBuilder":
+ """
+ Specify the engine to run suggestion analysis on (Engine mode).
+
+ Args:
+ engine: BaseEngine instance (e.g., DuckDBEngine)
+
+ Returns:
+ EngineConstraintSuggestionRunBuilder for method chaining
+ """
+ return EngineConstraintSuggestionRunBuilder(engine)
+
class ConstraintSuggestionRunBuilder:
"""
@@ -332,9 +377,92 @@ def _run_via_spark_connect(
return dataframe_from_plan(plan, self._spark)
+class EngineConstraintSuggestionRunBuilder:
+ """
+ Builder for configuring and executing engine-based constraint suggestions.
+
+ This class works with DuckDB and other SQL backends via the engine abstraction.
+ """
+
+ def __init__(self, engine: "BaseEngine"):
+ """
+ Create a new EngineConstraintSuggestionRunBuilder.
+
+ Args:
+ engine: BaseEngine instance (e.g., DuckDBEngine)
+ """
+ self._engine = engine
+ self._rules: List[Rules] = []
+ self._restrict_to_columns: Optional[Sequence[str]] = None
+
+ def addConstraintRules(self, rules: Rules) -> "EngineConstraintSuggestionRunBuilder":
+ """
+ Add a constraint rule set.
+
+ Can be called multiple times to add multiple rule sets.
+
+ Args:
+ rules: Rules enum value specifying which rules to use
+
+ Returns:
+ self for method chaining
+ """
+ self._rules.append(rules)
+ return self
+
+ def restrictToColumns(
+ self, columns: Sequence[str]
+ ) -> "EngineConstraintSuggestionRunBuilder":
+ """
+ Restrict suggestions to specific columns.
+
+ Args:
+ columns: List of column names to analyze
+
+ Returns:
+ self for method chaining
+ """
+ self._restrict_to_columns = columns
+ return self
+
+ def run(self) -> pd.DataFrame:
+ """
+ Execute the suggestion analysis and return results as a pandas DataFrame.
+
+ The result DataFrame contains columns:
+ - column_name: Column the constraint applies to
+ - constraint_name: Type of constraint (e.g., "Completeness", "IsIn")
+ - current_value: Current metric value that triggered suggestion
+ - description: Human-readable description
+ - suggesting_rule: Rule that generated this suggestion
+ - code_for_constraint: Python code snippet for the constraint
+
+ Returns:
+ pandas DataFrame with constraint suggestions
+
+ Raises:
+ ValueError: If no rules have been added
+ """
+ if not self._rules:
+ raise ValueError(
+ "At least one constraint rule set must be added. "
+ "Use .addConstraintRules(Rules.DEFAULT) to add rules."
+ )
+
+ # Convert Rules enum to string list
+ rule_strs = [r.value for r in self._rules]
+
+ suggestions = self._engine.suggest_constraints(
+ columns=self._restrict_to_columns,
+ rules=rule_strs,
+ )
+ return self._engine.suggestions_to_dataframe(suggestions)
+
+
# Export all public symbols
__all__ = [
"ConstraintSuggestionRunner",
"ConstraintSuggestionRunBuilder",
+ "EngineConstraintSuggestionRunBuilder",
"Rules",
]
diff --git a/pydeequ/v2/verification.py b/pydeequ/v2/verification.py
index c6d8d2f..10e43d0 100644
--- a/pydeequ/v2/verification.py
+++ b/pydeequ/v2/verification.py
@@ -1,12 +1,31 @@
# -*- coding: utf-8 -*-
"""
-VerificationSuite for Deequ Spark Connect.
+VerificationSuite for PyDeequ v2.
-This module provides the main entry point for running data quality checks
-via Spark Connect. It builds protobuf messages and sends them to the
-server-side Deequ plugin.
+This module provides the main entry point for running data quality checks.
+It supports two execution modes:
-Example usage:
+1. Engine-based (DuckDB, etc.) - uses pydeequ.connect()
+2. Spark Connect - uses SparkSession with Deequ plugin
+
+Example usage with DuckDB:
+ import duckdb
+ import pydeequ
+ from pydeequ.v2.verification import VerificationSuite, AnalysisRunner
+ from pydeequ.v2.checks import Check, CheckLevel
+ from pydeequ.v2.predicates import gte, eq
+
+ con = duckdb.connect()
+ con.execute("CREATE TABLE test AS SELECT 1 as id, 'foo@bar.com' as email")
+ engine = pydeequ.connect(con, table="test")
+
+ check = (Check(CheckLevel.Error, "Data quality check")
+ .isComplete("id")
+ .hasCompleteness("email", gte(0.95)))
+
+ result = VerificationSuite().on_engine(engine).addCheck(check).run()
+
+Example usage with Spark Connect:
from pyspark.sql import SparkSession
from pydeequ.v2.verification import VerificationSuite
from pydeequ.v2.checks import Check, CheckLevel
@@ -28,8 +47,9 @@
from __future__ import annotations
-from typing import TYPE_CHECKING, List
+from typing import TYPE_CHECKING, List, Optional
+import pandas as pd
from google.protobuf import any_pb2
from pydeequ.v2.analyzers import _ConnectAnalyzer
@@ -39,18 +59,27 @@
if TYPE_CHECKING:
from pyspark.sql import DataFrame, SparkSession
+ from pydeequ.engines import BaseEngine
class VerificationSuite:
"""
Main entry point for running data quality verification.
- VerificationSuite allows you to define checks and analyzers to run
- on a DataFrame. When run() is called, the checks and analyzers are
- serialized to protobuf and sent to the Spark Connect server where
- the Deequ plugin executes them.
+ VerificationSuite allows you to define checks and analyzers to run.
+ It supports two execution modes:
+
+ 1. Engine-based: Use on_engine() for DuckDB and other SQL backends
+ 2. Spark-based: Use onData() for Spark Connect
+
+ Example (Engine-based):
+ suite = VerificationSuite()
+ result = (suite
+ .on_engine(engine)
+ .addCheck(check)
+ .run())
- Example:
+ Example (Spark-based):
suite = VerificationSuite(spark)
result = (suite
.onData(df)
@@ -58,27 +87,66 @@ class VerificationSuite:
.run())
"""
- def __init__(self, spark: "SparkSession"):
+ def __init__(self, spark: Optional["SparkSession"] = None):
"""
Create a new VerificationSuite.
Args:
- spark: SparkSession connected via Spark Connect
+ spark: Optional SparkSession for Spark Connect mode.
+ Not required for engine-based execution.
"""
self._spark = spark
def onData(self, df: "DataFrame") -> "VerificationRunBuilder":
"""
- Specify the DataFrame to run verification on.
+ Specify the DataFrame to run verification on (Spark mode).
Args:
df: DataFrame to verify
Returns:
VerificationRunBuilder for method chaining
+
+ Raises:
+ ValueError: If SparkSession was not provided in constructor
"""
+ if self._spark is None:
+ raise ValueError(
+ "SparkSession required for onData(). "
+ "Use VerificationSuite(spark).onData(df) or "
+ "VerificationSuite().on_engine(engine) for engine-based execution."
+ )
return VerificationRunBuilder(self._spark, df)
+ def on_engine(self, engine: "BaseEngine") -> "EngineVerificationRunBuilder":
+ """
+ Specify the engine to run verification on (Engine mode).
+
+ Args:
+ engine: BaseEngine instance (e.g., DuckDBEngine)
+
+ Returns:
+ EngineVerificationRunBuilder for method chaining
+ """
+ return EngineVerificationRunBuilder(engine)
+
+ # Alias for consistency with other methods
+ def on_table(self, table: str) -> "EngineVerificationRunBuilder":
+ """
+ Specify a table name for engine-based verification.
+
+ Note: This method requires an engine to be set first via on_engine().
+ For direct table access, use:
+ pydeequ.connect(con, table="my_table")
+
+ Raises:
+ ValueError: This method is deprecated
+ """
+ raise ValueError(
+ "on_table() requires an engine. Use: "
+ "VerificationSuite().on_engine(pydeequ.connect(con, table='my_table'))"
+ )
+
class VerificationRunBuilder:
"""
@@ -173,14 +241,89 @@ def run(self) -> "DataFrame":
return dataframe_from_plan(plan, self._spark)
+class EngineVerificationRunBuilder:
+ """
+ Builder for configuring and executing engine-based verification.
+
+ This class works with DuckDB and other SQL backends via the engine abstraction.
+ """
+
+ def __init__(self, engine: "BaseEngine"):
+ """
+ Create a new EngineVerificationRunBuilder.
+
+ Args:
+ engine: BaseEngine instance (e.g., DuckDBEngine)
+ """
+ self._engine = engine
+ self._checks: List[Check] = []
+ self._analyzers: List[_ConnectAnalyzer] = []
+
+ def addCheck(self, check: Check) -> "EngineVerificationRunBuilder":
+ """
+ Add a check to run.
+
+ Args:
+ check: Check to add
+
+ Returns:
+ self for method chaining
+ """
+ self._checks.append(check)
+ return self
+
+ def addAnalyzer(self, analyzer: _ConnectAnalyzer) -> "EngineVerificationRunBuilder":
+ """
+ Add an analyzer to run (in addition to those required by checks).
+
+ Args:
+ analyzer: Analyzer to add
+
+ Returns:
+ self for method chaining
+ """
+ self._analyzers.append(analyzer)
+ return self
+
+ def run(self) -> pd.DataFrame:
+ """
+ Execute the verification and return results as a pandas DataFrame.
+
+ The result DataFrame contains columns:
+ - check: Check description
+ - check_level: Error or Warning
+ - check_status: Success, Warning, or Error
+ - constraint: Constraint description
+ - constraint_status: Success or Failure
+ - constraint_message: Details about failures
+
+ Returns:
+ pandas DataFrame with verification results
+ """
+ # Run checks via engine
+ results = self._engine.run_checks(self._checks)
+ return self._engine.constraints_to_dataframe(results)
+
+
class AnalysisRunner:
"""
Entry point for running analyzers without checks.
Use this when you want to compute metrics without defining
- pass/fail constraints.
+ pass/fail constraints. Supports both engine-based and Spark-based execution.
+
+ Example (Engine-based with DuckDB):
+ from pydeequ.v2.analyzers import Size, Completeness, Mean
+ import pydeequ
+
+ engine = pydeequ.connect(con, table="my_table")
+ result = (AnalysisRunner()
+ .on_engine(engine)
+ .addAnalyzer(Size())
+ .addAnalyzer(Completeness("email"))
+ .run())
- Example:
+ Example (Spark Connect):
from pydeequ.v2.analyzers import Size, Completeness, Mean
result = (AnalysisRunner(spark)
@@ -191,27 +334,61 @@ class AnalysisRunner:
.run())
"""
- def __init__(self, spark: "SparkSession"):
+ def __init__(self, spark: Optional["SparkSession"] = None):
"""
Create a new AnalysisRunner.
Args:
- spark: SparkSession connected via Spark Connect
+ spark: Optional SparkSession for Spark Connect mode.
+ Not required for engine-based execution.
"""
self._spark = spark
def onData(self, df: "DataFrame") -> "AnalysisRunBuilder":
"""
- Specify the DataFrame to analyze.
+ Specify the DataFrame to analyze (Spark mode).
Args:
df: DataFrame to analyze
Returns:
AnalysisRunBuilder for method chaining
+
+ Raises:
+ ValueError: If SparkSession was not provided in constructor
"""
+ if self._spark is None:
+ raise ValueError(
+ "SparkSession required for onData(). "
+ "Use AnalysisRunner(spark).onData(df) or "
+ "AnalysisRunner().on_engine(engine) for engine-based execution."
+ )
return AnalysisRunBuilder(self._spark, df)
+ def on_engine(self, engine: "BaseEngine") -> "EngineAnalysisRunBuilder":
+ """
+ Specify the engine to run analysis on (Engine mode).
+
+ Args:
+ engine: BaseEngine instance (e.g., DuckDBEngine)
+
+ Returns:
+ EngineAnalysisRunBuilder for method chaining
+ """
+ return EngineAnalysisRunBuilder(engine)
+
+ def on_table(self, table: str) -> "EngineAnalysisRunBuilder":
+ """
+ Specify a table name for engine-based analysis.
+
+ Raises:
+ ValueError: This method requires an engine
+ """
+ raise ValueError(
+ "on_table() requires an engine. Use: "
+ "AnalysisRunner().on_engine(pydeequ.connect(con, table='my_table'))"
+ )
+
class AnalysisRunBuilder:
"""Builder for configuring and executing an analysis run."""
@@ -270,10 +447,49 @@ def run(self) -> "DataFrame":
return dataframe_from_plan(plan, self._spark)
+class EngineAnalysisRunBuilder:
+ """Builder for configuring and executing engine-based analysis."""
+
+ def __init__(self, engine: "BaseEngine"):
+ """
+ Create a new EngineAnalysisRunBuilder.
+
+ Args:
+ engine: BaseEngine instance (e.g., DuckDBEngine)
+ """
+ self._engine = engine
+ self._analyzers: List[_ConnectAnalyzer] = []
+
+ def addAnalyzer(self, analyzer: _ConnectAnalyzer) -> "EngineAnalysisRunBuilder":
+ """
+ Add an analyzer to run.
+
+ Args:
+ analyzer: Analyzer to add
+
+ Returns:
+ self for method chaining
+ """
+ self._analyzers.append(analyzer)
+ return self
+
+ def run(self) -> pd.DataFrame:
+ """
+ Execute the analysis and return metrics as pandas DataFrame.
+
+ Returns:
+ pandas DataFrame with computed metrics
+ """
+ results = self._engine.compute_metrics(self._analyzers)
+ return self._engine.metrics_to_dataframe(results)
+
+
# Export all public symbols
__all__ = [
"VerificationSuite",
"VerificationRunBuilder",
+ "EngineVerificationRunBuilder",
"AnalysisRunner",
"AnalysisRunBuilder",
+ "EngineAnalysisRunBuilder",
]
diff --git a/pyproject.toml b/pyproject.toml
index 8168444..26b5237 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,14 +1,18 @@
-[tool.poetry]
+[project]
name = "pydeequ"
version = "2.0.0b1"
description = "PyDeequ - Unit Tests for Data"
-authors = ["Chenyang Liu ", "Rahul Sharma "]
-maintainers = ["Chenyang Liu ","Rahul Sharma "]
-license = "Apache-2.0"
readme = "README.md"
-homepage = "https://pydeequ.readthedocs.io"
-repository = "https://github.com/awslabs/python-deequ"
-documentation = "https://pydeequ.readthedocs.io"
+license = {text = "Apache-2.0"}
+requires-python = ">=3.9,<4"
+authors = [
+ {name = "Chenyang Liu", email = "peterl@amazon.com"},
+ {name = "Rahul Sharma", email = "rdsharma@amazon.com"},
+]
+maintainers = [
+ {name = "Chenyang Liu", email = "peterl@amazon.com"},
+ {name = "Rahul Sharma", email = "rdsharma@amazon.com"},
+]
keywords = [
"deequ",
"pydeequ",
@@ -23,17 +27,64 @@ keywords = [
classifiers = [
"Development Status :: 4 - Beta",
"Programming Language :: Python :: 3",
- "License :: OSI Approved :: Apache Software License"
+ "Programming Language :: Python :: 3.9",
+ "Programming Language :: Python :: 3.10",
+ "Programming Language :: Python :: 3.11",
+ "Programming Language :: Python :: 3.12",
+ "License :: OSI Approved :: Apache Software License",
+ "Operating System :: OS Independent",
+ "Topic :: Database",
+ "Topic :: Software Development :: Quality Assurance",
+ "Topic :: Software Development :: Testing",
+]
+
+# Core dependencies - minimal set required for the base package
+dependencies = [
+ "numpy>=1.23.0",
+ "pandas>=1.5.0",
+ "protobuf>=4.21.0",
+ "setuptools>=69.0.0", # Required for Python 3.12+ (distutils removed)
+]
+
+[project.optional-dependencies]
+# DuckDB backend - lightweight, no JVM required
+duckdb = ["duckdb>=0.9.0"]
+
+# Spark backend - requires Spark Connect server
+spark = ["pyspark[connect]>=3.5.0"]
+
+# All backends
+all = [
+ "duckdb>=0.9.0",
+ "pyspark[connect]>=3.5.0",
]
+# Development dependencies
+dev = [
+ "pytest>=8.0.0",
+ "pytest-cov>=4.1.0",
+ "coverage>=7.4.0",
+ "black>=24.0.0",
+ "pre-commit>=3.6.0",
+ "pytest-rerunfailures>=14.0",
+ "matplotlib>=3.8.0",
+ "duckdb>=0.9.0",
+ "pyspark[connect]>=3.5.0",
+]
-[tool.poetry.dependencies]
-python = ">=3.9,<4"
-numpy = ">=1.23.0"
-pandas = ">=1.5.0"
-protobuf = ">=4.21.0"
-setuptools = ">=69.0.0" # Required for Python 3.12+ (distutils removed)
-pyspark = {version = "3.5.0", extras = ["connect"]}
+[project.urls]
+Homepage = "https://pydeequ.readthedocs.io"
+Repository = "https://github.com/awslabs/python-deequ"
+Documentation = "https://pydeequ.readthedocs.io"
+Issues = "https://github.com/awslabs/python-deequ/issues"
+
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
+
+[tool.poetry]
+# Poetry-specific settings (for poetry install compatibility)
+packages = [{include = "pydeequ"}]
[tool.poetry.group.dev.dependencies]
pytest = "^8.0.0"
@@ -42,12 +93,9 @@ coverage = "^7.4.0"
black = "^24.0.0"
pre-commit = "^3.6.0"
pytest-rerunfailures = "^14.0"
-
-[tool.poetry.extras]
-
-[build-system]
-requires = ["poetry-core>=1.0.0"]
-build-backend = "poetry.core.masonry.api"
+matplotlib = "^3.8.0"
+duckdb = ">=0.9.0"
+pyspark = {version = ">=3.5.0", extras = ["connect"]}
[tool.black]
# https://github.com/psf/black
@@ -57,7 +105,7 @@ include_trailing_comma = true
force_grid_wrap = 0
use_parentheses = true
ensure_newline_before_comments = true
-target_version = ['py38']
+target_version = ['py39']
include = '\.pyi?$'
exclude = '''
/(
@@ -89,27 +137,10 @@ indent = ' '
multi_line_output = 3
include_trailing_comma = true
skip_glob = ['__init__.py']
-#force_grid_wrap = 0
atomic = true
-#lines_after_imports = 2
-#lines_between_types = 1
-#src_paths=isort,test
-
-# [mypy]
-# python_version = 3.8
-#warn_return_any = True
-#warn_unused_configs = True
-
-#[mypy-pyspark.*]
-#ignore_missing_imports = True
-# pytest -n 2 --reruns 3 --reruns-delay 5 --dist loadscope --tx 2*popen//python=python
-[pytest]
-testpaths = "tests"
-norecursedirs = ".git .* *.egg* old docs dist build"
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+norecursedirs = [".git", ".*", "*.egg*", "old", "docs", "dist", "build"]
cache_dir = "./.pytest_cache"
python_files = "*test_*.py"
-looponfailroots = "pydeequ tests"
-# addopts = "-n3 --reruns 3 --reruns-delay 5 --dist loadscope"
-# rsyncdirs = . mypkg helperpkg
-# rsyncignore = .hg
diff --git a/tests/conftest.py b/tests/conftest.py
index 543a27e..077afec 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -2,8 +2,7 @@
"""
Pytest configuration for PyDeequ tests using Spark Connect.
-All tests use the Spark Connect server which must be running before tests.
-Start it with: scripts/start-spark-connect.sh
+The Spark Connect server is automatically started by the spark_connect_server fixture.
"""
import os
@@ -15,6 +14,33 @@
os.environ.setdefault("SPARK_VERSION", "3.5")
+@pytest.fixture(scope="session")
+def spark_connect_server():
+ """Session-scoped fixture to start Spark Connect server.
+
+ Automatically starts the Spark Connect server if not already running.
+ The server is NOT stopped after tests complete (to allow reuse across test runs).
+ """
+ from tests.helpers.spark_server import SparkConnectServer, SparkServerConfig
+
+ config = SparkServerConfig()
+ server = SparkConnectServer(config)
+
+ if not server.is_running():
+ print("\nStarting Spark Connect server for tests...")
+ server.start()
+ print("Spark Connect server started.")
+ else:
+ print("\nSpark Connect server already running.")
+
+ # Set SPARK_REMOTE if not already set
+ if not os.environ.get("SPARK_REMOTE"):
+ os.environ["SPARK_REMOTE"] = f"sc://localhost:{config.port}"
+
+ yield server
+ # Note: We don't stop the server here to allow reuse across test runs
+
+
def create_spark_connect_session() -> SparkSession:
"""
Create a Spark Connect session for testing.
@@ -29,11 +55,12 @@ def create_spark_connect_session() -> SparkSession:
@pytest.fixture(scope="module")
-def spark() -> SparkSession:
+def spark(spark_connect_server) -> SparkSession:
"""
Pytest fixture providing a Spark Connect session.
The session is shared within each test module for efficiency.
+ Depends on spark_connect_server to ensure server is running.
Yields:
SparkSession for testing
@@ -75,6 +102,6 @@ def config(self, key, value):
return self
def getOrCreate(self):
- return get_spark_connect_session()
+ return create_spark_connect_session()
return SparkConnectBuilder()
diff --git a/tests/engines/__init__.py b/tests/engines/__init__.py
new file mode 100644
index 0000000..2d41719
--- /dev/null
+++ b/tests/engines/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Engine correctness testing module.
+
+This module contains tests for validating the DuckDB engine implementation
+and comparing it against the Spark engine baseline.
+"""
diff --git a/tests/engines/comparison/__init__.py b/tests/engines/comparison/__init__.py
new file mode 100644
index 0000000..6b039c7
--- /dev/null
+++ b/tests/engines/comparison/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Cross-engine comparison tests.
+
+Contains tests that compare DuckDB engine results against Spark engine
+to validate correctness and functional parity.
+"""
diff --git a/tests/engines/comparison/conftest.py b/tests/engines/comparison/conftest.py
new file mode 100644
index 0000000..b8fc389
--- /dev/null
+++ b/tests/engines/comparison/conftest.py
@@ -0,0 +1,230 @@
+# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Dual-engine test fixtures for cross-engine comparison.
+
+Provides fixtures for creating both Spark and DuckDB engines with
+identical data for parity testing. The Spark Connect server is
+automatically started if not already running.
+"""
+
+import os
+from dataclasses import dataclass
+from typing import Callable, Generator, Optional
+import pytest
+import duckdb
+import pandas as pd
+
+from pydeequ.engines import BaseEngine
+from pydeequ.engines.duckdb import DuckDBEngine
+from tests.engines.fixtures.datasets import DATASET_FACTORIES
+
+
+# Marker for tests requiring Spark - uses the spark_connect_server fixture
+# from the top-level conftest.py which automatically starts the server
+requires_spark = pytest.mark.usefixtures("spark_connect_server")
+
+
+@dataclass
+class DualEngines:
+ """Container for both Spark and DuckDB engines with same data."""
+ spark_engine: BaseEngine
+ duckdb_engine: BaseEngine
+ dataset_name: str
+
+
+@pytest.fixture(scope="module")
+def spark_session(spark_connect_server):
+ """Create a module-scoped Spark Connect session.
+
+ Depends on spark_connect_server fixture to ensure server is running.
+ """
+ from pyspark.sql import SparkSession
+ spark_remote = os.environ.get("SPARK_REMOTE", "sc://localhost:15002")
+ spark = SparkSession.builder.remote(spark_remote).getOrCreate()
+ yield spark
+ spark.stop()
+
+
+@pytest.fixture(scope="module")
+def duckdb_connection() -> Generator[duckdb.DuckDBPyConnection, None, None]:
+ """Create a module-scoped DuckDB connection."""
+ conn = duckdb.connect(":memory:")
+ yield conn
+ conn.close()
+
+
+@pytest.fixture(scope="function")
+def dual_engine_factory(
+ spark_session,
+ duckdb_connection: duckdb.DuckDBPyConnection
+) -> Callable[[str], DualEngines]:
+ """Factory fixture to create both Spark and DuckDB engines with same data.
+
+ Usage:
+ def test_comparison(dual_engine_factory):
+ engines = dual_engine_factory("df_full")
+ spark_metrics = engines.spark_engine.compute_metrics([Size()])
+ duckdb_metrics = engines.duckdb_engine.compute_metrics([Size()])
+ assert_metrics_match(spark_metrics, duckdb_metrics)
+ """
+ tables_created = []
+
+ def factory(dataset_name: str) -> DualEngines:
+ if dataset_name not in DATASET_FACTORIES:
+ raise ValueError(f"Unknown dataset: {dataset_name}")
+
+ # Get the pandas DataFrame
+ pdf = DATASET_FACTORIES[dataset_name]()
+ table_name = f"test_{dataset_name}"
+
+ # Create DuckDB engine
+ try:
+ duckdb_connection.unregister(table_name)
+ except Exception:
+ pass
+ duckdb_connection.register(table_name, pdf)
+ duckdb_engine = DuckDBEngine(duckdb_connection, table_name)
+ tables_created.append(table_name)
+
+ # Create Spark engine
+ from pydeequ.engines.spark import SparkEngine
+ spark_df = spark_session.createDataFrame(pdf)
+ spark_engine = SparkEngine(spark_session, dataframe=spark_df)
+
+ return DualEngines(
+ spark_engine=spark_engine,
+ duckdb_engine=duckdb_engine,
+ dataset_name=dataset_name
+ )
+
+ yield factory
+
+ # Cleanup DuckDB tables
+ for table_name in tables_created:
+ try:
+ duckdb_connection.unregister(table_name)
+ except Exception:
+ pass
+
+
+# Convenience fixtures for common datasets
+
+
+@pytest.fixture(scope="function")
+def dual_engines_full(dual_engine_factory) -> DualEngines:
+ """Dual engines with df_full dataset."""
+ return dual_engine_factory("df_full")
+
+
+@pytest.fixture(scope="function")
+def dual_engines_missing(dual_engine_factory) -> DualEngines:
+ """Dual engines with df_missing dataset."""
+ return dual_engine_factory("df_missing")
+
+
+@pytest.fixture(scope="function")
+def dual_engines_numeric(dual_engine_factory) -> DualEngines:
+ """Dual engines with df_numeric dataset."""
+ return dual_engine_factory("df_numeric")
+
+
+@pytest.fixture(scope="function")
+def dual_engines_unique(dual_engine_factory) -> DualEngines:
+ """Dual engines with df_unique dataset."""
+ return dual_engine_factory("df_unique")
+
+
+@pytest.fixture(scope="function")
+def dual_engines_distinct(dual_engine_factory) -> DualEngines:
+ """Dual engines with df_distinct dataset."""
+ return dual_engine_factory("df_distinct")
+
+
+@pytest.fixture(scope="function")
+def dual_engines_string_lengths(dual_engine_factory) -> DualEngines:
+ """Dual engines with df_string_lengths dataset."""
+ return dual_engine_factory("df_string_lengths")
+
+
+@pytest.fixture(scope="function")
+def dual_engines_correlation(dual_engine_factory) -> DualEngines:
+ """Dual engines with df_correlation dataset."""
+ return dual_engine_factory("df_correlation")
+
+
+@pytest.fixture(scope="function")
+def dual_engines_entropy(dual_engine_factory) -> DualEngines:
+ """Dual engines with df_entropy dataset."""
+ return dual_engine_factory("df_entropy")
+
+
+@pytest.fixture(scope="function")
+def dual_engines_compliance(dual_engine_factory) -> DualEngines:
+ """Dual engines with df_compliance dataset."""
+ return dual_engine_factory("df_compliance")
+
+
+@pytest.fixture(scope="function")
+def dual_engines_pattern(dual_engine_factory) -> DualEngines:
+ """Dual engines with df_pattern dataset."""
+ return dual_engine_factory("df_pattern")
+
+
+@pytest.fixture(scope="function")
+def dual_engines_quantile(dual_engine_factory) -> DualEngines:
+ """Dual engines with df_quantile dataset."""
+ return dual_engine_factory("df_quantile")
+
+
+@pytest.fixture(scope="function")
+def dual_engines_contained_in(dual_engine_factory) -> DualEngines:
+ """Dual engines with df_contained_in dataset."""
+ return dual_engine_factory("df_contained_in")
+
+
+@pytest.fixture(scope="function")
+def dual_engines_histogram(dual_engine_factory) -> DualEngines:
+ """Dual engines with df_histogram dataset."""
+ return dual_engine_factory("df_histogram")
+
+
+@pytest.fixture(scope="function")
+def dual_engines_mutual_info(dual_engine_factory) -> DualEngines:
+ """Dual engines with df_mutual_info dataset."""
+ return dual_engine_factory("df_mutual_info")
+
+
+@pytest.fixture(scope="function")
+def dual_engines_where(dual_engine_factory) -> DualEngines:
+ """Dual engines with df_where dataset."""
+ return dual_engine_factory("df_where")
+
+
+@pytest.fixture(scope="function")
+def dual_engines_all_null(dual_engine_factory) -> DualEngines:
+ """Dual engines with df_all_null dataset."""
+ return dual_engine_factory("df_all_null")
+
+
+@pytest.fixture(scope="function")
+def dual_engines_single(dual_engine_factory) -> DualEngines:
+ """Dual engines with df_single dataset."""
+ return dual_engine_factory("df_single")
+
+
+@pytest.fixture(scope="function")
+def dual_engines_empty(dual_engine_factory) -> DualEngines:
+ """Dual engines with df_empty dataset."""
+ return dual_engine_factory("df_empty")
diff --git a/tests/engines/comparison/test_analyzer_parity.py b/tests/engines/comparison/test_analyzer_parity.py
new file mode 100644
index 0000000..8db0d2e
--- /dev/null
+++ b/tests/engines/comparison/test_analyzer_parity.py
@@ -0,0 +1,393 @@
+# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Cross-engine analyzer parity tests.
+
+Tests that verify DuckDB engine produces the same analyzer results
+as the Spark engine baseline. Requires Spark Connect to be running.
+"""
+
+import pytest
+
+from pydeequ.v2.analyzers import (
+ Size,
+ Completeness,
+ Mean,
+ Sum,
+ Maximum,
+ Minimum,
+ StandardDeviation,
+ Distinctness,
+ Uniqueness,
+ UniqueValueRatio,
+ CountDistinct,
+ ApproxCountDistinct,
+ ApproxQuantile,
+ Correlation,
+ MutualInformation,
+ MaxLength,
+ MinLength,
+ PatternMatch,
+ Compliance,
+ Entropy,
+ Histogram,
+ DataType,
+)
+
+from tests.engines.comparison.conftest import requires_spark, DualEngines
+from tests.engines.comparison.utils import assert_metrics_match
+
+
+@requires_spark
+class TestSizeAnalyzerParity:
+ """Parity tests for Size analyzer."""
+
+ def test_size_basic(self, dual_engines_full: DualEngines):
+ """Size produces same result on both engines."""
+ analyzers = [Size()]
+ spark_metrics = dual_engines_full.spark_engine.compute_metrics(analyzers)
+ duckdb_metrics = dual_engines_full.duckdb_engine.compute_metrics(analyzers)
+ assert_metrics_match(spark_metrics, duckdb_metrics, "Size basic")
+
+ def test_size_with_nulls(self, dual_engines_missing: DualEngines):
+ """Size counts all rows regardless of NULLs on both engines."""
+ analyzers = [Size()]
+ spark_metrics = dual_engines_missing.spark_engine.compute_metrics(analyzers)
+ duckdb_metrics = dual_engines_missing.duckdb_engine.compute_metrics(analyzers)
+ assert_metrics_match(spark_metrics, duckdb_metrics, "Size with nulls")
+
+
+@requires_spark
+class TestCompletenessAnalyzerParity:
+ """Parity tests for Completeness analyzer."""
+
+ def test_completeness_full(self, dual_engines_full: DualEngines):
+ """Completeness produces same result for complete columns."""
+ analyzers = [Completeness("att1"), Completeness("att2")]
+ spark_metrics = dual_engines_full.spark_engine.compute_metrics(analyzers)
+ duckdb_metrics = dual_engines_full.duckdb_engine.compute_metrics(analyzers)
+ assert_metrics_match(spark_metrics, duckdb_metrics, "Completeness full")
+
+ def test_completeness_partial(self, dual_engines_missing: DualEngines):
+ """Completeness produces same result for partial columns."""
+ analyzers = [Completeness("att1"), Completeness("att2")]
+ spark_metrics = dual_engines_missing.spark_engine.compute_metrics(analyzers)
+ duckdb_metrics = dual_engines_missing.duckdb_engine.compute_metrics(analyzers)
+ assert_metrics_match(spark_metrics, duckdb_metrics, "Completeness partial")
+
+ def test_completeness_all_null(self, dual_engines_all_null: DualEngines):
+ """Completeness produces same result for all-NULL columns."""
+ analyzers = [Completeness("value")]
+ spark_metrics = dual_engines_all_null.spark_engine.compute_metrics(analyzers)
+ duckdb_metrics = dual_engines_all_null.duckdb_engine.compute_metrics(analyzers)
+ assert_metrics_match(spark_metrics, duckdb_metrics, "Completeness all null")
+
+
+@requires_spark
+class TestStatisticalAnalyzerParity:
+ """Parity tests for statistical analyzers."""
+
+ def test_mean(self, dual_engines_numeric: DualEngines):
+ """Mean produces same result on both engines."""
+ analyzers = [Mean("att1"), Mean("att2")]
+ spark_metrics = dual_engines_numeric.spark_engine.compute_metrics(analyzers)
+ duckdb_metrics = dual_engines_numeric.duckdb_engine.compute_metrics(analyzers)
+ assert_metrics_match(spark_metrics, duckdb_metrics, "Mean")
+
+ def test_sum(self, dual_engines_numeric: DualEngines):
+ """Sum produces same result on both engines."""
+ analyzers = [Sum("att1"), Sum("att2")]
+ spark_metrics = dual_engines_numeric.spark_engine.compute_metrics(analyzers)
+ duckdb_metrics = dual_engines_numeric.duckdb_engine.compute_metrics(analyzers)
+ assert_metrics_match(spark_metrics, duckdb_metrics, "Sum")
+
+ def test_minimum(self, dual_engines_numeric: DualEngines):
+ """Minimum produces same result on both engines."""
+ analyzers = [Minimum("att1"), Minimum("att2")]
+ spark_metrics = dual_engines_numeric.spark_engine.compute_metrics(analyzers)
+ duckdb_metrics = dual_engines_numeric.duckdb_engine.compute_metrics(analyzers)
+ assert_metrics_match(spark_metrics, duckdb_metrics, "Minimum")
+
+ def test_maximum(self, dual_engines_numeric: DualEngines):
+ """Maximum produces same result on both engines."""
+ analyzers = [Maximum("att1"), Maximum("att2")]
+ spark_metrics = dual_engines_numeric.spark_engine.compute_metrics(analyzers)
+ duckdb_metrics = dual_engines_numeric.duckdb_engine.compute_metrics(analyzers)
+ assert_metrics_match(spark_metrics, duckdb_metrics, "Maximum")
+
+ def test_standard_deviation(self, dual_engines_numeric: DualEngines):
+ """StandardDeviation produces same result on both engines."""
+ analyzers = [StandardDeviation("att1")]
+ spark_metrics = dual_engines_numeric.spark_engine.compute_metrics(analyzers)
+ duckdb_metrics = dual_engines_numeric.duckdb_engine.compute_metrics(analyzers)
+ assert_metrics_match(spark_metrics, duckdb_metrics, "StandardDeviation")
+
+
+@requires_spark
+class TestUniquenessAnalyzerParity:
+ """Parity tests for uniqueness-related analyzers."""
+
+ def test_distinctness(self, dual_engines_distinct: DualEngines):
+ """Distinctness produces same result on both engines."""
+ analyzers = [Distinctness(["att1"]), Distinctness(["att2"])]
+ spark_metrics = dual_engines_distinct.spark_engine.compute_metrics(analyzers)
+ duckdb_metrics = dual_engines_distinct.duckdb_engine.compute_metrics(analyzers)
+ assert_metrics_match(spark_metrics, duckdb_metrics, "Distinctness")
+
+ def test_uniqueness(self, dual_engines_distinct: DualEngines):
+ """Uniqueness produces same result on both engines."""
+ analyzers = [Uniqueness(["att1"]), Uniqueness(["att2"])]
+ spark_metrics = dual_engines_distinct.spark_engine.compute_metrics(analyzers)
+ duckdb_metrics = dual_engines_distinct.duckdb_engine.compute_metrics(analyzers)
+ assert_metrics_match(spark_metrics, duckdb_metrics, "Uniqueness")
+
+ def test_unique_value_ratio(self, dual_engines_distinct: DualEngines):
+ """UniqueValueRatio produces same result on both engines."""
+ analyzers = [UniqueValueRatio(["att1"]), UniqueValueRatio(["att2"])]
+ spark_metrics = dual_engines_distinct.spark_engine.compute_metrics(analyzers)
+ duckdb_metrics = dual_engines_distinct.duckdb_engine.compute_metrics(analyzers)
+ assert_metrics_match(spark_metrics, duckdb_metrics, "UniqueValueRatio")
+
+ def test_count_distinct(self, dual_engines_distinct: DualEngines):
+ """CountDistinct produces same result on both engines."""
+ analyzers = [CountDistinct(["att1"]), CountDistinct(["att2"])]
+ spark_metrics = dual_engines_distinct.spark_engine.compute_metrics(analyzers)
+ duckdb_metrics = dual_engines_distinct.duckdb_engine.compute_metrics(analyzers)
+ assert_metrics_match(spark_metrics, duckdb_metrics, "CountDistinct")
+
+ def test_approx_count_distinct(self, dual_engines_distinct: DualEngines):
+ """ApproxCountDistinct produces approximately same result."""
+ analyzers = [ApproxCountDistinct("att1"), ApproxCountDistinct("att2")]
+ spark_metrics = dual_engines_distinct.spark_engine.compute_metrics(analyzers)
+ duckdb_metrics = dual_engines_distinct.duckdb_engine.compute_metrics(analyzers)
+ # Uses APPROX_TOLERANCE (10%) for approximate algorithms
+ assert_metrics_match(spark_metrics, duckdb_metrics, "ApproxCountDistinct")
+
+
+@requires_spark
+class TestStringAnalyzerParity:
+ """Parity tests for string analyzers."""
+
+ def test_min_length(self, dual_engines_string_lengths: DualEngines):
+ """MinLength produces same result on both engines."""
+ analyzers = [MinLength("att1"), MinLength("att2")]
+ spark_metrics = dual_engines_string_lengths.spark_engine.compute_metrics(analyzers)
+ duckdb_metrics = dual_engines_string_lengths.duckdb_engine.compute_metrics(analyzers)
+ assert_metrics_match(spark_metrics, duckdb_metrics, "MinLength")
+
+ def test_max_length(self, dual_engines_string_lengths: DualEngines):
+ """MaxLength produces same result on both engines."""
+ analyzers = [MaxLength("att1"), MaxLength("att2")]
+ spark_metrics = dual_engines_string_lengths.spark_engine.compute_metrics(analyzers)
+ duckdb_metrics = dual_engines_string_lengths.duckdb_engine.compute_metrics(analyzers)
+ assert_metrics_match(spark_metrics, duckdb_metrics, "MaxLength")
+
+ def test_pattern_match(self, dual_engines_pattern: DualEngines):
+ """PatternMatch produces same result on both engines."""
+ analyzers = [PatternMatch("email", r".*@.*\..*")]
+ spark_metrics = dual_engines_pattern.spark_engine.compute_metrics(analyzers)
+ duckdb_metrics = dual_engines_pattern.duckdb_engine.compute_metrics(analyzers)
+ assert_metrics_match(spark_metrics, duckdb_metrics, "PatternMatch")
+
+
+@requires_spark
+class TestCorrelationAnalyzerParity:
+ """Parity tests for Correlation analyzer."""
+
+ def test_correlation_positive(self, dual_engines_correlation: DualEngines):
+ """Correlation produces same result for positively correlated columns."""
+ analyzers = [Correlation("x", "y")]
+ spark_metrics = dual_engines_correlation.spark_engine.compute_metrics(analyzers)
+ duckdb_metrics = dual_engines_correlation.duckdb_engine.compute_metrics(analyzers)
+ assert_metrics_match(spark_metrics, duckdb_metrics, "Correlation positive")
+
+ def test_correlation_negative(self, dual_engines_correlation: DualEngines):
+ """Correlation produces same result for negatively correlated columns."""
+ analyzers = [Correlation("x", "z")]
+ spark_metrics = dual_engines_correlation.spark_engine.compute_metrics(analyzers)
+ duckdb_metrics = dual_engines_correlation.duckdb_engine.compute_metrics(analyzers)
+ assert_metrics_match(spark_metrics, duckdb_metrics, "Correlation negative")
+
+
+@requires_spark
+class TestEntropyAnalyzerParity:
+ """Parity tests for Entropy analyzer."""
+
+ def test_entropy_uniform(self, dual_engines_entropy: DualEngines):
+ """Entropy produces same result for uniform distribution."""
+ analyzers = [Entropy("uniform")]
+ spark_metrics = dual_engines_entropy.spark_engine.compute_metrics(analyzers)
+ duckdb_metrics = dual_engines_entropy.duckdb_engine.compute_metrics(analyzers)
+ assert_metrics_match(spark_metrics, duckdb_metrics, "Entropy uniform")
+
+ def test_entropy_constant(self, dual_engines_entropy: DualEngines):
+ """Entropy produces same result for constant column."""
+ analyzers = [Entropy("constant")]
+ spark_metrics = dual_engines_entropy.spark_engine.compute_metrics(analyzers)
+ duckdb_metrics = dual_engines_entropy.duckdb_engine.compute_metrics(analyzers)
+ assert_metrics_match(spark_metrics, duckdb_metrics, "Entropy constant")
+
+
+@requires_spark
+class TestMutualInformationAnalyzerParity:
+ """Parity tests for MutualInformation analyzer."""
+
+ def test_mutual_information(self, dual_engines_mutual_info: DualEngines):
+ """MutualInformation produces same result on both engines."""
+ analyzers = [MutualInformation(["x", "y_dependent"])]
+ spark_metrics = dual_engines_mutual_info.spark_engine.compute_metrics(analyzers)
+ duckdb_metrics = dual_engines_mutual_info.duckdb_engine.compute_metrics(analyzers)
+ assert_metrics_match(spark_metrics, duckdb_metrics, "MutualInformation")
+
+
+@requires_spark
+class TestComplianceAnalyzerParity:
+ """Parity tests for Compliance analyzer."""
+
+ def test_compliance(self, dual_engines_compliance: DualEngines):
+ """Compliance produces same result on both engines."""
+ analyzers = [
+ Compliance("positive_check", "positive > 0"),
+ Compliance("mixed_check", "mixed > 0"),
+ ]
+ spark_metrics = dual_engines_compliance.spark_engine.compute_metrics(analyzers)
+ duckdb_metrics = dual_engines_compliance.duckdb_engine.compute_metrics(analyzers)
+ assert_metrics_match(spark_metrics, duckdb_metrics, "Compliance")
+
+
+@requires_spark
+class TestQuantileAnalyzerParity:
+ """Parity tests for ApproxQuantile analyzer."""
+
+ def test_approx_quantile_median(self, dual_engines_quantile: DualEngines):
+ """ApproxQuantile produces same result for median."""
+ analyzers = [ApproxQuantile("value", 0.5)]
+ spark_metrics = dual_engines_quantile.spark_engine.compute_metrics(analyzers)
+ duckdb_metrics = dual_engines_quantile.duckdb_engine.compute_metrics(analyzers)
+ assert_metrics_match(spark_metrics, duckdb_metrics, "ApproxQuantile median")
+
+ def test_approx_quantile_quartiles(self, dual_engines_quantile: DualEngines):
+ """ApproxQuantile produces same result for quartiles."""
+ analyzers = [
+ ApproxQuantile("value", 0.25),
+ ApproxQuantile("value", 0.75),
+ ]
+ spark_metrics = dual_engines_quantile.spark_engine.compute_metrics(analyzers)
+ duckdb_metrics = dual_engines_quantile.duckdb_engine.compute_metrics(analyzers)
+ assert_metrics_match(spark_metrics, duckdb_metrics, "ApproxQuantile quartiles")
+
+
+@requires_spark
+class TestHistogramAnalyzerParity:
+ """Parity tests for Histogram analyzer."""
+
+ def test_histogram(self, dual_engines_histogram: DualEngines):
+ """Histogram produces consistent results on both engines."""
+ analyzers = [Histogram("category")]
+ spark_metrics = dual_engines_histogram.spark_engine.compute_metrics(analyzers)
+ duckdb_metrics = dual_engines_histogram.duckdb_engine.compute_metrics(analyzers)
+ # Histogram structure may differ, so just check it exists
+ assert len(spark_metrics) > 0
+ assert len(duckdb_metrics) > 0
+
+
+@requires_spark
+class TestDataTypeAnalyzerParity:
+ """Parity tests for DataType analyzer."""
+
+ def test_data_type(self, dual_engines_full: DualEngines):
+ """DataType produces consistent results on both engines."""
+ analyzers = [DataType("att1")]
+ spark_metrics = dual_engines_full.spark_engine.compute_metrics(analyzers)
+ duckdb_metrics = dual_engines_full.duckdb_engine.compute_metrics(analyzers)
+ # DataType format may differ, so just check it exists
+ assert len(spark_metrics) > 0
+ assert len(duckdb_metrics) > 0
+
+
+@requires_spark
+class TestAnalyzersWithWhereParity:
+ """Parity tests for analyzers with WHERE clause."""
+
+ def test_size_with_where(self, dual_engines_where: DualEngines):
+ """Size with WHERE produces same result on both engines."""
+ analyzers = [Size(where="category = 'A'")]
+ spark_metrics = dual_engines_where.spark_engine.compute_metrics(analyzers)
+ duckdb_metrics = dual_engines_where.duckdb_engine.compute_metrics(analyzers)
+ assert_metrics_match(spark_metrics, duckdb_metrics, "Size with WHERE")
+
+ def test_completeness_with_where(self, dual_engines_where: DualEngines):
+ """Completeness with WHERE produces same result on both engines."""
+ analyzers = [Completeness("att1", where="category = 'A'")]
+ spark_metrics = dual_engines_where.spark_engine.compute_metrics(analyzers)
+ duckdb_metrics = dual_engines_where.duckdb_engine.compute_metrics(analyzers)
+ assert_metrics_match(spark_metrics, duckdb_metrics, "Completeness with WHERE")
+
+ def test_mean_with_where(self, dual_engines_where: DualEngines):
+ """Mean with WHERE produces same result on both engines."""
+ analyzers = [Mean("value", where="category = 'A'")]
+ spark_metrics = dual_engines_where.spark_engine.compute_metrics(analyzers)
+ duckdb_metrics = dual_engines_where.duckdb_engine.compute_metrics(analyzers)
+ assert_metrics_match(spark_metrics, duckdb_metrics, "Mean with WHERE")
+
+
+@requires_spark
+class TestMultipleAnalyzersParity:
+ """Parity tests for running multiple analyzers together."""
+
+ def test_all_basic_analyzers(self, dual_engines_numeric: DualEngines):
+ """All basic analyzers produce same results on both engines."""
+ analyzers = [
+ Size(),
+ Completeness("att1"),
+ Mean("att1"),
+ Sum("att1"),
+ Minimum("att1"),
+ Maximum("att1"),
+ StandardDeviation("att1"),
+ ]
+ spark_metrics = dual_engines_numeric.spark_engine.compute_metrics(analyzers)
+ duckdb_metrics = dual_engines_numeric.duckdb_engine.compute_metrics(analyzers)
+ assert_metrics_match(spark_metrics, duckdb_metrics, "All basic analyzers")
+
+ def test_mixed_analyzer_types(self, dual_engines_full: DualEngines):
+ """Mixed analyzer types produce same results on both engines."""
+ analyzers = [
+ Size(),
+ Completeness("att1"),
+ CountDistinct(["att1"]),
+ MaxLength("att1"),
+ MinLength("att1"),
+ ]
+ spark_metrics = dual_engines_full.spark_engine.compute_metrics(analyzers)
+ duckdb_metrics = dual_engines_full.duckdb_engine.compute_metrics(analyzers)
+ assert_metrics_match(spark_metrics, duckdb_metrics, "Mixed analyzer types")
+
+
+@requires_spark
+class TestEdgeCasesParity:
+ """Parity tests for edge cases."""
+
+ def test_single_row(self, dual_engines_single: DualEngines):
+ """Analyzers produce same results for single-row dataset."""
+ analyzers = [
+ Size(),
+ Completeness("att1"),
+ Mean("item"),
+ Maximum("item"),
+ Minimum("item"),
+ ]
+ spark_metrics = dual_engines_single.spark_engine.compute_metrics(analyzers)
+ duckdb_metrics = dual_engines_single.duckdb_engine.compute_metrics(analyzers)
+ assert_metrics_match(spark_metrics, duckdb_metrics, "Single row")
diff --git a/tests/engines/comparison/test_constraint_parity.py b/tests/engines/comparison/test_constraint_parity.py
new file mode 100644
index 0000000..0e32232
--- /dev/null
+++ b/tests/engines/comparison/test_constraint_parity.py
@@ -0,0 +1,334 @@
+# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Cross-engine constraint parity tests.
+
+Tests that verify DuckDB engine produces the same constraint evaluation
+results as the Spark engine baseline. Requires Spark Connect to be running.
+"""
+
+import pytest
+
+from pydeequ.v2.checks import Check, CheckLevel
+from pydeequ.v2.predicates import eq, gt, gte, lt, lte, between, is_one
+
+from tests.engines.comparison.conftest import requires_spark, DualEngines
+from tests.engines.comparison.utils import assert_constraints_match
+
+
+@requires_spark
+class TestSizeConstraintParity:
+ """Parity tests for size constraints."""
+
+ def test_has_size_success(self, dual_engines_full: DualEngines):
+ """hasSize produces same result on both engines when passing."""
+ check = Check(CheckLevel.Error, "size check").hasSize(eq(4))
+ spark_results = dual_engines_full.spark_engine.run_checks([check])
+ duckdb_results = dual_engines_full.duckdb_engine.run_checks([check])
+ assert_constraints_match(spark_results, duckdb_results, "hasSize success")
+
+ def test_has_size_failure(self, dual_engines_full: DualEngines):
+ """hasSize produces same result on both engines when failing."""
+ check = Check(CheckLevel.Error, "size check").hasSize(eq(100))
+ spark_results = dual_engines_full.spark_engine.run_checks([check])
+ duckdb_results = dual_engines_full.duckdb_engine.run_checks([check])
+ assert_constraints_match(spark_results, duckdb_results, "hasSize failure")
+
+
+@requires_spark
+class TestCompletenessConstraintParity:
+ """Parity tests for completeness constraints."""
+
+ def test_is_complete_success(self, dual_engines_full: DualEngines):
+ """isComplete produces same result on both engines when passing."""
+ check = Check(CheckLevel.Error, "complete").isComplete("att1")
+ spark_results = dual_engines_full.spark_engine.run_checks([check])
+ duckdb_results = dual_engines_full.duckdb_engine.run_checks([check])
+ assert_constraints_match(spark_results, duckdb_results, "isComplete success")
+
+ def test_is_complete_failure(self, dual_engines_missing: DualEngines):
+ """isComplete produces same result on both engines when failing."""
+ check = Check(CheckLevel.Error, "complete").isComplete("att1")
+ spark_results = dual_engines_missing.spark_engine.run_checks([check])
+ duckdb_results = dual_engines_missing.duckdb_engine.run_checks([check])
+ assert_constraints_match(spark_results, duckdb_results, "isComplete failure")
+
+ def test_has_completeness(self, dual_engines_missing: DualEngines):
+ """hasCompleteness produces same result on both engines."""
+ check = Check(CheckLevel.Error, "threshold").hasCompleteness("att1", gte(0.5))
+ spark_results = dual_engines_missing.spark_engine.run_checks([check])
+ duckdb_results = dual_engines_missing.duckdb_engine.run_checks([check])
+ assert_constraints_match(spark_results, duckdb_results, "hasCompleteness")
+
+ def test_are_complete(self, dual_engines_full: DualEngines):
+ """areComplete produces same result on both engines."""
+ check = Check(CheckLevel.Error, "multi").areComplete(["att1", "att2"])
+ spark_results = dual_engines_full.spark_engine.run_checks([check])
+ duckdb_results = dual_engines_full.duckdb_engine.run_checks([check])
+ assert_constraints_match(spark_results, duckdb_results, "areComplete")
+
+
+@requires_spark
+class TestUniquenessConstraintParity:
+ """Parity tests for uniqueness constraints."""
+
+ def test_is_unique_success(self, dual_engines_unique: DualEngines):
+ """isUnique produces same result on both engines when passing."""
+ check = Check(CheckLevel.Error, "unique").isUnique("unique_col")
+ spark_results = dual_engines_unique.spark_engine.run_checks([check])
+ duckdb_results = dual_engines_unique.duckdb_engine.run_checks([check])
+ assert_constraints_match(spark_results, duckdb_results, "isUnique success")
+
+ def test_is_unique_failure(self, dual_engines_unique: DualEngines):
+ """isUnique produces same result on both engines when failing."""
+ check = Check(CheckLevel.Error, "not unique").isUnique("non_unique")
+ spark_results = dual_engines_unique.spark_engine.run_checks([check])
+ duckdb_results = dual_engines_unique.duckdb_engine.run_checks([check])
+ assert_constraints_match(spark_results, duckdb_results, "isUnique failure")
+
+ def test_has_uniqueness(self, dual_engines_distinct: DualEngines):
+ """hasUniqueness produces same result on both engines."""
+ check = Check(CheckLevel.Error, "uniqueness").hasUniqueness(["att2"], is_one())
+ spark_results = dual_engines_distinct.spark_engine.run_checks([check])
+ duckdb_results = dual_engines_distinct.duckdb_engine.run_checks([check])
+ assert_constraints_match(spark_results, duckdb_results, "hasUniqueness")
+
+ def test_has_distinctness(self, dual_engines_distinct: DualEngines):
+ """hasDistinctness produces same result on both engines."""
+ check = Check(CheckLevel.Error, "distinct").hasDistinctness(["att1"], gte(0.5))
+ spark_results = dual_engines_distinct.spark_engine.run_checks([check])
+ duckdb_results = dual_engines_distinct.duckdb_engine.run_checks([check])
+ assert_constraints_match(spark_results, duckdb_results, "hasDistinctness")
+
+
+@requires_spark
+class TestStatisticalConstraintParity:
+ """Parity tests for statistical constraints."""
+
+ def test_has_min(self, dual_engines_numeric: DualEngines):
+ """hasMin produces same result on both engines."""
+ check = Check(CheckLevel.Error, "min").hasMin("att1", eq(1))
+ spark_results = dual_engines_numeric.spark_engine.run_checks([check])
+ duckdb_results = dual_engines_numeric.duckdb_engine.run_checks([check])
+ assert_constraints_match(spark_results, duckdb_results, "hasMin")
+
+ def test_has_max(self, dual_engines_numeric: DualEngines):
+ """hasMax produces same result on both engines."""
+ check = Check(CheckLevel.Error, "max").hasMax("att1", eq(6))
+ spark_results = dual_engines_numeric.spark_engine.run_checks([check])
+ duckdb_results = dual_engines_numeric.duckdb_engine.run_checks([check])
+ assert_constraints_match(spark_results, duckdb_results, "hasMax")
+
+ def test_has_mean(self, dual_engines_numeric: DualEngines):
+ """hasMean produces same result on both engines."""
+ check = Check(CheckLevel.Error, "mean").hasMean("att1", eq(3.5))
+ spark_results = dual_engines_numeric.spark_engine.run_checks([check])
+ duckdb_results = dual_engines_numeric.duckdb_engine.run_checks([check])
+ assert_constraints_match(spark_results, duckdb_results, "hasMean")
+
+ def test_has_sum(self, dual_engines_numeric: DualEngines):
+ """hasSum produces same result on both engines."""
+ check = Check(CheckLevel.Error, "sum").hasSum("att1", eq(21))
+ spark_results = dual_engines_numeric.spark_engine.run_checks([check])
+ duckdb_results = dual_engines_numeric.duckdb_engine.run_checks([check])
+ assert_constraints_match(spark_results, duckdb_results, "hasSum")
+
+ def test_has_standard_deviation(self, dual_engines_numeric: DualEngines):
+ """hasStandardDeviation produces same result on both engines."""
+ check = Check(CheckLevel.Error, "stddev").hasStandardDeviation("att1", between(1.5, 2.0))
+ spark_results = dual_engines_numeric.spark_engine.run_checks([check])
+ duckdb_results = dual_engines_numeric.duckdb_engine.run_checks([check])
+ assert_constraints_match(spark_results, duckdb_results, "hasStandardDeviation")
+
+
+@requires_spark
+class TestCorrelationConstraintParity:
+ """Parity tests for correlation constraints."""
+
+ def test_has_correlation(self, dual_engines_correlation: DualEngines):
+ """hasCorrelation produces same result on both engines."""
+ check = Check(CheckLevel.Error, "corr").hasCorrelation("x", "y", is_one())
+ spark_results = dual_engines_correlation.spark_engine.run_checks([check])
+ duckdb_results = dual_engines_correlation.duckdb_engine.run_checks([check])
+ assert_constraints_match(spark_results, duckdb_results, "hasCorrelation")
+
+
+@requires_spark
+class TestEntropyConstraintParity:
+ """Parity tests for entropy constraints."""
+
+ def test_has_entropy(self, dual_engines_entropy: DualEngines):
+ """hasEntropy produces same result on both engines."""
+ check = Check(CheckLevel.Error, "entropy").hasEntropy("uniform", eq(2.0))
+ spark_results = dual_engines_entropy.spark_engine.run_checks([check])
+ duckdb_results = dual_engines_entropy.duckdb_engine.run_checks([check])
+ assert_constraints_match(spark_results, duckdb_results, "hasEntropy")
+
+
+@requires_spark
+class TestStringConstraintParity:
+ """Parity tests for string constraints."""
+
+ def test_has_min_length(self, dual_engines_string_lengths: DualEngines):
+ """hasMinLength produces same result on both engines."""
+ check = Check(CheckLevel.Error, "min len").hasMinLength("att1", eq(0))
+ spark_results = dual_engines_string_lengths.spark_engine.run_checks([check])
+ duckdb_results = dual_engines_string_lengths.duckdb_engine.run_checks([check])
+ assert_constraints_match(spark_results, duckdb_results, "hasMinLength")
+
+ def test_has_max_length(self, dual_engines_string_lengths: DualEngines):
+ """hasMaxLength produces same result on both engines."""
+ check = Check(CheckLevel.Error, "max len").hasMaxLength("att1", lte(5))
+ spark_results = dual_engines_string_lengths.spark_engine.run_checks([check])
+ duckdb_results = dual_engines_string_lengths.duckdb_engine.run_checks([check])
+ assert_constraints_match(spark_results, duckdb_results, "hasMaxLength")
+
+ def test_has_pattern(self, dual_engines_pattern: DualEngines):
+ """hasPattern produces same result on both engines."""
+ check = Check(CheckLevel.Error, "pattern").hasPattern("email", r".*@.*\..*", gte(0.5))
+ spark_results = dual_engines_pattern.spark_engine.run_checks([check])
+ duckdb_results = dual_engines_pattern.duckdb_engine.run_checks([check])
+ assert_constraints_match(spark_results, duckdb_results, "hasPattern")
+
+
+@requires_spark
+class TestNumericConstraintParity:
+ """Parity tests for numeric value constraints."""
+
+ def test_is_positive(self, dual_engines_compliance: DualEngines):
+ """isPositive produces same result on both engines."""
+ check = Check(CheckLevel.Error, "positive").isPositive("positive", is_one())
+ spark_results = dual_engines_compliance.spark_engine.run_checks([check])
+ duckdb_results = dual_engines_compliance.duckdb_engine.run_checks([check])
+ assert_constraints_match(spark_results, duckdb_results, "isPositive")
+
+ def test_is_non_negative(self, dual_engines_compliance: DualEngines):
+ """isNonNegative produces same result on both engines."""
+ check = Check(CheckLevel.Error, "non-neg").isNonNegative("positive", is_one())
+ spark_results = dual_engines_compliance.spark_engine.run_checks([check])
+ duckdb_results = dual_engines_compliance.duckdb_engine.run_checks([check])
+ assert_constraints_match(spark_results, duckdb_results, "isNonNegative")
+
+
+@requires_spark
+class TestColumnComparisonConstraintParity:
+ """Parity tests for column comparison constraints."""
+
+ def test_is_less_than(self, dual_engines_correlation: DualEngines):
+ """isLessThan produces same result on both engines."""
+ check = Check(CheckLevel.Error, "less").isLessThan("x", "y")
+ spark_results = dual_engines_correlation.spark_engine.run_checks([check])
+ duckdb_results = dual_engines_correlation.duckdb_engine.run_checks([check])
+ assert_constraints_match(spark_results, duckdb_results, "isLessThan")
+
+ def test_is_greater_than(self, dual_engines_correlation: DualEngines):
+ """isGreaterThan produces same result on both engines."""
+ check = Check(CheckLevel.Error, "greater").isGreaterThan("y", "x")
+ spark_results = dual_engines_correlation.spark_engine.run_checks([check])
+ duckdb_results = dual_engines_correlation.duckdb_engine.run_checks([check])
+ assert_constraints_match(spark_results, duckdb_results, "isGreaterThan")
+
+
+@requires_spark
+class TestContainedInConstraintParity:
+ """Parity tests for isContainedIn constraint."""
+
+ def test_is_contained_in_success(self, dual_engines_contained_in: DualEngines):
+ """isContainedIn produces same result on both engines when passing."""
+ check = Check(CheckLevel.Error, "contained").isContainedIn(
+ "status", ["active", "inactive", "pending"], is_one()
+ )
+ spark_results = dual_engines_contained_in.spark_engine.run_checks([check])
+ duckdb_results = dual_engines_contained_in.duckdb_engine.run_checks([check])
+ assert_constraints_match(spark_results, duckdb_results, "isContainedIn success")
+
+ def test_is_contained_in_failure(self, dual_engines_contained_in: DualEngines):
+ """isContainedIn produces same result on both engines when failing."""
+ check = Check(CheckLevel.Error, "not contained").isContainedIn(
+ "category", ["A", "B", "C"], is_one()
+ )
+ spark_results = dual_engines_contained_in.spark_engine.run_checks([check])
+ duckdb_results = dual_engines_contained_in.duckdb_engine.run_checks([check])
+ assert_constraints_match(spark_results, duckdb_results, "isContainedIn failure")
+
+
+@requires_spark
+class TestSatisfiesConstraintParity:
+ """Parity tests for satisfies constraint."""
+
+ def test_satisfies(self, dual_engines_compliance: DualEngines):
+ """satisfies produces same result on both engines."""
+ check = Check(CheckLevel.Error, "satisfies").satisfies(
+ "positive > 0", "positive_check", assertion=is_one()
+ )
+ spark_results = dual_engines_compliance.spark_engine.run_checks([check])
+ duckdb_results = dual_engines_compliance.duckdb_engine.run_checks([check])
+ assert_constraints_match(spark_results, duckdb_results, "satisfies")
+
+
+@requires_spark
+class TestMultipleConstraintsParity:
+ """Parity tests for multiple constraints."""
+
+ def test_multiple_constraints_all_pass(self, dual_engines_full: DualEngines):
+ """Multiple passing constraints produce same results."""
+ check = (Check(CheckLevel.Error, "multi pass")
+ .hasSize(eq(4))
+ .isComplete("att1")
+ .isComplete("att2"))
+ spark_results = dual_engines_full.spark_engine.run_checks([check])
+ duckdb_results = dual_engines_full.duckdb_engine.run_checks([check])
+ assert_constraints_match(spark_results, duckdb_results, "Multiple pass")
+
+ def test_multiple_constraints_some_fail(self, dual_engines_missing: DualEngines):
+ """Mixed pass/fail constraints produce same results."""
+ check = (Check(CheckLevel.Error, "multi mixed")
+ .hasSize(eq(12)) # Pass
+ .isComplete("att1")) # Fail
+ spark_results = dual_engines_missing.spark_engine.run_checks([check])
+ duckdb_results = dual_engines_missing.duckdb_engine.run_checks([check])
+ assert_constraints_match(spark_results, duckdb_results, "Multiple mixed")
+
+
+@requires_spark
+class TestCheckLevelsParity:
+ """Parity tests for check levels."""
+
+ def test_error_level(self, dual_engines_full: DualEngines):
+ """Error level produces same results on both engines."""
+ check = Check(CheckLevel.Error, "error").hasSize(eq(100))
+ spark_results = dual_engines_full.spark_engine.run_checks([check])
+ duckdb_results = dual_engines_full.duckdb_engine.run_checks([check])
+ assert_constraints_match(spark_results, duckdb_results, "Error level")
+
+ def test_warning_level(self, dual_engines_full: DualEngines):
+ """Warning level produces same results on both engines."""
+ check = Check(CheckLevel.Warning, "warning").hasSize(eq(100))
+ spark_results = dual_engines_full.spark_engine.run_checks([check])
+ duckdb_results = dual_engines_full.duckdb_engine.run_checks([check])
+ assert_constraints_match(spark_results, duckdb_results, "Warning level")
+
+
+@requires_spark
+class TestConstraintsWithWhereParity:
+ """Parity tests for constraints with WHERE clause."""
+
+ def test_completeness_with_where(self, dual_engines_where: DualEngines):
+ """Completeness with WHERE produces same result on both engines."""
+ check = Check(CheckLevel.Error, "filtered").hasCompleteness(
+ "att1", is_one()
+ ).where("category = 'A'")
+ spark_results = dual_engines_where.spark_engine.run_checks([check])
+ duckdb_results = dual_engines_where.duckdb_engine.run_checks([check])
+ assert_constraints_match(spark_results, duckdb_results, "Completeness with WHERE")
diff --git a/tests/engines/comparison/test_profile_parity.py b/tests/engines/comparison/test_profile_parity.py
new file mode 100644
index 0000000..c980d34
--- /dev/null
+++ b/tests/engines/comparison/test_profile_parity.py
@@ -0,0 +1,142 @@
+# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Cross-engine column profiling parity tests.
+
+Tests that verify DuckDB engine produces the same column profiling
+results as the Spark engine baseline. Requires Spark Connect to be running.
+"""
+
+import pytest
+
+from tests.engines.comparison.conftest import requires_spark, DualEngines
+from tests.engines.comparison.utils import assert_profiles_match
+
+
+def get_profile_by_column(profiles, column_name: str):
+ """Find a column profile by column name."""
+ for p in profiles:
+ if p.column == column_name:
+ return p
+ return None
+
+
+@requires_spark
+class TestBasicProfilingParity:
+ """Parity tests for basic profiling functionality."""
+
+ def test_profile_all_columns(self, dual_engines_full: DualEngines):
+ """Profile all columns produces same results on both engines."""
+ spark_profiles = dual_engines_full.spark_engine.profile_columns()
+ duckdb_profiles = dual_engines_full.duckdb_engine.profile_columns()
+ assert_profiles_match(spark_profiles, duckdb_profiles, "All columns")
+
+ def test_profile_specific_columns(self, dual_engines_full: DualEngines):
+ """Profile specific columns produces same results."""
+ spark_profiles = dual_engines_full.spark_engine.profile_columns(columns=["att1", "item"])
+ duckdb_profiles = dual_engines_full.duckdb_engine.profile_columns(columns=["att1", "item"])
+ assert_profiles_match(spark_profiles, duckdb_profiles, "Specific columns")
+
+
+@requires_spark
+class TestCompletenessProfilingParity:
+ """Parity tests for completeness in profiles."""
+
+ def test_completeness_full(self, dual_engines_full: DualEngines):
+ """Completeness is same for complete columns on both engines."""
+ spark_profiles = dual_engines_full.spark_engine.profile_columns(columns=["att1"])
+ duckdb_profiles = dual_engines_full.duckdb_engine.profile_columns(columns=["att1"])
+ assert_profiles_match(spark_profiles, duckdb_profiles, "Completeness full")
+
+ def test_completeness_partial(self, dual_engines_missing: DualEngines):
+ """Completeness is same for partial columns on both engines."""
+ spark_profiles = dual_engines_missing.spark_engine.profile_columns(columns=["att1", "att2"])
+ duckdb_profiles = dual_engines_missing.duckdb_engine.profile_columns(columns=["att1", "att2"])
+ assert_profiles_match(spark_profiles, duckdb_profiles, "Completeness partial")
+
+ def test_completeness_all_null(self, dual_engines_all_null: DualEngines):
+ """Completeness is same for all-NULL column on both engines."""
+ spark_profiles = dual_engines_all_null.spark_engine.profile_columns(columns=["value"])
+ duckdb_profiles = dual_engines_all_null.duckdb_engine.profile_columns(columns=["value"])
+ assert_profiles_match(spark_profiles, duckdb_profiles, "Completeness all null")
+
+
+@requires_spark
+class TestDistinctValuesProfilingParity:
+ """Parity tests for distinct values in profiles."""
+
+ def test_distinct_values(self, dual_engines_distinct: DualEngines):
+ """Distinct value counts are same on both engines."""
+ spark_profiles = dual_engines_distinct.spark_engine.profile_columns(columns=["att1", "att2"])
+ duckdb_profiles = dual_engines_distinct.duckdb_engine.profile_columns(columns=["att1", "att2"])
+ assert_profiles_match(spark_profiles, duckdb_profiles, "Distinct values")
+
+
+@requires_spark
+class TestNumericProfilingParity:
+ """Parity tests for numeric column profiling."""
+
+ def test_numeric_statistics(self, dual_engines_numeric: DualEngines):
+ """Numeric statistics are same on both engines."""
+ spark_profiles = dual_engines_numeric.spark_engine.profile_columns(columns=["att1"])
+ duckdb_profiles = dual_engines_numeric.duckdb_engine.profile_columns(columns=["att1"])
+ assert_profiles_match(spark_profiles, duckdb_profiles, "Numeric statistics")
+
+ def test_numeric_with_nulls(self, dual_engines_numeric: DualEngines):
+ """Numeric statistics handle NULLs same way on both engines."""
+ spark_profiles = dual_engines_numeric.spark_engine.profile_columns(columns=["att2"])
+ duckdb_profiles = dual_engines_numeric.duckdb_engine.profile_columns(columns=["att2"])
+ assert_profiles_match(spark_profiles, duckdb_profiles, "Numeric with nulls")
+
+
+@requires_spark
+class TestHistogramProfilingParity:
+ """Parity tests for histogram profiling."""
+
+ def test_histogram(self, dual_engines_histogram: DualEngines):
+ """Histogram profiling produces consistent results."""
+ spark_profiles = dual_engines_histogram.spark_engine.profile_columns(
+ columns=["category"],
+ low_cardinality_threshold=10
+ )
+ duckdb_profiles = dual_engines_histogram.duckdb_engine.profile_columns(
+ columns=["category"],
+ low_cardinality_threshold=10
+ )
+ # Check profiles exist and have histogram data
+ # (exact histogram format may differ)
+ assert len(spark_profiles) > 0
+ assert len(duckdb_profiles) > 0
+
+
+@requires_spark
+class TestEdgeCaseProfilingParity:
+ """Parity tests for edge cases in profiling."""
+
+ def test_single_row(self, dual_engines_single: DualEngines):
+ """Single-row profiling produces same results."""
+ spark_profiles = dual_engines_single.spark_engine.profile_columns()
+ duckdb_profiles = dual_engines_single.duckdb_engine.profile_columns()
+ assert_profiles_match(spark_profiles, duckdb_profiles, "Single row")
+
+
+@requires_spark
+class TestMixedTypeProfilingParity:
+ """Parity tests for mixed column types."""
+
+ def test_mixed_types(self, dual_engines_full: DualEngines):
+ """Mixed column types produce same results on both engines."""
+ spark_profiles = dual_engines_full.spark_engine.profile_columns()
+ duckdb_profiles = dual_engines_full.duckdb_engine.profile_columns()
+ assert_profiles_match(spark_profiles, duckdb_profiles, "Mixed types")
diff --git a/tests/engines/comparison/test_suggestion_parity.py b/tests/engines/comparison/test_suggestion_parity.py
new file mode 100644
index 0000000..510e17d
--- /dev/null
+++ b/tests/engines/comparison/test_suggestion_parity.py
@@ -0,0 +1,222 @@
+# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Cross-engine constraint suggestion parity tests.
+
+Tests that verify DuckDB engine produces the same constraint suggestions
+as the Spark engine baseline. Requires Spark Connect to be running.
+
+Note: Suggestions may differ between engines due to different profiling
+algorithms. These tests focus on structural consistency rather than
+exact match.
+"""
+
+import pytest
+
+from pydeequ.v2.suggestions import Rules
+
+from tests.engines.comparison.conftest import requires_spark, DualEngines
+
+
+def get_suggestions_for_column(suggestions, column_name: str):
+ """Get all suggestions for a specific column."""
+ return [s for s in suggestions if s.column_name == column_name]
+
+
+def get_suggestions_by_constraint(suggestions, constraint_name: str):
+ """Get all suggestions matching a constraint type."""
+ return [s for s in suggestions if constraint_name in s.constraint_name]
+
+
+@requires_spark
+class TestSuggestionStructureParity:
+ """Parity tests for suggestion structure consistency."""
+
+ def test_default_rules_structure(self, dual_engines_full: DualEngines):
+ """DEFAULT rules produce structurally similar suggestions."""
+ spark_suggestions = dual_engines_full.spark_engine.suggest_constraints(rules=[Rules.DEFAULT])
+ duckdb_suggestions = dual_engines_full.duckdb_engine.suggest_constraints(rules=[Rules.DEFAULT])
+
+ # Both should return a list of suggestions
+ assert isinstance(spark_suggestions, list)
+ assert isinstance(duckdb_suggestions, list)
+
+ # Both should have required fields
+ if spark_suggestions:
+ s = spark_suggestions[0]
+ assert hasattr(s, 'column_name')
+ assert hasattr(s, 'constraint_name')
+ assert hasattr(s, 'description')
+
+ if duckdb_suggestions:
+ s = duckdb_suggestions[0]
+ assert hasattr(s, 'column_name')
+ assert hasattr(s, 'constraint_name')
+ assert hasattr(s, 'description')
+
+ def test_numerical_rules_structure(self, dual_engines_numeric: DualEngines):
+ """NUMERICAL rules produce structurally similar suggestions."""
+ spark_suggestions = dual_engines_numeric.spark_engine.suggest_constraints(rules=[Rules.NUMERICAL])
+ duckdb_suggestions = dual_engines_numeric.duckdb_engine.suggest_constraints(rules=[Rules.NUMERICAL])
+
+ assert isinstance(spark_suggestions, list)
+ assert isinstance(duckdb_suggestions, list)
+
+ def test_string_rules_structure(self, dual_engines_string_lengths: DualEngines):
+ """STRING rules produce structurally similar suggestions."""
+ spark_suggestions = dual_engines_string_lengths.spark_engine.suggest_constraints(rules=[Rules.STRING])
+ duckdb_suggestions = dual_engines_string_lengths.duckdb_engine.suggest_constraints(rules=[Rules.STRING])
+
+ assert isinstance(spark_suggestions, list)
+ assert isinstance(duckdb_suggestions, list)
+
+
+@requires_spark
+class TestSuggestionColumnCoverage:
+ """Parity tests for column coverage in suggestions."""
+
+ def test_complete_column_suggestions(self, dual_engines_full: DualEngines):
+ """Both engines suggest constraints for complete columns."""
+ spark_suggestions = dual_engines_full.spark_engine.suggest_constraints(
+ columns=["att1"],
+ rules=[Rules.DEFAULT]
+ )
+ duckdb_suggestions = dual_engines_full.duckdb_engine.suggest_constraints(
+ columns=["att1"],
+ rules=[Rules.DEFAULT]
+ )
+
+ # Both should return results (may differ in content)
+ assert isinstance(spark_suggestions, list)
+ assert isinstance(duckdb_suggestions, list)
+
+ def test_numeric_column_suggestions(self, dual_engines_numeric: DualEngines):
+ """Both engines suggest constraints for numeric columns."""
+ spark_suggestions = dual_engines_numeric.spark_engine.suggest_constraints(
+ columns=["att1"],
+ rules=[Rules.NUMERICAL]
+ )
+ duckdb_suggestions = dual_engines_numeric.duckdb_engine.suggest_constraints(
+ columns=["att1"],
+ rules=[Rules.NUMERICAL]
+ )
+
+ assert isinstance(spark_suggestions, list)
+ assert isinstance(duckdb_suggestions, list)
+
+
+@requires_spark
+class TestSuggestionConstraintTypes:
+ """Parity tests for suggested constraint types."""
+
+ def test_completeness_suggestions(self, dual_engines_full: DualEngines):
+ """Both engines may suggest completeness constraints."""
+ spark_suggestions = dual_engines_full.spark_engine.suggest_constraints(rules=[Rules.DEFAULT])
+ duckdb_suggestions = dual_engines_full.duckdb_engine.suggest_constraints(rules=[Rules.DEFAULT])
+
+ spark_completeness = get_suggestions_by_constraint(spark_suggestions, "Complete")
+ duckdb_completeness = get_suggestions_by_constraint(duckdb_suggestions, "Complete")
+
+ # Both might suggest completeness (or not - depends on data)
+ # Just verify structure is consistent
+ assert isinstance(spark_completeness, list)
+ assert isinstance(duckdb_completeness, list)
+
+ def test_uniqueness_suggestions(self, dual_engines_unique: DualEngines):
+ """Both engines may suggest uniqueness constraints."""
+ spark_suggestions = dual_engines_unique.spark_engine.suggest_constraints(rules=[Rules.COMMON])
+ duckdb_suggestions = dual_engines_unique.duckdb_engine.suggest_constraints(rules=[Rules.COMMON])
+
+ assert isinstance(spark_suggestions, list)
+ assert isinstance(duckdb_suggestions, list)
+
+
+@requires_spark
+class TestSuggestionRuleSetsParity:
+ """Parity tests for different rule sets."""
+
+ def test_extended_rules(self, dual_engines_full: DualEngines):
+ """EXTENDED rules produce consistent suggestions on both engines."""
+ spark_suggestions = dual_engines_full.spark_engine.suggest_constraints(rules=[Rules.EXTENDED])
+ duckdb_suggestions = dual_engines_full.duckdb_engine.suggest_constraints(rules=[Rules.EXTENDED])
+
+ assert isinstance(spark_suggestions, list)
+ assert isinstance(duckdb_suggestions, list)
+
+ def test_multiple_rule_sets(self, dual_engines_numeric: DualEngines):
+ """Multiple rule sets produce consistent suggestions."""
+ spark_suggestions = dual_engines_numeric.spark_engine.suggest_constraints(
+ rules=[Rules.DEFAULT, Rules.NUMERICAL]
+ )
+ duckdb_suggestions = dual_engines_numeric.duckdb_engine.suggest_constraints(
+ rules=[Rules.DEFAULT, Rules.NUMERICAL]
+ )
+
+ assert isinstance(spark_suggestions, list)
+ assert isinstance(duckdb_suggestions, list)
+
+
+@requires_spark
+class TestSuggestionEdgeCases:
+ """Parity tests for edge cases in suggestions."""
+
+ def test_single_row_suggestions(self, dual_engines_single: DualEngines):
+ """Single-row dataset produces consistent suggestions."""
+ spark_suggestions = dual_engines_single.spark_engine.suggest_constraints(rules=[Rules.DEFAULT])
+ duckdb_suggestions = dual_engines_single.duckdb_engine.suggest_constraints(rules=[Rules.DEFAULT])
+
+ assert isinstance(spark_suggestions, list)
+ assert isinstance(duckdb_suggestions, list)
+
+ def test_all_null_column_suggestions(self, dual_engines_all_null: DualEngines):
+ """All-NULL column produces consistent suggestions."""
+ spark_suggestions = dual_engines_all_null.spark_engine.suggest_constraints(
+ columns=["value"],
+ rules=[Rules.DEFAULT]
+ )
+ duckdb_suggestions = dual_engines_all_null.duckdb_engine.suggest_constraints(
+ columns=["value"],
+ rules=[Rules.DEFAULT]
+ )
+
+ assert isinstance(spark_suggestions, list)
+ assert isinstance(duckdb_suggestions, list)
+
+
+@requires_spark
+class TestSuggestionColumnRestriction:
+ """Parity tests for column restriction in suggestions."""
+
+ def test_restrict_to_columns(self, dual_engines_full: DualEngines):
+ """Column restriction produces consistent suggestions."""
+ spark_suggestions = dual_engines_full.spark_engine.suggest_constraints(
+ columns=["att1", "att2"],
+ rules=[Rules.DEFAULT]
+ )
+ duckdb_suggestions = dual_engines_full.duckdb_engine.suggest_constraints(
+ columns=["att1", "att2"],
+ rules=[Rules.DEFAULT]
+ )
+
+ # Check that suggestions are for the restricted columns
+ spark_columns = {s.column_name for s in spark_suggestions if s.column_name}
+ duckdb_columns = {s.column_name for s in duckdb_suggestions if s.column_name}
+
+ # Both should only include requested columns (or None for dataset-level)
+ for col in spark_columns:
+ if col:
+ assert col in ["att1", "att2"]
+ for col in duckdb_columns:
+ if col:
+ assert col in ["att1", "att2"]
diff --git a/tests/engines/comparison/utils.py b/tests/engines/comparison/utils.py
new file mode 100644
index 0000000..78be3df
--- /dev/null
+++ b/tests/engines/comparison/utils.py
@@ -0,0 +1,434 @@
+# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Comparison utilities for cross-engine testing.
+
+Provides utilities for comparing results between DuckDB and Spark engines
+with appropriate tolerance levels for different metric types.
+"""
+
+import json
+import math
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional, Tuple
+
+from pydeequ.engines import MetricResult, ConstraintResult, ColumnProfile
+
+
+# Tolerance levels for comparing floating-point results
+FLOAT_EPSILON = 1e-9 # Exact comparisons: Size, Completeness, Uniqueness
+FLOAT_TOLERANCE = 1e-6 # Statistical: Mean, StdDev, Correlation
+APPROX_TOLERANCE = 0.1 # Approximate algorithms: ApproxCountDistinct (10% relative)
+ENTROPY_TOLERANCE = 1e-4 # Information theory metrics: Entropy, MutualInformation
+
+
+# Mapping of analyzer types to their expected tolerance
+ANALYZER_TOLERANCES: Dict[str, float] = {
+ # Exact metrics
+ "Size": FLOAT_EPSILON,
+ "Completeness": FLOAT_EPSILON,
+ "Uniqueness": FLOAT_EPSILON,
+ "Distinctness": FLOAT_EPSILON,
+ "UniqueValueRatio": FLOAT_EPSILON,
+ "CountDistinct": FLOAT_EPSILON,
+ "MinLength": FLOAT_EPSILON,
+ "MaxLength": FLOAT_EPSILON,
+ "PatternMatch": FLOAT_EPSILON,
+ "Compliance": FLOAT_EPSILON,
+
+ # Statistical metrics
+ "Mean": FLOAT_TOLERANCE,
+ "Sum": FLOAT_TOLERANCE,
+ "Minimum": FLOAT_TOLERANCE,
+ "Maximum": FLOAT_TOLERANCE,
+ "StandardDeviation": FLOAT_TOLERANCE,
+ "Correlation": FLOAT_TOLERANCE,
+ "Entropy": ENTROPY_TOLERANCE,
+ "MutualInformation": ENTROPY_TOLERANCE,
+ "ApproxQuantile": APPROX_TOLERANCE,
+
+ # Approximate metrics
+ "ApproxCountDistinct": APPROX_TOLERANCE,
+}
+
+
+def get_tolerance(analyzer_name: str) -> float:
+ """Get the appropriate tolerance for an analyzer type."""
+ return ANALYZER_TOLERANCES.get(analyzer_name, FLOAT_TOLERANCE)
+
+
+def values_equal(
+ actual: Any,
+ expected: Any,
+ tolerance: float = FLOAT_TOLERANCE
+) -> bool:
+ """Check if two values are equal within tolerance.
+
+ Handles None, NaN, strings, and numeric values appropriately.
+
+ Args:
+ actual: The actual value from DuckDB
+ expected: The expected value from Spark
+ tolerance: The tolerance for numeric comparison
+
+ Returns:
+ True if values are considered equal
+ """
+ # Handle None/null
+ if actual is None and expected is None:
+ return True
+ if actual is None or expected is None:
+ return False
+
+ # Handle NaN
+ if isinstance(actual, float) and isinstance(expected, float):
+ if math.isnan(actual) and math.isnan(expected):
+ return True
+ if math.isnan(actual) or math.isnan(expected):
+ return False
+
+ # Handle JSON strings vs dicts (for Histogram, DataType)
+ if isinstance(actual, str) and not isinstance(expected, str):
+ try:
+ actual = json.loads(actual)
+ except (json.JSONDecodeError, TypeError):
+ pass
+ if isinstance(expected, str) and not isinstance(actual, str):
+ try:
+ expected = json.loads(expected)
+ except (json.JSONDecodeError, TypeError):
+ pass
+
+ # Handle strings
+ if isinstance(actual, str) and isinstance(expected, str):
+ return actual == expected
+
+ # Handle numeric values
+ try:
+ actual_float = float(actual)
+ expected_float = float(expected)
+
+ if tolerance >= APPROX_TOLERANCE:
+ # Relative tolerance for approximate algorithms
+ if expected_float == 0:
+ return abs(actual_float) < tolerance
+ return abs(actual_float - expected_float) / abs(expected_float) < tolerance
+ else:
+ # Absolute tolerance for exact/statistical metrics
+ return abs(actual_float - expected_float) < tolerance
+ except (TypeError, ValueError):
+ # Fall back to exact equality
+ return actual == expected
+
+
+@dataclass
+class MetricDifference:
+ """Represents a difference between two metric results."""
+ name: str
+ instance: Optional[str]
+ spark_value: Any
+ duckdb_value: Any
+ tolerance: float
+ is_match: bool
+ message: str = ""
+
+
+@dataclass
+class ComparisonReport:
+ """Report comparing results from two engines."""
+ total_metrics: int = 0
+ matching_metrics: int = 0
+ differing_metrics: int = 0
+ spark_only_metrics: int = 0
+ duckdb_only_metrics: int = 0
+ differences: List[MetricDifference] = field(default_factory=list)
+
+ @property
+ def success(self) -> bool:
+ """True if all metrics match within tolerance."""
+ return self.differing_metrics == 0 and self.spark_only_metrics == 0 and self.duckdb_only_metrics == 0
+
+ def summary(self) -> str:
+ """Generate a summary string."""
+ lines = [
+ f"Comparison Report:",
+ f" Total metrics: {self.total_metrics}",
+ f" Matching: {self.matching_metrics}",
+ f" Differing: {self.differing_metrics}",
+ f" Spark-only: {self.spark_only_metrics}",
+ f" DuckDB-only: {self.duckdb_only_metrics}",
+ ]
+ if self.differences:
+ lines.append(" Differences:")
+ for diff in self.differences:
+ lines.append(f" - {diff.name}({diff.instance}): Spark={diff.spark_value}, DuckDB={diff.duckdb_value}")
+ return "\n".join(lines)
+
+
+def index_metrics(metrics: List[MetricResult]) -> Dict[Tuple[str, str], MetricResult]:
+ """Index metrics by (name, instance) tuple for efficient lookup."""
+ return {(m.name, m.instance or ""): m for m in metrics}
+
+
+def compare_metrics(
+ spark_metrics: List[MetricResult],
+ duckdb_metrics: List[MetricResult]
+) -> ComparisonReport:
+ """Compare metric results from Spark and DuckDB engines.
+
+ Args:
+ spark_metrics: Metrics computed by Spark engine
+ duckdb_metrics: Metrics computed by DuckDB engine
+
+ Returns:
+ ComparisonReport with detailed comparison results
+ """
+ report = ComparisonReport()
+
+ # Index by (name, instance)
+ spark_index = index_metrics(spark_metrics)
+ duckdb_index = index_metrics(duckdb_metrics)
+
+ all_keys = set(spark_index.keys()) | set(duckdb_index.keys())
+ report.total_metrics = len(all_keys)
+
+ for key in all_keys:
+ name, instance = key
+ tolerance = get_tolerance(name)
+
+ spark_metric = spark_index.get(key)
+ duckdb_metric = duckdb_index.get(key)
+
+ if spark_metric is None:
+ report.duckdb_only_metrics += 1
+ report.differences.append(MetricDifference(
+ name=name,
+ instance=instance,
+ spark_value=None,
+ duckdb_value=duckdb_metric.value if duckdb_metric else None,
+ tolerance=tolerance,
+ is_match=False,
+ message="Metric only in DuckDB"
+ ))
+ elif duckdb_metric is None:
+ report.spark_only_metrics += 1
+ report.differences.append(MetricDifference(
+ name=name,
+ instance=instance,
+ spark_value=spark_metric.value,
+ duckdb_value=None,
+ tolerance=tolerance,
+ is_match=False,
+ message="Metric only in Spark"
+ ))
+ else:
+ is_match = values_equal(spark_metric.value, duckdb_metric.value, tolerance)
+ if is_match:
+ report.matching_metrics += 1
+ else:
+ report.differing_metrics += 1
+ report.differences.append(MetricDifference(
+ name=name,
+ instance=instance,
+ spark_value=spark_metric.value,
+ duckdb_value=duckdb_metric.value,
+ tolerance=tolerance,
+ is_match=False,
+ message=f"Values differ (tolerance={tolerance})"
+ ))
+
+ return report
+
+
+def compare_constraint_results(
+ spark_results: List[ConstraintResult],
+ duckdb_results: List[ConstraintResult]
+) -> ComparisonReport:
+ """Compare constraint results from Spark and DuckDB engines.
+
+ Comparison is done by position within each check group, since constraint
+ names may differ between engines (e.g., Spark uses 'SizeConstraint(Size(None))'
+ while DuckDB uses 'hasSize(assertion)').
+
+ Args:
+ spark_results: Constraint results from Spark engine
+ duckdb_results: Constraint results from DuckDB engine
+
+ Returns:
+ ComparisonReport with detailed comparison results
+ """
+ report = ComparisonReport()
+
+ # Group results by check_description to maintain ordering within checks
+ def group_by_check(results: List[ConstraintResult]) -> Dict[str, List[ConstraintResult]]:
+ groups: Dict[str, List[ConstraintResult]] = {}
+ for r in results:
+ key = r.check_description
+ if key not in groups:
+ groups[key] = []
+ groups[key].append(r)
+ return groups
+
+ spark_groups = group_by_check(spark_results)
+ duckdb_groups = group_by_check(duckdb_results)
+
+ all_checks = set(spark_groups.keys()) | set(duckdb_groups.keys())
+
+ for check_desc in all_checks:
+ spark_list = spark_groups.get(check_desc, [])
+ duckdb_list = duckdb_groups.get(check_desc, [])
+
+ # Compare by position within each check
+ max_len = max(len(spark_list), len(duckdb_list))
+ report.total_metrics += max_len
+
+ for i in range(max_len):
+ spark_result = spark_list[i] if i < len(spark_list) else None
+ duckdb_result = duckdb_list[i] if i < len(duckdb_list) else None
+
+ if spark_result is None:
+ report.duckdb_only_metrics += 1
+ elif duckdb_result is None:
+ report.spark_only_metrics += 1
+ else:
+ # Compare constraint status
+ spark_status = spark_result.constraint_status
+ duckdb_status = duckdb_result.constraint_status
+
+ if spark_status == duckdb_status:
+ report.matching_metrics += 1
+ else:
+ report.differing_metrics += 1
+ report.differences.append(MetricDifference(
+ name="ConstraintStatus",
+ instance=f"{check_desc}[{i}]",
+ spark_value=str(spark_status),
+ duckdb_value=str(duckdb_status),
+ tolerance=0,
+ is_match=False,
+ message="Constraint status differs"
+ ))
+
+ return report
+
+
+def compare_profiles(
+ spark_profiles: List[ColumnProfile],
+ duckdb_profiles: List[ColumnProfile]
+) -> ComparisonReport:
+ """Compare column profiles from Spark and DuckDB engines.
+
+ Args:
+ spark_profiles: Column profiles from Spark engine
+ duckdb_profiles: Column profiles from DuckDB engine
+
+ Returns:
+ ComparisonReport with detailed comparison results
+ """
+ report = ComparisonReport()
+
+ # Index by column name
+ spark_index = {p.column: p for p in spark_profiles}
+ duckdb_index = {p.column: p for p in duckdb_profiles}
+
+ all_columns = set(spark_index.keys()) | set(duckdb_index.keys())
+
+ for column in all_columns:
+ spark_profile = spark_index.get(column)
+ duckdb_profile = duckdb_index.get(column)
+
+ if spark_profile is None:
+ report.duckdb_only_metrics += 1
+ continue
+ if duckdb_profile is None:
+ report.spark_only_metrics += 1
+ continue
+
+ # Compare profile attributes
+ attrs_to_compare = [
+ ("completeness", FLOAT_EPSILON),
+ ("approx_distinct_values", APPROX_TOLERANCE),
+ ("mean", FLOAT_TOLERANCE),
+ ("minimum", FLOAT_TOLERANCE),
+ ("maximum", FLOAT_TOLERANCE),
+ ("sum", FLOAT_TOLERANCE),
+ ("std_dev", APPROX_TOLERANCE), # Use relative tolerance for sample vs pop
+ ]
+
+ for attr, tolerance in attrs_to_compare:
+ spark_val = getattr(spark_profile, attr, None)
+ duckdb_val = getattr(duckdb_profile, attr, None)
+
+ report.total_metrics += 1
+
+ if values_equal(spark_val, duckdb_val, tolerance):
+ report.matching_metrics += 1
+ else:
+ report.differing_metrics += 1
+ report.differences.append(MetricDifference(
+ name=attr,
+ instance=column,
+ spark_value=spark_val,
+ duckdb_value=duckdb_val,
+ tolerance=tolerance,
+ is_match=False,
+ message=f"Profile attribute {attr} differs"
+ ))
+
+ return report
+
+
+def assert_metrics_match(
+ spark_metrics: List[MetricResult],
+ duckdb_metrics: List[MetricResult],
+ msg: str = ""
+) -> None:
+ """Assert that metrics from both engines match within tolerance.
+
+ Raises:
+ AssertionError: If metrics don't match
+ """
+ report = compare_metrics(spark_metrics, duckdb_metrics)
+ if not report.success:
+ raise AssertionError(f"{msg}\n{report.summary()}")
+
+
+def assert_constraints_match(
+ spark_results: List[ConstraintResult],
+ duckdb_results: List[ConstraintResult],
+ msg: str = ""
+) -> None:
+ """Assert that constraint results from both engines match.
+
+ Raises:
+ AssertionError: If results don't match
+ """
+ report = compare_constraint_results(spark_results, duckdb_results)
+ if not report.success:
+ raise AssertionError(f"{msg}\n{report.summary()}")
+
+
+def assert_profiles_match(
+ spark_profiles: List[ColumnProfile],
+ duckdb_profiles: List[ColumnProfile],
+ msg: str = ""
+) -> None:
+ """Assert that profiles from both engines match within tolerance.
+
+ Raises:
+ AssertionError: If profiles don't match
+ """
+ report = compare_profiles(spark_profiles, duckdb_profiles)
+ if not report.success:
+ raise AssertionError(f"{msg}\n{report.summary()}")
diff --git a/tests/engines/conftest.py b/tests/engines/conftest.py
new file mode 100644
index 0000000..5a53f9d
--- /dev/null
+++ b/tests/engines/conftest.py
@@ -0,0 +1,330 @@
+# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""DuckDB engine test fixtures.
+
+Provides fixtures for creating DuckDB engines with various test datasets.
+These fixtures are used by DuckDB-only tests that don't require Spark.
+"""
+
+from typing import Callable, Generator
+import pytest
+import duckdb
+import pandas as pd
+
+from pydeequ.engines.duckdb import DuckDBEngine
+from tests.engines.fixtures.datasets import (
+ create_df_full,
+ create_df_missing,
+ create_df_numeric,
+ create_df_unique,
+ create_df_distinct,
+ create_df_string_lengths,
+ create_df_empty,
+ create_df_single,
+ create_df_all_null,
+ create_df_escape,
+ create_df_correlation,
+ create_df_entropy,
+ create_df_where,
+ create_df_pattern,
+ create_df_compliance,
+ create_df_quantile,
+ create_df_contained_in,
+ create_df_histogram,
+ create_df_mutual_info,
+ create_df_data_type,
+ DATASET_FACTORIES,
+)
+
+
+@pytest.fixture(scope="module")
+def duckdb_connection() -> Generator[duckdb.DuckDBPyConnection, None, None]:
+ """Create a module-scoped DuckDB connection."""
+ conn = duckdb.connect(":memory:")
+ yield conn
+ conn.close()
+
+
+def _create_engine_from_df(
+ conn: duckdb.DuckDBPyConnection,
+ df: pd.DataFrame,
+ table_name: str
+) -> DuckDBEngine:
+ """Helper to create a DuckDB engine from a pandas DataFrame."""
+ # Register the DataFrame as a table
+ conn.register(table_name, df)
+ # Create engine pointing to the table
+ return DuckDBEngine(conn, table_name)
+
+
+@pytest.fixture(scope="function")
+def engine_factory(duckdb_connection: duckdb.DuckDBPyConnection) -> Callable[[str], DuckDBEngine]:
+ """Factory fixture to create DuckDB engines for any dataset.
+
+ Usage:
+ def test_something(engine_factory):
+ engine = engine_factory("df_full")
+ results = engine.compute_metrics([Size()])
+ """
+ tables_created = []
+
+ def factory(dataset_name: str) -> DuckDBEngine:
+ if dataset_name not in DATASET_FACTORIES:
+ raise ValueError(f"Unknown dataset: {dataset_name}")
+
+ table_name = f"test_{dataset_name}"
+ df = DATASET_FACTORIES[dataset_name]()
+
+ # Unregister if already exists (for reuse in same test)
+ try:
+ duckdb_connection.unregister(table_name)
+ except Exception:
+ pass
+
+ duckdb_connection.register(table_name, df)
+ tables_created.append(table_name)
+
+ return DuckDBEngine(duckdb_connection, table_name)
+
+ yield factory
+
+ # Cleanup: unregister all tables
+ for table_name in tables_created:
+ try:
+ duckdb_connection.unregister(table_name)
+ except Exception:
+ pass
+
+
+# Individual dataset fixtures for convenience
+
+
+@pytest.fixture(scope="function")
+def engine_full(duckdb_connection: duckdb.DuckDBPyConnection) -> Generator[DuckDBEngine, None, None]:
+ """DuckDB engine with df_full dataset."""
+ table_name = "test_df_full"
+ df = create_df_full()
+ duckdb_connection.register(table_name, df)
+ yield DuckDBEngine(duckdb_connection, table_name)
+ duckdb_connection.unregister(table_name)
+
+
+@pytest.fixture(scope="function")
+def engine_missing(duckdb_connection: duckdb.DuckDBPyConnection) -> Generator[DuckDBEngine, None, None]:
+ """DuckDB engine with df_missing dataset."""
+ table_name = "test_df_missing"
+ df = create_df_missing()
+ duckdb_connection.register(table_name, df)
+ yield DuckDBEngine(duckdb_connection, table_name)
+ duckdb_connection.unregister(table_name)
+
+
+@pytest.fixture(scope="function")
+def engine_numeric(duckdb_connection: duckdb.DuckDBPyConnection) -> Generator[DuckDBEngine, None, None]:
+ """DuckDB engine with df_numeric dataset."""
+ table_name = "test_df_numeric"
+ df = create_df_numeric()
+ duckdb_connection.register(table_name, df)
+ yield DuckDBEngine(duckdb_connection, table_name)
+ duckdb_connection.unregister(table_name)
+
+
+@pytest.fixture(scope="function")
+def engine_unique(duckdb_connection: duckdb.DuckDBPyConnection) -> Generator[DuckDBEngine, None, None]:
+ """DuckDB engine with df_unique dataset."""
+ table_name = "test_df_unique"
+ df = create_df_unique()
+ duckdb_connection.register(table_name, df)
+ yield DuckDBEngine(duckdb_connection, table_name)
+ duckdb_connection.unregister(table_name)
+
+
+@pytest.fixture(scope="function")
+def engine_distinct(duckdb_connection: duckdb.DuckDBPyConnection) -> Generator[DuckDBEngine, None, None]:
+ """DuckDB engine with df_distinct dataset."""
+ table_name = "test_df_distinct"
+ df = create_df_distinct()
+ duckdb_connection.register(table_name, df)
+ yield DuckDBEngine(duckdb_connection, table_name)
+ duckdb_connection.unregister(table_name)
+
+
+@pytest.fixture(scope="function")
+def engine_string_lengths(duckdb_connection: duckdb.DuckDBPyConnection) -> Generator[DuckDBEngine, None, None]:
+ """DuckDB engine with df_string_lengths dataset."""
+ table_name = "test_df_string_lengths"
+ df = create_df_string_lengths()
+ duckdb_connection.register(table_name, df)
+ yield DuckDBEngine(duckdb_connection, table_name)
+ duckdb_connection.unregister(table_name)
+
+
+@pytest.fixture(scope="function")
+def engine_empty(duckdb_connection: duckdb.DuckDBPyConnection) -> Generator[DuckDBEngine, None, None]:
+ """DuckDB engine with df_empty dataset."""
+ table_name = "test_df_empty"
+ df = create_df_empty()
+ duckdb_connection.register(table_name, df)
+ yield DuckDBEngine(duckdb_connection, table_name)
+ duckdb_connection.unregister(table_name)
+
+
+@pytest.fixture(scope="function")
+def engine_single(duckdb_connection: duckdb.DuckDBPyConnection) -> Generator[DuckDBEngine, None, None]:
+ """DuckDB engine with df_single dataset."""
+ table_name = "test_df_single"
+ df = create_df_single()
+ duckdb_connection.register(table_name, df)
+ yield DuckDBEngine(duckdb_connection, table_name)
+ duckdb_connection.unregister(table_name)
+
+
+@pytest.fixture(scope="function")
+def engine_all_null(duckdb_connection: duckdb.DuckDBPyConnection) -> Generator[DuckDBEngine, None, None]:
+ """DuckDB engine with df_all_null dataset."""
+ table_name = "test_df_all_null"
+ df = create_df_all_null()
+ duckdb_connection.register(table_name, df)
+ yield DuckDBEngine(duckdb_connection, table_name)
+ duckdb_connection.unregister(table_name)
+
+
+@pytest.fixture(scope="function")
+def engine_escape(duckdb_connection: duckdb.DuckDBPyConnection) -> Generator[DuckDBEngine, None, None]:
+ """DuckDB engine with df_escape dataset."""
+ table_name = "test_df_escape"
+ df = create_df_escape()
+ duckdb_connection.register(table_name, df)
+ yield DuckDBEngine(duckdb_connection, table_name)
+ duckdb_connection.unregister(table_name)
+
+
+@pytest.fixture(scope="function")
+def engine_correlation(duckdb_connection: duckdb.DuckDBPyConnection) -> Generator[DuckDBEngine, None, None]:
+ """DuckDB engine with df_correlation dataset."""
+ table_name = "test_df_correlation"
+ df = create_df_correlation()
+ duckdb_connection.register(table_name, df)
+ yield DuckDBEngine(duckdb_connection, table_name)
+ duckdb_connection.unregister(table_name)
+
+
+@pytest.fixture(scope="function")
+def engine_entropy(duckdb_connection: duckdb.DuckDBPyConnection) -> Generator[DuckDBEngine, None, None]:
+ """DuckDB engine with df_entropy dataset."""
+ table_name = "test_df_entropy"
+ df = create_df_entropy()
+ duckdb_connection.register(table_name, df)
+ yield DuckDBEngine(duckdb_connection, table_name)
+ duckdb_connection.unregister(table_name)
+
+
+@pytest.fixture(scope="function")
+def engine_where(duckdb_connection: duckdb.DuckDBPyConnection) -> Generator[DuckDBEngine, None, None]:
+ """DuckDB engine with df_where dataset."""
+ table_name = "test_df_where"
+ df = create_df_where()
+ duckdb_connection.register(table_name, df)
+ yield DuckDBEngine(duckdb_connection, table_name)
+ duckdb_connection.unregister(table_name)
+
+
+@pytest.fixture(scope="function")
+def engine_pattern(duckdb_connection: duckdb.DuckDBPyConnection) -> Generator[DuckDBEngine, None, None]:
+ """DuckDB engine with df_pattern dataset."""
+ table_name = "test_df_pattern"
+ df = create_df_pattern()
+ duckdb_connection.register(table_name, df)
+ yield DuckDBEngine(duckdb_connection, table_name)
+ duckdb_connection.unregister(table_name)
+
+
+@pytest.fixture(scope="function")
+def engine_compliance(duckdb_connection: duckdb.DuckDBPyConnection) -> Generator[DuckDBEngine, None, None]:
+ """DuckDB engine with df_compliance dataset."""
+ table_name = "test_df_compliance"
+ df = create_df_compliance()
+ duckdb_connection.register(table_name, df)
+ yield DuckDBEngine(duckdb_connection, table_name)
+ duckdb_connection.unregister(table_name)
+
+
+@pytest.fixture(scope="function")
+def engine_quantile(duckdb_connection: duckdb.DuckDBPyConnection) -> Generator[DuckDBEngine, None, None]:
+ """DuckDB engine with df_quantile dataset."""
+ table_name = "test_df_quantile"
+ df = create_df_quantile()
+ duckdb_connection.register(table_name, df)
+ yield DuckDBEngine(duckdb_connection, table_name)
+ duckdb_connection.unregister(table_name)
+
+
+@pytest.fixture(scope="function")
+def engine_contained_in(duckdb_connection: duckdb.DuckDBPyConnection) -> Generator[DuckDBEngine, None, None]:
+ """DuckDB engine with df_contained_in dataset."""
+ table_name = "test_df_contained_in"
+ df = create_df_contained_in()
+ duckdb_connection.register(table_name, df)
+ yield DuckDBEngine(duckdb_connection, table_name)
+ duckdb_connection.unregister(table_name)
+
+
+@pytest.fixture(scope="function")
+def engine_histogram(duckdb_connection: duckdb.DuckDBPyConnection) -> Generator[DuckDBEngine, None, None]:
+ """DuckDB engine with df_histogram dataset."""
+ table_name = "test_df_histogram"
+ df = create_df_histogram()
+ duckdb_connection.register(table_name, df)
+ yield DuckDBEngine(duckdb_connection, table_name)
+ duckdb_connection.unregister(table_name)
+
+
+@pytest.fixture(scope="function")
+def engine_mutual_info(duckdb_connection: duckdb.DuckDBPyConnection) -> Generator[DuckDBEngine, None, None]:
+ """DuckDB engine with df_mutual_info dataset."""
+ table_name = "test_df_mutual_info"
+ df = create_df_mutual_info()
+ duckdb_connection.register(table_name, df)
+ yield DuckDBEngine(duckdb_connection, table_name)
+ duckdb_connection.unregister(table_name)
+
+
+@pytest.fixture(scope="function")
+def engine_data_type(duckdb_connection: duckdb.DuckDBPyConnection) -> Generator[DuckDBEngine, None, None]:
+ """DuckDB engine with df_data_type dataset."""
+ table_name = "test_df_data_type"
+ df = create_df_data_type()
+ duckdb_connection.register(table_name, df)
+ yield DuckDBEngine(duckdb_connection, table_name)
+ duckdb_connection.unregister(table_name)
+
+
+# Helper function for metric lookup
+def get_metric_value(metrics, name: str, instance: str = None) -> float:
+ """Extract a metric value from results by name and optionally instance."""
+ for m in metrics:
+ if m.name == name:
+ if instance is None or m.instance == instance:
+ return m.value
+ return None
+
+
+def get_metric(metrics, name: str, instance: str = None):
+ """Extract a metric result from results by name and optionally instance."""
+ for m in metrics:
+ if m.name == name:
+ if instance is None or m.instance == instance:
+ return m
+ return None
diff --git a/tests/engines/fixtures/__init__.py b/tests/engines/fixtures/__init__.py
new file mode 100644
index 0000000..73c4067
--- /dev/null
+++ b/tests/engines/fixtures/__init__.py
@@ -0,0 +1,36 @@
+# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Test fixtures module.
+
+Contains test dataset definitions ported from the original Deequ Scala
+implementation for comprehensive edge case coverage.
+"""
+
+from .datasets import (
+ create_df_full,
+ create_df_missing,
+ create_df_numeric,
+ create_df_unique,
+ create_df_distinct,
+ create_df_string_lengths,
+ create_df_empty,
+ create_df_single,
+ create_df_all_null,
+ create_df_escape,
+ create_df_correlation,
+ create_df_entropy,
+ create_df_where,
+ EXPECTED_VALUES,
+)
diff --git a/tests/engines/fixtures/datasets.py b/tests/engines/fixtures/datasets.py
new file mode 100644
index 0000000..af002e8
--- /dev/null
+++ b/tests/engines/fixtures/datasets.py
@@ -0,0 +1,529 @@
+# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Test dataset definitions ported from Deequ Scala FixtureSupport.
+
+These datasets provide comprehensive edge case coverage for testing
+analyzers, constraints, profiles, and suggestions.
+"""
+
+from typing import Any, Dict, List, Tuple
+import math
+
+import pandas as pd
+
+
+def create_df_full() -> pd.DataFrame:
+ """Basic complete data with no nulls (4 rows).
+
+ Purpose: Basic complete data testing
+ Edge cases: No nulls, simple values
+ """
+ return pd.DataFrame({
+ "att1": ["a", "b", "c", "a"],
+ "att2": ["d", "e", "f", "d"],
+ "item": [1, 2, 3, 4],
+ "price": [10.0, 20.0, 30.0, 40.0],
+ })
+
+
+def create_df_missing() -> pd.DataFrame:
+ """Dataset with NULL handling patterns (12 rows).
+
+ Purpose: NULL handling tests
+ Edge cases: att1: 50% complete, att2: 75% complete
+ """
+ return pd.DataFrame({
+ "att1": ["a", "b", None, None, "e", "f", None, None, "i", "j", None, None],
+ "att2": ["d", "e", "f", None, "h", "i", "j", None, "l", "m", "n", None],
+ "item": list(range(1, 13)),
+ })
+
+
+def create_df_numeric() -> pd.DataFrame:
+ """Dataset for statistical tests (6 rows).
+
+ Purpose: Statistical analyzer tests (Mean, Sum, Min, Max, StdDev)
+ Edge cases: Mean=3.5, includes NULL column
+ Values: 1, 2, 3, 4, 5, 6 -> Mean=3.5, Sum=21, Min=1, Max=6
+ StdDev (population) = sqrt(17.5/6) ≈ 1.7078
+ """
+ return pd.DataFrame({
+ "att1": [1, 2, 3, 4, 5, 6],
+ "att2": [1.0, 2.0, 3.0, 4.0, 5.0, None], # One NULL
+ "item": ["a", "b", "c", "d", "e", "f"],
+ })
+
+
+def create_df_unique() -> pd.DataFrame:
+ """Dataset for uniqueness pattern tests (6 rows).
+
+ Purpose: Uniqueness analyzer tests
+ Edge cases: Various uniqueness scenarios
+ - unique_col: All unique values (uniqueness=1.0)
+ - half_null: 50% null (completeness=0.5)
+ - non_unique: Duplicates present
+ """
+ return pd.DataFrame({
+ "unique_col": [1, 2, 3, 4, 5, 6],
+ "half_null": [1, None, 3, None, 5, None],
+ "non_unique": [1, 1, 2, 2, 3, 3],
+ "all_same": [1, 1, 1, 1, 1, 1],
+ })
+
+
+def create_df_distinct() -> pd.DataFrame:
+ """Dataset for distinctness testing with duplicates (6 rows).
+
+ Purpose: Distinctness and uniqueness ratio testing
+ Edge cases: 3 distinct values in att1 with duplicates
+ - att1: ["a", "a", "b", "b", "c", "c"] -> 3 distinct, 0 unique
+ - att2: ["x", "y", "z", "w", "v", "u"] -> 6 distinct, 6 unique
+ """
+ return pd.DataFrame({
+ "att1": ["a", "a", "b", "b", "c", "c"],
+ "att2": ["x", "y", "z", "w", "v", "u"],
+ "item": [1, 2, 3, 4, 5, 6],
+ })
+
+
+def create_df_string_lengths() -> pd.DataFrame:
+ """Dataset for string length edge cases (5 rows).
+
+ Purpose: MinLength and MaxLength analyzer tests
+ Edge cases: Empty string (""), varying lengths
+ Lengths: 0, 1, 2, 3, 4
+ """
+ return pd.DataFrame({
+ "att1": ["", "a", "bb", "ccc", "dddd"],
+ "att2": ["hello", "world", "test", "data", "value"],
+ "item": [1, 2, 3, 4, 5],
+ })
+
+
+def create_df_empty() -> pd.DataFrame:
+ """Empty dataset with schema (0 rows).
+
+ Purpose: Edge case testing with zero rows
+ Edge cases: Size=0, Completeness=1.0 (vacuously true for empty)
+ """
+ return pd.DataFrame({
+ "att1": pd.Series([], dtype="object"),
+ "att2": pd.Series([], dtype="object"),
+ "item": pd.Series([], dtype="int64"),
+ })
+
+
+def create_df_single() -> pd.DataFrame:
+ """Minimal dataset with single row (1 row).
+
+ Purpose: Minimal dataset edge case testing
+ Edge cases: StdDev undefined/NaN, Uniqueness=1.0
+ """
+ return pd.DataFrame({
+ "att1": ["a"],
+ "att2": ["d"],
+ "item": [1],
+ "price": [10.0],
+ })
+
+
+def create_df_all_null() -> pd.DataFrame:
+ """Dataset with all-NULL column (3 rows).
+
+ Purpose: 0% completeness edge case testing
+ Edge cases: Completeness=0, Mean=NULL
+ """
+ return pd.DataFrame({
+ "value": [None, None, None],
+ "item": [1, 2, 3],
+ })
+
+
+def create_df_escape() -> pd.DataFrame:
+ """Dataset with special characters (8 rows).
+
+ Purpose: Special character and regex escaping tests
+ Edge cases: Quotes, special characters (@#$%^&)
+ """
+ return pd.DataFrame({
+ "att1": [
+ 'hello "world"',
+ "it's working",
+ "test@example.com",
+ "#hashtag",
+ "$money$",
+ "%percent%",
+ "^caret^",
+ "&ersand&",
+ ],
+ "att2": ["normal", "values", "here", "for", "comparison", "testing", "edge", "cases"],
+ "item": list(range(1, 9)),
+ })
+
+
+def create_df_correlation() -> pd.DataFrame:
+ """Dataset for correlation testing (5 rows).
+
+ Purpose: Correlation analyzer tests
+ Edge cases: Perfect +1.0 and -1.0 correlation
+ - x and y: perfectly positively correlated (1.0)
+ - x and z: perfectly negatively correlated (-1.0)
+ """
+ return pd.DataFrame({
+ "x": [1.0, 2.0, 3.0, 4.0, 5.0],
+ "y": [2.0, 4.0, 6.0, 8.0, 10.0], # y = 2x, correlation = 1.0
+ "z": [5.0, 4.0, 3.0, 2.0, 1.0], # z = 6-x, correlation = -1.0
+ "w": [1.0, 1.0, 1.0, 1.0, 1.0], # constant, correlation undefined
+ })
+
+
+def create_df_entropy() -> pd.DataFrame:
+ """Dataset for entropy testing (4 rows).
+
+ Purpose: Entropy analyzer tests
+ Edge cases: Uniform vs skewed distribution
+ - uniform: 4 distinct values each appearing once -> entropy = ln(4) ≈ 1.386
+ - skewed: 1 value appearing 3 times, 1 appearing once -> entropy < 1.386
+ """
+ return pd.DataFrame({
+ "uniform": ["a", "b", "c", "d"], # Entropy = ln(4) ≈ 1.386
+ "skewed": ["a", "a", "a", "b"], # Entropy = -(3/4)ln(3/4) - (1/4)ln(1/4) ≈ 0.562
+ "constant": ["x", "x", "x", "x"], # Entropy = 0 (single value)
+ "item": [1, 2, 3, 4],
+ })
+
+
+def create_df_where() -> pd.DataFrame:
+ """Dataset for WHERE clause filtering tests (4 rows).
+
+ Purpose: WHERE clause filter testing
+ Edge cases: Mixed completeness by filter
+ - When filtered by category='A': att1 is complete
+ - When filtered by category='B': att1 has nulls
+ """
+ return pd.DataFrame({
+ "category": ["A", "A", "B", "B"],
+ "att1": ["x", "y", None, "w"], # A: 2/2 complete, B: 1/2 complete
+ "att2": [1, None, 3, 4], # A: 1/2 complete, B: 2/2 complete
+ "value": [10.0, 20.0, 30.0, 40.0],
+ })
+
+
+def create_df_pattern() -> pd.DataFrame:
+ """Dataset for pattern matching tests (6 rows).
+
+ Purpose: PatternMatch analyzer and regex compliance tests
+ Edge cases: Email patterns, phone patterns, mixed valid/invalid
+ """
+ return pd.DataFrame({
+ "email": [
+ "test@example.com",
+ "user@domain.org",
+ "invalid-email",
+ "another@test.co.uk",
+ "bad@",
+ "good.name@company.com",
+ ],
+ "phone": [
+ "123-456-7890",
+ "987-654-3210",
+ "invalid",
+ "555-123-4567",
+ "1234567890",
+ "800-555-1234",
+ ],
+ "code": ["ABC123", "DEF456", "xyz789", "GHI012", "JKL345", "mno678"],
+ "item": list(range(1, 7)),
+ })
+
+
+def create_df_compliance() -> pd.DataFrame:
+ """Dataset for compliance predicate tests (6 rows).
+
+ Purpose: Compliance and satisfies constraint tests
+ Edge cases: Positive/negative numbers, boundary conditions
+ """
+ return pd.DataFrame({
+ "positive": [1, 2, 3, 4, 5, 6],
+ "negative": [-1, -2, -3, -4, -5, -6],
+ "mixed": [-2, -1, 0, 1, 2, 3],
+ "with_null": [1, 2, None, 4, 5, None],
+ "item": list(range(1, 7)),
+ })
+
+
+def create_df_quantile() -> pd.DataFrame:
+ """Dataset for quantile testing (10 rows).
+
+ Purpose: ApproxQuantile analyzer tests
+ Edge cases: Sorted values for predictable quantiles
+ Values: 1-10, Median (50th percentile) = 5.5
+ """
+ return pd.DataFrame({
+ "value": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0],
+ "item": list(range(1, 11)),
+ })
+
+
+def create_df_contained_in() -> pd.DataFrame:
+ """Dataset for isContainedIn constraint tests (6 rows).
+
+ Purpose: Testing containment in allowed value sets
+ Edge cases: All valid, some invalid, NULL handling
+ """
+ return pd.DataFrame({
+ "status": ["active", "inactive", "pending", "active", "inactive", "active"],
+ "category": ["A", "B", "C", "A", "B", "D"], # D is not in typical allowed set
+ "priority": [1, 2, 3, 1, 2, 4], # 4 might be outside allowed range
+ "item": list(range(1, 7)),
+ })
+
+
+def create_df_histogram() -> pd.DataFrame:
+ """Dataset for histogram testing (10 rows).
+
+ Purpose: Histogram analyzer tests
+ Edge cases: Low cardinality categorical data
+ """
+ return pd.DataFrame({
+ "category": ["A", "A", "A", "B", "B", "C", "C", "C", "C", "D"],
+ "status": ["active", "active", "inactive", "active", "inactive",
+ "active", "active", "inactive", "inactive", "active"],
+ "item": list(range(1, 11)),
+ })
+
+
+def create_df_mutual_info() -> pd.DataFrame:
+ """Dataset for mutual information testing (8 rows).
+
+ Purpose: MutualInformation analyzer tests
+ Edge cases: Perfectly dependent vs independent columns
+ """
+ return pd.DataFrame({
+ "x": ["a", "a", "b", "b", "c", "c", "d", "d"],
+ "y_dependent": ["a", "a", "b", "b", "c", "c", "d", "d"], # Perfectly dependent on x
+ "y_independent": ["p", "q", "r", "s", "p", "q", "r", "s"], # Less dependent
+ "item": list(range(1, 9)),
+ })
+
+
+def create_df_data_type() -> pd.DataFrame:
+ """Dataset for DataType analyzer testing.
+
+ Purpose: Testing data type inference
+ Edge cases: Mixed numeric strings, pure numeric, non-numeric
+ """
+ return pd.DataFrame({
+ "numeric_strings": ["1", "2", "3", "4", "5"],
+ "mixed": ["1", "2", "three", "4", "five"],
+ "pure_numeric": [1.0, 2.0, 3.0, 4.0, 5.0],
+ "strings": ["a", "b", "c", "d", "e"],
+ "item": list(range(1, 6)),
+ })
+
+
+# Expected values registry for DuckDB-only tests
+# Key: (dataset_name, analyzer_name, instance) -> expected_value
+# instance can be a column name, tuple of columns, or None for dataset-level metrics
+EXPECTED_VALUES: Dict[Tuple[str, str, Any], float] = {
+ # Size analyzer
+ ("df_full", "Size", None): 4.0,
+ ("df_missing", "Size", None): 12.0,
+ ("df_numeric", "Size", None): 6.0,
+ ("df_empty", "Size", None): 0.0,
+ ("df_single", "Size", None): 1.0,
+
+ # Completeness analyzer
+ ("df_full", "Completeness", "att1"): 1.0,
+ ("df_full", "Completeness", "att2"): 1.0,
+ ("df_missing", "Completeness", "att1"): 0.5, # 6/12
+ ("df_missing", "Completeness", "att2"): 0.75, # 9/12
+ ("df_all_null", "Completeness", "value"): 0.0,
+ ("df_single", "Completeness", "att1"): 1.0,
+ ("df_unique", "Completeness", "unique_col"): 1.0,
+ ("df_unique", "Completeness", "half_null"): 0.5, # 3/6
+
+ # Mean analyzer
+ ("df_numeric", "Mean", "att1"): 3.5, # (1+2+3+4+5+6)/6
+ ("df_numeric", "Mean", "att2"): 3.0, # (1+2+3+4+5)/5, NULL excluded
+ ("df_single", "Mean", "item"): 1.0,
+ ("df_single", "Mean", "price"): 10.0,
+
+ # Sum analyzer
+ ("df_numeric", "Sum", "att1"): 21.0, # 1+2+3+4+5+6
+ ("df_numeric", "Sum", "att2"): 15.0, # 1+2+3+4+5, NULL excluded
+ ("df_single", "Sum", "item"): 1.0,
+ ("df_single", "Sum", "price"): 10.0,
+
+ # Minimum analyzer
+ ("df_numeric", "Minimum", "att1"): 1.0,
+ ("df_numeric", "Minimum", "att2"): 1.0,
+ ("df_single", "Minimum", "item"): 1.0,
+ ("df_single", "Minimum", "price"): 10.0,
+
+ # Maximum analyzer
+ ("df_numeric", "Maximum", "att1"): 6.0,
+ ("df_numeric", "Maximum", "att2"): 5.0, # 6 is NULL position
+ ("df_single", "Maximum", "item"): 1.0,
+ ("df_single", "Maximum", "price"): 10.0,
+
+ # StandardDeviation analyzer (population stddev)
+ ("df_numeric", "StandardDeviation", "att1"): 1.7078251276599330, # sqrt(17.5/6)
+
+ # String length analyzers
+ ("df_string_lengths", "MinLength", "att1"): 0.0, # Empty string
+ ("df_string_lengths", "MaxLength", "att1"): 4.0, # "dddd"
+ ("df_string_lengths", "MinLength", "att2"): 4.0, # "test", "data"
+ ("df_string_lengths", "MaxLength", "att2"): 5.0, # "hello", "world", "value"
+
+ # Distinctness analyzer (distinct values / total rows)
+ ("df_distinct", "Distinctness", "att1"): 0.5, # 3 distinct / 6 rows
+ ("df_distinct", "Distinctness", "att2"): 1.0, # 6 distinct / 6 rows
+ ("df_unique", "Distinctness", "all_same"): 1/6, # 1 distinct / 6 rows
+
+ # Uniqueness analyzer (rows with unique values / total rows)
+ ("df_distinct", "Uniqueness", "att1"): 0.0, # No unique values (all duplicated)
+ ("df_distinct", "Uniqueness", "att2"): 1.0, # All values are unique
+ ("df_unique", "Uniqueness", "unique_col"): 1.0, # All values unique
+ ("df_unique", "Uniqueness", "non_unique"): 0.0, # All values duplicated
+
+ # UniqueValueRatio analyzer (unique values / distinct values)
+ ("df_distinct", "UniqueValueRatio", "att1"): 0.0, # 0 unique / 3 distinct
+ ("df_distinct", "UniqueValueRatio", "att2"): 1.0, # 6 unique / 6 distinct
+
+ # Correlation analyzer
+ ("df_correlation", "Correlation", ("x", "y")): 1.0, # Perfect positive
+ ("df_correlation", "Correlation", ("x", "z")): -1.0, # Perfect negative
+
+ # Entropy analyzer
+ ("df_entropy", "Entropy", "uniform"): 1.3862943611198906, # ln(4) for 4 uniform values
+ ("df_entropy", "Entropy", "constant"): 0.0, # Single value = 0 entropy
+
+ # ApproxCountDistinct analyzer
+ ("df_full", "ApproxCountDistinct", "att1"): 3.0, # "a", "b", "c" (a appears twice)
+ ("df_full", "ApproxCountDistinct", "item"): 4.0, # 1, 2, 3, 4
+ ("df_distinct", "ApproxCountDistinct", "att1"): 3.0, # "a", "b", "c"
+ ("df_distinct", "ApproxCountDistinct", "att2"): 6.0, # All distinct
+
+ # CountDistinct analyzer
+ ("df_full", "CountDistinct", "att1"): 3.0,
+ ("df_distinct", "CountDistinct", "att1"): 3.0,
+ ("df_distinct", "CountDistinct", "att2"): 6.0,
+
+ # PatternMatch analyzer (fraction of rows matching pattern)
+ # These will be tested with specific patterns in the tests
+
+ # Compliance analyzer (fraction of rows satisfying predicate)
+ ("df_compliance", "Compliance", "positive > 0"): 1.0, # All positive
+ ("df_compliance", "Compliance", "negative < 0"): 1.0, # All negative
+ ("df_compliance", "Compliance", "mixed > 0"): 0.5, # 3/6 > 0
+
+ # Quantile analyzer (approximate)
+ ("df_quantile", "ApproxQuantile", ("value", 0.5)): 5.5, # Median
+ ("df_quantile", "ApproxQuantile", ("value", 0.25)): 3.0, # 25th percentile (approx)
+ ("df_quantile", "ApproxQuantile", ("value", 0.75)): 8.0, # 75th percentile (approx)
+}
+
+
+# Tolerance levels for comparing floating-point results
+FLOAT_EPSILON = 1e-9 # Exact comparisons: Size, Completeness, Uniqueness
+FLOAT_TOLERANCE = 1e-6 # Statistical: Mean, StdDev, Correlation
+APPROX_TOLERANCE = 0.1 # Approximate algorithms: ApproxCountDistinct (10% relative)
+
+
+# Mapping of analyzer types to their expected tolerance
+ANALYZER_TOLERANCES: Dict[str, float] = {
+ # Exact metrics
+ "Size": FLOAT_EPSILON,
+ "Completeness": FLOAT_EPSILON,
+ "Uniqueness": FLOAT_EPSILON,
+ "Distinctness": FLOAT_EPSILON,
+ "UniqueValueRatio": FLOAT_EPSILON,
+ "CountDistinct": FLOAT_EPSILON,
+ "MinLength": FLOAT_EPSILON,
+ "MaxLength": FLOAT_EPSILON,
+ "PatternMatch": FLOAT_EPSILON,
+ "Compliance": FLOAT_EPSILON,
+
+ # Statistical metrics
+ "Mean": FLOAT_TOLERANCE,
+ "Sum": FLOAT_TOLERANCE,
+ "Minimum": FLOAT_TOLERANCE,
+ "Maximum": FLOAT_TOLERANCE,
+ "StandardDeviation": FLOAT_TOLERANCE,
+ "Correlation": FLOAT_TOLERANCE,
+ "Entropy": FLOAT_TOLERANCE,
+ "MutualInformation": FLOAT_TOLERANCE,
+ "ApproxQuantile": FLOAT_TOLERANCE,
+
+ # Approximate metrics
+ "ApproxCountDistinct": APPROX_TOLERANCE,
+}
+
+
+def get_tolerance(analyzer_name: str) -> float:
+ """Get the appropriate tolerance for an analyzer type."""
+ return ANALYZER_TOLERANCES.get(analyzer_name, FLOAT_TOLERANCE)
+
+
+def is_close(actual: float, expected: float, tolerance: float) -> bool:
+ """Check if two values are close within tolerance.
+
+ For APPROX_TOLERANCE, uses relative comparison.
+ For smaller tolerances, uses absolute comparison.
+ """
+ if expected is None or actual is None:
+ return expected is None and actual is None
+
+ if tolerance >= APPROX_TOLERANCE:
+ # Relative tolerance for approximate algorithms
+ if expected == 0:
+ return abs(actual) < tolerance
+ return abs(actual - expected) / abs(expected) < tolerance
+ else:
+ # Absolute tolerance for exact/statistical metrics
+ return abs(actual - expected) < tolerance
+
+
+# Dataset factory registry
+DATASET_FACTORIES = {
+ "df_full": create_df_full,
+ "df_missing": create_df_missing,
+ "df_numeric": create_df_numeric,
+ "df_unique": create_df_unique,
+ "df_distinct": create_df_distinct,
+ "df_string_lengths": create_df_string_lengths,
+ "df_empty": create_df_empty,
+ "df_single": create_df_single,
+ "df_all_null": create_df_all_null,
+ "df_escape": create_df_escape,
+ "df_correlation": create_df_correlation,
+ "df_entropy": create_df_entropy,
+ "df_where": create_df_where,
+ "df_pattern": create_df_pattern,
+ "df_compliance": create_df_compliance,
+ "df_quantile": create_df_quantile,
+ "df_contained_in": create_df_contained_in,
+ "df_histogram": create_df_histogram,
+ "df_mutual_info": create_df_mutual_info,
+ "df_data_type": create_df_data_type,
+}
+
+
+def get_dataset(name: str) -> pd.DataFrame:
+ """Get a dataset by name."""
+ if name not in DATASET_FACTORIES:
+ raise ValueError(f"Unknown dataset: {name}. Available: {list(DATASET_FACTORIES.keys())}")
+ return DATASET_FACTORIES[name]()
diff --git a/tests/engines/test_constraint_evaluators.py b/tests/engines/test_constraint_evaluators.py
new file mode 100644
index 0000000..2b67027
--- /dev/null
+++ b/tests/engines/test_constraint_evaluators.py
@@ -0,0 +1,473 @@
+# -*- coding: utf-8 -*-
+"""
+Unit tests for constraint evaluators.
+
+These tests verify the constraint evaluator abstractions work correctly
+in isolation, testing SQL generation and evaluation logic.
+"""
+
+import pandas as pd
+import pytest
+
+from pydeequ.engines.constraints import (
+ ConstraintEvaluatorFactory,
+ BaseEvaluator,
+ RatioCheckEvaluator,
+ AnalyzerBasedEvaluator,
+ # Analyzer-based evaluators
+ SizeEvaluator,
+ CompletenessEvaluator,
+ MeanEvaluator,
+ MinimumEvaluator,
+ MaximumEvaluator,
+ SumEvaluator,
+ StandardDeviationEvaluator,
+ UniquenessEvaluator,
+ DistinctnessEvaluator,
+ UniqueValueRatioEvaluator,
+ CorrelationEvaluator,
+ EntropyEvaluator,
+ MutualInformationEvaluator,
+ PatternMatchEvaluator,
+ MinLengthEvaluator,
+ MaxLengthEvaluator,
+ ApproxCountDistinctEvaluator,
+ ApproxQuantileEvaluator,
+ ComplianceEvaluator,
+ # Ratio-check evaluators
+ IsPositiveEvaluator,
+ IsNonNegativeEvaluator,
+ IsContainedInEvaluator,
+ ContainsEmailEvaluator,
+ ContainsURLEvaluator,
+ ContainsCreditCardEvaluator,
+ ContainsSSNEvaluator,
+ # Comparison evaluators
+ ColumnComparisonEvaluator,
+ # Multi-column evaluators
+ MultiColumnCompletenessEvaluator,
+)
+
+
+class MockConstraintProto:
+ """Mock constraint protobuf for testing."""
+
+ def __init__(
+ self,
+ type: str = "test",
+ column: str = "",
+ columns: list = None,
+ where: str = "",
+ pattern: str = "",
+ column_condition: str = "",
+ constraint_name: str = "",
+ allowed_values: list = None,
+ quantile: float = 0.5,
+ assertion=None,
+ ):
+ self.type = type
+ self.column = column
+ self.columns = columns or []
+ self.where = where
+ self.pattern = pattern
+ self.column_condition = column_condition
+ self.constraint_name = constraint_name
+ self.allowed_values = allowed_values or []
+ self.quantile = quantile
+ self._assertion = assertion
+
+ def HasField(self, field_name):
+ if field_name == "assertion":
+ return self._assertion is not None
+ return False
+
+ @property
+ def assertion(self):
+ return self._assertion
+
+
+class TestConstraintEvaluatorFactory:
+ """Tests for ConstraintEvaluatorFactory."""
+
+ def test_create_size_evaluator(self):
+ """Test creating SizeEvaluator."""
+ proto = MockConstraintProto(type="hasSize")
+ evaluator = ConstraintEvaluatorFactory.create(proto)
+ assert evaluator is not None
+ assert isinstance(evaluator, SizeEvaluator)
+
+ def test_create_completeness_evaluator(self):
+ """Test creating CompletenessEvaluator for isComplete."""
+ proto = MockConstraintProto(type="isComplete", column="col1")
+ evaluator = ConstraintEvaluatorFactory.create(proto)
+ assert evaluator is not None
+ assert isinstance(evaluator, CompletenessEvaluator)
+
+ def test_create_completeness_evaluator_has(self):
+ """Test creating CompletenessEvaluator for hasCompleteness."""
+ proto = MockConstraintProto(type="hasCompleteness", column="col1")
+ evaluator = ConstraintEvaluatorFactory.create(proto)
+ assert evaluator is not None
+ assert isinstance(evaluator, CompletenessEvaluator)
+
+ def test_create_is_positive_evaluator(self):
+ """Test creating IsPositiveEvaluator."""
+ proto = MockConstraintProto(type="isPositive", column="col1")
+ evaluator = ConstraintEvaluatorFactory.create(proto)
+ assert evaluator is not None
+ assert isinstance(evaluator, IsPositiveEvaluator)
+
+ def test_create_is_contained_in_evaluator(self):
+ """Test creating IsContainedInEvaluator."""
+ proto = MockConstraintProto(
+ type="isContainedIn",
+ column="col1",
+ allowed_values=["a", "b", "c"]
+ )
+ evaluator = ConstraintEvaluatorFactory.create(proto)
+ assert evaluator is not None
+ assert isinstance(evaluator, IsContainedInEvaluator)
+
+ def test_create_column_comparison_evaluator(self):
+ """Test creating ColumnComparisonEvaluator for isLessThan."""
+ proto = MockConstraintProto(type="isLessThan", columns=["col1", "col2"])
+ evaluator = ConstraintEvaluatorFactory.create(proto)
+ assert evaluator is not None
+ assert isinstance(evaluator, ColumnComparisonEvaluator)
+
+ def test_create_multi_column_completeness_evaluator(self):
+ """Test creating MultiColumnCompletenessEvaluator."""
+ proto = MockConstraintProto(type="areComplete", columns=["col1", "col2"])
+ evaluator = ConstraintEvaluatorFactory.create(proto)
+ assert evaluator is not None
+ assert isinstance(evaluator, MultiColumnCompletenessEvaluator)
+
+ def test_create_unknown_type_returns_none(self):
+ """Test that unknown constraint types return None."""
+ proto = MockConstraintProto(type="unknownType")
+ evaluator = ConstraintEvaluatorFactory.create(proto)
+ assert evaluator is None
+
+ def test_is_supported(self):
+ """Test is_supported method."""
+ assert ConstraintEvaluatorFactory.is_supported("hasSize")
+ assert ConstraintEvaluatorFactory.is_supported("isComplete")
+ assert ConstraintEvaluatorFactory.is_supported("isPositive")
+ assert not ConstraintEvaluatorFactory.is_supported("unknownType")
+
+ def test_supported_types(self):
+ """Test supported_types method returns all registered types."""
+ types = ConstraintEvaluatorFactory.supported_types()
+ assert "hasSize" in types
+ assert "isComplete" in types
+ assert "hasCompleteness" in types
+ assert "isPositive" in types
+ assert "isNonNegative" in types
+ assert "isContainedIn" in types
+ assert "isLessThan" in types
+ assert "areComplete" in types
+
+
+class TestRatioCheckEvaluators:
+ """Tests for ratio-check evaluator condition generation."""
+
+ def test_is_positive_condition(self):
+ """Test IsPositiveEvaluator generates correct condition."""
+ proto = MockConstraintProto(type="isPositive", column="price")
+ evaluator = IsPositiveEvaluator(proto)
+ assert evaluator.get_condition() == "price > 0"
+
+ def test_is_non_negative_condition(self):
+ """Test IsNonNegativeEvaluator generates correct condition."""
+ proto = MockConstraintProto(type="isNonNegative", column="count")
+ evaluator = IsNonNegativeEvaluator(proto)
+ assert evaluator.get_condition() == "count >= 0"
+
+ def test_is_contained_in_condition(self):
+ """Test IsContainedInEvaluator generates correct condition."""
+ proto = MockConstraintProto(
+ type="isContainedIn",
+ column="status",
+ allowed_values=["active", "pending"]
+ )
+ evaluator = IsContainedInEvaluator(proto)
+ condition = evaluator.get_condition()
+ assert "status IN" in condition
+ assert "'active'" in condition
+ assert "'pending'" in condition
+
+ def test_is_contained_in_escapes_quotes(self):
+ """Test IsContainedInEvaluator properly escapes single quotes."""
+ proto = MockConstraintProto(
+ type="isContainedIn",
+ column="name",
+ allowed_values=["O'Brien", "D'Angelo"]
+ )
+ evaluator = IsContainedInEvaluator(proto)
+ condition = evaluator.get_condition()
+ assert "O''Brien" in condition
+ assert "D''Angelo" in condition
+
+ def test_contains_email_pattern(self):
+ """Test ContainsEmailEvaluator uses email regex pattern."""
+ proto = MockConstraintProto(type="containsEmail", column="email")
+ evaluator = ContainsEmailEvaluator(proto)
+ condition = evaluator.get_condition()
+ assert "REGEXP_MATCHES" in condition
+ assert "email" in condition
+
+ def test_contains_url_pattern(self):
+ """Test ContainsURLEvaluator uses URL regex pattern."""
+ proto = MockConstraintProto(type="containsURL", column="website")
+ evaluator = ContainsURLEvaluator(proto)
+ condition = evaluator.get_condition()
+ assert "REGEXP_MATCHES" in condition
+ assert "website" in condition
+
+ def test_column_comparison_less_than(self):
+ """Test ColumnComparisonEvaluator for isLessThan."""
+ proto = MockConstraintProto(type="isLessThan", columns=["col_a", "col_b"])
+ evaluator = ColumnComparisonEvaluator(proto)
+ assert evaluator.get_condition() == "col_a < col_b"
+
+ def test_column_comparison_greater_than(self):
+ """Test ColumnComparisonEvaluator for isGreaterThan."""
+ proto = MockConstraintProto(type="isGreaterThan", columns=["col_a", "col_b"])
+ evaluator = ColumnComparisonEvaluator(proto)
+ assert evaluator.get_condition() == "col_a > col_b"
+
+ def test_column_comparison_less_than_or_equal(self):
+ """Test ColumnComparisonEvaluator for isLessThanOrEqualTo."""
+ proto = MockConstraintProto(type="isLessThanOrEqualTo", columns=["col_a", "col_b"])
+ evaluator = ColumnComparisonEvaluator(proto)
+ assert evaluator.get_condition() == "col_a <= col_b"
+
+ def test_column_comparison_greater_than_or_equal(self):
+ """Test ColumnComparisonEvaluator for isGreaterThanOrEqualTo."""
+ proto = MockConstraintProto(type="isGreaterThanOrEqualTo", columns=["col_a", "col_b"])
+ evaluator = ColumnComparisonEvaluator(proto)
+ assert evaluator.get_condition() == "col_a >= col_b"
+
+
+class TestEvaluatorToString:
+ """Tests for evaluator to_string methods."""
+
+ def test_size_evaluator_to_string(self):
+ """Test SizeEvaluator to_string."""
+ proto = MockConstraintProto(type="hasSize")
+ evaluator = SizeEvaluator(proto)
+ assert "hasSize" in evaluator.to_string()
+
+ def test_completeness_evaluator_to_string_is_complete(self):
+ """Test CompletenessEvaluator to_string for isComplete."""
+ proto = MockConstraintProto(type="isComplete", column="col1")
+ evaluator = CompletenessEvaluator(proto)
+ result = evaluator.to_string()
+ assert "Complete" in result
+ assert "col1" in result
+
+ def test_is_positive_evaluator_to_string(self):
+ """Test IsPositiveEvaluator to_string."""
+ proto = MockConstraintProto(type="isPositive", column="price")
+ evaluator = IsPositiveEvaluator(proto)
+ result = evaluator.to_string()
+ assert "isPositive" in result
+ assert "price" in result
+
+ def test_is_contained_in_evaluator_to_string(self):
+ """Test IsContainedInEvaluator to_string."""
+ proto = MockConstraintProto(
+ type="isContainedIn",
+ column="status",
+ allowed_values=["a", "b"]
+ )
+ evaluator = IsContainedInEvaluator(proto)
+ result = evaluator.to_string()
+ assert "isContainedIn" in result
+ assert "status" in result
+
+ def test_column_comparison_to_string(self):
+ """Test ColumnComparisonEvaluator to_string."""
+ proto = MockConstraintProto(type="isLessThan", columns=["a", "b"])
+ evaluator = ColumnComparisonEvaluator(proto)
+ result = evaluator.to_string()
+ assert "isLessThan" in result
+ assert "a" in result
+ assert "b" in result
+
+ def test_multi_column_completeness_to_string(self):
+ """Test MultiColumnCompletenessEvaluator to_string."""
+ proto = MockConstraintProto(type="areComplete", columns=["col1", "col2"])
+ evaluator = MultiColumnCompletenessEvaluator(proto)
+ result = evaluator.to_string()
+ assert "Complete" in result
+ assert "col1" in result
+ assert "col2" in result
+
+
+class TestEvaluatorEvaluation:
+ """Tests for evaluator evaluation logic."""
+
+ def test_evaluate_none_value_returns_false(self):
+ """Test that evaluating None value returns False."""
+ proto = MockConstraintProto(type="hasSize")
+ evaluator = SizeEvaluator(proto)
+ assert evaluator.evaluate(None) is False
+
+ def test_evaluate_1_0_without_assertion_returns_true(self):
+ """Test that evaluating 1.0 without assertion returns True."""
+ proto = MockConstraintProto(type="isComplete", column="col1")
+ evaluator = CompletenessEvaluator(proto)
+ assert evaluator.evaluate(1.0) is True
+
+ def test_evaluate_less_than_1_without_assertion_returns_false(self):
+ """Test that evaluating < 1.0 without assertion returns False."""
+ proto = MockConstraintProto(type="isComplete", column="col1")
+ evaluator = CompletenessEvaluator(proto)
+ assert evaluator.evaluate(0.5) is False
+
+
+class TestAnalyzerBasedEvaluators:
+ """Tests for analyzer-based evaluator operator generation."""
+
+ def test_completeness_evaluator_get_operator(self):
+ """Test CompletenessEvaluator creates correct operator."""
+ from pydeequ.engines.operators import CompletenessOperator
+
+ proto = MockConstraintProto(type="isComplete", column="col1")
+ evaluator = CompletenessEvaluator(proto)
+ operator = evaluator.get_operator()
+ assert isinstance(operator, CompletenessOperator)
+ assert operator.column == "col1"
+
+ def test_mean_evaluator_get_operator(self):
+ """Test MeanEvaluator creates correct operator."""
+ from pydeequ.engines.operators import MeanOperator
+
+ proto = MockConstraintProto(type="hasMean", column="value")
+ evaluator = MeanEvaluator(proto)
+ operator = evaluator.get_operator()
+ assert isinstance(operator, MeanOperator)
+ assert operator.column == "value"
+
+ def test_uniqueness_evaluator_get_operator(self):
+ """Test UniquenessEvaluator creates correct operator."""
+ from pydeequ.engines.operators import UniquenessOperator
+
+ proto = MockConstraintProto(type="isUnique", column="id")
+ evaluator = UniquenessEvaluator(proto)
+ operator = evaluator.get_operator()
+ assert isinstance(operator, UniquenessOperator)
+
+ def test_pattern_match_evaluator_get_operator(self):
+ """Test PatternMatchEvaluator creates correct operator."""
+ from pydeequ.engines.operators import PatternMatchOperator
+
+ proto = MockConstraintProto(type="hasPattern", column="email", pattern="^.*@.*$")
+ evaluator = PatternMatchEvaluator(proto)
+ operator = evaluator.get_operator()
+ assert isinstance(operator, PatternMatchOperator)
+ assert operator.column == "email"
+
+ def test_approx_quantile_evaluator_get_operator(self):
+ """Test ApproxQuantileEvaluator creates correct operator."""
+ from pydeequ.engines.operators import ApproxQuantileOperator
+
+ proto = MockConstraintProto(type="hasApproxQuantile", column="value", quantile=0.75)
+ evaluator = ApproxQuantileEvaluator(proto)
+ operator = evaluator.get_operator()
+ assert isinstance(operator, ApproxQuantileOperator)
+ assert operator.quantile == 0.75
+
+
+class TestWhereClauseHandling:
+ """Tests for WHERE clause handling in evaluators."""
+
+ def test_ratio_evaluator_with_where_clause(self):
+ """Test ratio evaluator includes WHERE in query."""
+ proto = MockConstraintProto(type="isPositive", column="price", where="status='active'")
+ evaluator = IsPositiveEvaluator(proto)
+ assert evaluator.where == "status='active'"
+
+ def test_analyzer_evaluator_with_where_clause(self):
+ """Test analyzer evaluator passes WHERE to operator."""
+ proto = MockConstraintProto(type="hasMean", column="value", where="status='active'")
+ evaluator = MeanEvaluator(proto)
+ operator = evaluator.get_operator()
+ assert operator.where == "status='active'"
+
+
+class TestSpecialConstraintTypes:
+ """Tests for special constraint types with extra parameters."""
+
+ def test_compliance_evaluator(self):
+ """Test ComplianceEvaluator with column_condition."""
+ proto = MockConstraintProto(
+ type="satisfies",
+ column_condition="price > 0 AND quantity > 0",
+ constraint_name="valid_order"
+ )
+ evaluator = ComplianceEvaluator(proto)
+ assert evaluator.predicate == "price > 0 AND quantity > 0"
+ assert evaluator.name == "valid_order"
+ result = evaluator.to_string()
+ assert "satisfies" in result
+
+ def test_correlation_evaluator_requires_two_columns(self):
+ """Test CorrelationEvaluator handles missing columns."""
+ proto = MockConstraintProto(type="hasCorrelation", columns=["col1"])
+ evaluator = CorrelationEvaluator(proto)
+ # Should return None for operator when not enough columns
+ result = evaluator.compute_value("test_table", lambda q: pd.DataFrame())
+ assert result is None
+
+ def test_mutual_information_evaluator_requires_two_columns(self):
+ """Test MutualInformationEvaluator handles missing columns."""
+ proto = MockConstraintProto(type="hasMutualInformation", columns=["col1"])
+ evaluator = MutualInformationEvaluator(proto)
+ result = evaluator.compute_value("test_table", lambda q: pd.DataFrame())
+ assert result is None
+
+
+class TestAllConstraintTypesSupported:
+ """Verify all constraint types have evaluators."""
+
+ @pytest.mark.parametrize("constraint_type", [
+ "hasSize",
+ "isComplete",
+ "hasCompleteness",
+ "hasMean",
+ "hasMin",
+ "hasMax",
+ "hasSum",
+ "hasStandardDeviation",
+ "isUnique",
+ "hasUniqueness",
+ "hasDistinctness",
+ "hasUniqueValueRatio",
+ "hasCorrelation",
+ "hasEntropy",
+ "hasMutualInformation",
+ "hasPattern",
+ "hasMinLength",
+ "hasMaxLength",
+ "hasApproxCountDistinct",
+ "hasApproxQuantile",
+ "satisfies",
+ "isPositive",
+ "isNonNegative",
+ "isContainedIn",
+ "containsEmail",
+ "containsURL",
+ "containsCreditCardNumber",
+ "containsSocialSecurityNumber",
+ "isLessThan",
+ "isLessThanOrEqualTo",
+ "isGreaterThan",
+ "isGreaterThanOrEqualTo",
+ "areComplete",
+ "haveCompleteness",
+ ])
+ def test_constraint_type_has_evaluator(self, constraint_type):
+ """Verify each constraint type maps to an evaluator."""
+ assert ConstraintEvaluatorFactory.is_supported(constraint_type)
diff --git a/tests/engines/test_duckdb_analyzers.py b/tests/engines/test_duckdb_analyzers.py
new file mode 100644
index 0000000..189c80f
--- /dev/null
+++ b/tests/engines/test_duckdb_analyzers.py
@@ -0,0 +1,650 @@
+# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""DuckDB-only analyzer tests.
+
+Tests all 22 analyzers against known expected values from the test datasets.
+These tests do not require Spark and can run quickly in CI.
+"""
+
+import math
+import pytest
+
+from pydeequ.v2.analyzers import (
+ Size,
+ Completeness,
+ Mean,
+ Sum,
+ Maximum,
+ Minimum,
+ StandardDeviation,
+ Distinctness,
+ Uniqueness,
+ UniqueValueRatio,
+ CountDistinct,
+ ApproxCountDistinct,
+ ApproxQuantile,
+ Correlation,
+ MutualInformation,
+ MaxLength,
+ MinLength,
+ PatternMatch,
+ Compliance,
+ Entropy,
+ Histogram,
+ DataType,
+)
+
+from tests.engines.conftest import get_metric_value, get_metric
+from tests.engines.fixtures.datasets import (
+ EXPECTED_VALUES,
+ FLOAT_EPSILON,
+ FLOAT_TOLERANCE,
+ APPROX_TOLERANCE,
+ is_close,
+ get_tolerance,
+)
+
+
+class TestSizeAnalyzer:
+ """Tests for the Size analyzer."""
+
+ def test_size_basic(self, engine_full):
+ """Size returns correct row count for basic dataset."""
+ metrics = engine_full.compute_metrics([Size()])
+ value = get_metric_value(metrics, "Size")
+ assert value == 4.0
+
+ def test_size_empty(self, engine_empty):
+ """Size returns 0 for empty dataset."""
+ metrics = engine_empty.compute_metrics([Size()])
+ value = get_metric_value(metrics, "Size")
+ assert value == 0.0
+
+ def test_size_single(self, engine_single):
+ """Size returns 1 for single-row dataset."""
+ metrics = engine_single.compute_metrics([Size()])
+ value = get_metric_value(metrics, "Size")
+ assert value == 1.0
+
+ def test_size_missing(self, engine_missing):
+ """Size counts all rows regardless of NULLs."""
+ metrics = engine_missing.compute_metrics([Size()])
+ value = get_metric_value(metrics, "Size")
+ assert value == 12.0
+
+ def test_size_with_where(self, engine_where):
+ """Size respects WHERE clause."""
+ metrics = engine_where.compute_metrics([Size(where="category = 'A'")])
+ value = get_metric_value(metrics, "Size")
+ assert value == 2.0
+
+
+class TestCompletenessAnalyzer:
+ """Tests for the Completeness analyzer."""
+
+ def test_completeness_full(self, engine_full):
+ """Completeness is 1.0 for columns with no NULLs."""
+ metrics = engine_full.compute_metrics([Completeness("att1")])
+ value = get_metric_value(metrics, "Completeness", "att1")
+ assert is_close(value, 1.0, FLOAT_EPSILON)
+
+ def test_completeness_partial(self, engine_missing):
+ """Completeness reflects NULL ratio correctly."""
+ metrics = engine_missing.compute_metrics([
+ Completeness("att1"),
+ Completeness("att2"),
+ ])
+ att1_value = get_metric_value(metrics, "Completeness", "att1")
+ att2_value = get_metric_value(metrics, "Completeness", "att2")
+ assert is_close(att1_value, 0.5, FLOAT_EPSILON) # 6/12
+ assert is_close(att2_value, 0.75, FLOAT_EPSILON) # 9/12
+
+ def test_completeness_all_null(self, engine_all_null):
+ """Completeness is 0.0 for all-NULL column."""
+ metrics = engine_all_null.compute_metrics([Completeness("value")])
+ value = get_metric_value(metrics, "Completeness", "value")
+ assert is_close(value, 0.0, FLOAT_EPSILON)
+
+ def test_completeness_empty(self, engine_empty):
+ """Completeness is 1.0 for empty dataset (vacuously true)."""
+ metrics = engine_empty.compute_metrics([Completeness("att1")])
+ value = get_metric_value(metrics, "Completeness", "att1")
+ # Empty dataset: 0/0 should be treated as 1.0 (all rows are complete)
+ assert value is None or is_close(value, 1.0, FLOAT_EPSILON) or math.isnan(value)
+
+ def test_completeness_with_where(self, engine_where):
+ """Completeness respects WHERE clause."""
+ # category='A': att1 has values "x", "y" (2/2 complete)
+ metrics = engine_where.compute_metrics([
+ Completeness("att1", where="category = 'A'")
+ ])
+ value = get_metric_value(metrics, "Completeness", "att1")
+ assert is_close(value, 1.0, FLOAT_EPSILON)
+
+
+class TestMeanAnalyzer:
+ """Tests for the Mean analyzer."""
+
+ def test_mean_basic(self, engine_numeric):
+ """Mean calculates correctly for numeric column."""
+ metrics = engine_numeric.compute_metrics([Mean("att1")])
+ value = get_metric_value(metrics, "Mean", "att1")
+ assert is_close(value, 3.5, FLOAT_TOLERANCE)
+
+ def test_mean_with_nulls(self, engine_numeric):
+ """Mean excludes NULL values in calculation."""
+ metrics = engine_numeric.compute_metrics([Mean("att2")])
+ value = get_metric_value(metrics, "Mean", "att2")
+ assert is_close(value, 3.0, FLOAT_TOLERANCE) # (1+2+3+4+5)/5
+
+ def test_mean_single(self, engine_single):
+ """Mean works for single row."""
+ metrics = engine_single.compute_metrics([Mean("price")])
+ value = get_metric_value(metrics, "Mean", "price")
+ assert is_close(value, 10.0, FLOAT_TOLERANCE)
+
+ def test_mean_with_where(self, engine_where):
+ """Mean respects WHERE clause."""
+ metrics = engine_where.compute_metrics([Mean("value", where="category = 'A'")])
+ value = get_metric_value(metrics, "Mean", "value")
+ assert is_close(value, 15.0, FLOAT_TOLERANCE) # (10+20)/2
+
+
+class TestSumAnalyzer:
+ """Tests for the Sum analyzer."""
+
+ def test_sum_basic(self, engine_numeric):
+ """Sum calculates correctly for numeric column."""
+ metrics = engine_numeric.compute_metrics([Sum("att1")])
+ value = get_metric_value(metrics, "Sum", "att1")
+ assert is_close(value, 21.0, FLOAT_TOLERANCE)
+
+ def test_sum_with_nulls(self, engine_numeric):
+ """Sum excludes NULL values."""
+ metrics = engine_numeric.compute_metrics([Sum("att2")])
+ value = get_metric_value(metrics, "Sum", "att2")
+ assert is_close(value, 15.0, FLOAT_TOLERANCE)
+
+ def test_sum_single(self, engine_single):
+ """Sum works for single row."""
+ metrics = engine_single.compute_metrics([Sum("price")])
+ value = get_metric_value(metrics, "Sum", "price")
+ assert is_close(value, 10.0, FLOAT_TOLERANCE)
+
+
+class TestMinimumAnalyzer:
+ """Tests for the Minimum analyzer."""
+
+ def test_minimum_basic(self, engine_numeric):
+ """Minimum finds smallest value."""
+ metrics = engine_numeric.compute_metrics([Minimum("att1")])
+ value = get_metric_value(metrics, "Minimum", "att1")
+ assert is_close(value, 1.0, FLOAT_TOLERANCE)
+
+ def test_minimum_with_nulls(self, engine_numeric):
+ """Minimum ignores NULL values."""
+ metrics = engine_numeric.compute_metrics([Minimum("att2")])
+ value = get_metric_value(metrics, "Minimum", "att2")
+ assert is_close(value, 1.0, FLOAT_TOLERANCE)
+
+ def test_minimum_single(self, engine_single):
+ """Minimum works for single row."""
+ metrics = engine_single.compute_metrics([Minimum("price")])
+ value = get_metric_value(metrics, "Minimum", "price")
+ assert is_close(value, 10.0, FLOAT_TOLERANCE)
+
+
+class TestMaximumAnalyzer:
+ """Tests for the Maximum analyzer."""
+
+ def test_maximum_basic(self, engine_numeric):
+ """Maximum finds largest value."""
+ metrics = engine_numeric.compute_metrics([Maximum("att1")])
+ value = get_metric_value(metrics, "Maximum", "att1")
+ assert is_close(value, 6.0, FLOAT_TOLERANCE)
+
+ def test_maximum_with_nulls(self, engine_numeric):
+ """Maximum ignores NULL values."""
+ metrics = engine_numeric.compute_metrics([Maximum("att2")])
+ value = get_metric_value(metrics, "Maximum", "att2")
+ assert is_close(value, 5.0, FLOAT_TOLERANCE)
+
+ def test_maximum_single(self, engine_single):
+ """Maximum works for single row."""
+ metrics = engine_single.compute_metrics([Maximum("price")])
+ value = get_metric_value(metrics, "Maximum", "price")
+ assert is_close(value, 10.0, FLOAT_TOLERANCE)
+
+
+class TestStandardDeviationAnalyzer:
+ """Tests for the StandardDeviation analyzer."""
+
+ def test_stddev_basic(self, engine_numeric):
+ """StandardDeviation calculates population stddev correctly."""
+ metrics = engine_numeric.compute_metrics([StandardDeviation("att1")])
+ value = get_metric_value(metrics, "StandardDeviation", "att1")
+ # Population stddev of [1,2,3,4,5,6] = sqrt(17.5/6) ≈ 1.7078 (matches Spark)
+ assert is_close(value, 1.7078251276599330, FLOAT_TOLERANCE)
+
+ def test_stddev_single_row(self, engine_single):
+ """StandardDeviation for single row is NaN or 0."""
+ metrics = engine_single.compute_metrics([StandardDeviation("price")])
+ value = get_metric_value(metrics, "StandardDeviation", "price")
+ # Single value: stddev is undefined (NaN) or 0
+ assert value is None or math.isnan(value) or value == 0.0
+
+
+class TestDistinctnessAnalyzer:
+ """Tests for the Distinctness analyzer."""
+
+ def test_distinctness_basic(self, engine_distinct):
+ """Distinctness = distinct values / total rows."""
+ metrics = engine_distinct.compute_metrics([Distinctness(["att1"])])
+ value = get_metric_value(metrics, "Distinctness", "att1")
+ # 3 distinct values / 6 rows = 0.5
+ assert is_close(value, 0.5, FLOAT_EPSILON)
+
+ def test_distinctness_all_unique(self, engine_distinct):
+ """Distinctness is 1.0 when all values are distinct."""
+ metrics = engine_distinct.compute_metrics([Distinctness(["att2"])])
+ value = get_metric_value(metrics, "Distinctness", "att2")
+ assert is_close(value, 1.0, FLOAT_EPSILON)
+
+ def test_distinctness_all_same(self, engine_unique):
+ """Distinctness is 1/n when all values are the same."""
+ metrics = engine_unique.compute_metrics([Distinctness(["all_same"])])
+ value = get_metric_value(metrics, "Distinctness", "all_same")
+ # 1 distinct / 6 rows ≈ 0.167
+ assert is_close(value, 1/6, FLOAT_EPSILON)
+
+
+class TestUniquenessAnalyzer:
+ """Tests for the Uniqueness analyzer."""
+
+ def test_uniqueness_all_unique(self, engine_unique):
+ """Uniqueness is 1.0 when all values appear exactly once."""
+ metrics = engine_unique.compute_metrics([Uniqueness(["unique_col"])])
+ value = get_metric_value(metrics, "Uniqueness", "unique_col")
+ assert is_close(value, 1.0, FLOAT_EPSILON)
+
+ def test_uniqueness_all_duplicated(self, engine_distinct):
+ """Uniqueness is 0.0 when all values are duplicated."""
+ metrics = engine_distinct.compute_metrics([Uniqueness(["att1"])])
+ value = get_metric_value(metrics, "Uniqueness", "att1")
+ # All values in att1 appear twice, so 0 unique
+ assert is_close(value, 0.0, FLOAT_EPSILON)
+
+ def test_uniqueness_mixed(self, engine_unique):
+ """Uniqueness handles mixed case correctly."""
+ metrics = engine_unique.compute_metrics([Uniqueness(["non_unique"])])
+ value = get_metric_value(metrics, "Uniqueness", "non_unique")
+ # [1,1,2,2,3,3] - all duplicated, uniqueness = 0
+ assert is_close(value, 0.0, FLOAT_EPSILON)
+
+
+class TestUniqueValueRatioAnalyzer:
+ """Tests for the UniqueValueRatio analyzer."""
+
+ def test_unique_value_ratio_all_unique(self, engine_distinct):
+ """UniqueValueRatio is 1.0 when unique count = distinct count."""
+ metrics = engine_distinct.compute_metrics([UniqueValueRatio(["att2"])])
+ value = get_metric_value(metrics, "UniqueValueRatio", "att2")
+ # 6 unique / 6 distinct = 1.0
+ assert is_close(value, 1.0, FLOAT_EPSILON)
+
+ def test_unique_value_ratio_no_unique(self, engine_distinct):
+ """UniqueValueRatio is 0.0 when no values are unique."""
+ metrics = engine_distinct.compute_metrics([UniqueValueRatio(["att1"])])
+ value = get_metric_value(metrics, "UniqueValueRatio", "att1")
+ # 0 unique / 3 distinct = 0.0
+ assert is_close(value, 0.0, FLOAT_EPSILON)
+
+
+class TestCountDistinctAnalyzer:
+ """Tests for the CountDistinct analyzer."""
+
+ def test_count_distinct_basic(self, engine_full):
+ """CountDistinct counts unique values correctly."""
+ metrics = engine_full.compute_metrics([CountDistinct(["att1"])])
+ value = get_metric_value(metrics, "CountDistinct", "att1")
+ # "a", "b", "c" (a appears twice) = 3 distinct
+ assert is_close(value, 3.0, FLOAT_EPSILON)
+
+ def test_count_distinct_all_unique(self, engine_distinct):
+ """CountDistinct equals row count when all values are distinct."""
+ metrics = engine_distinct.compute_metrics([CountDistinct(["att2"])])
+ value = get_metric_value(metrics, "CountDistinct", "att2")
+ assert is_close(value, 6.0, FLOAT_EPSILON)
+
+ def test_count_distinct_with_duplicates(self, engine_distinct):
+ """CountDistinct counts only distinct values."""
+ metrics = engine_distinct.compute_metrics([CountDistinct(["att1"])])
+ value = get_metric_value(metrics, "CountDistinct", "att1")
+ assert is_close(value, 3.0, FLOAT_EPSILON)
+
+
+class TestApproxCountDistinctAnalyzer:
+ """Tests for the ApproxCountDistinct analyzer."""
+
+ def test_approx_count_distinct_basic(self, engine_full):
+ """ApproxCountDistinct approximates distinct count."""
+ metrics = engine_full.compute_metrics([ApproxCountDistinct("att1")])
+ value = get_metric_value(metrics, "ApproxCountDistinct", "att1")
+ # Should be approximately 3
+ assert is_close(value, 3.0, APPROX_TOLERANCE)
+
+ def test_approx_count_distinct_all_unique(self, engine_distinct):
+ """ApproxCountDistinct handles all-unique column."""
+ metrics = engine_distinct.compute_metrics([ApproxCountDistinct("att2")])
+ value = get_metric_value(metrics, "ApproxCountDistinct", "att2")
+ # HyperLogLog can have higher variance on small datasets (up to 20% error)
+ assert is_close(value, 6.0, 0.2)
+
+
+class TestApproxQuantileAnalyzer:
+ """Tests for the ApproxQuantile analyzer."""
+
+ def test_approx_quantile_median(self, engine_quantile):
+ """ApproxQuantile calculates median correctly."""
+ metrics = engine_quantile.compute_metrics([ApproxQuantile("value", 0.5)])
+ value = get_metric_value(metrics, "ApproxQuantile", "value")
+ # Median of [1,2,3,4,5,6,7,8,9,10] = 5.5
+ assert is_close(value, 5.5, FLOAT_TOLERANCE)
+
+ def test_approx_quantile_quartiles(self, engine_quantile):
+ """ApproxQuantile calculates quartiles."""
+ metrics = engine_quantile.compute_metrics([
+ ApproxQuantile("value", 0.25),
+ ApproxQuantile("value", 0.75),
+ ])
+ # For small datasets, quantile calculation may vary slightly
+ q25 = get_metric_value(metrics, "ApproxQuantile", "value")
+ # Note: DuckDB uses QUANTILE_CONT which interpolates
+ assert q25 is not None
+
+
+class TestCorrelationAnalyzer:
+ """Tests for the Correlation analyzer."""
+
+ def test_correlation_positive(self, engine_correlation):
+ """Correlation is 1.0 for perfectly positively correlated columns."""
+ metrics = engine_correlation.compute_metrics([Correlation("x", "y")])
+ value = get_metric_value(metrics, "Correlation", "x,y")
+ assert is_close(value, 1.0, FLOAT_TOLERANCE)
+
+ def test_correlation_negative(self, engine_correlation):
+ """Correlation is -1.0 for perfectly negatively correlated columns."""
+ metrics = engine_correlation.compute_metrics([Correlation("x", "z")])
+ value = get_metric_value(metrics, "Correlation", "x,z")
+ assert is_close(value, -1.0, FLOAT_TOLERANCE)
+
+
+class TestMutualInformationAnalyzer:
+ """Tests for the MutualInformation analyzer."""
+
+ def test_mutual_information_dependent(self, engine_mutual_info):
+ """MutualInformation is high for perfectly dependent columns."""
+ metrics = engine_mutual_info.compute_metrics([
+ MutualInformation(["x", "y_dependent"])
+ ])
+ value = get_metric_value(metrics, "MutualInformation", "x,y_dependent")
+ # Perfect dependency should have high MI (equal to entropy of x)
+ assert value is not None and value > 0
+
+
+class TestMaxLengthAnalyzer:
+ """Tests for the MaxLength analyzer."""
+
+ def test_maxlength_basic(self, engine_string_lengths):
+ """MaxLength finds longest string."""
+ metrics = engine_string_lengths.compute_metrics([MaxLength("att1")])
+ value = get_metric_value(metrics, "MaxLength", "att1")
+ assert is_close(value, 4.0, FLOAT_EPSILON) # "dddd"
+
+ def test_maxlength_uniform(self, engine_string_lengths):
+ """MaxLength works with varying lengths."""
+ metrics = engine_string_lengths.compute_metrics([MaxLength("att2")])
+ value = get_metric_value(metrics, "MaxLength", "att2")
+ assert is_close(value, 5.0, FLOAT_EPSILON) # "hello", "world", "value"
+
+
+class TestMinLengthAnalyzer:
+ """Tests for the MinLength analyzer."""
+
+ def test_minlength_empty_string(self, engine_string_lengths):
+ """MinLength handles empty string (length 0)."""
+ metrics = engine_string_lengths.compute_metrics([MinLength("att1")])
+ value = get_metric_value(metrics, "MinLength", "att1")
+ assert is_close(value, 0.0, FLOAT_EPSILON) # ""
+
+ def test_minlength_basic(self, engine_string_lengths):
+ """MinLength finds shortest string."""
+ metrics = engine_string_lengths.compute_metrics([MinLength("att2")])
+ value = get_metric_value(metrics, "MinLength", "att2")
+ assert is_close(value, 4.0, FLOAT_EPSILON) # "test", "data"
+
+
+class TestPatternMatchAnalyzer:
+ """Tests for the PatternMatch analyzer."""
+
+ def test_pattern_match_email(self, engine_pattern):
+ """PatternMatch detects email pattern."""
+ # Simple email regex
+ metrics = engine_pattern.compute_metrics([
+ PatternMatch("email", r".*@.*\..*")
+ ])
+ value = get_metric_value(metrics, "PatternMatch", "email")
+ # 4 valid emails out of 6
+ assert is_close(value, 4/6, FLOAT_TOLERANCE)
+
+ def test_pattern_match_all_match(self, engine_full):
+ """PatternMatch returns 1.0 when all rows match."""
+ metrics = engine_full.compute_metrics([
+ PatternMatch("att1", r"^[a-c]$")
+ ])
+ value = get_metric_value(metrics, "PatternMatch", "att1")
+ # "a", "b", "c", "a" all match
+ assert is_close(value, 1.0, FLOAT_TOLERANCE)
+
+
+class TestComplianceAnalyzer:
+ """Tests for the Compliance analyzer."""
+
+ def test_compliance_all_positive(self, engine_compliance):
+ """Compliance is 1.0 when all rows satisfy predicate."""
+ metrics = engine_compliance.compute_metrics([
+ Compliance("positive_check", "positive > 0")
+ ])
+ value = get_metric_value(metrics, "Compliance", "positive_check")
+ assert is_close(value, 1.0, FLOAT_TOLERANCE)
+
+ def test_compliance_partial(self, engine_compliance):
+ """Compliance reflects fraction satisfying predicate."""
+ metrics = engine_compliance.compute_metrics([
+ Compliance("mixed_check", "mixed > 0")
+ ])
+ value = get_metric_value(metrics, "Compliance", "mixed_check")
+ # [-2,-1,0,1,2,3] -> 3 values > 0
+ assert is_close(value, 0.5, FLOAT_TOLERANCE)
+
+ def test_compliance_none(self, engine_compliance):
+ """Compliance is 0.0 when no rows satisfy predicate."""
+ metrics = engine_compliance.compute_metrics([
+ Compliance("negative_positive", "negative > 0")
+ ])
+ value = get_metric_value(metrics, "Compliance", "negative_positive")
+ assert is_close(value, 0.0, FLOAT_TOLERANCE)
+
+
+class TestEntropyAnalyzer:
+ """Tests for the Entropy analyzer."""
+
+ def test_entropy_uniform(self, engine_entropy):
+ """Entropy is ln(n) for uniform distribution."""
+ metrics = engine_entropy.compute_metrics([Entropy("uniform")])
+ value = get_metric_value(metrics, "Entropy", "uniform")
+ # 4 equally distributed values: entropy = ln(4) ≈ 1.386 (matches Spark)
+ assert is_close(value, 1.3862943611198906, FLOAT_TOLERANCE)
+
+ def test_entropy_constant(self, engine_entropy):
+ """Entropy is 0 for constant column."""
+ metrics = engine_entropy.compute_metrics([Entropy("constant")])
+ value = get_metric_value(metrics, "Entropy", "constant")
+ assert is_close(value, 0.0, FLOAT_TOLERANCE)
+
+ def test_entropy_skewed(self, engine_entropy):
+ """Entropy is between 0 and max for skewed distribution."""
+ metrics = engine_entropy.compute_metrics([Entropy("skewed")])
+ value = get_metric_value(metrics, "Entropy", "skewed")
+ # Skewed distribution: 0 < entropy < ln(4) ≈ 1.386
+ assert value > 0.0 and value < 1.3862943611198906
+
+
+class TestHistogramAnalyzer:
+ """Tests for the Histogram analyzer."""
+
+ def test_histogram_basic(self, engine_histogram):
+ """Histogram returns value distribution."""
+ metrics = engine_histogram.compute_metrics([Histogram("category")])
+ result = get_metric(metrics, "Histogram", "category")
+ assert result is not None
+ # Histogram value should be non-null (JSON or dict)
+
+
+class TestDataTypeAnalyzer:
+ """Tests for the DataType analyzer."""
+
+ def test_datatype_numeric(self, engine_data_type):
+ """DataType identifies numeric columns."""
+ metrics = engine_data_type.compute_metrics([DataType("pure_numeric")])
+ result = get_metric(metrics, "DataType", "pure_numeric")
+ assert result is not None
+
+ def test_datatype_string(self, engine_data_type):
+ """DataType identifies string columns."""
+ metrics = engine_data_type.compute_metrics([DataType("strings")])
+ result = get_metric(metrics, "DataType", "strings")
+ assert result is not None
+
+
+class TestMultipleAnalyzers:
+ """Tests for running multiple analyzers together."""
+
+ def test_multiple_basic_analyzers(self, engine_numeric):
+ """Multiple analyzers can be computed in one call."""
+ metrics = engine_numeric.compute_metrics([
+ Size(),
+ Mean("att1"),
+ Sum("att1"),
+ Minimum("att1"),
+ Maximum("att1"),
+ ])
+
+ assert len(metrics) >= 5
+ assert get_metric_value(metrics, "Size") == 6.0
+ assert is_close(get_metric_value(metrics, "Mean", "att1"), 3.5, FLOAT_TOLERANCE)
+ assert is_close(get_metric_value(metrics, "Sum", "att1"), 21.0, FLOAT_TOLERANCE)
+ assert is_close(get_metric_value(metrics, "Minimum", "att1"), 1.0, FLOAT_TOLERANCE)
+ assert is_close(get_metric_value(metrics, "Maximum", "att1"), 6.0, FLOAT_TOLERANCE)
+
+ def test_multiple_columns_same_analyzer(self, engine_full):
+ """Same analyzer type on multiple columns."""
+ metrics = engine_full.compute_metrics([
+ Completeness("att1"),
+ Completeness("att2"),
+ Completeness("item"),
+ ])
+
+ assert len(metrics) >= 3
+ assert is_close(get_metric_value(metrics, "Completeness", "att1"), 1.0, FLOAT_EPSILON)
+ assert is_close(get_metric_value(metrics, "Completeness", "att2"), 1.0, FLOAT_EPSILON)
+ assert is_close(get_metric_value(metrics, "Completeness", "item"), 1.0, FLOAT_EPSILON)
+
+ def test_mixed_analyzer_types(self, engine_full):
+ """Mix of different analyzer categories."""
+ metrics = engine_full.compute_metrics([
+ Size(),
+ Completeness("att1"),
+ CountDistinct(["att1"]),
+ MaxLength("att1"),
+ ])
+
+ assert get_metric_value(metrics, "Size") == 4.0
+ assert is_close(get_metric_value(metrics, "Completeness", "att1"), 1.0, FLOAT_EPSILON)
+ assert get_metric_value(metrics, "CountDistinct", "att1") == 3.0
+ assert get_metric_value(metrics, "MaxLength", "att1") == 1.0 # "a", "b", "c"
+
+
+class TestAnalyzersWithWhere:
+ """Tests for analyzers with WHERE clause filtering."""
+
+ def test_size_where_a(self, engine_where):
+ """Size with WHERE filters correctly."""
+ metrics = engine_where.compute_metrics([
+ Size(where="category = 'A'"),
+ Size(where="category = 'B'"),
+ ])
+ assert get_metric_value(metrics, "Size") == 2.0 # Both return 2
+
+ def test_completeness_where(self, engine_where):
+ """Completeness varies by WHERE filter."""
+ metrics = engine_where.compute_metrics([
+ Completeness("att1", where="category = 'A'"),
+ Completeness("att1", where="category = 'B'"),
+ ])
+ # Category A: att1 = ["x", "y"] -> 2/2 complete
+ # Category B: att1 = [None, "w"] -> 1/2 complete
+ a_completeness = get_metric_value(metrics, "Completeness", "att1")
+ assert a_completeness is not None
+
+ def test_mean_where(self, engine_where):
+ """Mean varies by WHERE filter."""
+ metrics = engine_where.compute_metrics([
+ Mean("value", where="category = 'A'"),
+ ])
+ # Category A: value = [10, 20] -> mean = 15
+ value = get_metric_value(metrics, "Mean", "value")
+ assert is_close(value, 15.0, FLOAT_TOLERANCE)
+
+
+class TestEdgeCases:
+ """Tests for edge cases and boundary conditions."""
+
+ def test_empty_dataset_all_analyzers(self, engine_empty):
+ """Empty dataset handles gracefully."""
+ metrics = engine_empty.compute_metrics([
+ Size(),
+ Completeness("att1"),
+ ])
+ assert get_metric_value(metrics, "Size") == 0.0
+
+ def test_all_null_column_stats(self, engine_all_null):
+ """All-NULL column returns appropriate values."""
+ metrics = engine_all_null.compute_metrics([
+ Completeness("value"),
+ Size(),
+ ])
+ assert is_close(get_metric_value(metrics, "Completeness", "value"), 0.0, FLOAT_EPSILON)
+ assert get_metric_value(metrics, "Size") == 3.0
+
+ def test_special_characters(self, engine_escape):
+ """Special characters in data are handled."""
+ metrics = engine_escape.compute_metrics([
+ Size(),
+ Completeness("att1"),
+ MaxLength("att1"),
+ ])
+ assert get_metric_value(metrics, "Size") == 8.0
+ assert is_close(get_metric_value(metrics, "Completeness", "att1"), 1.0, FLOAT_EPSILON)
diff --git a/tests/engines/test_duckdb_constraints.py b/tests/engines/test_duckdb_constraints.py
new file mode 100644
index 0000000..a1124ed
--- /dev/null
+++ b/tests/engines/test_duckdb_constraints.py
@@ -0,0 +1,641 @@
+# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""DuckDB-only constraint tests.
+
+Tests all 32 constraint types against known expected values from the test datasets.
+These tests do not require Spark and can run quickly in CI.
+"""
+
+import pytest
+
+from pydeequ.v2.checks import Check, CheckLevel
+from pydeequ.v2.predicates import eq, gt, gte, lt, lte, between, is_one
+from pydeequ.v2.verification import VerificationSuite
+from pydeequ.engines import ConstraintStatus, CheckStatus
+
+
+def get_constraint_result(results, constraint_substring: str):
+ """Find a constraint result by substring match on constraint name."""
+ for r in results:
+ if constraint_substring in r.constraint:
+ return r
+ return None
+
+
+def get_check_result(results, check_description: str):
+ """Find results for a specific check by description."""
+ return [r for r in results if r.check_description == check_description]
+
+
+class TestSizeConstraint:
+ """Tests for hasSize constraint."""
+
+ def test_has_size_success(self, engine_full):
+ """hasSize succeeds when size equals expected."""
+ check = Check(CheckLevel.Error, "size check").hasSize(eq(4))
+ results = engine_full.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Success
+
+ def test_has_size_failure(self, engine_full):
+ """hasSize fails when size doesn't match."""
+ check = Check(CheckLevel.Error, "size check").hasSize(eq(10))
+ results = engine_full.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Failure
+
+ def test_has_size_range(self, engine_full):
+ """hasSize with between predicate."""
+ check = Check(CheckLevel.Error, "size range").hasSize(between(3, 5))
+ results = engine_full.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Success
+
+ def test_has_size_empty(self, engine_empty):
+ """hasSize correctly reports 0 for empty dataset."""
+ check = Check(CheckLevel.Error, "empty size").hasSize(eq(0))
+ results = engine_empty.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Success
+
+
+class TestCompletenessConstraints:
+ """Tests for completeness-related constraints."""
+
+ def test_is_complete_success(self, engine_full):
+ """isComplete succeeds for non-NULL column."""
+ check = Check(CheckLevel.Error, "complete").isComplete("att1")
+ results = engine_full.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Success
+
+ def test_is_complete_failure(self, engine_missing):
+ """isComplete fails for column with NULLs."""
+ check = Check(CheckLevel.Error, "complete").isComplete("att1")
+ results = engine_missing.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Failure
+
+ def test_has_completeness_success(self, engine_missing):
+ """hasCompleteness succeeds when threshold is met."""
+ # att1 is 50% complete, check for >= 50%
+ check = Check(CheckLevel.Error, "partial complete").hasCompleteness("att1", gte(0.5))
+ results = engine_missing.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Success
+
+ def test_has_completeness_failure(self, engine_missing):
+ """hasCompleteness fails when threshold not met."""
+ # att1 is 50% complete, check for >= 90%
+ check = Check(CheckLevel.Error, "high threshold").hasCompleteness("att1", gte(0.9))
+ results = engine_missing.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Failure
+
+ def test_are_complete_success(self, engine_full):
+ """areComplete succeeds when all columns are complete."""
+ check = Check(CheckLevel.Error, "multi complete").areComplete(["att1", "att2"])
+ results = engine_full.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Success
+
+ def test_are_complete_failure(self, engine_missing):
+ """areComplete fails when any column has NULLs."""
+ check = Check(CheckLevel.Error, "multi complete").areComplete(["att1", "att2"])
+ results = engine_missing.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Failure
+
+ def test_have_completeness_success(self, engine_missing):
+ """haveCompleteness succeeds for combined column threshold."""
+ check = Check(CheckLevel.Error, "combined").haveCompleteness(["att1", "att2"], gte(0.5))
+ results = engine_missing.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Success
+
+
+class TestUniquenessConstraints:
+ """Tests for uniqueness-related constraints."""
+
+ def test_is_unique_success(self, engine_unique):
+ """isUnique succeeds when all values are unique."""
+ check = Check(CheckLevel.Error, "unique").isUnique("unique_col")
+ results = engine_unique.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Success
+
+ def test_is_unique_failure(self, engine_unique):
+ """isUnique fails when there are duplicates."""
+ check = Check(CheckLevel.Error, "not unique").isUnique("non_unique")
+ results = engine_unique.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Failure
+
+ def test_has_uniqueness_success(self, engine_unique):
+ """hasUniqueness succeeds when threshold met."""
+ check = Check(CheckLevel.Error, "uniqueness").hasUniqueness(["unique_col"], is_one())
+ results = engine_unique.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Success
+
+ def test_has_uniqueness_failure(self, engine_distinct):
+ """hasUniqueness fails when uniqueness is below threshold."""
+ # att1 has all duplicates, uniqueness = 0
+ check = Check(CheckLevel.Error, "low uniqueness").hasUniqueness(["att1"], gte(0.5))
+ results = engine_distinct.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Failure
+
+ def test_has_distinctness_success(self, engine_distinct):
+ """hasDistinctness succeeds when threshold met."""
+ # att2 has 6 distinct / 6 rows = 1.0
+ check = Check(CheckLevel.Error, "distinct").hasDistinctness(["att2"], is_one())
+ results = engine_distinct.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Success
+
+ def test_has_distinctness_partial(self, engine_distinct):
+ """hasDistinctness with partial distinctness."""
+ # att1 has 3 distinct / 6 rows = 0.5
+ check = Check(CheckLevel.Error, "partial distinct").hasDistinctness(["att1"], gte(0.5))
+ results = engine_distinct.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Success
+
+ def test_has_unique_value_ratio_success(self, engine_distinct):
+ """hasUniqueValueRatio succeeds for all-unique column."""
+ check = Check(CheckLevel.Error, "uvr").hasUniqueValueRatio(["att2"], is_one())
+ results = engine_distinct.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Success
+
+ def test_has_unique_value_ratio_zero(self, engine_distinct):
+ """hasUniqueValueRatio for all-duplicated column."""
+ # att1: 0 unique / 3 distinct = 0
+ check = Check(CheckLevel.Error, "uvr zero").hasUniqueValueRatio(["att1"], eq(0))
+ results = engine_distinct.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Success
+
+
+class TestStatisticalConstraints:
+ """Tests for statistical constraints."""
+
+ def test_has_min_success(self, engine_numeric):
+ """hasMin succeeds when minimum matches."""
+ check = Check(CheckLevel.Error, "min").hasMin("att1", eq(1))
+ results = engine_numeric.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Success
+
+ def test_has_min_failure(self, engine_numeric):
+ """hasMin fails when minimum doesn't match."""
+ check = Check(CheckLevel.Error, "min fail").hasMin("att1", eq(5))
+ results = engine_numeric.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Failure
+
+ def test_has_max_success(self, engine_numeric):
+ """hasMax succeeds when maximum matches."""
+ check = Check(CheckLevel.Error, "max").hasMax("att1", eq(6))
+ results = engine_numeric.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Success
+
+ def test_has_max_failure(self, engine_numeric):
+ """hasMax fails when maximum doesn't match."""
+ check = Check(CheckLevel.Error, "max fail").hasMax("att1", eq(100))
+ results = engine_numeric.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Failure
+
+ def test_has_mean_success(self, engine_numeric):
+ """hasMean succeeds when mean matches."""
+ check = Check(CheckLevel.Error, "mean").hasMean("att1", eq(3.5))
+ results = engine_numeric.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Success
+
+ def test_has_mean_range(self, engine_numeric):
+ """hasMean with range predicate."""
+ check = Check(CheckLevel.Error, "mean range").hasMean("att1", between(3.0, 4.0))
+ results = engine_numeric.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Success
+
+ def test_has_sum_success(self, engine_numeric):
+ """hasSum succeeds when sum matches."""
+ check = Check(CheckLevel.Error, "sum").hasSum("att1", eq(21))
+ results = engine_numeric.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Success
+
+ def test_has_standard_deviation_success(self, engine_numeric):
+ """hasStandardDeviation with range check."""
+ check = Check(CheckLevel.Error, "stddev").hasStandardDeviation("att1", between(1.5, 2.0))
+ results = engine_numeric.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Success
+
+ def test_has_approx_count_distinct_success(self, engine_full):
+ """hasApproxCountDistinct succeeds when count is approximately correct."""
+ check = Check(CheckLevel.Error, "approx distinct").hasApproxCountDistinct("att1", between(2, 4))
+ results = engine_full.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Success
+
+
+class TestQuantileConstraints:
+ """Tests for quantile constraints."""
+
+ def test_has_approx_quantile_median(self, engine_quantile):
+ """hasApproxQuantile for median."""
+ check = Check(CheckLevel.Error, "median").hasApproxQuantile("value", 0.5, between(5.0, 6.0))
+ results = engine_quantile.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Success
+
+
+class TestCorrelationConstraints:
+ """Tests for correlation constraints."""
+
+ def test_has_correlation_positive(self, engine_correlation):
+ """hasCorrelation for perfectly correlated columns."""
+ check = Check(CheckLevel.Error, "positive corr").hasCorrelation("x", "y", is_one())
+ results = engine_correlation.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Success
+
+ def test_has_correlation_negative(self, engine_correlation):
+ """hasCorrelation for negative correlation."""
+ check = Check(CheckLevel.Error, "negative corr").hasCorrelation("x", "z", eq(-1))
+ results = engine_correlation.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Success
+
+
+class TestEntropyConstraints:
+ """Tests for entropy constraints."""
+
+ def test_has_entropy_uniform(self, engine_entropy):
+ """hasEntropy for uniform distribution."""
+ # ln(4) ≈ 1.386 (matches Spark's natural log convention)
+ check = Check(CheckLevel.Error, "entropy").hasEntropy("uniform", between(1.38, 1.39))
+ results = engine_entropy.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Success
+
+ def test_has_entropy_constant(self, engine_entropy):
+ """hasEntropy for constant column (entropy=0)."""
+ check = Check(CheckLevel.Error, "zero entropy").hasEntropy("constant", eq(0))
+ results = engine_entropy.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Success
+
+
+class TestMutualInformationConstraints:
+ """Tests for mutual information constraints."""
+
+ def test_has_mutual_information(self, engine_mutual_info):
+ """hasMutualInformation for dependent columns."""
+ check = Check(CheckLevel.Error, "mi").hasMutualInformation("x", "y_dependent", gt(0))
+ results = engine_mutual_info.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Success
+
+
+class TestStringLengthConstraints:
+ """Tests for string length constraints."""
+
+ def test_has_min_length_success(self, engine_string_lengths):
+ """hasMinLength for empty string (0)."""
+ check = Check(CheckLevel.Error, "min length").hasMinLength("att1", eq(0))
+ results = engine_string_lengths.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Success
+
+ def test_has_min_length_failure(self, engine_string_lengths):
+ """hasMinLength fails when min length is higher."""
+ check = Check(CheckLevel.Error, "min length fail").hasMinLength("att1", gte(2))
+ results = engine_string_lengths.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Failure
+
+ def test_has_max_length_success(self, engine_string_lengths):
+ """hasMaxLength succeeds when max is correct."""
+ check = Check(CheckLevel.Error, "max length").hasMaxLength("att1", eq(4))
+ results = engine_string_lengths.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Success
+
+ def test_has_max_length_bound(self, engine_string_lengths):
+ """hasMaxLength with upper bound."""
+ check = Check(CheckLevel.Error, "max bound").hasMaxLength("att1", lte(5))
+ results = engine_string_lengths.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Success
+
+
+class TestPatternConstraints:
+ """Tests for pattern matching constraints."""
+
+ def test_has_pattern_success(self, engine_full):
+ """hasPattern succeeds when pattern matches all rows."""
+ check = Check(CheckLevel.Error, "pattern").hasPattern("att1", r"^[a-c]$", is_one())
+ results = engine_full.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Success
+
+ def test_has_pattern_partial(self, engine_pattern):
+ """hasPattern with partial match threshold."""
+ # Email pattern matches 4/6 rows
+ check = Check(CheckLevel.Error, "email pattern").hasPattern("email", r".*@.*\..*", gte(0.5))
+ results = engine_pattern.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Success
+
+ def test_has_pattern_failure(self, engine_pattern):
+ """hasPattern fails when match rate is below threshold."""
+ check = Check(CheckLevel.Error, "strict pattern").hasPattern("email", r".*@.*\..*", is_one())
+ results = engine_pattern.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Failure
+
+
+class TestEmailUrlConstraints:
+ """Tests for email and URL pattern constraints."""
+
+ def test_contains_email_success(self, engine_pattern):
+ """containsEmail with threshold."""
+ check = Check(CheckLevel.Error, "email").containsEmail("email", gte(0.5))
+ results = engine_pattern.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Success
+
+ def test_contains_url_failure(self, engine_pattern):
+ """containsURL fails for non-URL column."""
+ check = Check(CheckLevel.Error, "url").containsURL("email", gte(0.5))
+ results = engine_pattern.run_checks([check])
+ result = results[0]
+ # No URLs in email column
+ assert result.constraint_status == ConstraintStatus.Failure
+
+
+class TestNumericConstraints:
+ """Tests for numeric value constraints."""
+
+ def test_is_positive_success(self, engine_compliance):
+ """isPositive succeeds for all-positive column."""
+ check = Check(CheckLevel.Error, "positive").isPositive("positive", is_one())
+ results = engine_compliance.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Success
+
+ def test_is_positive_failure(self, engine_compliance):
+ """isPositive fails for negative column."""
+ check = Check(CheckLevel.Error, "not positive").isPositive("negative", gte(0.5))
+ results = engine_compliance.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Failure
+
+ def test_is_non_negative_success(self, engine_compliance):
+ """isNonNegative for positive column."""
+ check = Check(CheckLevel.Error, "non-neg").isNonNegative("positive", is_one())
+ results = engine_compliance.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Success
+
+ def test_is_non_negative_partial(self, engine_compliance):
+ """isNonNegative with partial compliance."""
+ # mixed: [-2,-1,0,1,2,3] -> 4/6 non-negative
+ check = Check(CheckLevel.Error, "partial non-neg").isNonNegative("mixed", gte(0.5))
+ results = engine_compliance.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Success
+
+
+class TestColumnComparisonConstraints:
+ """Tests for column comparison constraints."""
+
+ def test_is_less_than(self, engine_correlation):
+ """isLessThan for ordered columns."""
+ # x = [1,2,3,4,5], y = [2,4,6,8,10], so x < y always
+ check = Check(CheckLevel.Error, "less than").isLessThan("x", "y")
+ results = engine_correlation.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Success
+
+ def test_is_less_than_or_equal_to(self, engine_correlation):
+ """isLessThanOrEqualTo for ordered columns."""
+ check = Check(CheckLevel.Error, "lte").isLessThanOrEqualTo("x", "y")
+ results = engine_correlation.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Success
+
+ def test_is_greater_than(self, engine_correlation):
+ """isGreaterThan for reverse-ordered columns."""
+ # y > x always
+ check = Check(CheckLevel.Error, "greater than").isGreaterThan("y", "x")
+ results = engine_correlation.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Success
+
+ def test_is_greater_than_or_equal_to(self, engine_correlation):
+ """isGreaterThanOrEqualTo for ordered columns."""
+ check = Check(CheckLevel.Error, "gte").isGreaterThanOrEqualTo("y", "x")
+ results = engine_correlation.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Success
+
+
+class TestContainedInConstraint:
+ """Tests for isContainedIn constraint."""
+
+ def test_is_contained_in_success(self, engine_contained_in):
+ """isContainedIn succeeds when all values are in allowed set."""
+ check = Check(CheckLevel.Error, "contained").isContainedIn(
+ "status", ["active", "inactive", "pending"], is_one()
+ )
+ results = engine_contained_in.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Success
+
+ def test_is_contained_in_failure(self, engine_contained_in):
+ """isContainedIn fails when some values are not in set."""
+ check = Check(CheckLevel.Error, "not contained").isContainedIn(
+ "category", ["A", "B", "C"], is_one()
+ )
+ results = engine_contained_in.run_checks([check])
+ result = results[0]
+ # "D" is not in the allowed set
+ assert result.constraint_status == ConstraintStatus.Failure
+
+ def test_is_contained_in_partial(self, engine_contained_in):
+ """isContainedIn with threshold for partial match."""
+ check = Check(CheckLevel.Error, "partial contained").isContainedIn(
+ "category", ["A", "B", "C"], gte(0.8)
+ )
+ results = engine_contained_in.run_checks([check])
+ result = results[0]
+ # 5/6 = 0.833 in allowed set
+ assert result.constraint_status == ConstraintStatus.Success
+
+
+class TestSatisfiesConstraint:
+ """Tests for satisfies constraint."""
+
+ def test_satisfies_simple(self, engine_compliance):
+ """satisfies with simple predicate."""
+ check = Check(CheckLevel.Error, "satisfies").satisfies("positive > 0", "positive_check", is_one())
+ results = engine_compliance.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Success
+
+ def test_satisfies_complex(self, engine_compliance):
+ """satisfies with complex predicate."""
+ check = Check(CheckLevel.Error, "complex").satisfies(
+ "mixed >= -2 AND mixed <= 3", "range_check", is_one()
+ )
+ results = engine_compliance.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Success
+
+ def test_satisfies_partial(self, engine_compliance):
+ """satisfies with partial compliance."""
+ check = Check(CheckLevel.Error, "partial satisfies").satisfies("mixed > 0", "partial_check", gte(0.4))
+ results = engine_compliance.run_checks([check])
+ result = results[0]
+ # 3/6 = 0.5 > 0.4
+ assert result.constraint_status == ConstraintStatus.Success
+
+
+class TestCheckLevels:
+ """Tests for check levels (Error vs Warning)."""
+
+ def test_error_level_failure(self, engine_full):
+ """Error level check results in Error status on failure."""
+ check = Check(CheckLevel.Error, "error check").hasSize(eq(100))
+ results = engine_full.run_checks([check])
+ result = results[0]
+ assert result.check_level == "Error"
+ assert result.check_status == CheckStatus.Error
+
+ def test_warning_level_failure(self, engine_full):
+ """Warning level check results in Warning status on failure."""
+ check = Check(CheckLevel.Warning, "warning check").hasSize(eq(100))
+ results = engine_full.run_checks([check])
+ result = results[0]
+ assert result.check_level == "Warning"
+ assert result.check_status == CheckStatus.Warning
+
+ def test_error_level_success(self, engine_full):
+ """Error level check results in Success status on pass."""
+ check = Check(CheckLevel.Error, "pass check").hasSize(eq(4))
+ results = engine_full.run_checks([check])
+ result = results[0]
+ assert result.check_status == CheckStatus.Success
+
+
+class TestMultipleConstraints:
+ """Tests for multiple constraints in one check."""
+
+ def test_all_pass(self, engine_full):
+ """All constraints pass results in Success."""
+ check = (Check(CheckLevel.Error, "all pass")
+ .hasSize(eq(4))
+ .isComplete("att1")
+ .isComplete("att2"))
+ results = engine_full.run_checks([check])
+ assert all(r.constraint_status == ConstraintStatus.Success for r in results)
+ assert results[0].check_status == CheckStatus.Success
+
+ def test_some_fail(self, engine_missing):
+ """Some constraints fail results in overall failure."""
+ check = (Check(CheckLevel.Error, "some fail")
+ .hasSize(eq(12)) # Pass
+ .isComplete("att1") # Fail
+ .hasCompleteness("att2", gte(0.5))) # Pass
+ results = engine_missing.run_checks([check])
+ # Check that at least one constraint failed
+ failed = [r for r in results if r.constraint_status == ConstraintStatus.Failure]
+ assert len(failed) >= 1
+ # Overall check should fail
+ assert results[0].check_status == CheckStatus.Error
+
+ def test_multiple_checks(self, engine_numeric):
+ """Multiple checks can be run together."""
+ check1 = Check(CheckLevel.Error, "size check").hasSize(eq(6))
+ check2 = Check(CheckLevel.Error, "mean check").hasMean("att1", eq(3.5))
+ check3 = Check(CheckLevel.Warning, "sum check").hasSum("att1", eq(21))
+
+ results = engine_numeric.run_checks([check1, check2, check3])
+ # All should pass
+ assert len(results) == 3
+ assert all(r.constraint_status == ConstraintStatus.Success for r in results)
+
+
+class TestConstraintsWithWhere:
+ """Tests for constraints with WHERE clause filtering."""
+
+ @pytest.mark.skip(reason="WHERE clause support not yet implemented in Check API")
+ def test_completeness_where(self, engine_where):
+ """Completeness constraint with WHERE filter."""
+ check = Check(CheckLevel.Error, "filtered completeness").hasCompleteness(
+ "att1", is_one(), where="category = 'A'"
+ )
+ results = engine_where.run_checks([check])
+ result = results[0]
+ # Category A: att1 is complete
+ assert result.constraint_status == ConstraintStatus.Success
+
+ @pytest.mark.skip(reason="WHERE clause support not yet implemented in Check API")
+ def test_size_where(self, engine_where):
+ """Size constraint with WHERE filter."""
+ check = Check(CheckLevel.Error, "filtered size").hasSize(
+ eq(2), where="category = 'A'"
+ )
+ results = engine_where.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Success
+
+
+class TestEdgeCases:
+ """Tests for edge cases and boundary conditions."""
+
+ def test_empty_dataset(self, engine_empty):
+ """Constraints on empty dataset."""
+ check = (Check(CheckLevel.Error, "empty check")
+ .hasSize(eq(0)))
+ results = engine_empty.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Success
+
+ def test_single_row(self, engine_single):
+ """Constraints on single-row dataset."""
+ check = (Check(CheckLevel.Error, "single row")
+ .hasSize(eq(1))
+ .isComplete("att1")
+ .hasMin("item", eq(1))
+ .hasMax("item", eq(1)))
+ results = engine_single.run_checks([check])
+ assert all(r.constraint_status == ConstraintStatus.Success for r in results)
+
+ def test_all_null_column(self, engine_all_null):
+ """Constraints on all-NULL column."""
+ check = (Check(CheckLevel.Error, "all null")
+ .hasCompleteness("value", eq(0)))
+ results = engine_all_null.run_checks([check])
+ result = results[0]
+ assert result.constraint_status == ConstraintStatus.Success
diff --git a/tests/engines/test_duckdb_profiles.py b/tests/engines/test_duckdb_profiles.py
new file mode 100644
index 0000000..f105fa1
--- /dev/null
+++ b/tests/engines/test_duckdb_profiles.py
@@ -0,0 +1,267 @@
+# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""DuckDB-only profiling tests.
+
+Tests the column profiling functionality of the DuckDB engine.
+"""
+
+import math
+import pytest
+
+from tests.engines.fixtures.datasets import (
+ FLOAT_EPSILON,
+ FLOAT_TOLERANCE,
+ is_close,
+)
+
+
+def get_profile_by_column(profiles, column_name: str):
+ """Find a column profile by column name."""
+ for p in profiles:
+ if p.column == column_name:
+ return p
+ return None
+
+
+class TestBasicProfiling:
+ """Tests for basic profiling functionality."""
+
+ def test_profile_all_columns(self, engine_full):
+ """Profile returns data for all columns."""
+ profiles = engine_full.profile_columns()
+ assert len(profiles) >= 4 # att1, att2, item, price
+
+ def test_profile_specific_columns(self, engine_full):
+ """Profile can be restricted to specific columns."""
+ profiles = engine_full.profile_columns(columns=["att1", "item"])
+ column_names = [p.column for p in profiles]
+ assert "att1" in column_names
+ assert "item" in column_names
+
+ def test_profile_column_name(self, engine_full):
+ """Profile contains correct column names."""
+ profiles = engine_full.profile_columns()
+ column_names = [p.column for p in profiles]
+ assert "att1" in column_names
+ assert "att2" in column_names
+
+
+class TestCompletenessProfile:
+ """Tests for completeness in profiles."""
+
+ def test_completeness_full(self, engine_full):
+ """Completeness is 1.0 for complete columns."""
+ profiles = engine_full.profile_columns(columns=["att1"])
+ profile = get_profile_by_column(profiles, "att1")
+ assert is_close(profile.completeness, 1.0, FLOAT_EPSILON)
+
+ def test_completeness_partial(self, engine_missing):
+ """Completeness reflects NULL ratio."""
+ profiles = engine_missing.profile_columns(columns=["att1", "att2"])
+ att1_profile = get_profile_by_column(profiles, "att1")
+ att2_profile = get_profile_by_column(profiles, "att2")
+ assert is_close(att1_profile.completeness, 0.5, FLOAT_EPSILON) # 6/12
+ assert is_close(att2_profile.completeness, 0.75, FLOAT_EPSILON) # 9/12
+
+ def test_completeness_all_null(self, engine_all_null):
+ """Completeness is 0 for all-NULL column."""
+ profiles = engine_all_null.profile_columns(columns=["value"])
+ profile = get_profile_by_column(profiles, "value")
+ assert is_close(profile.completeness, 0.0, FLOAT_EPSILON)
+
+
+class TestDistinctValuesProfile:
+ """Tests for approximate distinct values in profiles."""
+
+ def test_distinct_values_unique(self, engine_unique):
+ """Distinct count for unique column."""
+ profiles = engine_unique.profile_columns(columns=["unique_col"])
+ profile = get_profile_by_column(profiles, "unique_col")
+ assert profile.approx_distinct_values == 6
+
+ def test_distinct_values_duplicates(self, engine_distinct):
+ """Distinct count handles duplicates correctly."""
+ profiles = engine_distinct.profile_columns(columns=["att1"])
+ profile = get_profile_by_column(profiles, "att1")
+ # att1: ["a", "a", "b", "b", "c", "c"] -> 3 distinct
+ assert profile.approx_distinct_values == 3
+
+
+class TestDataTypeProfile:
+ """Tests for data type detection in profiles."""
+
+ def test_data_type_string(self, engine_full):
+ """Data type detection for string column."""
+ profiles = engine_full.profile_columns(columns=["att1"])
+ profile = get_profile_by_column(profiles, "att1")
+ assert profile.data_type is not None
+ # Should be some form of string type
+ assert "str" in profile.data_type.lower() or "char" in profile.data_type.lower() or "text" in profile.data_type.lower() or "object" in profile.data_type.lower()
+
+ def test_data_type_numeric(self, engine_numeric):
+ """Data type detection for numeric column."""
+ profiles = engine_numeric.profile_columns(columns=["att1"])
+ profile = get_profile_by_column(profiles, "att1")
+ assert profile.data_type is not None
+
+
+class TestNumericProfileStatistics:
+ """Tests for numeric statistics in profiles."""
+
+ def test_mean_numeric(self, engine_numeric):
+ """Mean is calculated for numeric columns."""
+ profiles = engine_numeric.profile_columns(columns=["att1"])
+ profile = get_profile_by_column(profiles, "att1")
+ assert profile.mean is not None
+ assert is_close(profile.mean, 3.5, FLOAT_TOLERANCE)
+
+ def test_min_numeric(self, engine_numeric):
+ """Minimum is calculated for numeric columns."""
+ profiles = engine_numeric.profile_columns(columns=["att1"])
+ profile = get_profile_by_column(profiles, "att1")
+ assert profile.minimum is not None
+ assert is_close(profile.minimum, 1.0, FLOAT_TOLERANCE)
+
+ def test_max_numeric(self, engine_numeric):
+ """Maximum is calculated for numeric columns."""
+ profiles = engine_numeric.profile_columns(columns=["att1"])
+ profile = get_profile_by_column(profiles, "att1")
+ assert profile.maximum is not None
+ assert is_close(profile.maximum, 6.0, FLOAT_TOLERANCE)
+
+ def test_sum_numeric(self, engine_numeric):
+ """Sum is calculated for numeric columns."""
+ profiles = engine_numeric.profile_columns(columns=["att1"])
+ profile = get_profile_by_column(profiles, "att1")
+ assert profile.sum is not None
+ assert is_close(profile.sum, 21.0, FLOAT_TOLERANCE)
+
+ def test_stddev_numeric(self, engine_numeric):
+ """Standard deviation is calculated for numeric columns."""
+ profiles = engine_numeric.profile_columns(columns=["att1"])
+ profile = get_profile_by_column(profiles, "att1")
+ if profile.std_dev is not None:
+ # Population stddev (matches Spark)
+ assert is_close(profile.std_dev, 1.7078251276599330, FLOAT_TOLERANCE)
+
+ def test_numeric_with_nulls(self, engine_numeric):
+ """Numeric statistics handle NULLs correctly."""
+ profiles = engine_numeric.profile_columns(columns=["att2"])
+ profile = get_profile_by_column(profiles, "att2")
+ # att2 has values [1,2,3,4,5,NULL]
+ if profile.mean is not None:
+ assert is_close(profile.mean, 3.0, FLOAT_TOLERANCE) # (1+2+3+4+5)/5
+
+
+class TestStringProfileStatistics:
+ """Tests for string column profiles."""
+
+ def test_string_column_no_numeric_stats(self, engine_full):
+ """String columns don't have numeric statistics."""
+ profiles = engine_full.profile_columns(columns=["att1"])
+ profile = get_profile_by_column(profiles, "att1")
+ # String column shouldn't have meaningful numeric stats
+ # (or they might be None)
+ # Just verify we get a profile back
+ assert profile is not None
+ assert profile.completeness is not None
+
+
+class TestHistogramProfile:
+ """Tests for histogram in profiles."""
+
+ def test_histogram_low_cardinality(self, engine_histogram):
+ """Histogram is generated for low cardinality columns."""
+ profiles = engine_histogram.profile_columns(
+ columns=["category"],
+ low_cardinality_threshold=10
+ )
+ profile = get_profile_by_column(profiles, "category")
+ # Should have histogram for 4-value column with threshold 10
+ if profile.histogram is not None:
+ assert len(profile.histogram) > 0
+
+ def test_histogram_high_cardinality(self, engine_unique):
+ """Histogram might not be generated for high cardinality columns."""
+ profiles = engine_unique.profile_columns(
+ columns=["unique_col"],
+ low_cardinality_threshold=3
+ )
+ profile = get_profile_by_column(profiles, "unique_col")
+ # With 6 distinct and threshold 3, might skip histogram
+ assert profile is not None
+
+
+class TestQuantileProfile:
+ """Tests for quantile/percentile information in profiles."""
+
+ def test_percentiles_numeric(self, engine_quantile):
+ """Percentiles are calculated for numeric columns."""
+ profiles = engine_quantile.profile_columns(columns=["value"])
+ profile = get_profile_by_column(profiles, "value")
+ # Check for percentile attributes if present
+ if hasattr(profile, 'approx_percentiles') and profile.approx_percentiles:
+ # Should have some percentile data
+ assert len(profile.approx_percentiles) >= 0
+
+
+class TestEdgeCases:
+ """Tests for edge cases in profiling."""
+
+ def test_empty_dataset(self, engine_empty):
+ """Profiling empty dataset."""
+ profiles = engine_empty.profile_columns()
+ # Should return profiles (possibly with default/None values)
+ assert isinstance(profiles, list)
+
+ def test_single_row(self, engine_single):
+ """Profiling single-row dataset."""
+ profiles = engine_single.profile_columns(columns=["att1", "item"])
+ att1_profile = get_profile_by_column(profiles, "att1")
+ item_profile = get_profile_by_column(profiles, "item")
+
+ assert att1_profile.completeness == 1.0
+ assert att1_profile.approx_distinct_values == 1
+
+ if item_profile.mean is not None:
+ assert item_profile.mean == 1.0
+ if item_profile.minimum is not None:
+ assert item_profile.minimum == 1.0
+ if item_profile.maximum is not None:
+ assert item_profile.maximum == 1.0
+
+ def test_all_null_column(self, engine_all_null):
+ """Profiling all-NULL column."""
+ profiles = engine_all_null.profile_columns(columns=["value"])
+ profile = get_profile_by_column(profiles, "value")
+ assert profile.completeness == 0.0
+ # Statistics should be None or NaN for all-NULL column
+ if profile.mean is not None and not math.isnan(profile.mean):
+ # Some implementations might return 0 or None
+ pass
+
+
+class TestProfileDataFrame:
+ """Tests for profile to DataFrame conversion."""
+
+ def test_profiles_to_dataframe(self, engine_full):
+ """Profiles can be converted to DataFrame."""
+ profiles = engine_full.profile_columns()
+ df = engine_full.profiles_to_dataframe(profiles)
+
+ assert df is not None
+ assert len(df) > 0
+ assert "column" in df.columns
+ assert "completeness" in df.columns
diff --git a/tests/engines/test_duckdb_suggestions.py b/tests/engines/test_duckdb_suggestions.py
new file mode 100644
index 0000000..3da9a51
--- /dev/null
+++ b/tests/engines/test_duckdb_suggestions.py
@@ -0,0 +1,287 @@
+# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""DuckDB-only constraint suggestion tests.
+
+Tests the constraint suggestion functionality of the DuckDB engine.
+"""
+
+import pytest
+
+from pydeequ.v2.suggestions import Rules
+
+
+def get_suggestions_for_column(suggestions, column_name: str):
+ """Get all suggestions for a specific column."""
+ return [s for s in suggestions if s.column_name == column_name]
+
+
+def get_suggestions_by_constraint(suggestions, constraint_name: str):
+ """Get all suggestions for a specific constraint type."""
+ return [s for s in suggestions if constraint_name in s.constraint_name]
+
+
+class TestBasicSuggestions:
+ """Tests for basic suggestion functionality."""
+
+ def test_default_rules_generate_suggestions(self, engine_full):
+ """DEFAULT rules generate suggestions for complete columns."""
+ suggestions = engine_full.suggest_constraints(rules=[Rules.DEFAULT])
+ # Should generate some suggestions for complete data
+ assert isinstance(suggestions, list)
+
+ def test_suggestions_have_required_fields(self, engine_full):
+ """Suggestions contain all required fields."""
+ suggestions = engine_full.suggest_constraints(rules=[Rules.DEFAULT])
+ if suggestions:
+ suggestion = suggestions[0]
+ assert hasattr(suggestion, 'column_name')
+ assert hasattr(suggestion, 'constraint_name')
+ assert hasattr(suggestion, 'description')
+ assert hasattr(suggestion, 'suggesting_rule')
+
+ def test_restrict_to_columns(self, engine_full):
+ """Suggestions can be restricted to specific columns."""
+ suggestions = engine_full.suggest_constraints(
+ columns=["att1"],
+ rules=[Rules.DEFAULT]
+ )
+ # All suggestions should be for att1 (or dataset-level)
+ column_suggestions = [s for s in suggestions if s.column_name]
+ for s in column_suggestions:
+ assert s.column_name == "att1" or s.column_name is None
+
+
+class TestCompletenessRuleSuggestions:
+ """Tests for completeness-related suggestions."""
+
+ def test_complete_column_suggestions(self, engine_full):
+ """Complete columns get completeness suggestions."""
+ suggestions = engine_full.suggest_constraints(
+ columns=["att1"],
+ rules=[Rules.DEFAULT]
+ )
+ # Should suggest isComplete or hasCompleteness for complete column
+ completeness_suggestions = get_suggestions_by_constraint(suggestions, "Complete")
+ # May or may not generate based on implementation
+ assert isinstance(suggestions, list)
+
+ def test_incomplete_column_suggestions(self, engine_missing):
+ """Incomplete columns may get retain completeness suggestions."""
+ suggestions = engine_missing.suggest_constraints(
+ columns=["att1"],
+ rules=[Rules.DEFAULT]
+ )
+ # att1 is 50% complete - might suggest retaining that level
+ assert isinstance(suggestions, list)
+
+
+class TestUniquenessRuleSuggestions:
+ """Tests for uniqueness-related suggestions."""
+
+ def test_unique_column_suggestions(self, engine_unique):
+ """Unique columns get uniqueness suggestions with COMMON rules."""
+ suggestions = engine_unique.suggest_constraints(
+ columns=["unique_col"],
+ rules=[Rules.COMMON]
+ )
+ # Should suggest isUnique or hasUniqueness for unique column
+ uniqueness_suggestions = get_suggestions_by_constraint(suggestions, "Unique")
+ # Implementation dependent
+ assert isinstance(suggestions, list)
+
+
+class TestNumericalRuleSuggestions:
+ """Tests for numerical constraint suggestions."""
+
+ def test_numeric_column_suggestions(self, engine_numeric):
+ """Numeric columns get statistical suggestions with NUMERICAL rules."""
+ suggestions = engine_numeric.suggest_constraints(
+ columns=["att1"],
+ rules=[Rules.NUMERICAL]
+ )
+ # Should suggest hasMin, hasMax, hasMean for numeric column
+ assert isinstance(suggestions, list)
+
+ def test_min_max_suggestions(self, engine_numeric):
+ """Numeric columns may get min/max suggestions."""
+ suggestions = engine_numeric.suggest_constraints(
+ columns=["att1"],
+ rules=[Rules.NUMERICAL]
+ )
+ min_suggestions = get_suggestions_by_constraint(suggestions, "Min")
+ max_suggestions = get_suggestions_by_constraint(suggestions, "Max")
+ # May have min/max suggestions
+ assert isinstance(suggestions, list)
+
+
+class TestStringRuleSuggestions:
+ """Tests for string-related suggestions."""
+
+ def test_string_column_suggestions(self, engine_string_lengths):
+ """String columns get length suggestions with STRING rules."""
+ suggestions = engine_string_lengths.suggest_constraints(
+ columns=["att1"],
+ rules=[Rules.STRING]
+ )
+ # Should suggest hasMinLength, hasMaxLength for string column
+ assert isinstance(suggestions, list)
+
+
+class TestCategoricalRuleSuggestions:
+ """Tests for categorical constraint suggestions."""
+
+ def test_categorical_column_suggestions(self, engine_contained_in):
+ """Low-cardinality columns may get containment suggestions."""
+ suggestions = engine_contained_in.suggest_constraints(
+ columns=["status"],
+ rules=[Rules.DEFAULT]
+ )
+ # May suggest isContainedIn for categorical column
+ assert isinstance(suggestions, list)
+
+
+class TestMultipleRules:
+ """Tests for combining multiple rule sets."""
+
+ def test_extended_rules(self, engine_numeric):
+ """EXTENDED rules combine all rule sets."""
+ suggestions = engine_numeric.suggest_constraints(
+ rules=[Rules.EXTENDED]
+ )
+ # Should get suggestions from all rule categories
+ assert isinstance(suggestions, list)
+
+ def test_multiple_rule_sets(self, engine_numeric):
+ """Multiple rule sets can be combined."""
+ suggestions = engine_numeric.suggest_constraints(
+ rules=[Rules.DEFAULT, Rules.NUMERICAL]
+ )
+ assert isinstance(suggestions, list)
+
+
+class TestSuggestionContent:
+ """Tests for suggestion content quality."""
+
+ def test_suggestion_has_description(self, engine_full):
+ """Suggestions include human-readable descriptions."""
+ suggestions = engine_full.suggest_constraints(rules=[Rules.DEFAULT])
+ if suggestions:
+ for s in suggestions:
+ assert s.description is not None
+ assert len(s.description) > 0
+
+ def test_suggestion_has_rule_name(self, engine_full):
+ """Suggestions identify the suggesting rule."""
+ suggestions = engine_full.suggest_constraints(rules=[Rules.DEFAULT])
+ if suggestions:
+ for s in suggestions:
+ assert s.suggesting_rule is not None
+
+ def test_suggestion_has_current_value(self, engine_full):
+ """Suggestions include current metric value."""
+ suggestions = engine_full.suggest_constraints(rules=[Rules.DEFAULT])
+ if suggestions:
+ for s in suggestions:
+ # current_value may be present
+ assert hasattr(s, 'current_value')
+
+ def test_suggestion_has_code_snippet(self, engine_full):
+ """Suggestions may include code for constraint."""
+ suggestions = engine_full.suggest_constraints(rules=[Rules.DEFAULT])
+ if suggestions:
+ for s in suggestions:
+ # code_for_constraint may be present
+ assert hasattr(s, 'code_for_constraint')
+
+
+class TestEdgeCases:
+ """Tests for edge cases in suggestions."""
+
+ def test_empty_dataset_suggestions(self, engine_empty):
+ """Suggestions on empty dataset."""
+ suggestions = engine_empty.suggest_constraints(rules=[Rules.DEFAULT])
+ # Should handle gracefully
+ assert isinstance(suggestions, list)
+
+ def test_single_row_suggestions(self, engine_single):
+ """Suggestions on single-row dataset."""
+ suggestions = engine_single.suggest_constraints(rules=[Rules.DEFAULT])
+ assert isinstance(suggestions, list)
+
+ def test_all_null_column_suggestions(self, engine_all_null):
+ """Suggestions on all-NULL column."""
+ suggestions = engine_all_null.suggest_constraints(
+ columns=["value"],
+ rules=[Rules.DEFAULT]
+ )
+ # Should handle all-NULL gracefully
+ assert isinstance(suggestions, list)
+
+
+class TestSuggestionDataFrame:
+ """Tests for suggestion to DataFrame conversion."""
+
+ def test_suggestions_to_dataframe(self, engine_full):
+ """Suggestions can be converted to DataFrame."""
+ suggestions = engine_full.suggest_constraints(rules=[Rules.DEFAULT])
+ df = engine_full.suggestions_to_dataframe(suggestions)
+
+ assert df is not None
+ if len(suggestions) > 0:
+ assert len(df) > 0
+ assert "column_name" in df.columns
+ assert "constraint_name" in df.columns
+
+
+class TestDatasetSpecificSuggestions:
+ """Tests for suggestions on specific dataset types."""
+
+ def test_numeric_dataset(self, engine_numeric):
+ """Numeric dataset gets appropriate suggestions."""
+ suggestions = engine_numeric.suggest_constraints(
+ rules=[Rules.DEFAULT, Rules.NUMERICAL]
+ )
+ # Should have suggestions for numeric columns
+ numeric_suggestions = get_suggestions_for_column(suggestions, "att1")
+ assert isinstance(suggestions, list)
+
+ def test_string_dataset(self, engine_string_lengths):
+ """String dataset gets appropriate suggestions."""
+ suggestions = engine_string_lengths.suggest_constraints(
+ rules=[Rules.DEFAULT, Rules.STRING]
+ )
+ string_suggestions = get_suggestions_for_column(suggestions, "att1")
+ assert isinstance(suggestions, list)
+
+ def test_mixed_type_dataset(self, engine_full):
+ """Mixed-type dataset handles all columns."""
+ suggestions = engine_full.suggest_constraints(
+ rules=[Rules.EXTENDED]
+ )
+ # Should have suggestions for different column types
+ assert isinstance(suggestions, list)
+
+
+class TestNonNegativeRuleSuggestions:
+ """Tests for non-negative number suggestions."""
+
+ def test_positive_column_suggestions(self, engine_compliance):
+ """All-positive columns may get non-negative suggestions."""
+ suggestions = engine_compliance.suggest_constraints(
+ columns=["positive"],
+ rules=[Rules.DEFAULT]
+ )
+ # May suggest isNonNegative for positive column
+ assert isinstance(suggestions, list)
diff --git a/tests/engines/test_operators.py b/tests/engines/test_operators.py
new file mode 100644
index 0000000..62ccd24
--- /dev/null
+++ b/tests/engines/test_operators.py
@@ -0,0 +1,484 @@
+# -*- coding: utf-8 -*-
+"""
+Unit tests for SQL operators.
+
+These tests verify the operator abstractions work correctly in isolation,
+testing SQL generation and result extraction separately from actual
+database execution.
+"""
+
+import pandas as pd
+import pytest
+
+from pydeequ.engines import MetricResult
+from pydeequ.engines.operators import (
+ # Scan operators
+ SizeOperator,
+ CompletenessOperator,
+ MeanOperator,
+ SumOperator,
+ MinimumOperator,
+ MaximumOperator,
+ StandardDeviationOperator,
+ MaxLengthOperator,
+ MinLengthOperator,
+ PatternMatchOperator,
+ ComplianceOperator,
+ CorrelationOperator,
+ CountDistinctOperator,
+ ApproxCountDistinctOperator,
+ # Grouping operators
+ DistinctnessOperator,
+ UniquenessOperator,
+ UniqueValueRatioOperator,
+ EntropyOperator,
+ MutualInformationOperator,
+ # Factory
+ OperatorFactory,
+ # Mixins
+ WhereClauseMixin,
+ SafeExtractMixin,
+ ColumnAliasMixin,
+)
+
+
+class TestWhereClauseMixin:
+ """Tests for WhereClauseMixin."""
+
+ def test_wrap_agg_with_where_no_condition(self):
+ """Test wrapping aggregation without WHERE clause."""
+ class TestClass(WhereClauseMixin):
+ where = None
+
+ obj = TestClass()
+ result = obj.wrap_agg_with_where("AVG", "price")
+ assert result == "AVG(price)"
+
+ def test_wrap_agg_with_where_with_condition(self):
+ """Test wrapping aggregation with WHERE clause."""
+ class TestClass(WhereClauseMixin):
+ where = "status = 'active'"
+
+ obj = TestClass()
+ result = obj.wrap_agg_with_where("AVG", "price")
+ assert result == "AVG(CASE WHEN status = 'active' THEN price ELSE NULL END)"
+
+ def test_wrap_count_with_where_no_condition(self):
+ """Test wrapping COUNT without WHERE clause."""
+ class TestClass(WhereClauseMixin):
+ where = None
+
+ obj = TestClass()
+ result = obj.wrap_count_with_where()
+ assert result == "COUNT(*)"
+
+ def test_wrap_count_with_where_with_condition(self):
+ """Test wrapping COUNT with WHERE clause."""
+ class TestClass(WhereClauseMixin):
+ where = "status = 'active'"
+
+ obj = TestClass()
+ result = obj.wrap_count_with_where()
+ assert result == "SUM(CASE WHEN status = 'active' THEN 1 ELSE 0 END)"
+
+ def test_wrap_count_with_where_custom_condition(self):
+ """Test wrapping COUNT with custom condition and WHERE clause."""
+ class TestClass(WhereClauseMixin):
+ where = "status = 'active'"
+
+ obj = TestClass()
+ result = obj.wrap_count_with_where("price > 0")
+ assert "status = 'active'" in result
+ assert "price > 0" in result
+
+
+class TestSafeExtractMixin:
+ """Tests for SafeExtractMixin."""
+
+ def test_safe_float_valid(self):
+ """Test extracting valid float value."""
+ class TestClass(SafeExtractMixin):
+ pass
+
+ obj = TestClass()
+ df = pd.DataFrame({"value": [42.5]})
+ result = obj.safe_float(df, "value")
+ assert result == 42.5
+
+ def test_safe_float_none(self):
+ """Test extracting None value."""
+ class TestClass(SafeExtractMixin):
+ pass
+
+ obj = TestClass()
+ df = pd.DataFrame({"value": [None]})
+ result = obj.safe_float(df, "value")
+ assert result is None
+
+ def test_safe_float_missing_column(self):
+ """Test extracting from missing column."""
+ class TestClass(SafeExtractMixin):
+ pass
+
+ obj = TestClass()
+ df = pd.DataFrame({"other": [42.5]})
+ result = obj.safe_float(df, "value")
+ assert result is None
+
+ def test_safe_int(self):
+ """Test extracting integer value."""
+ class TestClass(SafeExtractMixin):
+ pass
+
+ obj = TestClass()
+ df = pd.DataFrame({"value": [42.7]})
+ result = obj.safe_int(df, "value")
+ assert result == 42
+
+
+class TestColumnAliasMixin:
+ """Tests for ColumnAliasMixin."""
+
+ def test_make_alias_single_part(self):
+ """Test alias with single part."""
+ class TestClass(ColumnAliasMixin):
+ pass
+
+ obj = TestClass()
+ result = obj.make_alias("mean", "price")
+ assert result == "mean_price"
+
+ def test_make_alias_multiple_parts(self):
+ """Test alias with multiple parts."""
+ class TestClass(ColumnAliasMixin):
+ pass
+
+ obj = TestClass()
+ result = obj.make_alias("corr", "price", "quantity")
+ assert result == "corr_price_quantity"
+
+ def test_make_alias_sanitization(self):
+ """Test alias sanitizes special characters."""
+ class TestClass(ColumnAliasMixin):
+ pass
+
+ obj = TestClass()
+ result = obj.make_alias("mean", "table.column")
+ assert result == "mean_table_column"
+
+
+class TestSizeOperator:
+ """Tests for SizeOperator."""
+
+ def test_get_aggregations_no_where(self):
+ """Test SQL generation without WHERE clause."""
+ op = SizeOperator()
+ aggs = op.get_aggregations()
+ assert len(aggs) == 1
+ assert "COUNT(*)" in aggs[0]
+ assert "size_value" in aggs[0]
+
+ def test_get_aggregations_with_where(self):
+ """Test SQL generation with WHERE clause."""
+ op = SizeOperator(where="status = 'active'")
+ aggs = op.get_aggregations()
+ assert len(aggs) == 1
+ assert "SUM(CASE WHEN" in aggs[0]
+ assert "status = 'active'" in aggs[0]
+
+ def test_extract_result(self):
+ """Test result extraction."""
+ op = SizeOperator()
+ df = pd.DataFrame({"size_value": [100]})
+ result = op.extract_result(df)
+ assert result.name == "Size"
+ assert result.instance == "*"
+ assert result.entity == "Dataset"
+ assert result.value == 100.0
+
+
+class TestCompletenessOperator:
+ """Tests for CompletenessOperator."""
+
+ def test_get_aggregations(self):
+ """Test SQL generation."""
+ op = CompletenessOperator("email")
+ aggs = op.get_aggregations()
+ assert len(aggs) == 2
+ assert any("count_email" in agg for agg in aggs)
+ assert any("null_count_email" in agg for agg in aggs)
+
+ def test_extract_result_complete(self):
+ """Test result extraction with complete data."""
+ op = CompletenessOperator("email")
+ df = pd.DataFrame({
+ "count_email": [100],
+ "null_count_email": [0],
+ })
+ result = op.extract_result(df)
+ assert result.value == 1.0
+
+ def test_extract_result_partial(self):
+ """Test result extraction with partial data."""
+ op = CompletenessOperator("email")
+ df = pd.DataFrame({
+ "count_email": [100],
+ "null_count_email": [20],
+ })
+ result = op.extract_result(df)
+ assert result.value == 0.8
+
+
+class TestMeanOperator:
+ """Tests for MeanOperator."""
+
+ def test_get_aggregations(self):
+ """Test SQL generation."""
+ op = MeanOperator("price")
+ aggs = op.get_aggregations()
+ assert len(aggs) == 1
+ assert "AVG(price)" in aggs[0]
+ assert "mean_price" in aggs[0]
+
+ def test_extract_result(self):
+ """Test result extraction."""
+ op = MeanOperator("price")
+ df = pd.DataFrame({"mean_price": [42.5]})
+ result = op.extract_result(df)
+ assert result.name == "Mean"
+ assert result.instance == "price"
+ assert result.value == 42.5
+
+
+class TestPatternMatchOperator:
+ """Tests for PatternMatchOperator."""
+
+ def test_get_aggregations(self):
+ """Test SQL generation."""
+ op = PatternMatchOperator("email", r"^.+@.+\..+$")
+ aggs = op.get_aggregations()
+ assert len(aggs) == 2
+ assert any("count_email" in agg for agg in aggs)
+ assert any("pattern_match_email" in agg for agg in aggs)
+ assert any("REGEXP_MATCHES" in agg for agg in aggs)
+
+ def test_extract_result(self):
+ """Test result extraction."""
+ op = PatternMatchOperator("email", r"^.+@.+\..+$")
+ df = pd.DataFrame({
+ "count_email": [100],
+ "pattern_match_email": [95],
+ })
+ result = op.extract_result(df)
+ assert result.name == "PatternMatch"
+ assert result.value == 0.95
+
+
+class TestDistinctnessOperator:
+ """Tests for DistinctnessOperator."""
+
+ def test_get_grouping_columns(self):
+ """Test grouping columns."""
+ op = DistinctnessOperator(["category"])
+ assert op.get_grouping_columns() == ["category"]
+
+ def test_build_query(self):
+ """Test query building."""
+ op = DistinctnessOperator(["category"])
+ query = op.build_query("products")
+ assert "SELECT category" in query
+ assert "GROUP BY category" in query
+ assert "distinct_count" in query
+ assert "total_count" in query
+
+ def test_extract_result(self):
+ """Test result extraction."""
+ op = DistinctnessOperator(["category"])
+ df = pd.DataFrame({
+ "distinct_count": [10],
+ "total_count": [100],
+ })
+ result = op.extract_result(df)
+ assert result.name == "Distinctness"
+ assert result.value == 0.1
+
+
+class TestUniquenessOperator:
+ """Tests for UniquenessOperator."""
+
+ def test_build_query(self):
+ """Test query building."""
+ op = UniquenessOperator(["id"])
+ query = op.build_query("users")
+ assert "GROUP BY id" in query
+ assert "HAVING" not in query # HAVING is used in the inner query
+ assert "unique_count" in query
+ assert "total_count" in query
+
+ def test_extract_result(self):
+ """Test result extraction."""
+ op = UniquenessOperator(["id"])
+ df = pd.DataFrame({
+ "unique_count": [90],
+ "total_count": [100],
+ })
+ result = op.extract_result(df)
+ assert result.name == "Uniqueness"
+ assert result.value == 0.9
+
+
+class TestEntropyOperator:
+ """Tests for EntropyOperator."""
+
+ def test_build_query(self):
+ """Test query building."""
+ op = EntropyOperator("category")
+ query = op.build_query("products")
+ assert "GROUP BY category" in query
+ assert "LN" in query
+ assert "entropy" in query
+
+ def test_extract_result(self):
+ """Test result extraction."""
+ op = EntropyOperator("category")
+ df = pd.DataFrame({"entropy": [2.5]})
+ result = op.extract_result(df)
+ assert result.name == "Entropy"
+ assert result.value == 2.5
+
+
+class TestOperatorFactory:
+ """Tests for OperatorFactory."""
+
+ def test_is_scan_operator(self):
+ """Test scan operator detection."""
+ from pydeequ.v2.analyzers import Mean, Sum, Completeness
+
+ assert OperatorFactory.is_scan_operator(Mean("price"))
+ assert OperatorFactory.is_scan_operator(Sum("amount"))
+ assert OperatorFactory.is_scan_operator(Completeness("email"))
+
+ def test_is_grouping_operator(self):
+ """Test grouping operator detection."""
+ from pydeequ.v2.analyzers import Distinctness, Uniqueness, Entropy
+
+ assert OperatorFactory.is_grouping_operator(Distinctness("category"))
+ assert OperatorFactory.is_grouping_operator(Uniqueness("id"))
+ assert OperatorFactory.is_grouping_operator(Entropy("status"))
+
+ def test_create_scan_operator(self):
+ """Test creating scan operator from analyzer."""
+ from pydeequ.v2.analyzers import Mean
+
+ analyzer = Mean("price", where="status = 'active'")
+ operator = OperatorFactory.create(analyzer)
+
+ assert operator is not None
+ assert isinstance(operator, MeanOperator)
+ assert operator.column == "price"
+ assert operator.where == "status = 'active'"
+
+ def test_create_grouping_operator(self):
+ """Test creating grouping operator from analyzer."""
+ from pydeequ.v2.analyzers import Distinctness
+
+ analyzer = Distinctness(["category", "brand"])
+ operator = OperatorFactory.create(analyzer)
+
+ assert operator is not None
+ assert isinstance(operator, DistinctnessOperator)
+ assert operator.columns == ["category", "brand"]
+
+ def test_is_supported(self):
+ """Test analyzer support checking."""
+ from pydeequ.v2.analyzers import Mean, Histogram, ApproxQuantile, DataType
+
+ assert OperatorFactory.is_supported(Mean("price"))
+ # Histogram and ApproxQuantile are now supported as operators
+ assert OperatorFactory.is_supported(Histogram("category"))
+ assert OperatorFactory.is_supported(ApproxQuantile("price", 0.5))
+ # DataType is now supported via the metadata registry
+ assert OperatorFactory.is_supported(DataType("category"))
+ assert OperatorFactory.is_metadata_operator(DataType("category"))
+
+
+class TestOperatorIntegration:
+ """Integration tests for operators with actual DuckDB."""
+
+ @pytest.fixture
+ def duckdb_conn(self):
+ """Create a DuckDB connection with test data."""
+ import duckdb
+
+ conn = duckdb.connect(":memory:")
+ conn.execute("""
+ CREATE TABLE test_data AS SELECT * FROM (
+ VALUES
+ (1, 'Alice', 100.0, 'A', 'active'),
+ (2, 'Bob', 200.0, 'B', 'active'),
+ (3, 'Carol', 150.0, 'A', 'inactive'),
+ (4, 'Dave', NULL, 'C', 'active'),
+ (5, 'Eve', 300.0, 'A', 'active')
+ ) AS t(id, name, amount, category, status)
+ """)
+ yield conn
+ conn.close()
+
+ def test_scan_operators_batch_execution(self, duckdb_conn):
+ """Test batch execution of multiple scan operators."""
+ operators = [
+ SizeOperator(),
+ MeanOperator("amount"),
+ MaximumOperator("amount"),
+ MinimumOperator("amount"),
+ ]
+
+ # Collect all aggregations
+ aggregations = []
+ for op in operators:
+ aggregations.extend(op.get_aggregations())
+
+ # Execute single query
+ query = f"SELECT {', '.join(aggregations)} FROM test_data"
+ result = duckdb_conn.execute(query).fetchdf()
+
+ # Extract results
+ results = [op.extract_result(result) for op in operators]
+
+ assert results[0].value == 5.0 # Size
+ assert results[1].value == 187.5 # Mean (750/4 non-null)
+ assert results[2].value == 300.0 # Maximum
+ assert results[3].value == 100.0 # Minimum
+
+ def test_grouping_operator_execution(self, duckdb_conn):
+ """Test execution of grouping operator."""
+ op = DistinctnessOperator(["category"])
+ query = op.build_query("test_data")
+ result = duckdb_conn.execute(query).fetchdf()
+ metric = op.extract_result(result)
+
+ # 3 distinct categories / 5 rows = 0.6
+ assert metric.name == "Distinctness"
+ assert metric.value == 0.6
+
+ def test_completeness_operator(self, duckdb_conn):
+ """Test completeness operator with NULL values."""
+ op = CompletenessOperator("amount")
+ aggs = op.get_aggregations()
+ query = f"SELECT {', '.join(aggs)} FROM test_data"
+ result = duckdb_conn.execute(query).fetchdf()
+ metric = op.extract_result(result)
+
+ # 4 non-null out of 5
+ assert metric.value == 0.8
+
+ def test_operator_with_where_clause(self, duckdb_conn):
+ """Test operator with WHERE clause filtering."""
+ op = MeanOperator("amount", where="status = 'active'")
+ aggs = op.get_aggregations()
+ query = f"SELECT {', '.join(aggs)} FROM test_data"
+ result = duckdb_conn.execute(query).fetchdf()
+ metric = op.extract_result(result)
+
+ # Active rows: 100, 200, NULL, 300 -> mean of non-null = 200
+ assert metric.value == 200.0
diff --git a/tests/engines/test_suggestion_rules.py b/tests/engines/test_suggestion_rules.py
new file mode 100644
index 0000000..248182f
--- /dev/null
+++ b/tests/engines/test_suggestion_rules.py
@@ -0,0 +1,462 @@
+# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Unit tests for suggestion rules.
+
+Tests the individual suggestion rules in isolation using mock column profiles.
+"""
+
+import json
+import pytest
+
+from pydeequ.engines import ColumnProfile
+from pydeequ.engines.suggestions import (
+ RuleRegistry,
+ SuggestionRunner,
+ CompleteIfCompleteRule,
+ RetainCompletenessRule,
+ NonNegativeNumbersRule,
+ CategoricalRangeRule,
+ HasMinRule,
+ HasMaxRule,
+ HasMeanRule,
+ HasMinLengthRule,
+ HasMaxLengthRule,
+ UniqueIfApproximatelyUniqueRule,
+)
+
+
+def make_profile(
+ column: str = "test_col",
+ completeness: float = 1.0,
+ approx_distinct_values: int = 10,
+ data_type: str = "INTEGER",
+ minimum: float = None,
+ maximum: float = None,
+ mean: float = None,
+ histogram: str = None,
+) -> ColumnProfile:
+ """Create a test column profile with specified attributes."""
+ return ColumnProfile(
+ column=column,
+ completeness=completeness,
+ approx_distinct_values=approx_distinct_values,
+ data_type=data_type,
+ minimum=minimum,
+ maximum=maximum,
+ mean=mean,
+ histogram=histogram,
+ )
+
+
+class TestCompleteIfCompleteRule:
+ """Tests for CompleteIfComplete rule."""
+
+ def test_applies_when_fully_complete(self):
+ """Rule applies when completeness is 1.0."""
+ rule = CompleteIfCompleteRule()
+ profile = make_profile(completeness=1.0)
+ assert rule.applies_to(profile) is True
+
+ def test_does_not_apply_when_not_complete(self):
+ """Rule does not apply when completeness < 1.0."""
+ rule = CompleteIfCompleteRule()
+ profile = make_profile(completeness=0.95)
+ assert rule.applies_to(profile) is False
+
+ def test_generates_correct_suggestion(self):
+ """Rule generates isComplete suggestion."""
+ rule = CompleteIfCompleteRule()
+ profile = make_profile(column="my_column", completeness=1.0)
+ suggestion = rule.generate(profile)
+
+ assert suggestion.column_name == "my_column"
+ assert suggestion.constraint_name == "Completeness"
+ assert suggestion.suggesting_rule == "CompleteIfComplete"
+ assert ".isComplete" in suggestion.code_for_constraint
+
+ def test_rule_sets(self):
+ """Rule belongs to DEFAULT and EXTENDED sets."""
+ rule = CompleteIfCompleteRule()
+ assert "DEFAULT" in rule.rule_sets
+ assert "EXTENDED" in rule.rule_sets
+
+
+class TestRetainCompletenessRule:
+ """Tests for RetainCompleteness rule."""
+
+ def test_applies_when_high_completeness(self):
+ """Rule applies when completeness >= 0.9 and < 1.0."""
+ rule = RetainCompletenessRule()
+ profile = make_profile(completeness=0.95)
+ assert rule.applies_to(profile) is True
+
+ def test_does_not_apply_when_fully_complete(self):
+ """Rule does not apply when completeness is 1.0."""
+ rule = RetainCompletenessRule()
+ profile = make_profile(completeness=1.0)
+ assert rule.applies_to(profile) is False
+
+ def test_does_not_apply_when_low_completeness(self):
+ """Rule does not apply when completeness < threshold."""
+ rule = RetainCompletenessRule()
+ profile = make_profile(completeness=0.85)
+ assert rule.applies_to(profile) is False
+
+ def test_generates_correct_suggestion(self):
+ """Rule generates hasCompleteness suggestion."""
+ rule = RetainCompletenessRule()
+ profile = make_profile(column="my_column", completeness=0.95)
+ suggestion = rule.generate(profile)
+
+ assert suggestion.column_name == "my_column"
+ assert suggestion.suggesting_rule == "RetainCompleteness"
+ assert ".hasCompleteness" in suggestion.code_for_constraint
+
+
+class TestNonNegativeNumbersRule:
+ """Tests for NonNegativeNumbers rule."""
+
+ def test_applies_when_minimum_non_negative(self):
+ """Rule applies when minimum >= 0."""
+ rule = NonNegativeNumbersRule()
+ profile = make_profile(minimum=0.0)
+ assert rule.applies_to(profile) is True
+
+ def test_applies_when_minimum_positive(self):
+ """Rule applies when minimum > 0."""
+ rule = NonNegativeNumbersRule()
+ profile = make_profile(minimum=5.0)
+ assert rule.applies_to(profile) is True
+
+ def test_does_not_apply_when_minimum_negative(self):
+ """Rule does not apply when minimum < 0."""
+ rule = NonNegativeNumbersRule()
+ profile = make_profile(minimum=-1.0)
+ assert rule.applies_to(profile) is False
+
+ def test_does_not_apply_when_no_minimum(self):
+ """Rule does not apply when minimum is None."""
+ rule = NonNegativeNumbersRule()
+ profile = make_profile(minimum=None)
+ assert rule.applies_to(profile) is False
+
+ def test_generates_correct_suggestion(self):
+ """Rule generates isNonNegative suggestion."""
+ rule = NonNegativeNumbersRule()
+ profile = make_profile(column="amount", minimum=0.0)
+ suggestion = rule.generate(profile)
+
+ assert suggestion.column_name == "amount"
+ assert suggestion.suggesting_rule == "NonNegativeNumbers"
+ assert ".isNonNegative" in suggestion.code_for_constraint
+
+
+class TestCategoricalRangeRule:
+ """Tests for CategoricalRange rule."""
+
+ def test_applies_when_low_cardinality_histogram(self):
+ """Rule applies when histogram has <= 10 values."""
+ rule = CategoricalRangeRule()
+ histogram = json.dumps({"A": 10, "B": 20, "C": 30})
+ profile = make_profile(histogram=histogram)
+ assert rule.applies_to(profile) is True
+
+ def test_does_not_apply_when_no_histogram(self):
+ """Rule does not apply when no histogram."""
+ rule = CategoricalRangeRule()
+ profile = make_profile(histogram=None)
+ assert rule.applies_to(profile) is False
+
+ def test_does_not_apply_when_high_cardinality(self):
+ """Rule does not apply when histogram has > 10 values."""
+ rule = CategoricalRangeRule()
+ histogram = json.dumps({f"val_{i}": i for i in range(20)})
+ profile = make_profile(histogram=histogram)
+ assert rule.applies_to(profile) is False
+
+ def test_generates_correct_suggestion(self):
+ """Rule generates isContainedIn suggestion."""
+ rule = CategoricalRangeRule()
+ histogram = json.dumps({"A": 10, "B": 20})
+ profile = make_profile(column="status", histogram=histogram)
+ suggestion = rule.generate(profile)
+
+ assert suggestion.column_name == "status"
+ assert suggestion.suggesting_rule == "CategoricalRangeRule"
+ assert ".isContainedIn" in suggestion.code_for_constraint
+
+
+class TestHasMinRule:
+ """Tests for HasMin rule."""
+
+ def test_applies_when_numeric_with_stats(self):
+ """Rule applies when minimum and mean are present."""
+ rule = HasMinRule()
+ profile = make_profile(minimum=0.0, mean=5.0)
+ assert rule.applies_to(profile) is True
+
+ def test_does_not_apply_when_no_minimum(self):
+ """Rule does not apply when minimum is None."""
+ rule = HasMinRule()
+ profile = make_profile(minimum=None, mean=5.0)
+ assert rule.applies_to(profile) is False
+
+ def test_generates_correct_suggestion(self):
+ """Rule generates hasMin suggestion."""
+ rule = HasMinRule()
+ profile = make_profile(column="value", minimum=1.0, mean=5.0)
+ suggestion = rule.generate(profile)
+
+ assert suggestion.column_name == "value"
+ assert suggestion.suggesting_rule == "HasMin"
+ assert ".hasMin" in suggestion.code_for_constraint
+
+ def test_rule_sets(self):
+ """Rule belongs to NUMERICAL and EXTENDED sets."""
+ rule = HasMinRule()
+ assert "NUMERICAL" in rule.rule_sets
+ assert "EXTENDED" in rule.rule_sets
+
+
+class TestHasMaxRule:
+ """Tests for HasMax rule."""
+
+ def test_applies_when_numeric_with_stats(self):
+ """Rule applies when maximum and mean are present."""
+ rule = HasMaxRule()
+ profile = make_profile(maximum=10.0, mean=5.0)
+ assert rule.applies_to(profile) is True
+
+ def test_generates_correct_suggestion(self):
+ """Rule generates hasMax suggestion."""
+ rule = HasMaxRule()
+ profile = make_profile(column="value", maximum=10.0, mean=5.0)
+ suggestion = rule.generate(profile)
+
+ assert suggestion.column_name == "value"
+ assert suggestion.suggesting_rule == "HasMax"
+ assert ".hasMax" in suggestion.code_for_constraint
+
+
+class TestHasMeanRule:
+ """Tests for HasMean rule."""
+
+ def test_applies_when_mean_present(self):
+ """Rule applies when mean is present."""
+ rule = HasMeanRule()
+ profile = make_profile(mean=5.0)
+ assert rule.applies_to(profile) is True
+
+ def test_does_not_apply_when_no_mean(self):
+ """Rule does not apply when mean is None."""
+ rule = HasMeanRule()
+ profile = make_profile(mean=None)
+ assert rule.applies_to(profile) is False
+
+ def test_generates_correct_suggestion(self):
+ """Rule generates hasMean suggestion with range."""
+ rule = HasMeanRule()
+ profile = make_profile(column="value", mean=100.0)
+ suggestion = rule.generate(profile)
+
+ assert suggestion.column_name == "value"
+ assert suggestion.suggesting_rule == "HasMean"
+ assert ".hasMean" in suggestion.code_for_constraint
+ assert "between" in suggestion.code_for_constraint
+
+
+class TestHasMinLengthRule:
+ """Tests for HasMinLength rule."""
+
+ def test_applies_to_string_columns(self):
+ """Rule applies to string data types."""
+ rule = HasMinLengthRule()
+ profile = make_profile(data_type="VARCHAR")
+ assert rule.applies_to(profile) is True
+
+ def test_does_not_apply_to_numeric_columns(self):
+ """Rule does not apply to numeric data types."""
+ rule = HasMinLengthRule()
+ profile = make_profile(data_type="INTEGER")
+ assert rule.applies_to(profile) is False
+
+ def test_generates_correct_suggestion(self):
+ """Rule generates hasMinLength suggestion."""
+ rule = HasMinLengthRule()
+ profile = make_profile(column="name", data_type="VARCHAR")
+ suggestion = rule.generate(profile, min_length=3)
+
+ assert suggestion.column_name == "name"
+ assert suggestion.suggesting_rule == "HasMinLength"
+ assert ".hasMinLength" in suggestion.code_for_constraint
+
+ def test_returns_none_when_no_length(self):
+ """Rule returns None when no min_length provided."""
+ rule = HasMinLengthRule()
+ profile = make_profile(data_type="VARCHAR")
+ suggestion = rule.generate(profile, min_length=None)
+ assert suggestion is None
+
+ def test_rule_sets(self):
+ """Rule belongs to STRING and EXTENDED sets."""
+ rule = HasMinLengthRule()
+ assert "STRING" in rule.rule_sets
+ assert "EXTENDED" in rule.rule_sets
+
+
+class TestHasMaxLengthRule:
+ """Tests for HasMaxLength rule."""
+
+ def test_applies_to_string_columns(self):
+ """Rule applies to string data types."""
+ rule = HasMaxLengthRule()
+ profile = make_profile(data_type="TEXT")
+ assert rule.applies_to(profile) is True
+
+ def test_generates_correct_suggestion(self):
+ """Rule generates hasMaxLength suggestion."""
+ rule = HasMaxLengthRule()
+ profile = make_profile(column="name", data_type="VARCHAR")
+ suggestion = rule.generate(profile, max_length=50)
+
+ assert suggestion.column_name == "name"
+ assert suggestion.suggesting_rule == "HasMaxLength"
+ assert ".hasMaxLength" in suggestion.code_for_constraint
+
+
+class TestUniqueIfApproximatelyUniqueRule:
+ """Tests for UniqueIfApproximatelyUnique rule."""
+
+ def test_generates_suggestion_when_unique(self):
+ """Rule generates isUnique when distinct values >= 99% of rows."""
+ rule = UniqueIfApproximatelyUniqueRule()
+ profile = make_profile(column="id", approx_distinct_values=100)
+ suggestion = rule.generate(profile, row_count=100)
+
+ assert suggestion is not None
+ assert suggestion.column_name == "id"
+ assert suggestion.suggesting_rule == "UniqueIfApproximatelyUnique"
+ assert ".isUnique" in suggestion.code_for_constraint
+
+ def test_does_not_generate_when_not_unique(self):
+ """Rule returns None when distinct values < 99% of rows."""
+ rule = UniqueIfApproximatelyUniqueRule()
+ profile = make_profile(approx_distinct_values=50)
+ suggestion = rule.generate(profile, row_count=100)
+ assert suggestion is None
+
+ def test_returns_none_when_no_row_count(self):
+ """Rule returns None when row_count is not provided."""
+ rule = UniqueIfApproximatelyUniqueRule()
+ profile = make_profile(approx_distinct_values=100)
+ suggestion = rule.generate(profile, row_count=None)
+ assert suggestion is None
+
+ def test_rule_sets(self):
+ """Rule belongs to COMMON and EXTENDED sets."""
+ rule = UniqueIfApproximatelyUniqueRule()
+ assert "COMMON" in rule.rule_sets
+ assert "EXTENDED" in rule.rule_sets
+
+
+class TestRuleRegistry:
+ """Tests for RuleRegistry."""
+
+ def test_registry_has_default_rules(self):
+ """Registry has rules registered by default."""
+ rules = RuleRegistry.get_all_rules()
+ assert len(rules) > 0
+
+ def test_get_rules_for_sets_default(self):
+ """Can retrieve DEFAULT rules."""
+ rules = RuleRegistry.get_rules_for_sets(["DEFAULT"])
+ rule_names = [r.name for r in rules]
+ assert "CompleteIfComplete" in rule_names
+ assert "NonNegativeNumbers" in rule_names
+
+ def test_get_rules_for_sets_numerical(self):
+ """Can retrieve NUMERICAL rules."""
+ rules = RuleRegistry.get_rules_for_sets(["NUMERICAL"])
+ rule_names = [r.name for r in rules]
+ assert "HasMin" in rule_names
+ assert "HasMax" in rule_names
+ assert "HasMean" in rule_names
+
+ def test_get_rules_for_sets_string(self):
+ """Can retrieve STRING rules."""
+ rules = RuleRegistry.get_rules_for_sets(["STRING"])
+ rule_names = [r.name for r in rules]
+ assert "HasMinLength" in rule_names
+ assert "HasMaxLength" in rule_names
+
+ def test_get_rules_for_multiple_sets(self):
+ """Can retrieve rules from multiple sets."""
+ rules = RuleRegistry.get_rules_for_sets(["DEFAULT", "NUMERICAL"])
+ rule_names = [r.name for r in rules]
+ assert "CompleteIfComplete" in rule_names
+ assert "HasMin" in rule_names
+
+
+class TestSuggestionRunner:
+ """Tests for SuggestionRunner."""
+
+ def test_runner_default_rules(self):
+ """Runner uses DEFAULT rules by default."""
+ runner = SuggestionRunner()
+ assert runner.rule_sets == ["DEFAULT"]
+
+ def test_runner_custom_rules(self):
+ """Runner can use custom rule sets."""
+ runner = SuggestionRunner(rule_sets=["NUMERICAL", "STRING"])
+ assert "NUMERICAL" in runner.rule_sets
+ assert "STRING" in runner.rule_sets
+
+ def test_run_generates_suggestions(self):
+ """Runner generates suggestions from profiles."""
+ runner = SuggestionRunner(rule_sets=["DEFAULT"])
+ profiles = [
+ make_profile(column="complete_col", completeness=1.0),
+ make_profile(column="partial_col", completeness=0.95),
+ ]
+ suggestions = runner.run(profiles)
+
+ # Should have suggestions for both columns
+ column_names = [s.column_name for s in suggestions]
+ assert "complete_col" in column_names
+ assert "partial_col" in column_names
+
+ def test_run_with_numeric_profiles(self):
+ """Runner generates numeric suggestions."""
+ runner = SuggestionRunner(rule_sets=["NUMERICAL"])
+ profiles = [
+ make_profile(column="value", minimum=0.0, maximum=100.0, mean=50.0),
+ ]
+ suggestions = runner.run(profiles)
+
+ rule_names = [s.suggesting_rule for s in suggestions]
+ assert "HasMin" in rule_names
+ assert "HasMax" in rule_names
+ assert "HasMean" in rule_names
+
+ def test_run_with_row_count_for_uniqueness(self):
+ """Runner uses row_count for uniqueness checks."""
+ runner = SuggestionRunner(rule_sets=["COMMON"])
+ profiles = [
+ make_profile(column="id", approx_distinct_values=100),
+ ]
+ suggestions = runner.run(profiles, row_count=100)
+
+ rule_names = [s.suggesting_rule for s in suggestions]
+ assert "UniqueIfApproximatelyUnique" in rule_names
diff --git a/tests/helpers/__init__.py b/tests/helpers/__init__.py
new file mode 100644
index 0000000..9c7f9e4
--- /dev/null
+++ b/tests/helpers/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Test helpers for PyDeequ tests."""
+
+from tests.helpers.spark_server import SparkConnectServer, SparkServerConfig
+
+__all__ = ["SparkConnectServer", "SparkServerConfig"]
diff --git a/tests/helpers/spark_server.py b/tests/helpers/spark_server.py
new file mode 100644
index 0000000..2679433
--- /dev/null
+++ b/tests/helpers/spark_server.py
@@ -0,0 +1,193 @@
+# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Spark Connect server management for tests.
+
+This module provides utilities to start and manage a Spark Connect server
+for running integration tests that require Spark.
+"""
+
+import os
+import socket
+import subprocess
+import time
+from dataclasses import dataclass, field
+from typing import Optional
+
+
+@dataclass
+class SparkServerConfig:
+ """Configuration for Spark Connect server."""
+
+ java_home: str = field(
+ default_factory=lambda: os.environ.get(
+ "JAVA_HOME",
+ "/Library/Java/JavaVirtualMachines/amazon-corretto-17.jdk/Contents/Home",
+ )
+ )
+ spark_home: str = field(
+ default_factory=lambda: os.environ.get(
+ "SPARK_HOME", "/Volumes/workplace/deequ_rewrite/spark-3.5.0-bin-hadoop3"
+ )
+ )
+ port: int = 15002
+ startup_timeout: int = 60
+ poll_interval: float = 1.0
+ driver_memory: str = "4g"
+ executor_memory: str = "4g"
+ deequ_jar: str = field(
+ default_factory=lambda: os.environ.get(
+ "DEEQU_JAR",
+ "/Volumes/workplace/deequ_rewrite/deequ/target/deequ_2.12-2.1.0b-spark-3.5.jar"
+ )
+ )
+
+
+class SparkConnectServer:
+ """Manages Spark Connect server lifecycle for tests."""
+
+ def __init__(self, config: Optional[SparkServerConfig] = None):
+ """
+ Initialize Spark Connect server manager.
+
+ Args:
+ config: Server configuration (uses defaults if not provided)
+ """
+ self.config = config or SparkServerConfig()
+ self._process: Optional[subprocess.Popen] = None
+ self._started_by_us = False
+
+ def is_running(self) -> bool:
+ """Check if Spark Connect server is running by attempting to connect."""
+ try:
+ sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+ sock.settimeout(1)
+ result = sock.connect_ex(("localhost", self.config.port))
+ sock.close()
+ return result == 0
+ except (socket.error, OSError):
+ return False
+
+ def start(self) -> float:
+ """
+ Start Spark Connect server if not already running.
+
+ Returns:
+ Time taken to start the server (0 if already running)
+
+ Raises:
+ RuntimeError: If server fails to start within timeout
+ """
+ if self.is_running():
+ print(f"Spark Connect server already running on port {self.config.port}")
+ return 0.0
+
+ start_time = time.time()
+
+ # Build the startup command
+ start_script = os.path.join(self.config.spark_home, "sbin", "start-connect-server.sh")
+
+ if not os.path.exists(start_script):
+ raise RuntimeError(f"Spark Connect start script not found: {start_script}")
+
+ cmd = [
+ start_script,
+ "--conf", f"spark.driver.memory={self.config.driver_memory}",
+ "--conf", f"spark.executor.memory={self.config.executor_memory}",
+ "--packages", "org.apache.spark:spark-connect_2.12:3.5.0",
+ "--jars", self.config.deequ_jar,
+ "--conf", "spark.connect.extensions.relation.classes=com.amazon.deequ.connect.DeequRelationPlugin",
+ ]
+
+ # Set up environment
+ env = os.environ.copy()
+ env["JAVA_HOME"] = self.config.java_home
+ env["SPARK_HOME"] = self.config.spark_home
+
+ print(f"Starting Spark Connect server on port {self.config.port}...")
+ print(f" JAVA_HOME: {self.config.java_home}")
+ print(f" SPARK_HOME: {self.config.spark_home}")
+
+ # Start the server
+ self._process = subprocess.Popen(
+ cmd,
+ env=env,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ )
+ self._started_by_us = True
+
+ # Wait for server to be ready
+ deadline = time.time() + self.config.startup_timeout
+ while time.time() < deadline:
+ if self.is_running():
+ elapsed = time.time() - start_time
+ print(f"Spark Connect server started in {elapsed:.1f}s")
+ return elapsed
+ time.sleep(self.config.poll_interval)
+
+ # Timeout - try to get error output
+ if self._process:
+ self._process.terminate()
+ _, stderr = self._process.communicate(timeout=5)
+ error_msg = stderr.decode() if stderr else "Unknown error"
+ self._process = None
+ self._started_by_us = False
+ raise RuntimeError(
+ f"Spark Connect server failed to start within {self.config.startup_timeout}s: {error_msg[:500]}"
+ )
+
+ raise RuntimeError(
+ f"Spark Connect server failed to start within {self.config.startup_timeout}s"
+ )
+
+ def stop(self) -> None:
+ """Stop Spark Connect server if we started it."""
+ if not self._started_by_us:
+ print("Spark Connect server was not started by us, skipping stop")
+ return
+
+ stop_script = os.path.join(self.config.spark_home, "sbin", "stop-connect-server.sh")
+
+ if os.path.exists(stop_script):
+ print("Stopping Spark Connect server...")
+ env = os.environ.copy()
+ env["JAVA_HOME"] = self.config.java_home
+ env["SPARK_HOME"] = self.config.spark_home
+
+ try:
+ subprocess.run(
+ [stop_script],
+ env=env,
+ timeout=30,
+ capture_output=True,
+ )
+ print("Spark Connect server stopped")
+ except subprocess.TimeoutExpired:
+ print("Warning: stop script timed out")
+ except Exception as e:
+ print(f"Warning: Error stopping server: {e}")
+ else:
+ # Fall back to killing the process directly
+ if self._process:
+ print("Terminating Spark Connect server process...")
+ self._process.terminate()
+ try:
+ self._process.wait(timeout=10)
+ except subprocess.TimeoutExpired:
+ self._process.kill()
+ print("Spark Connect server process terminated")
+
+ self._started_by_us = False
+ self._process = None
diff --git a/tests/v2/conftest.py b/tests/v2/conftest.py
index 0474335..611d170 100644
--- a/tests/v2/conftest.py
+++ b/tests/v2/conftest.py
@@ -20,11 +20,13 @@
import pytest
from pyspark.sql import Row, SparkSession
-@pytest.fixture(scope="session")
-def spark():
+@pytest.fixture(scope="module")
+def spark(spark_connect_server):
"""
- Session-scoped Spark Connect session.
- Shared across all tests for efficiency.
+ Module-scoped Spark Connect session.
+
+ Depends on spark_connect_server fixture from tests/conftest.py
+ to ensure the server is running before creating the session.
"""
remote_url = os.environ.get("SPARK_REMOTE", "sc://localhost:15002")
session = SparkSession.builder.remote(remote_url).getOrCreate()
diff --git a/tests/v2/test_e2e_spark_connect.py b/tests/v2/test_e2e_spark_connect.py
index 58c18fd..1dc019d 100644
--- a/tests/v2/test_e2e_spark_connect.py
+++ b/tests/v2/test_e2e_spark_connect.py
@@ -42,14 +42,8 @@
# Import the new Spark Connect API
from pydeequ.v2.verification import AnalysisRunner, VerificationSuite
-# Skip all tests if SPARK_REMOTE is not set
-pytestmark = pytest.mark.skipif(
- "SPARK_REMOTE" not in os.environ,
- reason="SPARK_REMOTE environment variable not set. Start Spark Connect server first.",
-)
-
-
-# Note: spark fixture is defined in conftest.py (session-scoped)
+# Note: spark fixture is defined in conftest.py and depends on spark_connect_server
+# which automatically starts the server if needed
@pytest.fixture(scope="module")
diff --git a/tutorials/data_quality_example_duckdb.py b/tutorials/data_quality_example_duckdb.py
new file mode 100644
index 0000000..29d4aff
--- /dev/null
+++ b/tutorials/data_quality_example_duckdb.py
@@ -0,0 +1,258 @@
+#!/usr/bin/env python3
+"""
+Testing Data Quality at Scale with PyDeequ + DuckDB
+
+This example demonstrates using PyDeequ with DuckDB as the execution backend,
+enabling data quality checks without a Spark cluster.
+
+It covers:
+- Data analysis (AnalysisRunner)
+- Constraint verification (VerificationSuite)
+- Column profiling (ColumnProfilerRunner)
+- Constraint suggestions (ConstraintSuggestionRunner)
+
+Prerequisites:
+1. Install dependencies:
+ pip install duckdb pandas
+
+2. Run this script:
+ python data_quality_example_duckdb.py
+"""
+
+import duckdb
+import pydeequ
+from pydeequ.v2.analyzers import (
+ Size,
+ Completeness,
+ Distinctness,
+ Mean,
+ Minimum,
+ Maximum,
+ StandardDeviation,
+ Correlation,
+ Uniqueness,
+)
+from pydeequ.v2.checks import Check, CheckLevel
+from pydeequ.v2.verification import AnalysisRunner, VerificationSuite
+from pydeequ.v2.predicates import eq, gte, lte, between
+from pydeequ.v2.profiles import ColumnProfilerRunner
+from pydeequ.v2.suggestions import ConstraintSuggestionRunner, Rules
+
+
+def create_sample_data(con: duckdb.DuckDBPyConnection):
+ """Create a sample product reviews dataset for demonstration."""
+ con.execute("""
+ CREATE TABLE reviews AS SELECT * FROM (VALUES
+ ('R001', 'C100', 'P001', 'US', 5, 10, 12, 2023, 'Great Product', 'Y'),
+ ('R002', 'C101', 'P002', 'US', 4, 8, 10, 2023, 'Good Value', 'Y'),
+ ('R003', 'C102', 'P001', 'UK', 5, 15, 18, 2022, 'Great Product', 'N'),
+ ('R004', 'C103', 'P003', 'DE', 3, 5, 8, 2022, 'Decent Item', 'Y'),
+ ('R005', 'C104', 'P002', 'FR', 4, 12, 15, 2021, 'Good Value', 'N'),
+ ('R006', 'C105', 'P004', 'JP', 5, 20, 22, 2023, 'Excellent!', 'Y'),
+ ('R007', 'C106', 'P001', 'US', 2, 3, 10, 2020, 'Great Product', 'N'),
+ ('R008', 'C107', 'P005', 'UK', 1, 25, 30, 2021, 'Disappointing', 'Y'),
+ ('R009', 'C108', 'P002', NULL, 4, 7, 9, 2023, 'Good Value', 'Y'),
+ ('R001', 'C109', 'P003', 'US', 3, 4, 6, 2022, 'Decent Item', 'N')
+ ) AS t(review_id, customer_id, product_id, marketplace, star_rating,
+ helpful_votes, total_votes, review_year, product_title, insight)
+ """)
+
+
+def run_data_analysis(engine):
+ """
+ Run data analysis using AnalysisRunner.
+
+ This demonstrates computing various metrics on the dataset:
+ - Size: Total row count
+ - Completeness: Ratio of non-null values
+ - Distinctness: Ratio of distinct values
+ - Mean, Min, Max: Statistical measures
+ - Correlation: Relationship between columns
+ """
+ print("\n" + "=" * 60)
+ print("DATA ANALYSIS")
+ print("=" * 60)
+
+ result = (AnalysisRunner()
+ .on_engine(engine)
+ .addAnalyzer(Size())
+ .addAnalyzer(Completeness("review_id"))
+ .addAnalyzer(Completeness("marketplace"))
+ .addAnalyzer(Distinctness(["review_id"]))
+ .addAnalyzer(Mean("star_rating"))
+ .addAnalyzer(Minimum("star_rating"))
+ .addAnalyzer(Maximum("star_rating"))
+ .addAnalyzer(StandardDeviation("star_rating"))
+ .addAnalyzer(Correlation("total_votes", "helpful_votes"))
+ .run())
+
+ print("\nAnalysis Results:")
+ print(result.to_string(index=False))
+
+ # Extract key insights
+ metrics = {(r["name"], r["instance"]): r["value"] for _, r in result.iterrows()}
+
+ print("\nKey Insights:")
+ print(f" - Dataset contains {int(metrics.get(('Size', '*'), 0))} reviews")
+ print(f" - review_id completeness: {metrics.get(('Completeness', 'review_id'), 0):.1%}")
+ print(f" - marketplace completeness: {metrics.get(('Completeness', 'marketplace'), 0):.1%}")
+ print(f" - review_id distinctness: {metrics.get(('Distinctness', 'review_id'), 0):.1%}")
+ print(f" - Average star rating: {metrics.get(('Mean', 'star_rating'), 0):.2f}")
+ print(f" - Star rating range: {metrics.get(('Minimum', 'star_rating'), 0):.0f} - {metrics.get(('Maximum', 'star_rating'), 0):.0f}")
+
+ return result
+
+
+def run_constraint_verification(engine):
+ """
+ Run constraint verification using VerificationSuite.
+
+ This demonstrates defining and verifying data quality rules:
+ - Size checks
+ - Completeness checks
+ - Uniqueness checks
+ - Range checks (min/max)
+ - Categorical value checks
+ """
+ print("\n" + "=" * 60)
+ print("CONSTRAINT VERIFICATION")
+ print("=" * 60)
+
+ # Define checks using the V2 predicate API
+ check = (Check(CheckLevel.Warning, "Product Reviews Quality Check")
+ # Size check: at least 5 reviews
+ .hasSize(gte(5))
+ # Completeness checks
+ .isComplete("review_id")
+ .isComplete("customer_id")
+ .hasCompleteness("marketplace", gte(0.8)) # Allow some missing
+ # Uniqueness check
+ .isUnique("review_id")
+ # Star rating range check
+ .hasMin("star_rating", eq(1.0))
+ .hasMax("star_rating", eq(5.0))
+ .hasMean("star_rating", between(1.0, 5.0))
+ # Year range check
+ .hasMin("review_year", gte(2015))
+ .hasMax("review_year", lte(2025))
+ # Categorical check
+ .isContainedIn("marketplace", ["US", "UK", "DE", "JP", "FR"])
+ .isContainedIn("insight", ["Y", "N"])
+ )
+
+ result = (VerificationSuite()
+ .on_engine(engine)
+ .addCheck(check)
+ .run())
+
+ print("\nVerification Results:")
+ print(result.to_string(index=False))
+
+ # Summarize results
+ passed = (result["constraint_status"] == "Success").sum()
+ failed = (result["constraint_status"] == "Failure").sum()
+
+ print(f"\nSummary: {passed} passed, {failed} failed out of {len(result)} constraints")
+
+ if failed > 0:
+ print("\nFailed Constraints:")
+ for _, row in result[result["constraint_status"] == "Failure"].iterrows():
+ print(f" - {row['constraint']}")
+ if row["constraint_message"]:
+ print(f" Message: {row['constraint_message']}")
+
+ return result
+
+
+def run_column_profiling(engine):
+ """
+ Run column profiling using ColumnProfilerRunner.
+
+ This automatically computes statistics for each column:
+ - Completeness
+ - Approximate distinct values
+ - Data type detection
+ - Numeric statistics (mean, min, max, etc.)
+ """
+ print("\n" + "=" * 60)
+ print("COLUMN PROFILING")
+ print("=" * 60)
+
+ result = (ColumnProfilerRunner()
+ .on_engine(engine)
+ .withLowCardinalityHistogramThreshold(10) # Generate histograms for low-cardinality columns
+ .run())
+
+ print("\nColumn Profiles:")
+ # Show selected columns for readability
+ cols_to_show = ["column", "completeness", "approx_distinct_values", "data_type", "mean", "minimum", "maximum"]
+ available_cols = [c for c in cols_to_show if c in result.columns]
+ print(result[available_cols].to_string(index=False))
+
+ return result
+
+
+def run_constraint_suggestions(engine):
+ """
+ Run automated constraint suggestion using ConstraintSuggestionRunner.
+
+ This analyzes the data and suggests appropriate constraints:
+ - Completeness constraints for complete columns
+ - Uniqueness constraints for unique columns
+ - Categorical range constraints for low-cardinality columns
+ - Non-negative constraints for numeric columns
+ """
+ print("\n" + "=" * 60)
+ print("CONSTRAINT SUGGESTIONS")
+ print("=" * 60)
+
+ result = (ConstraintSuggestionRunner()
+ .on_engine(engine)
+ .addConstraintRules(Rules.DEFAULT)
+ .run())
+
+ print("\nSuggested Constraints:")
+ cols_to_show = ["column_name", "constraint_name", "description", "code_for_constraint"]
+ available_cols = [c for c in cols_to_show if c in result.columns]
+ print(result[available_cols].to_string(index=False))
+
+ print(f"\nTotal suggestions: {len(result)}")
+
+ return result
+
+
+def main():
+ print("PyDeequ Data Quality Example with DuckDB")
+ print("No Spark cluster required!")
+
+ # Create in-memory DuckDB connection
+ con = duckdb.connect()
+
+ # Create sample data
+ print("\nCreating sample product reviews dataset...")
+ create_sample_data(con)
+
+ # Create engine using pydeequ.connect()
+ engine = pydeequ.connect(con, table="reviews")
+
+ print("\nDataset Schema:")
+ schema = engine.get_schema()
+ for col, dtype in schema.items():
+ print(f" {col}: {dtype}")
+
+ print("\nSample Data:")
+ print(con.execute("SELECT * FROM reviews LIMIT 5").fetchdf().to_string(index=False))
+
+ # Run all examples
+ run_data_analysis(engine)
+ run_constraint_verification(engine)
+ run_column_profiling(engine)
+ run_constraint_suggestions(engine)
+
+ print("\n" + "=" * 60)
+ print("EXAMPLE COMPLETE")
+ print("=" * 60)
+
+
+if __name__ == "__main__":
+ main()