diff --git a/.gitignore b/.gitignore index b4b68d0..1c912e4 100644 --- a/.gitignore +++ b/.gitignore @@ -148,5 +148,7 @@ dmypy.json # Cython debug symbols cython_debug/ -# DS_STORE +# DS_STORE .DS_Store + +benchmark_results diff --git a/BENCHMARK.md b/BENCHMARK.md new file mode 100644 index 0000000..025c159 --- /dev/null +++ b/BENCHMARK.md @@ -0,0 +1,349 @@ +# PyDeequ Benchmark + +Benchmark harness for comparing DuckDB and Spark engine performance. + +## Design Overview + +### Architecture + +``` +benchmark_cli.py # CLI entry point +benchmark/ +├── config.py # Configuration dataclasses +├── experiments.py # Experiment logic (data gen, checks, profiling) +├── worker.py # Subprocess worker for process isolation +├── spark_server.py # Auto Spark Connect server management +├── results.py # Results storage and merging +├── report.py # Markdown report generation +└── visualize.py # PNG chart generation +``` + +### Process Isolation + +Each engine runs in a separate subprocess to ensure: +- Clean JVM state for Spark +- Independent memory allocation +- No cross-contamination between engines + +### Data Pipeline + +1. **Generate** synthetic mixed-type data (strings, floats, ints) +2. **Cache** as Parquet files with optimized row groups +3. **Load** from same Parquet files for both engines (fair comparison) + +## Experiments + +### 1. Varying Rows +- Fixed: 10 columns, 16 data quality checks +- Variable: 100K to 130M rows +- Measures: Validation time scaling with data size + +### 2. Varying Columns +- Fixed: 1M rows +- Variable: 10 to 80 columns (16 to 226 checks) +- Measures: Validation time scaling with schema complexity + +### 3. Column Profiling +- Fixed: 10 columns +- Variable: 100K to 10M rows +- Measures: Full column profiling performance + +## Results + +Benchmark run on Apple M3 Max (14 cores), macOS Darwin 25.2.0. + +![Benchmark Results](imgs/benchmark_chart.png) + +### Experiment 1: Varying Rows + +| Rows | DuckDB (s) | Spark (s) | Speedup | +|------|------------|-----------|---------| +| 100K | 0.034 | 0.662 | **19.5x** | +| 1M | 0.071 | 1.648 | **23.2x** | +| 5M | 0.167 | 2.470 | **14.8x** | +| 10M | 0.268 | 3.239 | **12.1x** | +| 50M | 1.114 | 12.448 | **11.2x** | +| 130M | 2.752 | 28.404 | **10.3x** | + +### Experiment 2: Varying Columns + +| Cols | Checks | DuckDB (s) | Spark (s) | Speedup | +|------|--------|------------|-----------|---------| +| 10 | 16 | 0.076 | 1.619 | **21.3x** | +| 20 | 46 | 0.081 | 2.078 | **25.7x** | +| 40 | 106 | 0.121 | 2.781 | **23.0x** | +| 80 | 226 | 0.177 | 4.258 | **24.1x** | + +### Experiment 3: Column Profiling + +| Rows | DuckDB (s) | Spark (s) | Speedup | +|------|------------|-----------|---------| +| 100K | 0.045 | 0.585 | **13.0x** | +| 1M | 0.288 | 0.720 | **2.5x** | +| 5M | 1.524 | 2.351 | **1.5x** | +| 10M | 2.993 | 3.975 | **1.3x** | + +### Key Takeaways + +1. **DuckDB is 10-23x faster** for row-scaling validation workloads +2. **Consistent speedup across complexity** - 21-26x speedup regardless of column count +3. **Profiling converges** - at 10M rows, DuckDB is still 1.3x faster +4. **No JVM overhead** - DuckDB runs natively in Python, no startup cost + +## Performance Optimizations + +The DuckDB engine includes several optimizations to maintain performance as check complexity increases: + +### Optimization 1: Grouping Operator Batching + +Grouping operators (Distinctness, Uniqueness, UniqueValueRatio) that share the same columns and WHERE clause are fused into single queries. + +**Before**: N queries for N grouping operators on same columns +```sql +-- Query 1: Distinctness +WITH freq AS (SELECT cols, COUNT(*) AS cnt FROM t GROUP BY cols) +SELECT COUNT(*) AS distinct_count, SUM(cnt) AS total_count FROM freq + +-- Query 2: Uniqueness +WITH freq AS (SELECT cols, COUNT(*) AS cnt FROM t GROUP BY cols) +SELECT SUM(CASE WHEN cnt = 1 THEN 1 ELSE 0 END) AS unique_count, SUM(cnt) AS total_count FROM freq +``` + +**After**: 1 query computing all metrics +```sql +WITH freq AS (SELECT cols, COUNT(*) AS cnt FROM t GROUP BY cols) +SELECT + COUNT(*) AS distinct_count, + SUM(cnt) AS total_count, + SUM(CASE WHEN cnt = 1 THEN 1 ELSE 0 END) AS unique_count +FROM freq +``` + +**Impact**: 20-40% improvement for checks with multiple grouping operators + +### Optimization 2: Multi-Column Profiling + +Profile statistics for all columns are batched into 2-3 queries instead of 2-3 queries per column. + +**Before**: 20-30 queries for 10 columns +```sql +-- Per-column queries for completeness, numeric stats, percentiles +SELECT COUNT(*), SUM(CASE WHEN col1 IS NULL...) FROM t +SELECT MIN(col1), MAX(col1), AVG(col1)... FROM t +SELECT QUANTILE_CONT(col1, 0.25)... FROM t +-- Repeated for each column +``` + +**After**: 3 queries total +```sql +-- Query 1: All completeness stats +SELECT COUNT(*), SUM(CASE WHEN col1 IS NULL...), SUM(CASE WHEN col2 IS NULL...)... FROM t + +-- Query 2: All numeric stats +SELECT MIN(col1), MAX(col1), MIN(col2), MAX(col2)... FROM t + +-- Query 3: All percentiles +SELECT QUANTILE_CONT(col1, 0.25), QUANTILE_CONT(col2, 0.25)... FROM t +``` + +**Impact**: 40-60% improvement for column profiling + +### Optimization 3: DuckDB Configuration + +Configurable engine settings optimize DuckDB for analytical workloads: + +```python +from pydeequ.engines.duckdb_config import DuckDBEngineConfig + +config = DuckDBEngineConfig( + threads=8, # Control parallelism + memory_limit="8GB", # Memory management + preserve_insertion_order=False, # Better parallel execution + parquet_metadata_cache=True, # Faster Parquet reads +) + +engine = DuckDBEngine(con, table="test", config=config) +``` + +**Impact**: 5-15% improvement for large parallel scans + +### Optimization 4: Constraint Batching + +Scan-based constraints (Size, Completeness, Mean, etc.) and ratio-check constraints (isPositive, isContainedIn, etc.) are batched into minimal queries. + +**Before**: 1 query per constraint +```sql +SELECT COUNT(*) FROM t -- Size +SELECT COUNT(*), SUM(CASE WHEN col IS NULL...) FROM t -- Completeness +SELECT AVG(col) FROM t -- Mean +``` + +**After**: 1 query for all scan-based constraints +```sql +SELECT + COUNT(*) AS size, + SUM(CASE WHEN col IS NULL THEN 1 ELSE 0 END) AS null_count, + AVG(col) AS mean +FROM t +``` + +**Impact**: 20-40% improvement for checks with many constraints + +### Optimization 5: Query Profiling Infrastructure + +Built-in profiling helps identify bottlenecks and verify optimizations: + +```python +engine = DuckDBEngine(con, table="test", enable_profiling=True) +engine.run_checks([check]) + +# Get query statistics +stats = engine.get_query_stats() +print(f"Query count: {engine.get_query_count()}") +print(stats) + +# Get query plan for analysis +plan = engine.explain_query("SELECT COUNT(*) FROM test") +``` + +### Measured Performance Improvements + +Benchmark comparison: Baseline (2026-01-20) vs After Optimization (2026-01-21, 5-run average) + +#### Experiment 2: Varying Columns (KEY METRIC - Speedup Degradation Fix) + +| Cols | Checks | Before DuckDB | After DuckDB | Spark | Before Speedup | After Speedup | +|------|--------|---------------|--------------|-------|----------------|---------------| +| 10 | 16 | 0.118s | 0.076s | 1.619s | 14.1x | **21.3x** | +| 20 | 46 | 0.286s | 0.081s | 2.078s | 7.5x | **25.7x** | +| 40 | 106 | 0.713s | 0.121s | 2.781s | 4.0x | **23.0x** | +| 80 | 226 | 2.214s | 0.177s | 4.258s | 2.0x | **24.1x** | + +**Key Achievement**: The speedup degradation problem is **SOLVED**. +- **Before**: Speedup degraded from 14x (10 cols) down to 2x (80 cols) +- **After**: Speedup is consistent **~21-26x** across ALL column counts + +#### DuckDB-Only Performance Gains + +| Cols | Before | After | Improvement | +|------|--------|-------|-------------| +| 10 | 0.118s | 0.076s | 36% faster | +| 20 | 0.286s | 0.081s | 72% faster | +| 40 | 0.713s | 0.121s | 83% faster | +| 80 | 2.214s | 0.177s | **92% faster (~12x)** | + +#### Experiment 1: Varying Rows (16 checks) + +| Rows | Before | After | Improvement | +|------|--------|-------|-------------| +| 100K | 0.052s | 0.034s | 35% faster | +| 1M | 0.090s | 0.071s | 21% faster | +| 5M | 0.221s | 0.167s | 24% faster | +| 10M | 0.335s | 0.268s | 20% faster | +| 50M | 1.177s | 1.114s | 5% faster | +| 130M | 2.897s | 2.752s | 5% faster | + +#### Experiment 3: Column Profiling (10 columns) + +| Rows | Before | After | Change | +|------|--------|-------|--------| +| 100K | 0.086s | 0.045s | 48% faster | +| 1M | 0.388s | 0.288s | 26% faster | +| 5M | 1.470s | 1.524s | ~same | +| 10M | 2.659s | 2.993s | 13% slower | + +Note: Profiling shows slight regression at very high row counts due to batched query overhead, which is a trade-off for the significant gains in column scaling. + +## Quick Start + +### Run DuckDB Only (No Spark Required) + +```bash +python benchmark_cli.py run --engine duckdb +``` + +### Run Both Engines + +```bash +python benchmark_cli.py run --engine all +``` + +Auto-spark is enabled by default. The harness will: +1. Start a Spark Connect server +2. Run DuckDB benchmarks +3. Run Spark benchmarks +4. Stop the server +5. Merge results + +### Run with External Spark Server + +```bash +# Start server manually first, then: +python benchmark_cli.py run --engine spark --no-auto-spark +``` + +## Output Structure + +Each run creates a timestamped folder: + +``` +benchmark_results/ +└── benchmark_2024-01-19T14-30-45/ + ├── results.json # Raw timing data + └── BENCHMARK_RESULTS.md # Markdown report +``` + +## Visualize Results + +Generate a PNG chart comparing engine performance: + +```bash +# From run folder +python benchmark_cli.py visualize benchmark_results/benchmark_2024-01-19T14-30-45/ + +# Custom output path +python benchmark_cli.py visualize benchmark_results/benchmark_2024-01-19T14-30-45/ -o comparison.png +``` + +The chart shows: +- **Top row**: Time comparisons (DuckDB vs Spark) for each experiment +- **Bottom row**: Speedup ratios (how many times faster DuckDB is) + +## Regenerate Report + +```bash +python benchmark_cli.py report benchmark_results/benchmark_2024-01-19T14-30-45/ +``` + +## Configuration + +Default experiment parameters (see `benchmark/config.py`): + +| Parameter | Default | +|-----------|---------| +| Row counts | 100K, 1M, 5M, 10M, 50M, 130M | +| Column counts | 10, 20, 40, 80 | +| Profiling rows | 100K, 1M, 5M, 10M | +| Validation runs | 3 (averaged) | +| Cache directory | `~/.deequ_benchmark_data` | + +## Requirements + +- **DuckDB**: No additional setup +- **Spark**: Requires `SPARK_HOME` and `JAVA_HOME` environment variables (or use `--spark-home`/`--java-home` flags) + +## Example Workflow + +```bash +# 1. Run full benchmark +python benchmark_cli.py run --engine all + +# 2. View results +cat benchmark_results/benchmark_*/BENCHMARK_RESULTS.md + +# 3. Generate chart +python benchmark_cli.py visualize benchmark_results/benchmark_*/ + +# 4. Open chart +open benchmark_results/benchmark_*/benchmark_chart.png +``` diff --git a/Engines.md b/Engines.md new file mode 100644 index 0000000..14d79a7 --- /dev/null +++ b/Engines.md @@ -0,0 +1,231 @@ +# Engine Parity Analysis Report + +## Executive Summary + +This report documents the parity testing results between the Spark and DuckDB engines in python-deequ. After implementing all fixes, **84 tests pass** and **5 tests fail** due to inherent algorithmic differences. + +### Current Test Results + +| Status | Count | Percentage | +|--------|-------|------------| +| **Passed** | 84 | 94.4% | +| **Failed** | 5 | 5.6% | + +### Summary of Remaining Failures + +| Category | Test Failures | Root Cause | Fixable? | +|----------|---------------|------------|----------| +| Approximate Quantile | 2 tests | Different quantile algorithms | Inherent difference | +| Approximate Count Distinct | 1 test | HyperLogLog implementation variance | Inherent variance | +| Profile Distinct Values | 2 tests | HyperLogLog variance | Inherent variance | + +--- + +## Fixes Applied + +### 1. STDDEV_SAMP → STDDEV_POP + +Changed DuckDB to use population standard deviation to match Spark. + +**Files Modified:** +- `pydeequ/engines/operators/scan_operators.py` +- `pydeequ/engines/operators/profiling_operators.py` + +**Tests Fixed:** ~8 tests + +### 2. Entropy: LOG2 → LN + +Changed DuckDB entropy calculation from log base 2 to natural log to match Spark. + +**File Modified:** `pydeequ/engines/operators/grouping_operators.py` + +```python +# Before (bits) +-SUM((cnt * 1.0 / total_cnt) * LOG2(cnt * 1.0 / total_cnt)) AS entropy + +# After (nats) - matches Spark +-SUM((cnt * 1.0 / total_cnt) * LN(cnt * 1.0 / total_cnt)) AS entropy +``` + +**Tests Fixed:** 3 tests +- `test_entropy_uniform` +- `test_mutual_information` +- `test_has_entropy` (constraint) + +### 3. Spark Connect Server Fixture + +Added automatic Spark Connect server startup for parity tests. + +**File Modified:** `tests/engines/comparison/conftest.py` + +```python +@pytest.fixture(scope="session") +def spark_connect_server(): + """Automatically starts Spark Connect server if not running.""" + from benchmark.spark_server import SparkConnectServer + from benchmark.config import SparkServerConfig + + config = SparkServerConfig() + server = SparkConnectServer(config) + + if not server.is_running(): + server.start() + + if not os.environ.get("SPARK_REMOTE"): + os.environ["SPARK_REMOTE"] = f"sc://localhost:{config.port}" + + yield server +``` + +### 4. Flatten Metrics in DeequRelationPlugin (Histogram/DataType Fix) + +Fixed the Scala Deequ Connect plugin to properly handle complex metrics like Histogram and DataType by flattening them before output. + +**File Modified:** `deequ/src/main/scala/com/amazon/deequ/connect/DeequRelationPlugin.scala` + +**Root Cause:** The plugin was only collecting `DoubleMetric` instances directly, but Histogram and DataType return complex metric types (`HistogramMetric`, etc.) that need to be flattened first. + +**Before:** +```scala +val metrics = context.metricMap.toSeq.collect { + case (analyzer, metric: DoubleMetric) => ... +} +``` + +**After:** +```scala +val metrics = context.metricMap.toSeq.flatMap { case (analyzer, metric) => + metric.flatten().map { doubleMetric => + val value: Double = doubleMetric.value.getOrElse(Double.NaN) + ( + analyzer.toString, + doubleMetric.entity.toString, + doubleMetric.instance, + doubleMetric.name, + value + ) + } +} +``` + +**Tests Fixed:** 2 tests +- `test_histogram` +- `test_data_type` + +--- + +## Detailed Analysis of Remaining Failures + +### 1. Approximate Quantile (2 tests) + +**Root Cause: Different Algorithms** + +| Engine | Algorithm | +|--------|-----------| +| Spark | T-Digest (approximate) | +| DuckDB | QUANTILE_CONT (exact interpolation) | + +The algorithms produce different results, especially for small datasets. + +**Resolution:** Accept as inherent difference or implement T-Digest in DuckDB. + +### 2. Approximate Count Distinct (3 tests) + +**Root Cause: HyperLogLog Variance** + +Both engines use HyperLogLog but with different implementations: +- Different hash functions +- Different precision parameters + +**Evidence:** +``` +Spark approx_distinct: 9 +DuckDB approx_distinct: 10 (or 6 vs 5) +``` + +~10% variance is expected for probabilistic data structures. + +**Resolution:** Accept as inherent variance. The 10% tolerance handles most cases but edge cases with small cardinalities still fail. + +--- + +## Test Results Summary + +### Passing Tests (84) + +All core analyzers and constraints: +- Size, Completeness, Mean, Sum, Min, Max +- StandardDeviation (after STDDEV_POP fix) +- Distinctness, Uniqueness, UniqueValueRatio, CountDistinct +- Correlation, PatternMatch, Compliance +- MinLength, MaxLength +- Entropy, MutualInformation (after LN fix) +- **Histogram** (after flatten fix) +- **DataType** (after flatten fix) +- All constraint tests (32 tests) +- All suggestion tests (13 tests) +- Most profile tests + +### Failing Tests (5) + +| Test | Category | Status | +|------|----------|--------| +| `test_approx_count_distinct` | Analyzer | Inherent HLL variance | +| `test_approx_quantile_median` | Analyzer | Algorithm difference | +| `test_approx_quantile_quartiles` | Analyzer | Algorithm difference | +| `test_completeness_partial` | Profile | Inherent HLL variance | +| `test_distinct_values` | Profile | Inherent HLL variance | + +--- + +## Files Modified + +### Python (python-deequ) + +| File | Changes | +|------|---------| +| `pydeequ/engines/operators/scan_operators.py` | STDDEV_SAMP → STDDEV_POP | +| `pydeequ/engines/operators/profiling_operators.py` | STDDEV_SAMP → STDDEV_POP | +| `pydeequ/engines/operators/grouping_operators.py` | LOG2 → LN for entropy | +| `tests/engines/comparison/conftest.py` | Added `spark_connect_server` fixture | +| `tests/engines/comparison/utils.py` | Tolerance adjustments, JSON parsing | + +### Scala (deequ) + +| File | Changes | +|------|---------| +| `deequ/src/main/scala/com/amazon/deequ/connect/DeequRelationPlugin.scala` | Flatten metrics in `analyzerContextToDataFrame` | + +--- + +## Recommendations + +### Mark as xfail (5 tests) + +These tests should be marked with `@pytest.mark.xfail` with documented reasons: + +```python +@pytest.mark.xfail(reason="HyperLogLog implementation variance") +def test_approx_count_distinct(self, ...): + ... + +@pytest.mark.xfail(reason="T-Digest vs QUANTILE_CONT algorithm difference") +def test_approx_quantile_median(self, ...): + ... +``` + +### Future Improvements + +1. **Exact Count for Small Data**: Use `COUNT(DISTINCT)` instead of HyperLogLog when dataset size < threshold +2. **Quantile Algorithm Alignment**: Consider implementing T-Digest in DuckDB for exact parity + +--- + +## Conclusion + +The parity testing initiative achieved **94.4% test pass rate** (84/89 tests). The remaining 5 failures represent inherent algorithmic differences: + +1. **Probabilistic algorithm variance** (3 tests) - Inherent to HyperLogLog +2. **Algorithm differences** (2 tests) - T-Digest vs QUANTILE_CONT + +All major analyzers (Size, Completeness, Mean, StandardDeviation, Entropy, Correlation, Histogram, DataType, etc.) now have full parity between engines. diff --git a/README.md b/README.md index 2d19db5..85c1714 100644 --- a/README.md +++ b/README.md @@ -6,73 +6,191 @@ PyDeequ is a Python API for [Deequ](https://github.com/awslabs/deequ), a library ## What's New in PyDeequ 2.0 -PyDeequ 2.0 introduces a new architecture using **Spark Connect**, bringing significant improvements: +PyDeequ 2.0 introduces a new multi-engine architecture with **DuckDB** and **Spark Connect** backends: | Feature | PyDeequ 1.x | PyDeequ 2.0 | |---------|-------------|-------------| -| Communication | Py4J (JVM bridge) | Spark Connect (gRPC) | +| Backends | Spark only (Py4J) | DuckDB, Spark Connect | +| JVM Required | Yes | No (DuckDB) / Yes (Spark) | | Assertions | Python lambdas | Serializable predicates | -| Spark Session | Local only | Local or Remote | -| Architecture | Tight JVM coupling | Clean client-server | +| Remote Execution | No | Yes (Spark Connect) | **Key Benefits:** -- **No Py4J dependency** - Uses Spark Connect protocol for communication +- **DuckDB backend** - Lightweight, no JVM required, perfect for local development and CI/CD +- **Spark Connect backend** - Production-scale processing with remote cluster support - **Serializable predicates** - Replace Python lambdas with predicate objects (`eq`, `gte`, `between`, etc.) -- **Remote execution** - Connect to remote Spark clusters via Spark Connect -- **Cleaner API** - Simplified imports and more Pythonic interface +- **Unified API** - Same code works with both backends ### Architecture ```mermaid -flowchart LR +flowchart TB subgraph CLIENT["Python Client"] - A["Python Code"] --> B["Protobuf
Serialization"] + A["pydeequ.connect()"] --> B["Engine Auto-Detection"] end - B -- gRPC --> C["Spark Connect (gRPC)"] - subgraph SERVER["Spark Connect Server"] - D["DeequRelationPlugin"] --> E["Deequ Core"] --> F["Spark DataFrame API"] --> G["(Data)"] + + B --> C{Connection Type} + + C -->|DuckDB| D["DuckDBEngine"] + C -->|SparkSession| E["SparkEngine"] + + subgraph DUCKDB["DuckDB Backend (Local)"] + D --> F["SQL Operators"] --> G["DuckDB"] --> H["Local Files
Parquet/CSV"] + end + + subgraph SPARK["Spark Connect Backend (Distributed)"] + E --> I["Protobuf"] -- gRPC --> J["Spark Connect Server"] + J --> K["DeequRelationPlugin"] --> L["Deequ Core"] --> M["Data Lake"] end - G --> H["Results"] -- gRPC --> I["Python DataFrame"] - %% Styling for compactness and distinction - classDef code fill:#C8F2FB,stroke:#35a7c2,color:#13505B,font-weight:bold; - class A code; + + H --> N["Results"] + M --> N + N --> O["MetricResult / ConstraintResult / ColumnProfile"] + + classDef duckdb fill:#FFF4CC,stroke:#E6B800,color:#806600; + classDef spark fill:#CCE5FF,stroke:#0066CC,color:#003366; + class D,F,G,H duckdb; + class E,I,J,K,L,M spark; ``` **How it works:** -1. **Client Side**: PyDeequ 2.0 builds checks and analyzers as Protobuf messages -2. **Transport**: Messages are sent via gRPC to the Spark Connect server -3. **Server Side**: The `DeequRelationPlugin` deserializes messages and executes Deequ operations -4. **Results**: Verification results are returned as a Spark DataFrame +- **Auto-detection**: `pydeequ.connect()` inspects the connection type and creates the appropriate engine +- **DuckDB path**: Direct SQL execution in-process, no JVM required +- **Spark path**: Protobuf serialization over gRPC to Spark Connect server with Deequ plugin +- **Unified results**: Both engines return the same `MetricResult`, `ConstraintResult`, and `ColumnProfile` types ### Feature Support Matrix -| Feature | PyDeequ 1.x | PyDeequ 2.0 | -|---------|:-----------:|:-----------:| -| **Constraint Verification** | | | -| VerificationSuite | Yes | Yes | -| Check constraints | Yes | Yes | -| Custom SQL expressions | Yes | Yes | -| **Metrics & Analysis** | | | -| AnalysisRunner | Yes | Yes | -| All standard analyzers | Yes | Yes | -| **Column Profiling** | | | -| ColumnProfilerRunner | Yes | Yes | -| Numeric statistics | Yes | Yes | -| KLL sketch profiling | Yes | Yes | -| Low-cardinality histograms | Yes | Yes | -| **Constraint Suggestions** | | | -| ConstraintSuggestionRunner | Yes | Yes | -| Rule sets (DEFAULT, EXTENDED, etc.) | Yes | Yes | -| Train/test split evaluation | Yes | Yes | -| **Metrics Repository** | | | -| FileSystemMetricsRepository | Yes | Planned | -| **Execution Mode** | | | -| Local Spark | Yes | No | -| Spark Connect (remote) | No | Yes | +| Feature | PyDeequ 1.x | PyDeequ 2.0 (DuckDB) | PyDeequ 2.0 (Spark) | +|---------|:-----------:|:--------------------:|:-------------------:| +| **Constraint Verification** | | | | +| VerificationSuite | Yes | Yes | Yes | +| Check constraints | Yes | Yes | Yes | +| Custom SQL expressions | Yes | Yes | Yes | +| **Metrics & Analysis** | | | | +| AnalysisRunner | Yes | Yes | Yes | +| All standard analyzers | Yes | Yes | Yes | +| **Column Profiling** | | | | +| ColumnProfilerRunner | Yes | Yes | Yes | +| Numeric statistics | Yes | Yes | Yes | +| KLL sketch profiling | Yes | No | Yes | +| Low-cardinality histograms | Yes | Yes | Yes | +| **Constraint Suggestions** | | | | +| ConstraintSuggestionRunner | Yes | Yes | Yes | +| Rule sets (DEFAULT, EXTENDED, etc.) | Yes | Yes | Yes | +| Train/test split evaluation | Yes | No | Yes | +| **Metrics Repository** | | | | +| FileSystemMetricsRepository | Yes | Planned | Planned | +| **Execution Environment** | | | | +| JVM Required | Yes | No | Yes | +| Local execution | Yes | Yes | Yes | +| Remote execution | No | No | Yes | + +--- + +## Installation + +PyDeequ 2.0 supports multiple backends. Install only what you need: + +**From PyPI (when published):** +```bash +# DuckDB backend (lightweight, no JVM required) +pip install pydeequ[duckdb] + +# Spark Connect backend (for production-scale processing) +pip install pydeequ[spark] + +# Both backends +pip install pydeequ[all] + +# Development (includes all backends + test tools) +pip install pydeequ[dev] +``` + +**From GitHub Release (beta):** +```bash +# Install beta wheel + DuckDB +pip install https://github.com/awslabs/python-deequ/releases/download/v2.0.0b1/pydeequ-2.0.0b1-py3-none-any.whl +pip install duckdb + +# For Spark backend, also install: +pip install pyspark[connect]==3.5.0 +``` + +--- + +## Quick Start with DuckDB (Recommended for Getting Started) + +The DuckDB backend is the easiest way to get started - no JVM or Spark server required. + +### Requirements +- Python 3.9+ + +### Installation + +```bash +pip install pydeequ[duckdb] +``` + +### Run Your First Check + +```python +import duckdb +import pydeequ +from pydeequ.v2.analyzers import Size, Completeness, Mean +from pydeequ.v2.checks import Check, CheckLevel +from pydeequ.v2.predicates import eq, gte + +# Create a DuckDB connection and load data +con = duckdb.connect() +con.execute(""" + CREATE TABLE users AS SELECT * FROM (VALUES + (1, 'Alice', 25), + (2, 'Bob', 30), + (3, 'Charlie', NULL) + ) AS t(id, name, age) +""") + +# Create an engine from the connection +engine = pydeequ.connect(con, table="users") + +# Run analyzers +metrics = engine.compute_metrics([ + Size(), + Completeness("id"), + Completeness("age"), + Mean("age"), +]) +print("Metrics:") +for m in metrics: + print(f" {m.name}({m.instance}): {m.value}") + +# Run constraint checks +check = (Check(CheckLevel.Error, "Data quality checks") + .hasSize(eq(3)) + .isComplete("id") + .isComplete("name") + .hasCompleteness("age", gte(0.5))) + +results = engine.run_checks([check]) +print("\nConstraint Results:") +for r in results: + print(f" {r.constraint}: {r.constraint_status}") + +# Profile columns +profiles = engine.profile_columns() +print("\nColumn Profiles:") +for p in profiles: + print(f" {p.column}: completeness={p.completeness}, distinct={p.approx_distinct_values}") + +con.close() +``` --- -## PyDeequ 2.0 Beta - Quick Start +## Quick Start with Spark Connect (Production Scale) + +For production workloads and large-scale data processing, use the Spark Connect backend. ### Requirements @@ -142,6 +260,11 @@ pip install pyspark[connect]==3.5.0 pip install setuptools ``` +Or using the extras syntax (once published to PyPI): +```bash +pip install pydeequ[spark] +``` + ### Step 5: Run Your First Check ```python @@ -444,7 +567,8 @@ The legacy PyDeequ API uses Py4J for JVM communication. It is still available fo ### Installation ```bash -pip install pydeequ +# Install with Spark backend (required for 1.x API) +pip install pydeequ[spark] ``` **Note:** Set the `SPARK_VERSION` environment variable to match your Spark version. @@ -638,7 +762,14 @@ sdk install spark 3.5.0 ### Poetry ```bash -poetry install +# Install all dependencies (including dev tools and both backends) +poetry install --with dev --all-extras + +# Or install specific extras +poetry install --extras duckdb # DuckDB only +poetry install --extras spark # Spark only +poetry install --extras all # Both backends + poetry update poetry show -o ``` @@ -646,7 +777,11 @@ poetry show -o ### Running Tests Locally ```bash +# Run all tests (requires Spark Connect server for comparison tests) poetry run pytest + +# Run DuckDB-only tests (no Spark required) +poetry run pytest tests/engines/test_duckdb*.py tests/engines/test_operators.py ``` ### Running Tests (Docker) diff --git a/benchmark/__init__.py b/benchmark/__init__.py new file mode 100644 index 0000000..7e9dfac --- /dev/null +++ b/benchmark/__init__.py @@ -0,0 +1,28 @@ +"""Benchmark package for PyDeequ engine comparison.""" + +from .config import ExperimentConfig, SparkServerConfig, BenchmarkConfig +from .results import ( + ExperimentResult, + EnvironmentInfo, + BenchmarkRun, + generate_run_id, + save_results, + load_results, + collect_environment_info, +) +from .spark_server import SparkConnectServer, managed_spark_server + +__all__ = [ + "ExperimentConfig", + "SparkServerConfig", + "BenchmarkConfig", + "ExperimentResult", + "EnvironmentInfo", + "BenchmarkRun", + "generate_run_id", + "save_results", + "load_results", + "collect_environment_info", + "SparkConnectServer", + "managed_spark_server", +] diff --git a/benchmark/config.py b/benchmark/config.py new file mode 100644 index 0000000..1829876 --- /dev/null +++ b/benchmark/config.py @@ -0,0 +1,107 @@ +"""Configuration dataclasses for benchmark.""" + +import os +from dataclasses import dataclass, field, asdict +from typing import List, Optional + + +# Default experiment configurations (from original script) +DEFAULT_ROW_COUNTS = [100_000, 1_000_000, 5_000_000, 10_000_000, 50_000_000, 130_000_000] +DEFAULT_COLUMN_COUNTS = [10, 20, 40, 80] +DEFAULT_PROFILING_ROW_COUNTS = [100_000, 1_000_000, 5_000_000, 10_000_000] +DEFAULT_FIXED_ROWS = 1_000_000 +DEFAULT_BASE_COLS = 10 +DEFAULT_N_RUNS = 3 + +@dataclass +class ExperimentConfig: + """Configuration for benchmark experiments.""" + + n_runs: int = DEFAULT_N_RUNS + row_counts: List[int] = field(default_factory=lambda: DEFAULT_ROW_COUNTS.copy()) + column_counts: List[int] = field(default_factory=lambda: DEFAULT_COLUMN_COUNTS.copy()) + profiling_row_counts: List[int] = field( + default_factory=lambda: DEFAULT_PROFILING_ROW_COUNTS.copy() + ) + fixed_rows: int = DEFAULT_FIXED_ROWS + base_cols: int = DEFAULT_BASE_COLS + cache_dir: str = field( + default_factory=lambda: os.path.expanduser("~/.deequ_benchmark_data") + ) + + def to_dict(self) -> dict: + """Convert to dictionary for JSON serialization.""" + return asdict(self) + + @classmethod + def from_dict(cls, data: dict) -> "ExperimentConfig": + """Create from dictionary.""" + return cls(**data) + + +@dataclass +class SparkServerConfig: + """Configuration for Spark Connect server.""" + + java_home: str = field( + default_factory=lambda: os.environ.get( + "JAVA_HOME", + "/Library/Java/JavaVirtualMachines/amazon-corretto-17.jdk/Contents/Home", + ) + ) + spark_home: str = field( + default_factory=lambda: os.environ.get( + "SPARK_HOME", "/Volumes/workplace/deequ_rewrite/spark-3.5.0-bin-hadoop3" + ) + ) + port: int = 15002 + startup_timeout: int = 60 + poll_interval: float = 1.0 + driver_memory: str = "16g" + executor_memory: str = "16g" + deequ_jar: str = field( + default_factory=lambda: "/Volumes/workplace/deequ_rewrite/deequ/target/deequ_2.12-2.1.0b-spark-3.5.jar" + ) + + def to_dict(self) -> dict: + """Convert to dictionary for JSON serialization.""" + return asdict(self) + + @classmethod + def from_dict(cls, data: dict) -> "SparkServerConfig": + """Create from dictionary.""" + return cls(**data) + + +@dataclass +class BenchmarkConfig: + """Overall benchmark configuration.""" + + engine: str = "all" # "all", "duckdb", or "spark" + output_dir: str = "benchmark_results" + experiment: ExperimentConfig = field(default_factory=ExperimentConfig) + spark_server: SparkServerConfig = field(default_factory=SparkServerConfig) + spark_remote: str = field( + default_factory=lambda: os.environ.get("SPARK_REMOTE", "sc://localhost:15002") + ) + + def to_dict(self) -> dict: + """Convert to dictionary for JSON serialization.""" + return { + "engine": self.engine, + "output_dir": self.output_dir, + "experiment": self.experiment.to_dict(), + "spark_server": self.spark_server.to_dict(), + "spark_remote": self.spark_remote, + } + + @classmethod + def from_dict(cls, data: dict) -> "BenchmarkConfig": + """Create from dictionary.""" + return cls( + engine=data.get("engine", "all"), + output_dir=data.get("output_dir", "benchmark_results"), + experiment=ExperimentConfig.from_dict(data.get("experiment", {})), + spark_server=SparkServerConfig.from_dict(data.get("spark_server", {})), + spark_remote=data.get("spark_remote", "sc://localhost:15002"), + ) diff --git a/benchmark/experiments.py b/benchmark/experiments.py new file mode 100644 index 0000000..3424807 --- /dev/null +++ b/benchmark/experiments.py @@ -0,0 +1,547 @@ +"""Benchmark experiment logic extracted from original benchmark_cli.py.""" + +import os +import time +from typing import List, Dict, Any, Optional, Tuple + +import duckdb +import numpy as np +import pandas as pd +import pydeequ + +from pydeequ.v2.verification import VerificationSuite +from pydeequ.v2.checks import Check, CheckLevel +from pydeequ.v2.predicates import gte, lte, between +from pydeequ.v2.profiles import ColumnProfilerRunner + +from .config import ExperimentConfig + + +# ============================================================================= +# Data Generation +# ============================================================================= + + +def generate_rich_data(n_rows: int, n_extra_cols: int = 0) -> pd.DataFrame: + """ + Generate mixed-type data for benchmarking with optional extra numeric columns. + + Base schema (10 columns): + - id: string (unique identifier) + - category: string (5 categorical values) + - status: string (3 categorical values) + - email: string (email-like pattern) + - amount: float [0, 10000] + - quantity: int [0, 1000] + - score: float [0, 100] (normal distribution) + - rating: int [1, 5] + - price: float [0.01, 9999.99] + - discount: float [0, 0.5] + + Args: + n_rows: Number of rows to generate + n_extra_cols: Number of additional numeric columns + + Returns: + DataFrame with mixed-type columns + optional extra numeric columns + """ + np.random.seed(42) + + data = { + "id": [f"ID{i:012d}" for i in range(n_rows)], + "category": np.random.choice( + ["electronics", "clothing", "food", "books", "toys"], n_rows + ), + "status": np.random.choice(["active", "inactive", "pending"], n_rows), + "email": [f"user{i}@example.com" for i in range(n_rows)], + "amount": np.random.uniform(0, 10000, n_rows), + "quantity": np.random.randint(0, 1001, n_rows), + "score": np.random.normal(50, 15, n_rows).clip(0, 100), + "rating": np.random.randint(1, 6, n_rows), + "price": np.random.uniform(0.01, 9999.99, n_rows), + "discount": np.random.uniform(0, 0.5, n_rows), + } + + for i in range(n_extra_cols): + data[f"extra_{i}"] = np.random.uniform(-10000, 10000, n_rows) + + return pd.DataFrame(data) + + +def save_to_parquet(df: pd.DataFrame, cache_dir: str, name: str, target_row_groups: int = 64) -> str: + """ + Save DataFrame to Parquet with dynamic row group size for Spark parallelism. + + Args: + df: DataFrame to save + cache_dir: Cache directory path + name: Cache file name + target_row_groups: Target number of row groups + + Returns: + Path to the saved Parquet file + """ + import pyarrow as pa + import pyarrow.parquet as pq + + os.makedirs(cache_dir, exist_ok=True) + path = os.path.join(cache_dir, f"{name}.parquet") + + if not os.path.exists(path): + n_rows = len(df) + row_group_size = max(10_000, n_rows // target_row_groups) + table = pa.Table.from_pandas(df) + pq.write_table(table, path, compression="snappy", row_group_size=row_group_size) + + return path + + +def get_cached_parquet(cache_dir: str, name: str) -> Optional[str]: + """Get path to cached Parquet file, or None if not cached.""" + path = os.path.join(cache_dir, f"{name}.parquet") + return path if os.path.exists(path) else None + + +# ============================================================================= +# Check Building +# ============================================================================= + + +def build_rich_check(n_extra_cols: int = 0) -> Check: + """ + Build a Check suite with rich validations on base columns + simple checks on extras. + + Args: + n_extra_cols: Number of extra numeric columns to add checks for + + Returns: + Check instance with all constraints configured + """ + check = ( + Check(CheckLevel.Warning, "Rich Benchmark Check") + # Completeness checks (3) + .isComplete("id") + .isComplete("category") + .hasCompleteness("email", gte(0.95)) + # Uniqueness checks (2) + .isUnique("id") + .hasDistinctness(["category"], gte(0.001)) + # Numeric range checks (6) + .hasMin("amount", gte(0)) + .hasMax("amount", lte(10000)) + .hasMean("score", between(0, 100)) + .hasStandardDeviation("score", lte(50)) + .isNonNegative("quantity") + .isPositive("price") + # String checks (5) + .hasMinLength("id", gte(8)) + .hasMaxLength("id", lte(20)) + .hasPattern("email", r".*@.*\..*", gte(0.9)) + .isContainedIn("status", ["active", "inactive", "pending"]) + .isContainedIn("rating", ["1", "2", "3", "4", "5"]) + ) + + for i in range(n_extra_cols): + col = f"extra_{i}" + check = check.isComplete(col).hasMin(col, gte(-10000)).hasMax(col, lte(10000)) + + return check + + +def count_checks(n_extra_cols: int = 0) -> int: + """Return total number of checks for given extra columns.""" + base_checks = 16 + extra_checks = n_extra_cols * 3 + return base_checks + extra_checks + + +# ============================================================================= +# DuckDB Setup and Benchmarking +# ============================================================================= + + +def setup_duckdb_from_parquet(parquet_path: str) -> Tuple[Any, duckdb.DuckDBPyConnection]: + """Setup DuckDB engine to read from Parquet file.""" + con = duckdb.connect() + engine = pydeequ.connect(con, table=f"read_parquet('{parquet_path}')") + return engine, con + + +def setup_duckdb_for_profiling(parquet_path: str) -> Tuple[Any, duckdb.DuckDBPyConnection]: + """ + Setup DuckDB engine for profiling by creating a view from parquet. + This is needed because PRAGMA table_info() doesn't work with read_parquet(). + """ + con = duckdb.connect() + con.execute(f"CREATE VIEW benchmark_data AS SELECT * FROM read_parquet('{parquet_path}')") + engine = pydeequ.connect(con, table="benchmark_data") + return engine, con + + +def benchmark_duckdb_validation(engine: Any, check: Check, n_runs: int) -> float: + """Time DuckDB VerificationSuite.run() over N runs, return average.""" + times = [] + for _ in range(n_runs): + start = time.perf_counter() + result = VerificationSuite().on_engine(engine).addCheck(check).run() + _ = len(result) + elapsed = time.perf_counter() - start + times.append(elapsed) + return sum(times) / len(times) + + +def benchmark_duckdb_profiling(engine: Any, n_runs: int) -> float: + """Time DuckDB ColumnProfilerRunner.run() over N runs, return average.""" + times = [] + for _ in range(n_runs): + start = time.perf_counter() + result = ColumnProfilerRunner().on_engine(engine).run() + _ = len(result) + elapsed = time.perf_counter() - start + times.append(elapsed) + return sum(times) / len(times) + + +# ============================================================================= +# Spark Setup and Benchmarking +# ============================================================================= + + +def setup_spark(spark_remote: str) -> Tuple[Any, float]: + """Create SparkSession for Spark Connect. Returns (spark, startup_time).""" + from pyspark.sql import SparkSession + + start = time.perf_counter() + spark = SparkSession.builder.remote(spark_remote).getOrCreate() + startup_time = time.perf_counter() - start + + return spark, startup_time + + +def load_spark_from_parquet(spark: Any, parquet_path: str) -> Tuple[Any, float]: + """Load Parquet file into Spark. Returns (spark_df, load_time).""" + start = time.perf_counter() + spark_df = spark.read.parquet(parquet_path) + spark_df.count() # Force materialization + load_time = time.perf_counter() - start + + return spark_df, load_time + + +def benchmark_spark_validation(spark: Any, spark_df: Any, check: Check, n_runs: int) -> float: + """Time Spark VerificationSuite.run() over N runs, return average.""" + times = [] + for _ in range(n_runs): + start = time.perf_counter() + result = VerificationSuite(spark).onData(spark_df).addCheck(check).run() + _ = result.collect() + elapsed = time.perf_counter() - start + times.append(elapsed) + return sum(times) / len(times) + + +def benchmark_spark_profiling(spark: Any, spark_df: Any, n_runs: int) -> float: + """Time Spark ColumnProfilerRunner.run() over N runs, return average.""" + times = [] + for _ in range(n_runs): + start = time.perf_counter() + result = ColumnProfilerRunner(spark).onData(spark_df).run() + _ = result.collect() + elapsed = time.perf_counter() - start + times.append(elapsed) + return sum(times) / len(times) + + +# ============================================================================= +# DuckDB Experiment Runners +# ============================================================================= + + +def run_varying_rows_experiment_duckdb(config: ExperimentConfig) -> List[Dict[str, Any]]: + """Run varying rows experiment for DuckDB engine.""" + print("\n" + "=" * 70) + print(f"EXPERIMENT 1 (DuckDB): VARYING ROWS (Fixed Columns = {config.base_cols})") + print("=" * 70) + + results = [] + + for n_rows in config.row_counts: + n_checks = count_checks(0) + print(f"\n--- {n_rows:,} rows x {config.base_cols} cols ({n_checks} checks) ---") + + cache_name = f"rich_rows_{n_rows}" + parquet_path = get_cached_parquet(config.cache_dir, cache_name) + + if parquet_path: + print(f"Using cached data: {parquet_path}") + else: + print("Generating rich mixed-type data and saving to Parquet...") + df = generate_rich_data(n_rows, n_extra_cols=0) + parquet_path = save_to_parquet(df, config.cache_dir, cache_name) + del df + + check = build_rich_check(n_extra_cols=0) + + print("Setting up DuckDB (from Parquet)...") + duck_engine, duck_con = setup_duckdb_from_parquet(parquet_path) + + print(f"Running DuckDB validation ({config.n_runs} runs)...") + duck_validation = benchmark_duckdb_validation(duck_engine, check, config.n_runs) + print(f" DuckDB Validation: {duck_validation:.3f}s (avg)") + + duck_con.close() + + results.append({ + "rows": n_rows, + "cols": config.base_cols, + "checks": n_checks, + "duckdb_validation": duck_validation, + }) + + return results + + +def run_varying_cols_experiment_duckdb(config: ExperimentConfig) -> List[Dict[str, Any]]: + """Run varying columns experiment for DuckDB engine.""" + print("\n" + "=" * 70) + print(f"EXPERIMENT 2 (DuckDB): VARYING COLUMNS (Fixed Rows = {config.fixed_rows:,})") + print("=" * 70) + + results = [] + + for n_cols in config.column_counts: + n_extra_cols = n_cols - config.base_cols + n_checks = count_checks(n_extra_cols) + print(f"\n--- {config.fixed_rows:,} rows x {n_cols} cols ({n_checks} checks) ---") + + cache_name = f"rich_cols_{n_cols}" + parquet_path = get_cached_parquet(config.cache_dir, cache_name) + + if parquet_path: + print(f"Using cached data: {parquet_path}") + else: + print("Generating rich mixed-type data and saving to Parquet...") + df = generate_rich_data(config.fixed_rows, n_extra_cols=n_extra_cols) + parquet_path = save_to_parquet(df, config.cache_dir, cache_name) + del df + + check = build_rich_check(n_extra_cols=n_extra_cols) + + print("Setting up DuckDB (from Parquet)...") + duck_engine, duck_con = setup_duckdb_from_parquet(parquet_path) + + print(f"Running DuckDB validation ({config.n_runs} runs)...") + duck_validation = benchmark_duckdb_validation(duck_engine, check, config.n_runs) + print(f" DuckDB Validation: {duck_validation:.3f}s (avg)") + + duck_con.close() + + results.append({ + "rows": config.fixed_rows, + "cols": n_cols, + "checks": n_checks, + "duckdb_validation": duck_validation, + }) + + return results + + +def run_profiling_experiment_duckdb(config: ExperimentConfig) -> List[Dict[str, Any]]: + """Run column profiling experiment for DuckDB engine.""" + print("\n" + "=" * 70) + print(f"EXPERIMENT 3 (DuckDB): COLUMN PROFILING (Fixed Columns = {config.base_cols})") + print("=" * 70) + + results = [] + + for n_rows in config.profiling_row_counts: + print(f"\n--- {n_rows:,} rows x {config.base_cols} cols (profiling) ---") + + cache_name = f"rich_rows_{n_rows}" + parquet_path = get_cached_parquet(config.cache_dir, cache_name) + + if parquet_path: + print(f"Using cached data: {parquet_path}") + else: + print("Generating rich mixed-type data and saving to Parquet...") + df = generate_rich_data(n_rows, n_extra_cols=0) + parquet_path = save_to_parquet(df, config.cache_dir, cache_name) + del df + + print("Setting up DuckDB for profiling...") + duck_engine, duck_con = setup_duckdb_for_profiling(parquet_path) + + print(f"Running DuckDB profiling ({config.n_runs} runs)...") + duck_profiling = benchmark_duckdb_profiling(duck_engine, config.n_runs) + print(f" DuckDB Profiling: {duck_profiling:.3f}s (avg)") + + duck_con.close() + + results.append({ + "rows": n_rows, + "cols": config.base_cols, + "duckdb_profiling": duck_profiling, + }) + + return results + + +# ============================================================================= +# Spark Experiment Runners +# ============================================================================= + + +def run_varying_rows_experiment_spark( + spark: Any, spark_startup_time: float, config: ExperimentConfig +) -> List[Dict[str, Any]]: + """Run varying rows experiment for Spark engine.""" + print("\n" + "=" * 70) + print(f"EXPERIMENT 1 (Spark): VARYING ROWS (Fixed Columns = {config.base_cols})") + print("=" * 70) + + results = [] + + for n_rows in config.row_counts: + n_checks = count_checks(0) + print(f"\n--- {n_rows:,} rows x {config.base_cols} cols ({n_checks} checks) ---") + + cache_name = f"rich_rows_{n_rows}" + parquet_path = get_cached_parquet(config.cache_dir, cache_name) + + if parquet_path: + print(f"Using cached data: {parquet_path}") + else: + print("Generating rich mixed-type data and saving to Parquet...") + df = generate_rich_data(n_rows, n_extra_cols=0) + parquet_path = save_to_parquet(df, config.cache_dir, cache_name) + del df + + check = build_rich_check(n_extra_cols=0) + + spark_load = None + spark_validation = None + + try: + print("Loading Parquet into Spark...") + spark_df, spark_load = load_spark_from_parquet(spark, parquet_path) + print(f" Spark Data Load: {spark_load:.3f}s") + + print(f"Running Spark validation ({config.n_runs} runs)...") + spark_validation = benchmark_spark_validation(spark, spark_df, check, config.n_runs) + print(f" Spark Validation: {spark_validation:.3f}s (avg)") + except Exception as e: + print(f" Spark error: {str(e)[:80]}") + + results.append({ + "rows": n_rows, + "cols": config.base_cols, + "checks": n_checks, + "spark_startup": spark_startup_time, + "spark_load": spark_load, + "spark_validation": spark_validation, + }) + + return results + + +def run_varying_cols_experiment_spark( + spark: Any, spark_startup_time: float, config: ExperimentConfig +) -> List[Dict[str, Any]]: + """Run varying columns experiment for Spark engine.""" + print("\n" + "=" * 70) + print(f"EXPERIMENT 2 (Spark): VARYING COLUMNS (Fixed Rows = {config.fixed_rows:,})") + print("=" * 70) + + results = [] + + for n_cols in config.column_counts: + n_extra_cols = n_cols - config.base_cols + n_checks = count_checks(n_extra_cols) + print(f"\n--- {config.fixed_rows:,} rows x {n_cols} cols ({n_checks} checks) ---") + + cache_name = f"rich_cols_{n_cols}" + parquet_path = get_cached_parquet(config.cache_dir, cache_name) + + if parquet_path: + print(f"Using cached data: {parquet_path}") + else: + print("Generating rich mixed-type data and saving to Parquet...") + df = generate_rich_data(config.fixed_rows, n_extra_cols=n_extra_cols) + parquet_path = save_to_parquet(df, config.cache_dir, cache_name) + del df + + check = build_rich_check(n_extra_cols=n_extra_cols) + + spark_load = None + spark_validation = None + + try: + print("Loading Parquet into Spark...") + spark_df, spark_load = load_spark_from_parquet(spark, parquet_path) + print(f" Spark Data Load: {spark_load:.3f}s") + + print(f"Running Spark validation ({config.n_runs} runs)...") + spark_validation = benchmark_spark_validation(spark, spark_df, check, config.n_runs) + print(f" Spark Validation: {spark_validation:.3f}s (avg)") + except Exception as e: + print(f" Spark error: {str(e)[:80]}") + + results.append({ + "rows": config.fixed_rows, + "cols": n_cols, + "checks": n_checks, + "spark_startup": spark_startup_time, + "spark_load": spark_load, + "spark_validation": spark_validation, + }) + + return results + + +def run_profiling_experiment_spark( + spark: Any, spark_startup_time: float, config: ExperimentConfig +) -> List[Dict[str, Any]]: + """Run column profiling experiment for Spark engine.""" + print("\n" + "=" * 70) + print(f"EXPERIMENT 3 (Spark): COLUMN PROFILING (Fixed Columns = {config.base_cols})") + print("=" * 70) + + results = [] + + for n_rows in config.profiling_row_counts: + print(f"\n--- {n_rows:,} rows x {config.base_cols} cols (profiling) ---") + + cache_name = f"rich_rows_{n_rows}" + parquet_path = get_cached_parquet(config.cache_dir, cache_name) + + if parquet_path: + print(f"Using cached data: {parquet_path}") + else: + print("Generating rich mixed-type data and saving to Parquet...") + df = generate_rich_data(n_rows, n_extra_cols=0) + parquet_path = save_to_parquet(df, config.cache_dir, cache_name) + del df + + spark_load = None + spark_profiling = None + + try: + print("Loading Parquet into Spark...") + spark_df, spark_load = load_spark_from_parquet(spark, parquet_path) + print(f" Spark Data Load: {spark_load:.3f}s") + + print(f"Running Spark profiling ({config.n_runs} runs)...") + spark_profiling = benchmark_spark_profiling(spark, spark_df, config.n_runs) + print(f" Spark Profiling: {spark_profiling:.3f}s (avg)") + except Exception as e: + print(f" Spark error: {str(e)[:80]}") + + results.append({ + "rows": n_rows, + "cols": config.base_cols, + "spark_startup": spark_startup_time, + "spark_load": spark_load, + "spark_profiling": spark_profiling, + }) + + return results diff --git a/benchmark/report.py b/benchmark/report.py new file mode 100644 index 0000000..48b4366 --- /dev/null +++ b/benchmark/report.py @@ -0,0 +1,210 @@ +"""Markdown report generation for benchmark results.""" + +import os +from typing import Optional + +from .results import BenchmarkRun +from .experiments import count_checks + + +def format_value(value: Optional[float], precision: int = 3) -> str: + """Format a numeric value or return 'N/A' if None.""" + if value is None: + return "N/A" + return f"{value:.{precision}f}" + + +def calculate_speedup(spark_time: Optional[float], duckdb_time: Optional[float]) -> str: + """Calculate speedup ratio (spark/duckdb) or return 'N/A'.""" + if spark_time is None or duckdb_time is None or duckdb_time <= 0: + return "N/A" + return f"{spark_time / duckdb_time:.1f}x" + + +def generate_markdown_report(run: BenchmarkRun) -> str: + """ + Generate a markdown report from benchmark results. + + Args: + run: BenchmarkRun instance with results + + Returns: + Markdown string + """ + # Extract config values + config = run.config + exp_config = config.get("experiment", {}) + n_runs = exp_config.get("n_runs", 3) + base_cols = exp_config.get("base_cols", 10) + fixed_rows = exp_config.get("fixed_rows", 1_000_000) + row_counts = exp_config.get("row_counts", []) + column_counts = exp_config.get("column_counts", []) + + spark_startup = run.spark_startup_time or 0.0 + + report = f"""# PyDeequ Engine Benchmark Results + +## Run Information + +| Field | Value | +|-------|-------| +| Run ID | `{run.run_id}` | +| Timestamp | {run.timestamp} | +| Engine | {run.engine} | +| Total Duration | {format_value(run.total_duration_seconds, 1)}s | + +## Environment + +| Component | Version | +|-----------|---------| +| Python | {run.environment.python_version} | +| Platform | {run.environment.platform_system} {run.environment.platform_release} ({run.environment.platform_machine}) | +| CPU Count | {run.environment.cpu_count} | +| DuckDB | {run.environment.duckdb_version or 'N/A'} | +| PySpark | {run.environment.pyspark_version or 'N/A'} | +| PyDeequ | {run.environment.pydeequ_version or 'N/A'} | +| Pandas | {run.environment.pandas_version or 'N/A'} | +| NumPy | {run.environment.numpy_version or 'N/A'} | +| PyArrow | {run.environment.pyarrow_version or 'N/A'} | + +## Methodology + +Based on duckdq-exp experiments: + +- **Data Source**: Both engines read from the same Parquet files +- **Rich Dataset**: Mixed-type columns (strings + numerics) with realistic data patterns +- **Validation Runs**: {n_runs} iterations, reporting average +- **Base Checks**: {count_checks(0)} rich checks on {base_cols} mixed-type columns + +### Rich Dataset Schema ({base_cols} base columns) + +| Column | Type | Description | +|--------|------|-------------| +| `id` | string | Unique identifier (ID000000000000) | +| `category` | string | Categorical (5 values) | +| `status` | string | Categorical (3 values) | +| `email` | string | Email pattern | +| `amount` | float | Numeric value [0, 10000] | +| `quantity` | int | Non-negative integer [0, 1000] | +| `score` | float | Normal distribution [0, 100] | +| `rating` | int | Star rating [1, 5] | +| `price` | float | Positive numeric [0.01, 9999.99] | +| `discount` | float | Percentage [0, 0.5] | + +## Experiment 1: Varying Rows (Fixed Columns = {base_cols}, {count_checks(0)} checks) + +| Rows | Cols | Checks | DuckDB (s) | Spark (s) | Speedup | +|------|------|--------|------------|-----------|---------| +""" + + for r in run.varying_rows_results: + duck_s = r.get("duckdb_validation") + spark_s = r.get("spark_validation") + checks = r.get("checks", count_checks(0)) + speedup = calculate_speedup(spark_s, duck_s) + report += f"| {r['rows']:,} | {r['cols']} | {checks} | {format_value(duck_s)} | {format_value(spark_s)} | {speedup} |\n" + + report += f""" +## Experiment 2: Varying Columns (Fixed Rows = {fixed_rows:,}) + +Column counts: {column_counts} (base {base_cols} mixed-type + extra numeric columns) + +| Rows | Cols | Checks | DuckDB (s) | Spark (s) | Speedup | +|------|------|--------|------------|-----------|---------| +""" + + for r in run.varying_cols_results: + duck_s = r.get("duckdb_validation") + spark_s = r.get("spark_validation") + checks = r.get("checks", "N/A") + speedup = calculate_speedup(spark_s, duck_s) + report += f"| {r['rows']:,} | {r['cols']} | {checks} | {format_value(duck_s)} | {format_value(spark_s)} | {speedup} |\n" + + report += f""" +## Experiment 3: Column Profiling (Fixed Columns = {base_cols}) + +Uses `ColumnProfilerRunner` to profile all columns. + +| Rows | Cols | DuckDB (s) | Spark (s) | Speedup | +|------|------|------------|-----------|---------| +""" + + for r in run.profiling_results: + duck_s = r.get("duckdb_profiling") + spark_s = r.get("spark_profiling") + speedup = calculate_speedup(spark_s, duck_s) + report += f"| {r['rows']:,} | {r['cols']} | {format_value(duck_s)} | {format_value(spark_s)} | {speedup} |\n" + + report += f""" +## Timing Details + +### Spark Overhead (Excluded from Validation Time) + +| Phase | Time (s) | +|-------|----------| +| Startup (SparkSession) | {format_value(spark_startup)} | + +**Note**: Data load time varies per experiment and is not included in validation/profiling time. + +## Key Findings + +1. **DuckDB is significantly faster** for single-node data quality validation +2. **No JVM overhead**: DuckDB runs natively in Python process +3. **Rich type support**: Both engines handle mixed string/numeric data effectively +4. **Parquet files**: Both engines read from the same files, eliminating gRPC serialization bottleneck +5. **Column profiling**: Full profiling available on both engines + +## Running the Benchmark + +```bash +# Run DuckDB only (no Spark server needed) +python benchmark_cli.py run --engine duckdb + +# Run Spark only (auto-spark is enabled by default) +python benchmark_cli.py run --engine spark + +# Run both engines +python benchmark_cli.py run --engine all + +# Generate report from saved results (folder or file path) +python benchmark_cli.py report benchmark_results/{run.run_id}/ +python benchmark_cli.py report benchmark_results/{run.run_id}/results.json + +# Generate PNG visualization +python benchmark_cli.py visualize benchmark_results/{run.run_id}/ +``` + +## Notes + +- Both engines read from the same Parquet files, ensuring fair comparison +- Memory configuration (16GB+) prevents OOM errors for large datasets +- For distributed workloads across multiple nodes, Spark scales horizontally +- DuckDB is optimized for single-node analytical workloads +""" + + if run.errors: + report += "\n## Errors\n\n" + for error in run.errors: + report += f"- {error}\n" + + return report + + +def save_report(run: BenchmarkRun, output_path: str) -> str: + """ + Generate and save markdown report. + + Args: + run: BenchmarkRun instance + output_path: Path to save the report + + Returns: + Path to the saved report + """ + report = generate_markdown_report(run) + + os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True) + with open(output_path, "w") as f: + f.write(report) + + return output_path diff --git a/benchmark/results.py b/benchmark/results.py new file mode 100644 index 0000000..dc4e179 --- /dev/null +++ b/benchmark/results.py @@ -0,0 +1,286 @@ +"""Results dataclasses and JSON I/O for benchmark.""" + +import json +import os +import platform +from dataclasses import dataclass, field, asdict +from datetime import datetime +from typing import List, Optional, Dict, Any + + +@dataclass +class ExperimentResult: + """Result from a single experiment run.""" + + rows: int + cols: int + checks: Optional[int] = None + duckdb_validation: Optional[float] = None + duckdb_profiling: Optional[float] = None + spark_startup: Optional[float] = None + spark_load: Optional[float] = None + spark_validation: Optional[float] = None + spark_profiling: Optional[float] = None + error: Optional[str] = None + + def to_dict(self) -> dict: + """Convert to dictionary, excluding None values.""" + return {k: v for k, v in asdict(self).items() if v is not None} + + @classmethod + def from_dict(cls, data: dict) -> "ExperimentResult": + """Create from dictionary.""" + return cls(**{k: v for k, v in data.items() if k in cls.__dataclass_fields__}) + + +@dataclass +class EnvironmentInfo: + """Environment information for reproducibility.""" + + python_version: str = "" + platform_system: str = "" + platform_release: str = "" + platform_machine: str = "" + cpu_count: int = 0 + duckdb_version: str = "" + pyspark_version: str = "" + pydeequ_version: str = "" + pandas_version: str = "" + numpy_version: str = "" + pyarrow_version: str = "" + + def to_dict(self) -> dict: + """Convert to dictionary.""" + return asdict(self) + + @classmethod + def from_dict(cls, data: dict) -> "EnvironmentInfo": + """Create from dictionary.""" + return cls(**{k: v for k, v in data.items() if k in cls.__dataclass_fields__}) + + +@dataclass +class BenchmarkRun: + """Complete benchmark run with all results.""" + + run_id: str + timestamp: str + engine: str + config: Dict[str, Any] = field(default_factory=dict) + environment: EnvironmentInfo = field(default_factory=EnvironmentInfo) + varying_rows_results: List[Dict[str, Any]] = field(default_factory=list) + varying_cols_results: List[Dict[str, Any]] = field(default_factory=list) + profiling_results: List[Dict[str, Any]] = field(default_factory=list) + spark_startup_time: Optional[float] = None + total_duration_seconds: Optional[float] = None + errors: List[str] = field(default_factory=list) + + def to_dict(self) -> dict: + """Convert to dictionary for JSON serialization.""" + return { + "run_id": self.run_id, + "timestamp": self.timestamp, + "engine": self.engine, + "config": self.config, + "environment": self.environment.to_dict(), + "varying_rows_results": self.varying_rows_results, + "varying_cols_results": self.varying_cols_results, + "profiling_results": self.profiling_results, + "spark_startup_time": self.spark_startup_time, + "total_duration_seconds": self.total_duration_seconds, + "errors": self.errors, + } + + @classmethod + def from_dict(cls, data: dict) -> "BenchmarkRun": + """Create from dictionary.""" + return cls( + run_id=data.get("run_id", ""), + timestamp=data.get("timestamp", ""), + engine=data.get("engine", ""), + config=data.get("config", {}), + environment=EnvironmentInfo.from_dict(data.get("environment", {})), + varying_rows_results=data.get("varying_rows_results", []), + varying_cols_results=data.get("varying_cols_results", []), + profiling_results=data.get("profiling_results", []), + spark_startup_time=data.get("spark_startup_time"), + total_duration_seconds=data.get("total_duration_seconds"), + errors=data.get("errors", []), + ) + + +def generate_run_id() -> str: + """Generate a unique run ID with timestamp.""" + return f"benchmark_{datetime.now().strftime('%Y-%m-%dT%H-%M-%S')}" + + +def save_results(run: BenchmarkRun, run_dir: str) -> str: + """ + Save benchmark results to JSON file in run directory. + + Args: + run: BenchmarkRun instance + run_dir: Directory for this benchmark run (e.g., benchmark_results/benchmark_2024-01-19T14-30-45/) + + Returns: + Path to the saved JSON file + """ + os.makedirs(run_dir, exist_ok=True) + path = os.path.join(run_dir, "results.json") + + with open(path, "w") as f: + json.dump(run.to_dict(), f, indent=2) + + return path + + +def load_results(path: str) -> BenchmarkRun: + """ + Load benchmark results from JSON file. + + Args: + path: Path to JSON file or run directory containing results.json + + Returns: + BenchmarkRun instance + """ + # If path is a directory, look for results.json inside + if os.path.isdir(path): + path = os.path.join(path, "results.json") + + with open(path) as f: + data = json.load(f) + return BenchmarkRun.from_dict(data) + + +def collect_environment_info() -> EnvironmentInfo: + """Collect environment information for reproducibility.""" + info = EnvironmentInfo( + python_version=platform.python_version(), + platform_system=platform.system(), + platform_release=platform.release(), + platform_machine=platform.machine(), + cpu_count=os.cpu_count() or 0, + ) + + # Try to get package versions + try: + import duckdb + + info.duckdb_version = duckdb.__version__ + except (ImportError, AttributeError): + pass + + try: + import pyspark + + info.pyspark_version = pyspark.__version__ + except (ImportError, AttributeError): + pass + + try: + import pydeequ + + info.pydeequ_version = getattr(pydeequ, "__version__", "unknown") + except (ImportError, AttributeError): + pass + + try: + import pandas + + info.pandas_version = pandas.__version__ + except (ImportError, AttributeError): + pass + + try: + import numpy + + info.numpy_version = numpy.__version__ + except (ImportError, AttributeError): + pass + + try: + import pyarrow + + info.pyarrow_version = pyarrow.__version__ + except (ImportError, AttributeError): + pass + + return info + + +def merge_results(duckdb_run: Optional[BenchmarkRun], spark_run: Optional[BenchmarkRun]) -> BenchmarkRun: + """ + Merge results from separate DuckDB and Spark runs into a single combined result. + + Args: + duckdb_run: Results from DuckDB worker (may be None) + spark_run: Results from Spark worker (may be None) + + Returns: + Combined BenchmarkRun + """ + # Use whichever run is available as the base + base = duckdb_run or spark_run + if base is None: + raise ValueError("At least one run must be provided") + + merged = BenchmarkRun( + run_id=base.run_id, + timestamp=base.timestamp, + engine="all" if duckdb_run and spark_run else base.engine, + config=base.config, + environment=base.environment, + errors=base.errors.copy(), + ) + + # Merge varying rows results + rows_by_key = {} + for run in [duckdb_run, spark_run]: + if run: + for r in run.varying_rows_results: + key = (r.get("rows"), r.get("cols")) + if key not in rows_by_key: + rows_by_key[key] = r.copy() + else: + rows_by_key[key].update({k: v for k, v in r.items() if v is not None}) + merged.errors.extend(e for e in run.errors if e not in merged.errors) + merged.varying_rows_results = list(rows_by_key.values()) + + # Merge varying cols results + cols_by_key = {} + for run in [duckdb_run, spark_run]: + if run: + for r in run.varying_cols_results: + key = (r.get("rows"), r.get("cols")) + if key not in cols_by_key: + cols_by_key[key] = r.copy() + else: + cols_by_key[key].update({k: v for k, v in r.items() if v is not None}) + merged.varying_cols_results = list(cols_by_key.values()) + + # Merge profiling results + prof_by_key = {} + for run in [duckdb_run, spark_run]: + if run: + for r in run.profiling_results: + key = (r.get("rows"), r.get("cols")) + if key not in prof_by_key: + prof_by_key[key] = r.copy() + else: + prof_by_key[key].update({k: v for k, v in r.items() if v is not None}) + merged.profiling_results = list(prof_by_key.values()) + + # Take Spark startup time from Spark run + if spark_run and spark_run.spark_startup_time: + merged.spark_startup_time = spark_run.spark_startup_time + + # Sum total durations + total = 0.0 + if duckdb_run and duckdb_run.total_duration_seconds: + total += duckdb_run.total_duration_seconds + if spark_run and spark_run.total_duration_seconds: + total += spark_run.total_duration_seconds + merged.total_duration_seconds = total if total > 0 else None + + return merged diff --git a/benchmark/spark_server.py b/benchmark/spark_server.py new file mode 100644 index 0000000..eac0020 --- /dev/null +++ b/benchmark/spark_server.py @@ -0,0 +1,197 @@ +"""Spark Connect server management for benchmarks.""" + +import os +import signal +import socket +import subprocess +import time +from contextlib import contextmanager +from typing import Optional + +from .config import SparkServerConfig + + +class SparkConnectServer: + """Manages Spark Connect server lifecycle.""" + + def __init__(self, config: Optional[SparkServerConfig] = None): + """ + Initialize Spark Connect server manager. + + Args: + config: Server configuration (uses defaults if not provided) + """ + self.config = config or SparkServerConfig() + self._process: Optional[subprocess.Popen] = None + self._started_by_us = False + + def is_running(self) -> bool: + """Check if Spark Connect server is running by attempting to connect.""" + try: + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.settimeout(1) + result = sock.connect_ex(("localhost", self.config.port)) + sock.close() + return result == 0 + except (socket.error, OSError): + return False + + def start(self) -> float: + """ + Start Spark Connect server if not already running. + + Returns: + Time taken to start the server (0 if already running) + + Raises: + RuntimeError: If server fails to start within timeout + """ + if self.is_running(): + print(f"Spark Connect server already running on port {self.config.port}") + return 0.0 + + start_time = time.time() + + # Build the startup command + start_script = os.path.join(self.config.spark_home, "sbin", "start-connect-server.sh") + + if not os.path.exists(start_script): + raise RuntimeError(f"Spark Connect start script not found: {start_script}") + + cmd = [ + start_script, + "--conf", f"spark.driver.memory={self.config.driver_memory}", + "--conf", f"spark.executor.memory={self.config.executor_memory}", + "--packages", "org.apache.spark:spark-connect_2.12:3.5.0", + "--jars", self.config.deequ_jar, + "--conf", "spark.connect.extensions.relation.classes=com.amazon.deequ.connect.DeequRelationPlugin", + ] + + # Set up environment + env = os.environ.copy() + env["JAVA_HOME"] = self.config.java_home + env["SPARK_HOME"] = self.config.spark_home + + print(f"Starting Spark Connect server on port {self.config.port}...") + print(f" JAVA_HOME: {self.config.java_home}") + print(f" SPARK_HOME: {self.config.spark_home}") + + # Start the server + self._process = subprocess.Popen( + cmd, + env=env, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + self._started_by_us = True + + # Wait for server to be ready + deadline = time.time() + self.config.startup_timeout + while time.time() < deadline: + if self.is_running(): + elapsed = time.time() - start_time + print(f"Spark Connect server started in {elapsed:.1f}s") + return elapsed + time.sleep(self.config.poll_interval) + + # Timeout - try to get error output + if self._process: + self._process.terminate() + _, stderr = self._process.communicate(timeout=5) + error_msg = stderr.decode() if stderr else "Unknown error" + self._process = None + self._started_by_us = False + raise RuntimeError( + f"Spark Connect server failed to start within {self.config.startup_timeout}s: {error_msg[:500]}" + ) + + raise RuntimeError( + f"Spark Connect server failed to start within {self.config.startup_timeout}s" + ) + + def stop(self) -> None: + """Stop Spark Connect server if we started it.""" + if not self._started_by_us: + print("Spark Connect server was not started by us, skipping stop") + return + + stop_script = os.path.join(self.config.spark_home, "sbin", "stop-connect-server.sh") + + if os.path.exists(stop_script): + print("Stopping Spark Connect server...") + env = os.environ.copy() + env["JAVA_HOME"] = self.config.java_home + env["SPARK_HOME"] = self.config.spark_home + + try: + subprocess.run( + [stop_script], + env=env, + timeout=30, + capture_output=True, + ) + print("Spark Connect server stopped") + except subprocess.TimeoutExpired: + print("Warning: stop script timed out") + except Exception as e: + print(f"Warning: Error stopping server: {e}") + else: + # Fall back to killing the process directly + if self._process: + print("Terminating Spark Connect server process...") + self._process.terminate() + try: + self._process.wait(timeout=10) + except subprocess.TimeoutExpired: + self._process.kill() + print("Spark Connect server process terminated") + + self._started_by_us = False + self._process = None + + +@contextmanager +def managed_spark_server(config: Optional[SparkServerConfig] = None): + """ + Context manager for Spark Connect server with signal handling. + + Ensures the server is stopped on exit, including on SIGINT/SIGTERM. + + Args: + config: Server configuration + + Yields: + SparkConnectServer instance + """ + server = SparkConnectServer(config) + original_sigint = signal.getsignal(signal.SIGINT) + original_sigterm = signal.getsignal(signal.SIGTERM) + + def signal_handler(signum, frame): + """Handle interrupt signals by stopping the server.""" + print(f"\nReceived signal {signum}, stopping Spark server...") + server.stop() + # Re-raise the signal to trigger default behavior + if signum == signal.SIGINT: + signal.signal(signal.SIGINT, original_sigint) + if callable(original_sigint): + original_sigint(signum, frame) + elif signum == signal.SIGTERM: + signal.signal(signal.SIGTERM, original_sigterm) + if callable(original_sigterm): + original_sigterm(signum, frame) + + try: + # Install signal handlers + signal.signal(signal.SIGINT, signal_handler) + signal.signal(signal.SIGTERM, signal_handler) + + yield server + + finally: + # Restore original signal handlers + signal.signal(signal.SIGINT, original_sigint) + signal.signal(signal.SIGTERM, original_sigterm) + + # Stop the server + server.stop() diff --git a/benchmark/visualize.py b/benchmark/visualize.py new file mode 100644 index 0000000..f239fa9 --- /dev/null +++ b/benchmark/visualize.py @@ -0,0 +1,288 @@ +""" +Benchmark Visualization for PyDeequ Engine Comparison. + +Generates PNG charts comparing DuckDB vs Spark performance from benchmark results. +""" + +import os +from typing import List, Dict, Any, Optional + +import matplotlib.pyplot as plt +import numpy as np + +from .results import BenchmarkRun + + +def _format_row_count(n: int) -> str: + """Format row count for display (e.g., 1000000 -> '1M').""" + if n >= 1_000_000: + return f"{n // 1_000_000}M" + elif n >= 1_000: + return f"{n // 1_000}K" + return str(n) + + +def _extract_validation_data( + results: List[Dict[str, Any]], x_key: str +) -> Dict[str, List]: + """Extract validation timing data from results.""" + data = { + "x_values": [], + "x_labels": [], + "duckdb": [], + "spark": [], + "checks": [], + } + + for r in sorted(results, key=lambda x: x.get(x_key, 0)): + x_val = r.get(x_key) + if x_val is None: + continue + + data["x_values"].append(x_val) + if x_key == "rows": + data["x_labels"].append(_format_row_count(x_val)) + else: + checks = r.get("checks", "") + data["x_labels"].append(f"{x_val}\n({checks})" if checks else str(x_val)) + + data["duckdb"].append(r.get("duckdb_validation")) + data["spark"].append(r.get("spark_validation")) + data["checks"].append(r.get("checks")) + + return data + + +def _extract_profiling_data(results: List[Dict[str, Any]]) -> Dict[str, List]: + """Extract profiling timing data from results.""" + data = { + "x_values": [], + "x_labels": [], + "duckdb": [], + "spark": [], + } + + for r in sorted(results, key=lambda x: x.get("rows", 0)): + rows = r.get("rows") + if rows is None: + continue + + data["x_values"].append(rows) + data["x_labels"].append(_format_row_count(rows)) + data["duckdb"].append(r.get("duckdb_profiling")) + data["spark"].append(r.get("spark_profiling")) + + return data + + +def _calculate_speedup(spark_times: List, duckdb_times: List) -> List[Optional[float]]: + """Calculate speedup ratios (Spark time / DuckDB time).""" + speedups = [] + for s, d in zip(spark_times, duckdb_times): + if s is not None and d is not None and d > 0: + speedups.append(s / d) + else: + speedups.append(None) + return speedups + + +def _plot_comparison( + ax: plt.Axes, + x_labels: List[str], + duckdb_times: List, + spark_times: List, + xlabel: str, + ylabel: str, + title: str, + duckdb_color: str, + spark_color: str, + use_log_scale: bool = False, +) -> None: + """Plot a side-by-side bar comparison chart.""" + # Filter out None values + valid_indices = [ + i for i in range(len(x_labels)) + if duckdb_times[i] is not None or spark_times[i] is not None + ] + + if not valid_indices: + ax.text(0.5, 0.5, "No data available", ha="center", va="center", transform=ax.transAxes) + ax.set_title(title, fontsize=12, fontweight="bold") + return + + labels = [x_labels[i] for i in valid_indices] + duckdb = [duckdb_times[i] if duckdb_times[i] is not None else 0 for i in valid_indices] + spark = [spark_times[i] if spark_times[i] is not None else 0 for i in valid_indices] + + x = np.arange(len(labels)) + width = 0.35 + + has_duckdb = any(d > 0 for d in duckdb) + has_spark = any(s > 0 for s in spark) + + if has_duckdb: + bars1 = ax.bar( + x - width / 2, duckdb, width, label="DuckDB", + color=duckdb_color, edgecolor="black", linewidth=0.5 + ) + for bar in bars1: + height = bar.get_height() + if height > 0: + ax.annotate( + f"{height:.2f}s", + xy=(bar.get_x() + bar.get_width() / 2, height), + xytext=(0, 3), textcoords="offset points", + ha="center", va="bottom", fontsize=7 + ) + + if has_spark: + bars2 = ax.bar( + x + width / 2, spark, width, label="Spark", + color=spark_color, edgecolor="black", linewidth=0.5 + ) + for bar in bars2: + height = bar.get_height() + if height > 0: + ax.annotate( + f"{height:.1f}s", + xy=(bar.get_x() + bar.get_width() / 2, height), + xytext=(0, 3), textcoords="offset points", + ha="center", va="bottom", fontsize=7 + ) + + ax.set_xlabel(xlabel, fontsize=11) + ax.set_ylabel(ylabel, fontsize=11) + ax.set_title(title, fontsize=12, fontweight="bold") + ax.set_xticks(x) + ax.set_xticklabels(labels) + ax.legend(loc="upper left") + + if use_log_scale and has_duckdb and has_spark: + ax.set_yscale("log") + + +def _plot_speedup( + ax: plt.Axes, + x_labels: List[str], + speedups: List[Optional[float]], + xlabel: str, + title: str, + speedup_color: str, +) -> None: + """Plot a speedup bar chart.""" + valid_indices = [i for i in range(len(x_labels)) if speedups[i] is not None] + + if not valid_indices: + ax.text(0.5, 0.5, "No speedup data\n(need both engines)", ha="center", va="center", transform=ax.transAxes) + ax.set_title(title, fontsize=12, fontweight="bold") + return + + labels = [x_labels[i] for i in valid_indices] + values = [speedups[i] for i in valid_indices] + + x = np.arange(len(labels)) + bars = ax.bar(x, values, color=speedup_color, edgecolor="black", linewidth=0.5) + + ax.axhline(y=1, color="gray", linestyle="--", alpha=0.7) + ax.set_xlabel(xlabel, fontsize=11) + ax.set_ylabel("Speedup (x times faster)", fontsize=11) + ax.set_title(title, fontsize=12, fontweight="bold") + ax.set_xticks(x) + ax.set_xticklabels(labels) + + for bar in bars: + height = bar.get_height() + ax.annotate( + f"{height:.1f}x", + xy=(bar.get_x() + bar.get_width() / 2, height), + xytext=(0, 3), textcoords="offset points", + ha="center", va="bottom", fontsize=10, fontweight="bold" + ) + + +def generate_visualization(run: BenchmarkRun, output_path: str) -> str: + """ + Generate benchmark visualization PNG from results. + + Args: + run: BenchmarkRun instance with results + output_path: Path to save the PNG file + + Returns: + Path to the saved PNG file + """ + # Extract data from results + rows_data = _extract_validation_data(run.varying_rows_results, "rows") + cols_data = _extract_validation_data(run.varying_cols_results, "cols") + profiling_data = _extract_profiling_data(run.profiling_results) + + # Calculate speedups + rows_speedup = _calculate_speedup(rows_data["spark"], rows_data["duckdb"]) + cols_speedup = _calculate_speedup(cols_data["spark"], cols_data["duckdb"]) + profiling_speedup = _calculate_speedup(profiling_data["spark"], profiling_data["duckdb"]) + + # Color scheme + duckdb_color = "#FFA500" # Orange + spark_color = "#E25A1C" # Spark orange/red + speedup_color = "#2E86AB" # Blue + + # Set style and create figure + plt.style.use("seaborn-v0_8-whitegrid") + fig, axes = plt.subplots(2, 3, figsize=(16, 10)) + + # Row 1: Time comparisons + _plot_comparison( + axes[0, 0], rows_data["x_labels"], rows_data["duckdb"], rows_data["spark"], + "Dataset Size (rows)", "Validation Time (seconds)", + "Exp 1: Varying Rows (10 cols)", + duckdb_color, spark_color, use_log_scale=True + ) + + _plot_comparison( + axes[0, 1], cols_data["x_labels"], cols_data["duckdb"], cols_data["spark"], + "Columns (Checks)", "Validation Time (seconds)", + "Exp 2: Varying Columns (1M rows)", + duckdb_color, spark_color + ) + + _plot_comparison( + axes[0, 2], profiling_data["x_labels"], profiling_data["duckdb"], profiling_data["spark"], + "Dataset Size (rows)", "Profiling Time (seconds)", + "Exp 3: Column Profiling (10 cols)", + duckdb_color, spark_color + ) + + # Row 2: Speedup charts + _plot_speedup( + axes[1, 0], rows_data["x_labels"], rows_speedup, + "Dataset Size (rows)", "DuckDB Speedup: Varying Rows", + speedup_color + ) + + _plot_speedup( + axes[1, 1], cols_data["x_labels"], cols_speedup, + "Columns (Checks)", "DuckDB Speedup: Varying Columns", + speedup_color + ) + + _plot_speedup( + axes[1, 2], profiling_data["x_labels"], profiling_speedup, + "Dataset Size (rows)", "DuckDB Speedup: Column Profiling", + speedup_color + ) + + # Title + engine_label = run.engine.upper() if run.engine != "all" else "DuckDB vs Spark" + fig.suptitle( + f"PyDeequ Benchmark: {engine_label}\n{run.run_id}", + fontsize=14, fontweight="bold", y=1.02 + ) + + plt.tight_layout() + + # Save the figure + os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True) + plt.savefig(output_path, dpi=150, bbox_inches="tight", facecolor="white", edgecolor="none") + plt.close(fig) + + return output_path diff --git a/benchmark/worker.py b/benchmark/worker.py new file mode 100644 index 0000000..3724731 --- /dev/null +++ b/benchmark/worker.py @@ -0,0 +1,189 @@ +#!/usr/bin/env python3 +""" +Subprocess worker for running benchmarks in isolated process. + +This module is designed to be run as: + python -m benchmark.worker --config --engine {duckdb,spark} --output + +Each engine runs in a fresh subprocess to ensure clean JVM/Python state. +""" + +import argparse +import json +import os +import sys +import time +from datetime import datetime +from typing import Optional + +from .config import BenchmarkConfig, ExperimentConfig +from .results import ( + BenchmarkRun, + collect_environment_info, + generate_run_id, +) +from .experiments import ( + run_varying_rows_experiment_duckdb, + run_varying_cols_experiment_duckdb, + run_profiling_experiment_duckdb, + run_varying_rows_experiment_spark, + run_varying_cols_experiment_spark, + run_profiling_experiment_spark, + setup_spark, +) + + +def run_duckdb_worker(config: BenchmarkConfig, run_id: str) -> BenchmarkRun: + """Run all DuckDB experiments.""" + start_time = time.time() + + run = BenchmarkRun( + run_id=run_id, + timestamp=datetime.now().isoformat(), + engine="duckdb", + config=config.to_dict(), + environment=collect_environment_info(), + ) + + exp_config = config.experiment + + try: + run.varying_rows_results = run_varying_rows_experiment_duckdb(exp_config) + except Exception as e: + run.errors.append(f"DuckDB varying rows experiment failed: {e}") + print(f"Error in varying rows experiment: {e}") + + try: + run.varying_cols_results = run_varying_cols_experiment_duckdb(exp_config) + except Exception as e: + run.errors.append(f"DuckDB varying cols experiment failed: {e}") + print(f"Error in varying cols experiment: {e}") + + try: + run.profiling_results = run_profiling_experiment_duckdb(exp_config) + except Exception as e: + run.errors.append(f"DuckDB profiling experiment failed: {e}") + print(f"Error in profiling experiment: {e}") + + run.total_duration_seconds = time.time() - start_time + return run + + +def run_spark_worker(config: BenchmarkConfig, run_id: str) -> BenchmarkRun: + """Run all Spark experiments.""" + start_time = time.time() + + run = BenchmarkRun( + run_id=run_id, + timestamp=datetime.now().isoformat(), + engine="spark", + config=config.to_dict(), + environment=collect_environment_info(), + ) + + exp_config = config.experiment + + # Setup Spark + spark = None + spark_startup_time = 0.0 + + try: + print("\nSetting up Spark Connect...") + spark, spark_startup_time = setup_spark(config.spark_remote) + print(f" Spark Startup: {spark_startup_time:.3f}s") + run.spark_startup_time = spark_startup_time + except Exception as e: + error_msg = f"Spark setup failed: {e}" + run.errors.append(error_msg) + print(f"Error: {error_msg}") + run.total_duration_seconds = time.time() - start_time + return run + + try: + run.varying_rows_results = run_varying_rows_experiment_spark( + spark, spark_startup_time, exp_config + ) + except Exception as e: + run.errors.append(f"Spark varying rows experiment failed: {e}") + print(f"Error in varying rows experiment: {e}") + + try: + run.varying_cols_results = run_varying_cols_experiment_spark( + spark, spark_startup_time, exp_config + ) + except Exception as e: + run.errors.append(f"Spark varying cols experiment failed: {e}") + print(f"Error in varying cols experiment: {e}") + + try: + run.profiling_results = run_profiling_experiment_spark( + spark, spark_startup_time, exp_config + ) + except Exception as e: + run.errors.append(f"Spark profiling experiment failed: {e}") + print(f"Error in profiling experiment: {e}") + + run.total_duration_seconds = time.time() - start_time + return run + + +def main(): + parser = argparse.ArgumentParser(description="Benchmark worker subprocess") + parser.add_argument( + "--config", + required=True, + help="Path to JSON config file", + ) + parser.add_argument( + "--engine", + choices=["duckdb", "spark"], + required=True, + help="Engine to benchmark", + ) + parser.add_argument( + "--output", + required=True, + help="Path to write JSON results", + ) + parser.add_argument( + "--run-id", + help="Run ID to use (optional, generated if not provided)", + ) + + args = parser.parse_args() + + # Load config from JSON file + with open(args.config) as f: + config_data = json.load(f) + config = BenchmarkConfig.from_dict(config_data) + + # Use provided run ID or generate new one + run_id = args.run_id or generate_run_id() + + print(f"Benchmark Worker: {args.engine}") + print(f"Run ID: {run_id}") + print("=" * 70) + + # Run the appropriate engine + if args.engine == "duckdb": + result = run_duckdb_worker(config, run_id) + else: + result = run_spark_worker(config, run_id) + + # Write results to output file + os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True) + with open(args.output, "w") as f: + json.dump(result.to_dict(), f, indent=2) + + print(f"\nResults written to: {args.output}") + print(f"Total duration: {result.total_duration_seconds:.1f}s") + + if result.errors: + print(f"Errors encountered: {len(result.errors)}") + for error in result.errors: + print(f" - {error}") + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/benchmark_cli.py b/benchmark_cli.py new file mode 100644 index 0000000..418e401 --- /dev/null +++ b/benchmark_cli.py @@ -0,0 +1,354 @@ +#!/usr/bin/env python3 +""" +PyDeequ Engine Benchmark CLI + +Orchestrates benchmark runs with process isolation and auto Spark server management. + +Usage: + # Run DuckDB only (no Spark server needed) + python benchmark_cli.py run --engine duckdb + + # Run Spark only (auto-spark is enabled by default) + python benchmark_cli.py run --engine spark + + # Run both engines + python benchmark_cli.py run --engine all + + # Run without auto Spark server management (assumes server is running) + python benchmark_cli.py run --engine spark --no-auto-spark + + # Generate report from saved results (folder or file path) + python benchmark_cli.py report benchmark_results/benchmark_2024-01-19T14-30-45/ + python benchmark_cli.py report benchmark_results/benchmark_2024-01-19T14-30-45/results.json + + # Generate report to custom location + python benchmark_cli.py report benchmark_results/benchmark_2024-01-19T14-30-45/ -o MY_RESULTS.md + + # Generate visualization PNG from results + python benchmark_cli.py visualize benchmark_results/benchmark_2024-01-19T14-30-45/ + python benchmark_cli.py visualize benchmark_results/benchmark_2024-01-19T14-30-45/ -o charts.png +""" + +import argparse +import json +import os +import subprocess +import sys +import tempfile +import time +from typing import Optional + +from benchmark.config import BenchmarkConfig, ExperimentConfig, SparkServerConfig +from benchmark.results import ( + BenchmarkRun, + generate_run_id, + save_results, + load_results, + merge_results, + collect_environment_info, +) +from benchmark.spark_server import managed_spark_server +from benchmark.report import save_report +from benchmark.visualize import generate_visualization + + +def run_engine_in_subprocess( + engine: str, + config: BenchmarkConfig, + run_id: str, +) -> Optional[BenchmarkRun]: + """ + Run benchmark for a single engine in an isolated subprocess. + + Args: + engine: Engine to run ("duckdb" or "spark") + config: Benchmark configuration + run_id: Run ID to use + + Returns: + BenchmarkRun result, or None on failure + """ + print(f"\n{'=' * 70}") + print(f"Running {engine.upper()} benchmarks in subprocess...") + print("=" * 70) + + # Write config to temp file + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: + json.dump(config.to_dict(), f, indent=2) + config_path = f.name + + # Temp output file for results (will be cleaned up after loading) + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f: + output_path = f.name + + try: + # Run worker subprocess + cmd = [ + sys.executable, + "-m", + "benchmark.worker", + "--config", config_path, + "--engine", engine, + "--output", output_path, + "--run-id", run_id, + ] + + print(f"Command: {' '.join(cmd)}") + print() + + result = subprocess.run( + cmd, + cwd=os.path.dirname(os.path.abspath(__file__)), + ) + + if result.returncode != 0: + print(f"\n{engine.upper()} worker exited with code {result.returncode}") + # Try to load partial results if they exist + if os.path.exists(output_path): + return load_results(output_path) + return None + + # Load and return results + return load_results(output_path) + + except Exception as e: + print(f"Error running {engine} worker: {e}") + return None + + finally: + # Clean up temp files + if os.path.exists(config_path): + os.unlink(config_path) + if os.path.exists(output_path): + os.unlink(output_path) + + +def cmd_run(args: argparse.Namespace) -> int: + """Execute the 'run' subcommand.""" + # Build configuration + exp_config = ExperimentConfig() + if args.n_runs: + exp_config.n_runs = args.n_runs + + spark_config = SparkServerConfig() + if args.spark_home: + spark_config.spark_home = args.spark_home + if args.java_home: + spark_config.java_home = args.java_home + + config = BenchmarkConfig( + engine=args.engine, + output_dir=args.output_dir, + experiment=exp_config, + spark_server=spark_config, + ) + + # Generate run ID and create run directory + run_id = generate_run_id() + run_dir = os.path.join(args.output_dir, run_id) + os.makedirs(run_dir, exist_ok=True) + + auto_spark = not args.no_auto_spark + + print("PyDeequ Engine Benchmark") + print("=" * 70) + print(f"Run ID: {run_id}") + print(f"Engine: {args.engine}") + print(f"Output directory: {run_dir}") + print(f"Auto Spark: {auto_spark}") + print(f"Validation runs: {exp_config.n_runs}") + print(f"Row counts: {exp_config.row_counts}") + print(f"Column counts: {exp_config.column_counts}") + print(f"Cache directory: {exp_config.cache_dir}") + + start_time = time.time() + + run_duckdb = args.engine in ("all", "duckdb") + run_spark = args.engine in ("all", "spark") + + duckdb_result: Optional[BenchmarkRun] = None + spark_result: Optional[BenchmarkRun] = None + + # Run DuckDB (doesn't need Spark server) + if run_duckdb: + duckdb_result = run_engine_in_subprocess("duckdb", config, run_id) + + # Run Spark (may need server management) + if run_spark: + if auto_spark: + # Use managed server context + with managed_spark_server(spark_config) as server: + startup_time = server.start() + if server.is_running(): + spark_result = run_engine_in_subprocess("spark", config, run_id) + else: + print("Spark server failed to start, skipping Spark benchmarks") + else: + # Assume server is already running + spark_result = run_engine_in_subprocess("spark", config, run_id) + + # Merge results if both engines ran + if duckdb_result and spark_result: + final_result = merge_results(duckdb_result, spark_result) + elif duckdb_result: + final_result = duckdb_result + elif spark_result: + final_result = spark_result + else: + print("\nNo benchmark results produced!") + return 1 + + # Update total duration + final_result.total_duration_seconds = time.time() - start_time + + # Save combined results to run directory + results_path = save_results(final_result, run_dir) + print(f"\n{'=' * 70}") + print(f"Results saved to: {results_path}") + + # Generate markdown report in run directory + report_path = os.path.join(run_dir, "BENCHMARK_RESULTS.md") + save_report(final_result, report_path) + print(f"Report saved to: {report_path}") + + print(f"Total duration: {final_result.total_duration_seconds:.1f}s") + + if final_result.errors: + print(f"\nErrors encountered: {len(final_result.errors)}") + for error in final_result.errors: + print(f" - {error}") + return 1 + + return 0 + + +def cmd_report(args: argparse.Namespace) -> int: + """Execute the 'report' subcommand.""" + if not os.path.exists(args.json_file): + print(f"Error: File not found: {args.json_file}") + return 1 + + try: + run = load_results(args.json_file) + except Exception as e: + print(f"Error loading results: {e}") + return 1 + + output_path = args.output or "BENCHMARK_RESULTS.md" + save_report(run, output_path) + print(f"Report generated: {output_path}") + + return 0 + + +def cmd_visualize(args: argparse.Namespace) -> int: + """Execute the 'visualize' subcommand.""" + if not os.path.exists(args.results_path): + print(f"Error: Path not found: {args.results_path}") + return 1 + + try: + run = load_results(args.results_path) + except Exception as e: + print(f"Error loading results: {e}") + return 1 + + # Determine output path + if args.output: + output_path = args.output + else: + # Default: save in the same directory as results + if os.path.isdir(args.results_path): + output_path = os.path.join(args.results_path, "benchmark_chart.png") + else: + output_path = os.path.join(os.path.dirname(args.results_path), "benchmark_chart.png") + + try: + generate_visualization(run, output_path) + print(f"Visualization saved to: {output_path}") + except Exception as e: + print(f"Error generating visualization: {e}") + return 1 + + return 0 + + +def main(): + parser = argparse.ArgumentParser( + description="PyDeequ Engine Benchmark CLI", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + + subparsers = parser.add_subparsers(dest="command", help="Available commands") + + # 'run' subcommand + run_parser = subparsers.add_parser("run", help="Run benchmark experiments") + run_parser.add_argument( + "--engine", + choices=["all", "duckdb", "spark"], + default="all", + help="Engine to benchmark (default: all)", + ) + run_parser.add_argument( + "--output-dir", + default="benchmark_results", + help="Output directory for results (default: benchmark_results/)", + ) + run_parser.add_argument( + "--no-auto-spark", + action="store_true", + dest="no_auto_spark", + help="Disable automatic Spark Connect server management (assumes server is already running)", + ) + run_parser.add_argument( + "--spark-home", + help="Path to Spark installation", + ) + run_parser.add_argument( + "--java-home", + help="Path to Java installation", + ) + run_parser.add_argument( + "--n-runs", + type=int, + help="Number of validation runs for averaging", + ) + + # 'report' subcommand + report_parser = subparsers.add_parser("report", help="Generate markdown report from JSON results") + report_parser.add_argument( + "json_file", + help="Path to JSON results file or run directory containing results.json", + ) + report_parser.add_argument( + "-o", "--output", + help="Output path for markdown report (default: BENCHMARK_RESULTS.md)", + ) + + # 'visualize' subcommand + visualize_parser = subparsers.add_parser("visualize", help="Generate PNG visualization from results") + visualize_parser.add_argument( + "results_path", + help="Path to JSON results file or run directory containing results.json", + ) + visualize_parser.add_argument( + "-o", "--output", + help="Output path for PNG file (default: benchmark_chart.png in results directory)", + ) + + args = parser.parse_args() + + if args.command == "run": + sys.exit(cmd_run(args)) + elif args.command == "report": + sys.exit(cmd_report(args)) + elif args.command == "visualize": + sys.exit(cmd_visualize(args)) + else: + parser.print_help() + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/docs/architecture.md b/docs/architecture.md new file mode 100644 index 0000000..6304e20 --- /dev/null +++ b/docs/architecture.md @@ -0,0 +1,214 @@ +# PyDeequ v2 Architecture + +## Overview + +PyDeequ v2 introduces a multi-engine architecture enabling data quality checks on different backends. The code is the source of truth - this document provides a high-level map to help you navigate the codebase. + +**Supported backends:** +- **DuckDB**: Local development, small-medium datasets (`pip install duckdb`) +- **Spark Connect**: Large-scale distributed processing (requires Spark cluster) + +## Design Philosophy + +The architecture is inspired by [DuckDQ](https://github.com/tdoehmen/duckdq), which demonstrated a key insight: + +> **Decouple state computation (engine-dependent) from state merging (engine-independent)** + +- **State computation** = expensive, engine-dependent (SQL queries, Spark jobs) +- **State merging** = cheap, pure Python (addition, max/min, Welford's algorithm) + +This separation enables multiple backends, incremental validation, and distributed processing. + +## Architecture Diagram + +``` + ┌──────────────────────────────────────┐ + │ User API │ + │ VerificationSuite, AnalysisRunner │ + │ ColumnProfilerRunner, Suggestions │ + └─────────────────┬────────────────────┘ + │ + ┌─────────────────▼────────────────────┐ + │ Engine Abstraction │ + │ BaseEngine ABC │ + │ compute_metrics(), run_checks() │ + │ profile_columns(), suggest_...() │ + └─────────────────┬────────────────────┘ + │ + ┌───────────────────────┼───────────────────────┐ + │ │ │ + ┌─────────▼─────────┐ ┌─────────▼─────────┐ ┌─────────▼─────────┐ + │ DuckDBEngine │ │ SparkEngine │ │ Future Engines │ + │ (Direct SQL) │ │ (Spark Connect) │ │ (Polars, etc.) │ + └───────────────────┘ └───────────────────┘ └───────────────────┘ +``` + +## Module Structure + +``` +pydeequ/ +├── __init__.py # connect() with auto-detection +├── engines/ +│ ├── __init__.py # BaseEngine ABC, result types +│ ├── duckdb.py # DuckDBEngine implementation +│ ├── spark.py # SparkEngine wrapper +│ ├── operators/ +│ │ ├── base.py # ScanOperator, GroupingOperator ABCs +│ │ ├── factory.py # OperatorFactory registry +│ │ ├── mixins.py # WhereClauseMixin, SafeExtractMixin +│ │ ├── scan_operators.py # 15 single-pass operators +│ │ ├── grouping_operators.py # 6 GROUP BY operators +│ │ ├── metadata_operators.py # Schema-based operators +│ │ └── profiling_operators.py # Column profiling operators +│ ├── constraints/ +│ │ ├── base.py # BaseEvaluator hierarchy +│ │ ├── factory.py # ConstraintEvaluatorFactory (27 types) +│ │ └── evaluators.py # 23 concrete evaluators +│ └── suggestions/ +│ ├── runner.py # Suggestion generation +│ ├── rules.py # Rule implementations +│ └── registry.py # Rule registry +└── v2/ # User-facing API + ├── analyzers.py # Analyzer definitions + ├── checks.py # Check/Constraint definitions + ├── predicates.py # Predicate classes + ├── verification.py # VerificationSuite, AnalysisRunner + ├── profiles.py # ColumnProfilerRunner + └── suggestions.py # ConstraintSuggestionRunner +``` + +## Key Abstractions + +### BaseEngine (`pydeequ/engines/__init__.py`) + +Abstract base class defining the engine interface. All engines implement: +- `compute_metrics(analyzers)` - Run analyzers and return `MetricResult` list +- `run_checks(checks)` - Evaluate constraints and return `ConstraintResult` list +- `profile_columns(columns)` - Return `ColumnProfile` for each column +- `suggest_constraints(rules)` - Generate `ConstraintSuggestion` list + +### Operators (`pydeequ/engines/operators/`) + +Operators translate analyzers into engine-specific queries: + +| Type | Description | Examples | +|------|-------------|----------| +| **ScanOperator** | Single-pass SQL aggregations, batched together | Size, Completeness, Mean, Sum, Min, Max | +| **GroupingOperator** | Requires GROUP BY, runs individually | Distinctness, Uniqueness, Entropy | +| **MetadataOperator** | Schema-based, no query needed | DataType | + +See `base.py` for ABCs, `factory.py` for the registry pattern. + +### OperatorFactory (`pydeequ/engines/operators/factory.py`) + +Registry mapping analyzer names to operator classes. Use `OperatorFactory.create(analyzer)` to instantiate operators. The factory determines query batching strategy. + +### Constraint Evaluators (`pydeequ/engines/constraints/`) + +Evaluators check if computed metrics satisfy constraints: +- **AnalyzerBasedEvaluator**: Delegates to an analyzer operator (hasMean, hasMin) +- **RatioCheckEvaluator**: Computes matches/total ratio (isPositive, isContainedIn) + +The `ConstraintEvaluatorFactory` maps 27 constraint types to evaluator classes. + +### Result Types (`pydeequ/engines/__init__.py`) + +Standardized dataclasses returned by all engines: +- `MetricResult`: Analyzer output (name, column, value, success) +- `ConstraintResult`: Check output (constraint, status, message) +- `ColumnProfile`: Profiling output (column, stats, histogram) + +All convert to pandas DataFrames via `results_to_dataframe()`. + +## Quick Start Examples + +### Analysis + +```python +import duckdb +import pydeequ +from pydeequ.v2.analyzers import Size, Completeness, Mean +from pydeequ.v2.verification import AnalysisRunner + +con = duckdb.connect() +con.execute("CREATE TABLE sales AS SELECT * FROM 'sales.parquet'") +engine = pydeequ.connect(con, table="sales") + +result = (AnalysisRunner() + .on_engine(engine) + .addAnalyzer(Size()) + .addAnalyzer(Completeness("customer_id")) + .addAnalyzer(Mean("amount")) + .run()) +``` + +### Verification + +```python +from pydeequ.v2.checks import Check, CheckLevel +from pydeequ.v2.verification import VerificationSuite +from pydeequ.v2.predicates import gte + +result = (VerificationSuite() + .on_engine(engine) + .addCheck( + Check(CheckLevel.Error, "Data Quality") + .isComplete("id") + .hasCompleteness("email", gte(0.95)) + .isUnique("transaction_id") + ) + .run()) +``` + +### Profiling + +```python +from pydeequ.v2.profiles import ColumnProfilerRunner + +profiles = (ColumnProfilerRunner() + .on_engine(engine) + .run()) +``` + +### Suggestions + +```python +from pydeequ.v2.suggestions import ConstraintSuggestionRunner, Rules + +suggestions = (ConstraintSuggestionRunner() + .on_engine(engine) + .addConstraintRules(Rules.DEFAULT) + .run()) +``` + +## Engine Comparison + +| Aspect | DuckDB | Spark | +|--------|--------|-------| +| **Use case** | Local dev, CI/CD, files < 10GB | Distributed data, data lakes | +| **Setup** | `pip install duckdb` | Spark cluster + Deequ plugin | +| **Latency** | Low (in-process) | Higher (network overhead) | +| **Scaling** | Single-node, memory-bound | Distributed, scales horizontally | +| **Approximate metrics** | HyperLogLog, exact quantiles | HLL++, KLL sketches | + +Both engines aim for functional parity. Minor differences exist in approximate algorithms and histogram formats - see test suite for tolerances. + +## Benchmarks + +Performance comparisons between DuckDB and Spark engines are documented in [BENCHMARK.md](../BENCHMARK.md), including: +- Varying row counts (100K to 130M rows) +- Varying column counts (10 to 80 columns) +- Column profiling performance + +## Future Enhancements + +- State persistence for incremental validation +- Additional backends (Polars, SQLAlchemy, BigQuery) +- Anomaly detection on metrics +- Data lineage for constraint violations + +## References + +- [DuckDQ](https://github.com/tdoehmen/duckdq) - Inspiration for engine abstraction +- [AWS Deequ](https://github.com/awslabs/deequ) - Original Scala implementation +- [Ibis](https://ibis-project.org/) - Multi-backend design patterns diff --git a/imgs/benchmark_chart.png b/imgs/benchmark_chart.png new file mode 100644 index 0000000..a6dc1ff Binary files /dev/null and b/imgs/benchmark_chart.png differ diff --git a/poetry.lock b/poetry.lock index 5b439ef..a9272e1 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.2.1 and should not be changed by hand. [[package]] name = "black" @@ -6,6 +6,7 @@ version = "24.10.0" description = "The uncompromising code formatter." optional = false python-versions = ">=3.9" +groups = ["main", "dev"] files = [ {file = "black-24.10.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e6668650ea4b685440857138e5fe40cde4d652633b1bdffc62933d0db4ed9812"}, {file = "black-24.10.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1c536fcf674217e87b8cc3657b81809d3c085d7bf3ef262ead700da345bfa6ea"}, @@ -52,6 +53,7 @@ version = "3.4.0" description = "Validate configuration and produce human readable error messages." optional = false python-versions = ">=3.8" +groups = ["main", "dev"] files = [ {file = "cfgv-3.4.0-py2.py3-none-any.whl", hash = "sha256:b7265b1f29fd3316bfcd2b330d63d024f2bfd8bcb8b0272f8e19a504856c48f9"}, {file = "cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560"}, @@ -63,6 +65,7 @@ version = "8.1.8" description = "Composable command line interface toolkit" optional = false python-versions = ">=3.7" +groups = ["main", "dev"] files = [ {file = "click-8.1.8-py3-none-any.whl", hash = "sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2"}, {file = "click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a"}, @@ -77,17 +80,277 @@ version = "0.4.6" description = "Cross-platform colored terminal text." optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" +groups = ["main", "dev"] +markers = "sys_platform == \"win32\" or platform_system == \"Windows\"" files = [ {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, ] +[[package]] +name = "contourpy" +version = "1.3.0" +description = "Python library for calculating contours of 2D quadrilateral grids" +optional = false +python-versions = ">=3.9" +groups = ["main", "dev"] +markers = "python_version == \"3.9\"" +files = [ + {file = "contourpy-1.3.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:880ea32e5c774634f9fcd46504bf9f080a41ad855f4fef54f5380f5133d343c7"}, + {file = "contourpy-1.3.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:76c905ef940a4474a6289c71d53122a4f77766eef23c03cd57016ce19d0f7b42"}, + {file = "contourpy-1.3.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:92f8557cbb07415a4d6fa191f20fd9d2d9eb9c0b61d1b2f52a8926e43c6e9af7"}, + {file = "contourpy-1.3.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:36f965570cff02b874773c49bfe85562b47030805d7d8360748f3eca570f4cab"}, + {file = "contourpy-1.3.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cacd81e2d4b6f89c9f8a5b69b86490152ff39afc58a95af002a398273e5ce589"}, + {file = "contourpy-1.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:69375194457ad0fad3a839b9e29aa0b0ed53bb54db1bfb6c3ae43d111c31ce41"}, + {file = "contourpy-1.3.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:7a52040312b1a858b5e31ef28c2e865376a386c60c0e248370bbea2d3f3b760d"}, + {file = "contourpy-1.3.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3faeb2998e4fcb256542e8a926d08da08977f7f5e62cf733f3c211c2a5586223"}, + {file = "contourpy-1.3.0-cp310-cp310-win32.whl", hash = "sha256:36e0cff201bcb17a0a8ecc7f454fe078437fa6bda730e695a92f2d9932bd507f"}, + {file = "contourpy-1.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:87ddffef1dbe5e669b5c2440b643d3fdd8622a348fe1983fad7a0f0ccb1cd67b"}, + {file = "contourpy-1.3.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0fa4c02abe6c446ba70d96ece336e621efa4aecae43eaa9b030ae5fb92b309ad"}, + {file = "contourpy-1.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:834e0cfe17ba12f79963861e0f908556b2cedd52e1f75e6578801febcc6a9f49"}, + {file = "contourpy-1.3.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dbc4c3217eee163fa3984fd1567632b48d6dfd29216da3ded3d7b844a8014a66"}, + {file = "contourpy-1.3.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4865cd1d419e0c7a7bf6de1777b185eebdc51470800a9f42b9e9decf17762081"}, + {file = "contourpy-1.3.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:303c252947ab4b14c08afeb52375b26781ccd6a5ccd81abcdfc1fafd14cf93c1"}, + {file = "contourpy-1.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:637f674226be46f6ba372fd29d9523dd977a291f66ab2a74fbeb5530bb3f445d"}, + {file = "contourpy-1.3.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:76a896b2f195b57db25d6b44e7e03f221d32fe318d03ede41f8b4d9ba1bff53c"}, + {file = "contourpy-1.3.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:e1fd23e9d01591bab45546c089ae89d926917a66dceb3abcf01f6105d927e2cb"}, + {file = "contourpy-1.3.0-cp311-cp311-win32.whl", hash = "sha256:d402880b84df3bec6eab53cd0cf802cae6a2ef9537e70cf75e91618a3801c20c"}, + {file = "contourpy-1.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:6cb6cc968059db9c62cb35fbf70248f40994dfcd7aa10444bbf8b3faeb7c2d67"}, + {file = "contourpy-1.3.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:570ef7cf892f0afbe5b2ee410c507ce12e15a5fa91017a0009f79f7d93a1268f"}, + {file = "contourpy-1.3.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:da84c537cb8b97d153e9fb208c221c45605f73147bd4cadd23bdae915042aad6"}, + {file = "contourpy-1.3.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0be4d8425bfa755e0fd76ee1e019636ccc7c29f77a7c86b4328a9eb6a26d0639"}, + {file = "contourpy-1.3.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9c0da700bf58f6e0b65312d0a5e695179a71d0163957fa381bb3c1f72972537c"}, + {file = "contourpy-1.3.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:eb8b141bb00fa977d9122636b16aa67d37fd40a3d8b52dd837e536d64b9a4d06"}, + {file = "contourpy-1.3.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3634b5385c6716c258d0419c46d05c8aa7dc8cb70326c9a4fb66b69ad2b52e09"}, + {file = "contourpy-1.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:0dce35502151b6bd35027ac39ba6e5a44be13a68f55735c3612c568cac3805fd"}, + {file = "contourpy-1.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:aea348f053c645100612b333adc5983d87be69acdc6d77d3169c090d3b01dc35"}, + {file = "contourpy-1.3.0-cp312-cp312-win32.whl", hash = "sha256:90f73a5116ad1ba7174341ef3ea5c3150ddf20b024b98fb0c3b29034752c8aeb"}, + {file = "contourpy-1.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:b11b39aea6be6764f84360fce6c82211a9db32a7c7de8fa6dd5397cf1d079c3b"}, + {file = "contourpy-1.3.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:3e1c7fa44aaae40a2247e2e8e0627f4bea3dd257014764aa644f319a5f8600e3"}, + {file = "contourpy-1.3.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:364174c2a76057feef647c802652f00953b575723062560498dc7930fc9b1cb7"}, + {file = "contourpy-1.3.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:32b238b3b3b649e09ce9aaf51f0c261d38644bdfa35cbaf7b263457850957a84"}, + {file = "contourpy-1.3.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d51fca85f9f7ad0b65b4b9fe800406d0d77017d7270d31ec3fb1cc07358fdea0"}, + {file = "contourpy-1.3.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:732896af21716b29ab3e988d4ce14bc5133733b85956316fb0c56355f398099b"}, + {file = "contourpy-1.3.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d73f659398a0904e125280836ae6f88ba9b178b2fed6884f3b1f95b989d2c8da"}, + {file = "contourpy-1.3.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:c6c7c2408b7048082932cf4e641fa3b8ca848259212f51c8c59c45aa7ac18f14"}, + {file = "contourpy-1.3.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f317576606de89da6b7e0861cf6061f6146ead3528acabff9236458a6ba467f8"}, + {file = "contourpy-1.3.0-cp313-cp313-win32.whl", hash = "sha256:31cd3a85dbdf1fc002280c65caa7e2b5f65e4a973fcdf70dd2fdcb9868069294"}, + {file = "contourpy-1.3.0-cp313-cp313-win_amd64.whl", hash = "sha256:4553c421929ec95fb07b3aaca0fae668b2eb5a5203d1217ca7c34c063c53d087"}, + {file = "contourpy-1.3.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:345af746d7766821d05d72cb8f3845dfd08dd137101a2cb9b24de277d716def8"}, + {file = "contourpy-1.3.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:3bb3808858a9dc68f6f03d319acd5f1b8a337e6cdda197f02f4b8ff67ad2057b"}, + {file = "contourpy-1.3.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:420d39daa61aab1221567b42eecb01112908b2cab7f1b4106a52caaec8d36973"}, + {file = "contourpy-1.3.0-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4d63ee447261e963af02642ffcb864e5a2ee4cbfd78080657a9880b8b1868e18"}, + {file = "contourpy-1.3.0-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:167d6c890815e1dac9536dca00828b445d5d0df4d6a8c6adb4a7ec3166812fa8"}, + {file = "contourpy-1.3.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:710a26b3dc80c0e4febf04555de66f5fd17e9cf7170a7b08000601a10570bda6"}, + {file = "contourpy-1.3.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:75ee7cb1a14c617f34a51d11fa7524173e56551646828353c4af859c56b766e2"}, + {file = "contourpy-1.3.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:33c92cdae89ec5135d036e7218e69b0bb2851206077251f04a6c4e0e21f03927"}, + {file = "contourpy-1.3.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a11077e395f67ffc2c44ec2418cfebed032cd6da3022a94fc227b6faf8e2acb8"}, + {file = "contourpy-1.3.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e8134301d7e204c88ed7ab50028ba06c683000040ede1d617298611f9dc6240c"}, + {file = "contourpy-1.3.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e12968fdfd5bb45ffdf6192a590bd8ddd3ba9e58360b29683c6bb71a7b41edca"}, + {file = "contourpy-1.3.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fd2a0fc506eccaaa7595b7e1418951f213cf8255be2600f1ea1b61e46a60c55f"}, + {file = "contourpy-1.3.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4cfb5c62ce023dfc410d6059c936dcf96442ba40814aefbfa575425a3a7f19dc"}, + {file = "contourpy-1.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:68a32389b06b82c2fdd68276148d7b9275b5f5cf13e5417e4252f6d1a34f72a2"}, + {file = "contourpy-1.3.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:94e848a6b83da10898cbf1311a815f770acc9b6a3f2d646f330d57eb4e87592e"}, + {file = "contourpy-1.3.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:d78ab28a03c854a873787a0a42254a0ccb3cb133c672f645c9f9c8f3ae9d0800"}, + {file = "contourpy-1.3.0-cp39-cp39-win32.whl", hash = "sha256:81cb5ed4952aae6014bc9d0421dec7c5835c9c8c31cdf51910b708f548cf58e5"}, + {file = "contourpy-1.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:14e262f67bd7e6eb6880bc564dcda30b15e351a594657e55b7eec94b6ef72843"}, + {file = "contourpy-1.3.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:fe41b41505a5a33aeaed2a613dccaeaa74e0e3ead6dd6fd3a118fb471644fd6c"}, + {file = "contourpy-1.3.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eca7e17a65f72a5133bdbec9ecf22401c62bcf4821361ef7811faee695799779"}, + {file = "contourpy-1.3.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:1ec4dc6bf570f5b22ed0d7efba0dfa9c5b9e0431aeea7581aa217542d9e809a4"}, + {file = "contourpy-1.3.0-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:00ccd0dbaad6d804ab259820fa7cb0b8036bda0686ef844d24125d8287178ce0"}, + {file = "contourpy-1.3.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8ca947601224119117f7c19c9cdf6b3ab54c5726ef1d906aa4a69dfb6dd58102"}, + {file = "contourpy-1.3.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:c6ec93afeb848a0845a18989da3beca3eec2c0f852322efe21af1931147d12cb"}, + {file = "contourpy-1.3.0.tar.gz", hash = "sha256:7ffa0db17717a8ffb127efd0c95a4362d996b892c2904db72428d5b52e1938a4"}, +] + +[package.dependencies] +numpy = ">=1.23" + +[package.extras] +bokeh = ["bokeh", "selenium"] +docs = ["furo", "sphinx (>=7.2)", "sphinx-copybutton"] +mypy = ["contourpy[bokeh,docs]", "docutils-stubs", "mypy (==1.11.1)", "types-Pillow"] +test = ["Pillow", "contourpy[test-no-images]", "matplotlib"] +test-no-images = ["pytest", "pytest-cov", "pytest-rerunfailures", "pytest-xdist", "wurlitzer"] + +[[package]] +name = "contourpy" +version = "1.3.2" +description = "Python library for calculating contours of 2D quadrilateral grids" +optional = false +python-versions = ">=3.10" +groups = ["main", "dev"] +markers = "python_version == \"3.10\"" +files = [ + {file = "contourpy-1.3.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ba38e3f9f330af820c4b27ceb4b9c7feee5fe0493ea53a8720f4792667465934"}, + {file = "contourpy-1.3.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:dc41ba0714aa2968d1f8674ec97504a8f7e334f48eeacebcaa6256213acb0989"}, + {file = "contourpy-1.3.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9be002b31c558d1ddf1b9b415b162c603405414bacd6932d031c5b5a8b757f0d"}, + {file = "contourpy-1.3.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8d2e74acbcba3bfdb6d9d8384cdc4f9260cae86ed9beee8bd5f54fee49a430b9"}, + {file = "contourpy-1.3.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e259bced5549ac64410162adc973c5e2fb77f04df4a439d00b478e57a0e65512"}, + {file = "contourpy-1.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad687a04bc802cbe8b9c399c07162a3c35e227e2daccf1668eb1f278cb698631"}, + {file = "contourpy-1.3.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:cdd22595308f53ef2f891040ab2b93d79192513ffccbd7fe19be7aa773a5e09f"}, + {file = "contourpy-1.3.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b4f54d6a2defe9f257327b0f243612dd051cc43825587520b1bf74a31e2f6ef2"}, + {file = "contourpy-1.3.2-cp310-cp310-win32.whl", hash = "sha256:f939a054192ddc596e031e50bb13b657ce318cf13d264f095ce9db7dc6ae81c0"}, + {file = "contourpy-1.3.2-cp310-cp310-win_amd64.whl", hash = "sha256:c440093bbc8fc21c637c03bafcbef95ccd963bc6e0514ad887932c18ca2a759a"}, + {file = "contourpy-1.3.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6a37a2fb93d4df3fc4c0e363ea4d16f83195fc09c891bc8ce072b9d084853445"}, + {file = "contourpy-1.3.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b7cd50c38f500bbcc9b6a46643a40e0913673f869315d8e70de0438817cb7773"}, + {file = "contourpy-1.3.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d6658ccc7251a4433eebd89ed2672c2ed96fba367fd25ca9512aa92a4b46c4f1"}, + {file = "contourpy-1.3.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:70771a461aaeb335df14deb6c97439973d253ae70660ca085eec25241137ef43"}, + {file = "contourpy-1.3.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:65a887a6e8c4cd0897507d814b14c54a8c2e2aa4ac9f7686292f9769fcf9a6ab"}, + {file = "contourpy-1.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3859783aefa2b8355697f16642695a5b9792e7a46ab86da1118a4a23a51a33d7"}, + {file = "contourpy-1.3.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:eab0f6db315fa4d70f1d8ab514e527f0366ec021ff853d7ed6a2d33605cf4b83"}, + {file = "contourpy-1.3.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d91a3ccc7fea94ca0acab82ceb77f396d50a1f67412efe4c526f5d20264e6ecd"}, + {file = "contourpy-1.3.2-cp311-cp311-win32.whl", hash = "sha256:1c48188778d4d2f3d48e4643fb15d8608b1d01e4b4d6b0548d9b336c28fc9b6f"}, + {file = "contourpy-1.3.2-cp311-cp311-win_amd64.whl", hash = "sha256:5ebac872ba09cb8f2131c46b8739a7ff71de28a24c869bcad554477eb089a878"}, + {file = "contourpy-1.3.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:4caf2bcd2969402bf77edc4cb6034c7dd7c0803213b3523f111eb7460a51b8d2"}, + {file = "contourpy-1.3.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:82199cb78276249796419fe36b7386bd8d2cc3f28b3bc19fe2454fe2e26c4c15"}, + {file = "contourpy-1.3.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:106fab697af11456fcba3e352ad50effe493a90f893fca6c2ca5c033820cea92"}, + {file = "contourpy-1.3.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d14f12932a8d620e307f715857107b1d1845cc44fdb5da2bc8e850f5ceba9f87"}, + {file = "contourpy-1.3.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:532fd26e715560721bb0d5fc7610fce279b3699b018600ab999d1be895b09415"}, + {file = "contourpy-1.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f26b383144cf2d2c29f01a1e8170f50dacf0eac02d64139dcd709a8ac4eb3cfe"}, + {file = "contourpy-1.3.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:c49f73e61f1f774650a55d221803b101d966ca0c5a2d6d5e4320ec3997489441"}, + {file = "contourpy-1.3.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3d80b2c0300583228ac98d0a927a1ba6a2ba6b8a742463c564f1d419ee5b211e"}, + {file = "contourpy-1.3.2-cp312-cp312-win32.whl", hash = "sha256:90df94c89a91b7362e1142cbee7568f86514412ab8a2c0d0fca72d7e91b62912"}, + {file = "contourpy-1.3.2-cp312-cp312-win_amd64.whl", hash = "sha256:8c942a01d9163e2e5cfb05cb66110121b8d07ad438a17f9e766317bcb62abf73"}, + {file = "contourpy-1.3.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:de39db2604ae755316cb5967728f4bea92685884b1e767b7c24e983ef5f771cb"}, + {file = "contourpy-1.3.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3f9e896f447c5c8618f1edb2bafa9a4030f22a575ec418ad70611450720b5b08"}, + {file = "contourpy-1.3.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:71e2bd4a1c4188f5c2b8d274da78faab884b59df20df63c34f74aa1813c4427c"}, + {file = "contourpy-1.3.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:de425af81b6cea33101ae95ece1f696af39446db9682a0b56daaa48cfc29f38f"}, + {file = "contourpy-1.3.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:977e98a0e0480d3fe292246417239d2d45435904afd6d7332d8455981c408b85"}, + {file = "contourpy-1.3.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:434f0adf84911c924519d2b08fc10491dd282b20bdd3fa8f60fd816ea0b48841"}, + {file = "contourpy-1.3.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:c66c4906cdbc50e9cba65978823e6e00b45682eb09adbb78c9775b74eb222422"}, + {file = "contourpy-1.3.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:8b7fc0cd78ba2f4695fd0a6ad81a19e7e3ab825c31b577f384aa9d7817dc3bef"}, + {file = "contourpy-1.3.2-cp313-cp313-win32.whl", hash = "sha256:15ce6ab60957ca74cff444fe66d9045c1fd3e92c8936894ebd1f3eef2fff075f"}, + {file = "contourpy-1.3.2-cp313-cp313-win_amd64.whl", hash = "sha256:e1578f7eafce927b168752ed7e22646dad6cd9bca673c60bff55889fa236ebf9"}, + {file = "contourpy-1.3.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0475b1f6604896bc7c53bb070e355e9321e1bc0d381735421a2d2068ec56531f"}, + {file = "contourpy-1.3.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:c85bb486e9be652314bb5b9e2e3b0d1b2e643d5eec4992c0fbe8ac71775da739"}, + {file = "contourpy-1.3.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:745b57db7758f3ffc05a10254edd3182a2a83402a89c00957a8e8a22f5582823"}, + {file = "contourpy-1.3.2-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:970e9173dbd7eba9b4e01aab19215a48ee5dd3f43cef736eebde064a171f89a5"}, + {file = "contourpy-1.3.2-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c6c4639a9c22230276b7bffb6a850dfc8258a2521305e1faefe804d006b2e532"}, + {file = "contourpy-1.3.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc829960f34ba36aad4302e78eabf3ef16a3a100863f0d4eeddf30e8a485a03b"}, + {file = "contourpy-1.3.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:d32530b534e986374fc19eaa77fcb87e8a99e5431499949b828312bdcd20ac52"}, + {file = "contourpy-1.3.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:e298e7e70cf4eb179cc1077be1c725b5fd131ebc81181bf0c03525c8abc297fd"}, + {file = "contourpy-1.3.2-cp313-cp313t-win32.whl", hash = "sha256:d0e589ae0d55204991450bb5c23f571c64fe43adaa53f93fc902a84c96f52fe1"}, + {file = "contourpy-1.3.2-cp313-cp313t-win_amd64.whl", hash = "sha256:78e9253c3de756b3f6a5174d024c4835acd59eb3f8e2ca13e775dbffe1558f69"}, + {file = "contourpy-1.3.2-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:fd93cc7f3139b6dd7aab2f26a90dde0aa9fc264dbf70f6740d498a70b860b82c"}, + {file = "contourpy-1.3.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:107ba8a6a7eec58bb475329e6d3b95deba9440667c4d62b9b6063942b61d7f16"}, + {file = "contourpy-1.3.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:ded1706ed0c1049224531b81128efbd5084598f18d8a2d9efae833edbd2b40ad"}, + {file = "contourpy-1.3.2-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:5f5964cdad279256c084b69c3f412b7801e15356b16efa9d78aa974041903da0"}, + {file = "contourpy-1.3.2-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:49b65a95d642d4efa8f64ba12558fcb83407e58a2dfba9d796d77b63ccfcaff5"}, + {file = "contourpy-1.3.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:8c5acb8dddb0752bf252e01a3035b21443158910ac16a3b0d20e7fed7d534ce5"}, + {file = "contourpy-1.3.2.tar.gz", hash = "sha256:b6945942715a034c671b7fc54f9588126b0b8bf23db2696e3ca8328f3ff0ab54"}, +] + +[package.dependencies] +numpy = ">=1.23" + +[package.extras] +bokeh = ["bokeh", "selenium"] +docs = ["furo", "sphinx (>=7.2)", "sphinx-copybutton"] +mypy = ["bokeh", "contourpy[bokeh,docs]", "docutils-stubs", "mypy (==1.15.0)", "types-Pillow"] +test = ["Pillow", "contourpy[test-no-images]", "matplotlib"] +test-no-images = ["pytest", "pytest-cov", "pytest-rerunfailures", "pytest-xdist", "wurlitzer"] + +[[package]] +name = "contourpy" +version = "1.3.3" +description = "Python library for calculating contours of 2D quadrilateral grids" +optional = false +python-versions = ">=3.11" +groups = ["main", "dev"] +markers = "python_version >= \"3.11\"" +files = [ + {file = "contourpy-1.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:709a48ef9a690e1343202916450bc48b9e51c049b089c7f79a267b46cffcdaa1"}, + {file = "contourpy-1.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:23416f38bfd74d5d28ab8429cc4d63fa67d5068bd711a85edb1c3fb0c3e2f381"}, + {file = "contourpy-1.3.3-cp311-cp311-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:929ddf8c4c7f348e4c0a5a3a714b5c8542ffaa8c22954862a46ca1813b667ee7"}, + {file = "contourpy-1.3.3-cp311-cp311-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:9e999574eddae35f1312c2b4b717b7885d4edd6cb46700e04f7f02db454e67c1"}, + {file = "contourpy-1.3.3-cp311-cp311-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:0bf67e0e3f482cb69779dd3061b534eb35ac9b17f163d851e2a547d56dba0a3a"}, + {file = "contourpy-1.3.3-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:51e79c1f7470158e838808d4a996fa9bac72c498e93d8ebe5119bc1e6becb0db"}, + {file = "contourpy-1.3.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:598c3aaece21c503615fd59c92a3598b428b2f01bfb4b8ca9c4edeecc2438620"}, + {file = "contourpy-1.3.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:322ab1c99b008dad206d406bb61d014cf0174df491ae9d9d0fac6a6fda4f977f"}, + {file = "contourpy-1.3.3-cp311-cp311-win32.whl", hash = "sha256:fd907ae12cd483cd83e414b12941c632a969171bf90fc937d0c9f268a31cafff"}, + {file = "contourpy-1.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:3519428f6be58431c56581f1694ba8e50626f2dd550af225f82fb5f5814d2a42"}, + {file = "contourpy-1.3.3-cp311-cp311-win_arm64.whl", hash = "sha256:15ff10bfada4bf92ec8b31c62bf7c1834c244019b4a33095a68000d7075df470"}, + {file = "contourpy-1.3.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b08a32ea2f8e42cf1d4be3169a98dd4be32bafe4f22b6c4cb4ba810fa9e5d2cb"}, + {file = "contourpy-1.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:556dba8fb6f5d8742f2923fe9457dbdd51e1049c4a43fd3986a0b14a1d815fc6"}, + {file = "contourpy-1.3.3-cp312-cp312-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:92d9abc807cf7d0e047b95ca5d957cf4792fcd04e920ca70d48add15c1a90ea7"}, + {file = "contourpy-1.3.3-cp312-cp312-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:b2e8faa0ed68cb29af51edd8e24798bb661eac3bd9f65420c1887b6ca89987c8"}, + {file = "contourpy-1.3.3-cp312-cp312-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:626d60935cf668e70a5ce6ff184fd713e9683fb458898e4249b63be9e28286ea"}, + {file = "contourpy-1.3.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4d00e655fcef08aba35ec9610536bfe90267d7ab5ba944f7032549c55a146da1"}, + {file = "contourpy-1.3.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:451e71b5a7d597379ef572de31eeb909a87246974d960049a9848c3bc6c41bf7"}, + {file = "contourpy-1.3.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:459c1f020cd59fcfe6650180678a9993932d80d44ccde1fa1868977438f0b411"}, + {file = "contourpy-1.3.3-cp312-cp312-win32.whl", hash = "sha256:023b44101dfe49d7d53932be418477dba359649246075c996866106da069af69"}, + {file = "contourpy-1.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:8153b8bfc11e1e4d75bcb0bff1db232f9e10b274e0929de9d608027e0d34ff8b"}, + {file = "contourpy-1.3.3-cp312-cp312-win_arm64.whl", hash = "sha256:07ce5ed73ecdc4a03ffe3e1b3e3c1166db35ae7584be76f65dbbe28a7791b0cc"}, + {file = "contourpy-1.3.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:177fb367556747a686509d6fef71d221a4b198a3905fe824430e5ea0fda54eb5"}, + {file = "contourpy-1.3.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:d002b6f00d73d69333dac9d0b8d5e84d9724ff9ef044fd63c5986e62b7c9e1b1"}, + {file = "contourpy-1.3.3-cp313-cp313-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:348ac1f5d4f1d66d3322420f01d42e43122f43616e0f194fc1c9f5d830c5b286"}, + {file = "contourpy-1.3.3-cp313-cp313-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:655456777ff65c2c548b7c454af9c6f33f16c8884f11083244b5819cc214f1b5"}, + {file = "contourpy-1.3.3-cp313-cp313-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:644a6853d15b2512d67881586bd03f462c7ab755db95f16f14d7e238f2852c67"}, + {file = "contourpy-1.3.3-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4debd64f124ca62069f313a9cb86656ff087786016d76927ae2cf37846b006c9"}, + {file = "contourpy-1.3.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a15459b0f4615b00bbd1e91f1b9e19b7e63aea7483d03d804186f278c0af2659"}, + {file = "contourpy-1.3.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ca0fdcd73925568ca027e0b17ab07aad764be4706d0a925b89227e447d9737b7"}, + {file = "contourpy-1.3.3-cp313-cp313-win32.whl", hash = "sha256:b20c7c9a3bf701366556e1b1984ed2d0cedf999903c51311417cf5f591d8c78d"}, + {file = "contourpy-1.3.3-cp313-cp313-win_amd64.whl", hash = "sha256:1cadd8b8969f060ba45ed7c1b714fe69185812ab43bd6b86a9123fe8f99c3263"}, + {file = "contourpy-1.3.3-cp313-cp313-win_arm64.whl", hash = "sha256:fd914713266421b7536de2bfa8181aa8c699432b6763a0ea64195ebe28bff6a9"}, + {file = "contourpy-1.3.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:88df9880d507169449d434c293467418b9f6cbe82edd19284aa0409e7fdb933d"}, + {file = "contourpy-1.3.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:d06bb1f751ba5d417047db62bca3c8fde202b8c11fb50742ab3ab962c81e8216"}, + {file = "contourpy-1.3.3-cp313-cp313t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e4e6b05a45525357e382909a4c1600444e2a45b4795163d3b22669285591c1ae"}, + {file = "contourpy-1.3.3-cp313-cp313t-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:ab3074b48c4e2cf1a960e6bbeb7f04566bf36b1861d5c9d4d8ac04b82e38ba20"}, + {file = "contourpy-1.3.3-cp313-cp313t-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:6c3d53c796f8647d6deb1abe867daeb66dcc8a97e8455efa729516b997b8ed99"}, + {file = "contourpy-1.3.3-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:50ed930df7289ff2a8d7afeb9603f8289e5704755c7e5c3bbd929c90c817164b"}, + {file = "contourpy-1.3.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:4feffb6537d64b84877da813a5c30f1422ea5739566abf0bd18065ac040e120a"}, + {file = "contourpy-1.3.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:2b7e9480ffe2b0cd2e787e4df64270e3a0440d9db8dc823312e2c940c167df7e"}, + {file = "contourpy-1.3.3-cp313-cp313t-win32.whl", hash = "sha256:283edd842a01e3dcd435b1c5116798d661378d83d36d337b8dde1d16a5fc9ba3"}, + {file = "contourpy-1.3.3-cp313-cp313t-win_amd64.whl", hash = "sha256:87acf5963fc2b34825e5b6b048f40e3635dd547f590b04d2ab317c2619ef7ae8"}, + {file = "contourpy-1.3.3-cp313-cp313t-win_arm64.whl", hash = "sha256:3c30273eb2a55024ff31ba7d052dde990d7d8e5450f4bbb6e913558b3d6c2301"}, + {file = "contourpy-1.3.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:fde6c716d51c04b1c25d0b90364d0be954624a0ee9d60e23e850e8d48353d07a"}, + {file = "contourpy-1.3.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:cbedb772ed74ff5be440fa8eee9bd49f64f6e3fc09436d9c7d8f1c287b121d77"}, + {file = "contourpy-1.3.3-cp314-cp314-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:22e9b1bd7a9b1d652cd77388465dc358dafcd2e217d35552424aa4f996f524f5"}, + {file = "contourpy-1.3.3-cp314-cp314-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a22738912262aa3e254e4f3cb079a95a67132fc5a063890e224393596902f5a4"}, + {file = "contourpy-1.3.3-cp314-cp314-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:afe5a512f31ee6bd7d0dda52ec9864c984ca3d66664444f2d72e0dc4eb832e36"}, + {file = "contourpy-1.3.3-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f64836de09927cba6f79dcd00fdd7d5329f3fccc633468507079c829ca4db4e3"}, + {file = "contourpy-1.3.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:1fd43c3be4c8e5fd6e4f2baeae35ae18176cf2e5cced681cca908addf1cdd53b"}, + {file = "contourpy-1.3.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:6afc576f7b33cf00996e5c1102dc2a8f7cc89e39c0b55df93a0b78c1bd992b36"}, + {file = "contourpy-1.3.3-cp314-cp314-win32.whl", hash = "sha256:66c8a43a4f7b8df8b71ee1840e4211a3c8d93b214b213f590e18a1beca458f7d"}, + {file = "contourpy-1.3.3-cp314-cp314-win_amd64.whl", hash = "sha256:cf9022ef053f2694e31d630feaacb21ea24224be1c3ad0520b13d844274614fd"}, + {file = "contourpy-1.3.3-cp314-cp314-win_arm64.whl", hash = "sha256:95b181891b4c71de4bb404c6621e7e2390745f887f2a026b2d99e92c17892339"}, + {file = "contourpy-1.3.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:33c82d0138c0a062380332c861387650c82e4cf1747aaa6938b9b6516762e772"}, + {file = "contourpy-1.3.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:ea37e7b45949df430fe649e5de8351c423430046a2af20b1c1961cae3afcda77"}, + {file = "contourpy-1.3.3-cp314-cp314t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d304906ecc71672e9c89e87c4675dc5c2645e1f4269a5063b99b0bb29f232d13"}, + {file = "contourpy-1.3.3-cp314-cp314t-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:ca658cd1a680a5c9ea96dc61cdbae1e85c8f25849843aa799dfd3cb370ad4fbe"}, + {file = "contourpy-1.3.3-cp314-cp314t-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:ab2fd90904c503739a75b7c8c5c01160130ba67944a7b77bbf36ef8054576e7f"}, + {file = "contourpy-1.3.3-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b7301b89040075c30e5768810bc96a8e8d78085b47d8be6e4c3f5a0b4ed478a0"}, + {file = "contourpy-1.3.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:2a2a8b627d5cc6b7c41a4beff6c5ad5eb848c88255fda4a8745f7e901b32d8e4"}, + {file = "contourpy-1.3.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:fd6ec6be509c787f1caf6b247f0b1ca598bef13f4ddeaa126b7658215529ba0f"}, + {file = "contourpy-1.3.3-cp314-cp314t-win32.whl", hash = "sha256:e74a9a0f5e3fff48fb5a7f2fd2b9b70a3fe014a67522f79b7cca4c0c7e43c9ae"}, + {file = "contourpy-1.3.3-cp314-cp314t-win_amd64.whl", hash = "sha256:13b68d6a62db8eafaebb8039218921399baf6e47bf85006fd8529f2a08ef33fc"}, + {file = "contourpy-1.3.3-cp314-cp314t-win_arm64.whl", hash = "sha256:b7448cb5a725bb1e35ce88771b86fba35ef418952474492cf7c764059933ff8b"}, + {file = "contourpy-1.3.3-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:cd5dfcaeb10f7b7f9dc8941717c6c2ade08f587be2226222c12b25f0483ed497"}, + {file = "contourpy-1.3.3-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:0c1fc238306b35f246d61a1d416a627348b5cf0648648a031e14bb8705fcdfe8"}, + {file = "contourpy-1.3.3-pp311-pypy311_pp73-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:70f9aad7de812d6541d29d2bbf8feb22ff7e1c299523db288004e3157ff4674e"}, + {file = "contourpy-1.3.3-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5ed3657edf08512fc3fe81b510e35c2012fbd3081d2e26160f27ca28affec989"}, + {file = "contourpy-1.3.3-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:3d1a3799d62d45c18bafd41c5fa05120b96a28079f2393af559b843d1a966a77"}, + {file = "contourpy-1.3.3.tar.gz", hash = "sha256:083e12155b210502d0bca491432bb04d56dc3432f95a979b429f2848c3dbe880"}, +] + +[package.dependencies] +numpy = ">=1.25" + +[package.extras] +bokeh = ["bokeh", "selenium"] +docs = ["furo", "sphinx (>=7.2)", "sphinx-copybutton"] +mypy = ["bokeh", "contourpy[bokeh,docs]", "docutils-stubs", "mypy (==1.17.0)", "types-Pillow"] +test = ["Pillow", "contourpy[test-no-images]", "matplotlib"] +test-no-images = ["pytest", "pytest-cov", "pytest-rerunfailures", "pytest-xdist", "wurlitzer"] + [[package]] name = "coverage" version = "7.10.7" description = "Code coverage measurement for Python" optional = false python-versions = ">=3.9" +groups = ["main", "dev"] files = [ {file = "coverage-7.10.7-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:fc04cc7a3db33664e0c2d10eb8990ff6b3536f6842c9590ae8da4c614b9ed05a"}, {file = "coverage-7.10.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e201e015644e207139f7e2351980feb7040e6f4b2c2978892f3e3789d1c125e5"}, @@ -199,7 +462,23 @@ files = [ tomli = {version = "*", optional = true, markers = "python_full_version <= \"3.11.0a6\" and extra == \"toml\""} [package.extras] -toml = ["tomli"] +toml = ["tomli ; python_full_version <= \"3.11.0a6\""] + +[[package]] +name = "cycler" +version = "0.12.1" +description = "Composable style cycles" +optional = false +python-versions = ">=3.8" +groups = ["main", "dev"] +files = [ + {file = "cycler-0.12.1-py3-none-any.whl", hash = "sha256:85cef7cff222d8644161529808465972e51340599459b8ac3ccbac5a854e0d30"}, + {file = "cycler-0.12.1.tar.gz", hash = "sha256:88bb128f02ba341da8ef447245a9e138fae777f6a23943da4540077d3601eb1c"}, +] + +[package.extras] +docs = ["ipython", "matplotlib", "numpydoc", "sphinx"] +tests = ["pytest", "pytest-cov", "pytest-xdist"] [[package]] name = "distlib" @@ -207,17 +486,74 @@ version = "0.4.0" description = "Distribution utilities" optional = false python-versions = "*" +groups = ["main", "dev"] files = [ {file = "distlib-0.4.0-py2.py3-none-any.whl", hash = "sha256:9659f7d87e46584a30b5780e43ac7a2143098441670ff0a49d5f9034c54a6c16"}, {file = "distlib-0.4.0.tar.gz", hash = "sha256:feec40075be03a04501a973d81f633735b4b69f98b05450592310c0f401a4e0d"}, ] +[[package]] +name = "duckdb" +version = "1.4.3" +description = "DuckDB in-process database" +optional = false +python-versions = ">=3.9.0" +groups = ["main", "dev"] +files = [ + {file = "duckdb-1.4.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:efa7f1191c59e34b688fcd4e588c1b903a4e4e1f4804945902cf0b20e08a9001"}, + {file = "duckdb-1.4.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:4fef6a053a1c485292000bf0c338bba60f89d334f6a06fc76ba4085a5a322b76"}, + {file = "duckdb-1.4.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:702dabbc22b27dc5b73e7599c60deef3d8c59968527c36b391773efddd8f4cf1"}, + {file = "duckdb-1.4.3-cp310-cp310-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:854b79375fa618f6ffa8d84fb45cbc9db887f6c4834076ea10d20bc106f1fd90"}, + {file = "duckdb-1.4.3-cp310-cp310-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1bb8bd5a3dd205983726185b280a211eacc9f5bc0c4d4505bec8c87ac33a8ccb"}, + {file = "duckdb-1.4.3-cp310-cp310-win_amd64.whl", hash = "sha256:d0ff08388ef8b1d1a4c95c321d6c5fa11201b241036b1ee740f9d841df3d6ba2"}, + {file = "duckdb-1.4.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:366bf607088053dce845c9d24c202c04d78022436cc5d8e4c9f0492de04afbe7"}, + {file = "duckdb-1.4.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:8d080e8d1bf2d226423ec781f539c8f6b6ef3fd42a9a58a7160de0a00877a21f"}, + {file = "duckdb-1.4.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9dc049ba7e906cb49ca2b6d4fbf7b6615ec3883193e8abb93f0bef2652e42dda"}, + {file = "duckdb-1.4.3-cp311-cp311-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2b30245375ea94ab528c87c61fc3ab3e36331180b16af92ee3a37b810a745d24"}, + {file = "duckdb-1.4.3-cp311-cp311-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a7c864df027da1ee95f0c32def67e15d02cd4a906c9c1cbae82c09c5112f526b"}, + {file = "duckdb-1.4.3-cp311-cp311-win_amd64.whl", hash = "sha256:813f189039b46877b5517f1909c7b94a8fe01b4bde2640ab217537ea0fe9b59b"}, + {file = "duckdb-1.4.3-cp311-cp311-win_arm64.whl", hash = "sha256:fbc63ffdd03835f660155b37a1b6db2005bcd46e5ad398b8cac141eb305d2a3d"}, + {file = "duckdb-1.4.3-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:6302452e57aef29aae3977063810ed7b2927967b97912947b9cca45c1c21955f"}, + {file = "duckdb-1.4.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:deab351ac43b6282a3270e3d40e3d57b3b50f472d9fd8c30975d88a31be41231"}, + {file = "duckdb-1.4.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5634e40e1e2d972e4f75bced1fbdd9e9e90faa26445c1052b27de97ee546944a"}, + {file = "duckdb-1.4.3-cp312-cp312-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:274d4a31aba63115f23e7e7b401e3e3a937f3626dc9dea820a9c7d3073f450d2"}, + {file = "duckdb-1.4.3-cp312-cp312-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4f868a7e6d9b37274a1aa34849ea92aa964e9bd59a5237d6c17e8540533a1e4f"}, + {file = "duckdb-1.4.3-cp312-cp312-win_amd64.whl", hash = "sha256:ef7ef15347ce97201b1b5182a5697682679b04c3374d5a01ac10ba31cf791b95"}, + {file = "duckdb-1.4.3-cp312-cp312-win_arm64.whl", hash = "sha256:1b9b445970fd18274d5ac07a0b24c032e228f967332fb5ebab3d7db27738c0e4"}, + {file = "duckdb-1.4.3-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:16952ac05bd7e7b39946695452bf450db1ebbe387e1e7178e10f593f2ea7b9a8"}, + {file = "duckdb-1.4.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:de984cd24a6cbefdd6d4a349f7b9a46e583ca3e58ce10d8def0b20a6e5fcbe78"}, + {file = "duckdb-1.4.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1e5457dda91b67258aae30fb1a0df84183a9f6cd27abac1d5536c0d876c6dfa1"}, + {file = "duckdb-1.4.3-cp313-cp313-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:006aca6a6d6736c441b02ff5c7600b099bb8b7f4de094b8b062137efddce42df"}, + {file = "duckdb-1.4.3-cp313-cp313-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a2813f4635f4d6681cc3304020374c46aca82758c6740d7edbc237fe3aae2744"}, + {file = "duckdb-1.4.3-cp313-cp313-win_amd64.whl", hash = "sha256:6db124f53a3edcb32b0a896ad3519e37477f7e67bf4811cb41ab60c1ef74e4c8"}, + {file = "duckdb-1.4.3-cp313-cp313-win_arm64.whl", hash = "sha256:a8b0a8764e1b5dd043d168c8f749314f7a1252b5a260fa415adaa26fa3b958fd"}, + {file = "duckdb-1.4.3-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:316711a9e852bcfe1ed6241a5f654983f67e909e290495f3562cccdf43be8180"}, + {file = "duckdb-1.4.3-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:9e625b2b4d52bafa1fd0ebdb0990c3961dac8bb00e30d327185de95b68202131"}, + {file = "duckdb-1.4.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:130c6760f6c573f9c9fe9aba56adba0fab48811a4871b7b8fd667318b4a3e8da"}, + {file = "duckdb-1.4.3-cp314-cp314-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:20c88effaa557a11267706b01419c542fe42f893dee66e5a6daa5974ea2d4a46"}, + {file = "duckdb-1.4.3-cp314-cp314-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1b35491db98ccd11d151165497c084a9d29d3dc42fc80abea2715a6c861ca43d"}, + {file = "duckdb-1.4.3-cp314-cp314-win_amd64.whl", hash = "sha256:23b12854032c1a58d0452e2b212afa908d4ce64171862f3792ba9a596ba7c765"}, + {file = "duckdb-1.4.3-cp314-cp314-win_arm64.whl", hash = "sha256:90f241f25cffe7241bf9f376754a5845c74775e00e1c5731119dc88cd71e0cb2"}, + {file = "duckdb-1.4.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:aa26a7406205bc1426cee28bdfdf084f669a5686977dafa4c3ec65873989593c"}, + {file = "duckdb-1.4.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:caa2164c91f7e91befb1ffb081b3cd97a137117533aef7abe1538b03ad72e3a9"}, + {file = "duckdb-1.4.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:8d53b217698a76c4957e2c807dd9295d409146f9d3d7932f372883201ba9d25a"}, + {file = "duckdb-1.4.3-cp39-cp39-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8afba22c370f06b7314aa46bfed052509269e482bcfb3f7b1ea0fa17ae49ce42"}, + {file = "duckdb-1.4.3-cp39-cp39-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2b195270ff1a661f22cbd547a215baff265b7d4469a76a215c8992b5994107c3"}, + {file = "duckdb-1.4.3-cp39-cp39-win_amd64.whl", hash = "sha256:23a3a077821bed1768a84ac9cbf6b6487ead33e28e62cb118bda5fb8f9e53dea"}, + {file = "duckdb-1.4.3.tar.gz", hash = "sha256:fea43e03604c713e25a25211ada87d30cd2a044d8f27afab5deba26ac49e5268"}, +] + +[package.extras] +all = ["adbc-driver-manager", "fsspec", "ipython", "numpy", "pandas", "pyarrow"] + [[package]] name = "exceptiongroup" version = "1.3.1" description = "Backport of PEP 654 (exception groups)" optional = false python-versions = ">=3.7" +groups = ["main", "dev"] +markers = "python_version < \"3.11\"" files = [ {file = "exceptiongroup-1.3.1-py3-none-any.whl", hash = "sha256:a7a39a3bd276781e98394987d3a5701d0c4edffb633bb7a5144577f82c773598"}, {file = "exceptiongroup-1.3.1.tar.gz", hash = "sha256:8b412432c6055b0b7d14c310000ae93352ed6754f70fa8f7c34141f91c4e3219"}, @@ -235,6 +571,8 @@ version = "3.19.1" description = "A platform independent file lock." optional = false python-versions = ">=3.9" +groups = ["main", "dev"] +markers = "python_version == \"3.9\"" files = [ {file = "filelock-3.19.1-py3-none-any.whl", hash = "sha256:d38e30481def20772f5baf097c122c3babc4fcdb7e14e57049eb9d88c6dc017d"}, {file = "filelock-3.19.1.tar.gz", hash = "sha256:66eda1888b0171c998b35be2bcc0f6d75c388a7ce20c3f3f37aa8e96c2dddf58"}, @@ -246,17 +584,176 @@ version = "3.20.3" description = "A platform independent file lock." optional = false python-versions = ">=3.10" +groups = ["main", "dev"] +markers = "python_version >= \"3.10\"" files = [ {file = "filelock-3.20.3-py3-none-any.whl", hash = "sha256:4b0dda527ee31078689fc205ec4f1c1bf7d56cf88b6dc9426c4f230e46c2dce1"}, {file = "filelock-3.20.3.tar.gz", hash = "sha256:18c57ee915c7ec61cff0ecf7f0f869936c7c30191bb0cf406f1341778d0834e1"}, ] +[[package]] +name = "fonttools" +version = "4.60.2" +description = "Tools to manipulate font files" +optional = false +python-versions = ">=3.9" +groups = ["main", "dev"] +markers = "python_version == \"3.9\"" +files = [ + {file = "fonttools-4.60.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:4e36fadcf7e8ca6e34d490eef86ed638d6fd9c55d2f514b05687622cfc4a7050"}, + {file = "fonttools-4.60.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6e500fc9c04bee749ceabfc20cb4903f6981c2139050d85720ea7ada61b75d5c"}, + {file = "fonttools-4.60.2-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:22efea5e784e1d1cd8d7b856c198e360a979383ebc6dea4604743b56da1cbc34"}, + {file = "fonttools-4.60.2-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:677aa92d84d335e4d301d8ba04afca6f575316bc647b6782cb0921943fcb6343"}, + {file = "fonttools-4.60.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:edd49d3defbf35476e78b61ff737ff5efea811acff68d44233a95a5a48252334"}, + {file = "fonttools-4.60.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:126839492b69cecc5baf2bddcde60caab2ffafd867bbae2a88463fce6078ca3a"}, + {file = "fonttools-4.60.2-cp310-cp310-win32.whl", hash = "sha256:ffcab6f5537136046ca902ed2491ab081ba271b07591b916289b7c27ff845f96"}, + {file = "fonttools-4.60.2-cp310-cp310-win_amd64.whl", hash = "sha256:9c68b287c7ffcd29dd83b5f961004b2a54a862a88825d52ea219c6220309ba45"}, + {file = "fonttools-4.60.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:a2aed0a7931401b3875265717a24c726f87ecfedbb7b3426c2ca4d2812e281ae"}, + {file = "fonttools-4.60.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:dea6868e9d2b816c9076cfea77754686f3c19149873bdbc5acde437631c15df1"}, + {file = "fonttools-4.60.2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2fa27f34950aa1fe0f0b1abe25eed04770a3b3b34ad94e5ace82cc341589678a"}, + {file = "fonttools-4.60.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:13a53d479d187b09bfaa4a35ffcbc334fc494ff355f0a587386099cb66674f1e"}, + {file = "fonttools-4.60.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:fac5e921d3bd0ca3bb8517dced2784f0742bc8ca28579a68b139f04ea323a779"}, + {file = "fonttools-4.60.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:648f4f9186fd7f1f3cd57dbf00d67a583720d5011feca67a5e88b3a491952cfb"}, + {file = "fonttools-4.60.2-cp311-cp311-win32.whl", hash = "sha256:3274e15fad871bead5453d5ce02658f6d0c7bc7e7021e2a5b8b04e2f9e40da1a"}, + {file = "fonttools-4.60.2-cp311-cp311-win_amd64.whl", hash = "sha256:91d058d5a483a1525b367803abb69de0923fbd45e1f82ebd000f5c8aa65bc78e"}, + {file = "fonttools-4.60.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:e0164b7609d2b5c5dd4e044b8085b7bd7ca7363ef8c269a4ab5b5d4885a426b2"}, + {file = "fonttools-4.60.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:1dd3d9574fc595c1e97faccae0f264dc88784ddf7fbf54c939528378bacc0033"}, + {file = "fonttools-4.60.2-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:98d0719f1b11c2817307d2da2e94296a3b2a3503f8d6252a101dca3ee663b917"}, + {file = "fonttools-4.60.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9d3ea26957dd07209f207b4fff64c702efe5496de153a54d3b91007ec28904dd"}, + {file = "fonttools-4.60.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1ee301273b0850f3a515299f212898f37421f42ff9adfc341702582ca5073c13"}, + {file = "fonttools-4.60.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c6eb4694cc3b9c03b7c01d65a9cf35b577f21aa6abdbeeb08d3114b842a58153"}, + {file = "fonttools-4.60.2-cp312-cp312-win32.whl", hash = "sha256:57f07b616c69c244cc1a5a51072eeef07dddda5ebef9ca5c6e9cf6d59ae65b70"}, + {file = "fonttools-4.60.2-cp312-cp312-win_amd64.whl", hash = "sha256:310035802392f1fe5a7cf43d76f6ff4a24c919e4c72c0352e7b8176e2584b8a0"}, + {file = "fonttools-4.60.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:2bb5fd231e56ccd7403212636dcccffc96c5ae0d6f9e4721fa0a32cb2e3ca432"}, + {file = "fonttools-4.60.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:536b5fab7b6fec78ccf59b5c59489189d9d0a8b0d3a77ed1858be59afb096696"}, + {file = "fonttools-4.60.2-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:6b9288fc38252ac86a9570f19313ecbc9ff678982e0f27c757a85f1f284d3400"}, + {file = "fonttools-4.60.2-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:93fcb420791d839ef592eada2b69997c445d0ce9c969b5190f2e16828ec10607"}, + {file = "fonttools-4.60.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:7916a381b094db4052ac284255186aebf74c5440248b78860cb41e300036f598"}, + {file = "fonttools-4.60.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:58c8c393d5e16b15662cfc2d988491940458aa87894c662154f50c7b49440bef"}, + {file = "fonttools-4.60.2-cp313-cp313-win32.whl", hash = "sha256:19c6e0afd8b02008caa0aa08ab896dfce5d0bcb510c49b2c499541d5cb95a963"}, + {file = "fonttools-4.60.2-cp313-cp313-win_amd64.whl", hash = "sha256:6a500dc59e11b2338c2dba1f8cf11a4ae8be35ec24af8b2628b8759a61457b76"}, + {file = "fonttools-4.60.2-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:9387c532acbe323bbf2a920f132bce3c408a609d5f9dcfc6532fbc7e37f8ccbb"}, + {file = "fonttools-4.60.2-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:e6f1c824185b5b8fb681297f315f26ae55abb0d560c2579242feea8236b1cfef"}, + {file = "fonttools-4.60.2-cp314-cp314-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:55a3129d1e4030b1a30260f1b32fe76781b585fb2111d04a988e141c09eb6403"}, + {file = "fonttools-4.60.2-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b196e63753abc33b3b97a6fd6de4b7c4fef5552c0a5ba5e562be214d1e9668e0"}, + {file = "fonttools-4.60.2-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:de76c8d740fb55745f3b154f0470c56db92ae3be27af8ad6c2e88f1458260c9a"}, + {file = "fonttools-4.60.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:6ba6303225c95998c9fda2d410aa792c3d2c1390a09df58d194b03e17583fa25"}, + {file = "fonttools-4.60.2-cp314-cp314-win32.whl", hash = "sha256:0a89728ce10d7c816fedaa5380c06d2793e7a8a634d7ce16810e536c22047384"}, + {file = "fonttools-4.60.2-cp314-cp314-win_amd64.whl", hash = "sha256:fa8446e6ab8bd778b82cb1077058a2addba86f30de27ab9cc18ed32b34bc8667"}, + {file = "fonttools-4.60.2-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:4063bc81ac5a4137642865cb63dd270e37b3cd1f55a07c0d6e41d072699ccca2"}, + {file = "fonttools-4.60.2-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:ebfdb66fa69732ed604ab8e2a0431e6deff35e933a11d73418cbc7823d03b8e1"}, + {file = "fonttools-4.60.2-cp314-cp314t-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:50b10b3b1a72d1d54c61b0e59239e1a94c0958f4a06a1febf97ce75388dd91a4"}, + {file = "fonttools-4.60.2-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:beae16891a13b4a2ddec9b39b4de76092a3025e4d1c82362e3042b62295d5e4d"}, + {file = "fonttools-4.60.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:522f017fdb3766fd5d2d321774ef351cc6ce88ad4e6ac9efe643e4a2b9d528db"}, + {file = "fonttools-4.60.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:82cceceaf9c09a965a75b84a4b240dd3768e596ffb65ef53852681606fe7c9ba"}, + {file = "fonttools-4.60.2-cp314-cp314t-win32.whl", hash = "sha256:bbfbc918a75437fe7e6d64d1b1e1f713237df1cf00f3a36dedae910b2ba01cee"}, + {file = "fonttools-4.60.2-cp314-cp314t-win_amd64.whl", hash = "sha256:0e5cd9b0830f6550d58c84f3ab151a9892b50c4f9d538c5603c0ce6fff2eb3f1"}, + {file = "fonttools-4.60.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:a3c75b8b42f7f93906bdba9eb1197bb76aecbe9a0a7cf6feec75f7605b5e8008"}, + {file = "fonttools-4.60.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:0f86c8c37bc0ec0b9c141d5e90c717ff614e93c187f06d80f18c7057097f71bc"}, + {file = "fonttools-4.60.2-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:fe905403fe59683b0e9a45f234af2866834376b8821f34633b1c76fb731b6311"}, + {file = "fonttools-4.60.2-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:38ce703b60a906e421e12d9e3a7f064883f5e61bb23e8961f4be33cfe578500b"}, + {file = "fonttools-4.60.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:9e810c06f3e79185cecf120e58b343ea5a89b54dd695fd644446bcf8c026da5e"}, + {file = "fonttools-4.60.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:38faec8cc1d12122599814d15a402183f5123fb7608dac956121e7c6742aebc5"}, + {file = "fonttools-4.60.2-cp39-cp39-win32.whl", hash = "sha256:80a45cf7bf659acb7b36578f300231873daba67bd3ca8cce181c73f861f14a37"}, + {file = "fonttools-4.60.2-cp39-cp39-win_amd64.whl", hash = "sha256:c355d5972071938e1b1e0f5a1df001f68ecf1a62f34a3407dc8e0beccf052501"}, + {file = "fonttools-4.60.2-py3-none-any.whl", hash = "sha256:73cf92eeda67cf6ff10c8af56fc8f4f07c1647d989a979be9e388a49be26552a"}, + {file = "fonttools-4.60.2.tar.gz", hash = "sha256:d29552e6b155ebfc685b0aecf8d429cb76c14ab734c22ef5d3dea6fdf800c92c"}, +] + +[package.extras] +all = ["brotli (>=1.0.1) ; platform_python_implementation == \"CPython\"", "brotlicffi (>=0.8.0) ; platform_python_implementation != \"CPython\"", "lxml (>=4.0)", "lz4 (>=1.7.4.2)", "matplotlib", "munkres ; platform_python_implementation == \"PyPy\"", "pycairo", "scipy ; platform_python_implementation != \"PyPy\"", "skia-pathops (>=0.5.0)", "sympy", "uharfbuzz (>=0.45.0)", "unicodedata2 (>=17.0.0) ; python_version <= \"3.14\"", "xattr ; sys_platform == \"darwin\"", "zopfli (>=0.1.4)"] +graphite = ["lz4 (>=1.7.4.2)"] +interpolatable = ["munkres ; platform_python_implementation == \"PyPy\"", "pycairo", "scipy ; platform_python_implementation != \"PyPy\""] +lxml = ["lxml (>=4.0)"] +pathops = ["skia-pathops (>=0.5.0)"] +plot = ["matplotlib"] +repacker = ["uharfbuzz (>=0.45.0)"] +symfont = ["sympy"] +type1 = ["xattr ; sys_platform == \"darwin\""] +unicode = ["unicodedata2 (>=17.0.0) ; python_version <= \"3.14\""] +woff = ["brotli (>=1.0.1) ; platform_python_implementation == \"CPython\"", "brotlicffi (>=0.8.0) ; platform_python_implementation != \"CPython\"", "zopfli (>=0.1.4)"] + +[[package]] +name = "fonttools" +version = "4.61.1" +description = "Tools to manipulate font files" +optional = false +python-versions = ">=3.10" +groups = ["main", "dev"] +markers = "python_version >= \"3.10\"" +files = [ + {file = "fonttools-4.61.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7c7db70d57e5e1089a274cbb2b1fd635c9a24de809a231b154965d415d6c6d24"}, + {file = "fonttools-4.61.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5fe9fd43882620017add5eabb781ebfbc6998ee49b35bd7f8f79af1f9f99a958"}, + {file = "fonttools-4.61.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d8db08051fc9e7d8bc622f2112511b8107d8f27cd89e2f64ec45e9825e8288da"}, + {file = "fonttools-4.61.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a76d4cb80f41ba94a6691264be76435e5f72f2cb3cab0b092a6212855f71c2f6"}, + {file = "fonttools-4.61.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a13fc8aeb24bad755eea8f7f9d409438eb94e82cf86b08fe77a03fbc8f6a96b1"}, + {file = "fonttools-4.61.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b846a1fcf8beadeb9ea4f44ec5bdde393e2f1569e17d700bfc49cd69bde75881"}, + {file = "fonttools-4.61.1-cp310-cp310-win32.whl", hash = "sha256:78a7d3ab09dc47ac1a363a493e6112d8cabed7ba7caad5f54dbe2f08676d1b47"}, + {file = "fonttools-4.61.1-cp310-cp310-win_amd64.whl", hash = "sha256:eff1ac3cc66c2ac7cda1e64b4e2f3ffef474b7335f92fc3833fc632d595fcee6"}, + {file = "fonttools-4.61.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c6604b735bb12fef8e0efd5578c9fb5d3d8532d5001ea13a19cddf295673ee09"}, + {file = "fonttools-4.61.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5ce02f38a754f207f2f06557523cd39a06438ba3aafc0639c477ac409fc64e37"}, + {file = "fonttools-4.61.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:77efb033d8d7ff233385f30c62c7c79271c8885d5c9657d967ede124671bbdfb"}, + {file = "fonttools-4.61.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:75c1a6dfac6abd407634420c93864a1e274ebc1c7531346d9254c0d8f6ca00f9"}, + {file = "fonttools-4.61.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0de30bfe7745c0d1ffa2b0b7048fb7123ad0d71107e10ee090fa0b16b9452e87"}, + {file = "fonttools-4.61.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:58b0ee0ab5b1fc9921eccfe11d1435added19d6494dde14e323f25ad2bc30c56"}, + {file = "fonttools-4.61.1-cp311-cp311-win32.whl", hash = "sha256:f79b168428351d11e10c5aeb61a74e1851ec221081299f4cf56036a95431c43a"}, + {file = "fonttools-4.61.1-cp311-cp311-win_amd64.whl", hash = "sha256:fe2efccb324948a11dd09d22136fe2ac8a97d6c1347cf0b58a911dcd529f66b7"}, + {file = "fonttools-4.61.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:f3cb4a569029b9f291f88aafc927dd53683757e640081ca8c412781ea144565e"}, + {file = "fonttools-4.61.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:41a7170d042e8c0024703ed13b71893519a1a6d6e18e933e3ec7507a2c26a4b2"}, + {file = "fonttools-4.61.1-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:10d88e55330e092940584774ee5e8a6971b01fc2f4d3466a1d6c158230880796"}, + {file = "fonttools-4.61.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:15acc09befd16a0fb8a8f62bc147e1a82817542d72184acca9ce6e0aeda9fa6d"}, + {file = "fonttools-4.61.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e6bcdf33aec38d16508ce61fd81838f24c83c90a1d1b8c68982857038673d6b8"}, + {file = "fonttools-4.61.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5fade934607a523614726119164ff621e8c30e8fa1ffffbbd358662056ba69f0"}, + {file = "fonttools-4.61.1-cp312-cp312-win32.whl", hash = "sha256:75da8f28eff26defba42c52986de97b22106cb8f26515b7c22443ebc9c2d3261"}, + {file = "fonttools-4.61.1-cp312-cp312-win_amd64.whl", hash = "sha256:497c31ce314219888c0e2fce5ad9178ca83fe5230b01a5006726cdf3ac9f24d9"}, + {file = "fonttools-4.61.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:8c56c488ab471628ff3bfa80964372fc13504ece601e0d97a78ee74126b2045c"}, + {file = "fonttools-4.61.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:dc492779501fa723b04d0ab1f5be046797fee17d27700476edc7ee9ae535a61e"}, + {file = "fonttools-4.61.1-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:64102ca87e84261419c3747a0d20f396eb024bdbeb04c2bfb37e2891f5fadcb5"}, + {file = "fonttools-4.61.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4c1b526c8d3f615a7b1867f38a9410849c8f4aef078535742198e942fba0e9bd"}, + {file = "fonttools-4.61.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:41ed4b5ec103bd306bb68f81dc166e77409e5209443e5773cb4ed837bcc9b0d3"}, + {file = "fonttools-4.61.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:b501c862d4901792adaec7c25b1ecc749e2662543f68bb194c42ba18d6eec98d"}, + {file = "fonttools-4.61.1-cp313-cp313-win32.whl", hash = "sha256:4d7092bb38c53bbc78e9255a59158b150bcdc115a1e3b3ce0b5f267dc35dd63c"}, + {file = "fonttools-4.61.1-cp313-cp313-win_amd64.whl", hash = "sha256:21e7c8d76f62ab13c9472ccf74515ca5b9a761d1bde3265152a6dc58700d895b"}, + {file = "fonttools-4.61.1-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:fff4f534200a04b4a36e7ae3cb74493afe807b517a09e99cb4faa89a34ed6ecd"}, + {file = "fonttools-4.61.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:d9203500f7c63545b4ce3799319fe4d9feb1a1b89b28d3cb5abd11b9dd64147e"}, + {file = "fonttools-4.61.1-cp314-cp314-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:fa646ecec9528bef693415c79a86e733c70a4965dd938e9a226b0fc64c9d2e6c"}, + {file = "fonttools-4.61.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:11f35ad7805edba3aac1a3710d104592df59f4b957e30108ae0ba6c10b11dd75"}, + {file = "fonttools-4.61.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:b931ae8f62db78861b0ff1ac017851764602288575d65b8e8ff1963fed419063"}, + {file = "fonttools-4.61.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:b148b56f5de675ee16d45e769e69f87623a4944f7443850bf9a9376e628a89d2"}, + {file = "fonttools-4.61.1-cp314-cp314-win32.whl", hash = "sha256:9b666a475a65f4e839d3d10473fad6d47e0a9db14a2f4a224029c5bfde58ad2c"}, + {file = "fonttools-4.61.1-cp314-cp314-win_amd64.whl", hash = "sha256:4f5686e1fe5fce75d82d93c47a438a25bf0d1319d2843a926f741140b2b16e0c"}, + {file = "fonttools-4.61.1-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:e76ce097e3c57c4bcb67c5aa24a0ecdbd9f74ea9219997a707a4061fbe2707aa"}, + {file = "fonttools-4.61.1-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:9cfef3ab326780c04d6646f68d4b4742aae222e8b8ea1d627c74e38afcbc9d91"}, + {file = "fonttools-4.61.1-cp314-cp314t-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:a75c301f96db737e1c5ed5fd7d77d9c34466de16095a266509e13da09751bd19"}, + {file = "fonttools-4.61.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:91669ccac46bbc1d09e9273546181919064e8df73488ea087dcac3e2968df9ba"}, + {file = "fonttools-4.61.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:c33ab3ca9d3ccd581d58e989d67554e42d8d4ded94ab3ade3508455fe70e65f7"}, + {file = "fonttools-4.61.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:664c5a68ec406f6b1547946683008576ef8b38275608e1cee6c061828171c118"}, + {file = "fonttools-4.61.1-cp314-cp314t-win32.whl", hash = "sha256:aed04cabe26f30c1647ef0e8fbb207516fd40fe9472e9439695f5c6998e60ac5"}, + {file = "fonttools-4.61.1-cp314-cp314t-win_amd64.whl", hash = "sha256:2180f14c141d2f0f3da43f3a81bc8aa4684860f6b0e6f9e165a4831f24e6a23b"}, + {file = "fonttools-4.61.1-py3-none-any.whl", hash = "sha256:17d2bf5d541add43822bcf0c43d7d847b160c9bb01d15d5007d84e2217aaa371"}, + {file = "fonttools-4.61.1.tar.gz", hash = "sha256:6675329885c44657f826ef01d9e4fb33b9158e9d93c537d84ad8399539bc6f69"}, +] + +[package.extras] +all = ["brotli (>=1.0.1) ; platform_python_implementation == \"CPython\"", "brotlicffi (>=0.8.0) ; platform_python_implementation != \"CPython\"", "lxml (>=4.0)", "lz4 (>=1.7.4.2)", "matplotlib", "munkres ; platform_python_implementation == \"PyPy\"", "pycairo", "scipy ; platform_python_implementation != \"PyPy\"", "skia-pathops (>=0.5.0)", "sympy", "uharfbuzz (>=0.45.0)", "unicodedata2 (>=17.0.0) ; python_version <= \"3.14\"", "xattr ; sys_platform == \"darwin\"", "zopfli (>=0.1.4)"] +graphite = ["lz4 (>=1.7.4.2)"] +interpolatable = ["munkres ; platform_python_implementation == \"PyPy\"", "pycairo", "scipy ; platform_python_implementation != \"PyPy\""] +lxml = ["lxml (>=4.0)"] +pathops = ["skia-pathops (>=0.5.0)"] +plot = ["matplotlib"] +repacker = ["uharfbuzz (>=0.45.0)"] +symfont = ["sympy"] +type1 = ["xattr ; sys_platform == \"darwin\""] +unicode = ["unicodedata2 (>=17.0.0) ; python_version <= \"3.14\""] +woff = ["brotli (>=1.0.1) ; platform_python_implementation == \"CPython\"", "brotlicffi (>=0.8.0) ; platform_python_implementation != \"CPython\"", "zopfli (>=0.1.4)"] + [[package]] name = "googleapis-common-protos" version = "1.72.0" description = "Common protobufs used in Google APIs" optional = false python-versions = ">=3.7" +groups = ["main", "dev"] files = [ {file = "googleapis_common_protos-1.72.0-py3-none-any.whl", hash = "sha256:4299c5a82d5ae1a9702ada957347726b167f9f8d1fc352477702a1e851ff4038"}, {file = "googleapis_common_protos-1.72.0.tar.gz", hash = "sha256:e55a601c1b32b52d7a3e65f43563e2aa61bcd737998ee672ac9b951cd49319f5"}, @@ -274,6 +771,7 @@ version = "1.76.0" description = "HTTP/2-based RPC framework" optional = false python-versions = ">=3.9" +groups = ["main", "dev"] files = [ {file = "grpcio-1.76.0-cp310-cp310-linux_armv7l.whl", hash = "sha256:65a20de41e85648e00305c1bb09a3598f840422e522277641145a32d42dcefcc"}, {file = "grpcio-1.76.0-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:40ad3afe81676fd9ec6d9d406eda00933f218038433980aa19d401490e46ecde"}, @@ -350,6 +848,7 @@ version = "1.76.0" description = "Status proto mapping for gRPC" optional = false python-versions = ">=3.9" +groups = ["main", "dev"] files = [ {file = "grpcio_status-1.76.0-py3-none-any.whl", hash = "sha256:380568794055a8efbbd8871162df92012e0228a5f6dffaf57f2a00c534103b18"}, {file = "grpcio_status-1.76.0.tar.gz", hash = "sha256:25fcbfec74c15d1a1cb5da3fab8ee9672852dc16a5a9eeb5baf7d7a9952943cd"}, @@ -366,6 +865,7 @@ version = "2.6.15" description = "File identification library for Python" optional = false python-versions = ">=3.9" +groups = ["main", "dev"] files = [ {file = "identify-2.6.15-py2.py3-none-any.whl", hash = "sha256:1181ef7608e00704db228516541eb83a88a9f94433a8c80bb9b5bd54b1d81757"}, {file = "identify-2.6.15.tar.gz", hash = "sha256:e4f4864b96c6557ef2a1e1c951771838f4edc9df3a72ec7118b338801b11c7bf"}, @@ -374,23 +874,433 @@ files = [ [package.extras] license = ["ukkonen"] +[[package]] +name = "importlib-resources" +version = "6.5.2" +description = "Read resources from Python packages" +optional = false +python-versions = ">=3.9" +groups = ["main", "dev"] +markers = "python_version == \"3.9\"" +files = [ + {file = "importlib_resources-6.5.2-py3-none-any.whl", hash = "sha256:789cfdc3ed28c78b67a06acb8126751ced69a3d5f79c095a98298cd8a760ccec"}, + {file = "importlib_resources-6.5.2.tar.gz", hash = "sha256:185f87adef5bcc288449d98fb4fba07cea78bc036455dd44c5fc4a2fe78fed2c"}, +] + +[package.dependencies] +zipp = {version = ">=3.1.0", markers = "python_version < \"3.10\""} + +[package.extras] +check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\""] +cover = ["pytest-cov"] +doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] +enabler = ["pytest-enabler (>=2.2)"] +test = ["jaraco.test (>=5.4)", "pytest (>=6,!=8.1.*)", "zipp (>=3.17)"] +type = ["pytest-mypy"] + [[package]] name = "iniconfig" version = "2.1.0" description = "brain-dead simple config-ini parsing" optional = false python-versions = ">=3.8" +groups = ["main", "dev"] files = [ {file = "iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760"}, {file = "iniconfig-2.1.0.tar.gz", hash = "sha256:3abbd2e30b36733fee78f9c7f7308f2d0050e88f0087fd25c2645f63c773e1c7"}, ] +[[package]] +name = "kiwisolver" +version = "1.4.7" +description = "A fast implementation of the Cassowary constraint solver" +optional = false +python-versions = ">=3.8" +groups = ["main", "dev"] +markers = "python_version == \"3.9\"" +files = [ + {file = "kiwisolver-1.4.7-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:8a9c83f75223d5e48b0bc9cb1bf2776cf01563e00ade8775ffe13b0b6e1af3a6"}, + {file = "kiwisolver-1.4.7-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:58370b1ffbd35407444d57057b57da5d6549d2d854fa30249771775c63b5fe17"}, + {file = "kiwisolver-1.4.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:aa0abdf853e09aff551db11fce173e2177d00786c688203f52c87ad7fcd91ef9"}, + {file = "kiwisolver-1.4.7-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:8d53103597a252fb3ab8b5845af04c7a26d5e7ea8122303dd7a021176a87e8b9"}, + {file = "kiwisolver-1.4.7-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:88f17c5ffa8e9462fb79f62746428dd57b46eb931698e42e990ad63103f35e6c"}, + {file = "kiwisolver-1.4.7-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:88a9ca9c710d598fd75ee5de59d5bda2684d9db36a9f50b6125eaea3969c2599"}, + {file = "kiwisolver-1.4.7-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f4d742cb7af1c28303a51b7a27aaee540e71bb8e24f68c736f6f2ffc82f2bf05"}, + {file = "kiwisolver-1.4.7-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e28c7fea2196bf4c2f8d46a0415c77a1c480cc0724722f23d7410ffe9842c407"}, + {file = "kiwisolver-1.4.7-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:e968b84db54f9d42046cf154e02911e39c0435c9801681e3fc9ce8a3c4130278"}, + {file = "kiwisolver-1.4.7-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:0c18ec74c0472de033e1bebb2911c3c310eef5649133dd0bedf2a169a1b269e5"}, + {file = "kiwisolver-1.4.7-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:8f0ea6da6d393d8b2e187e6a5e3fb81f5862010a40c3945e2c6d12ae45cfb2ad"}, + {file = "kiwisolver-1.4.7-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:f106407dda69ae456dd1227966bf445b157ccc80ba0dff3802bb63f30b74e895"}, + {file = "kiwisolver-1.4.7-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:84ec80df401cfee1457063732d90022f93951944b5b58975d34ab56bb150dfb3"}, + {file = "kiwisolver-1.4.7-cp310-cp310-win32.whl", hash = "sha256:71bb308552200fb2c195e35ef05de12f0c878c07fc91c270eb3d6e41698c3bcc"}, + {file = "kiwisolver-1.4.7-cp310-cp310-win_amd64.whl", hash = "sha256:44756f9fd339de0fb6ee4f8c1696cfd19b2422e0d70b4cefc1cc7f1f64045a8c"}, + {file = "kiwisolver-1.4.7-cp310-cp310-win_arm64.whl", hash = "sha256:78a42513018c41c2ffd262eb676442315cbfe3c44eed82385c2ed043bc63210a"}, + {file = "kiwisolver-1.4.7-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:d2b0e12a42fb4e72d509fc994713d099cbb15ebf1103545e8a45f14da2dfca54"}, + {file = "kiwisolver-1.4.7-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2a8781ac3edc42ea4b90bc23e7d37b665d89423818e26eb6df90698aa2287c95"}, + {file = "kiwisolver-1.4.7-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:46707a10836894b559e04b0fd143e343945c97fd170d69a2d26d640b4e297935"}, + {file = "kiwisolver-1.4.7-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ef97b8df011141c9b0f6caf23b29379f87dd13183c978a30a3c546d2c47314cb"}, + {file = "kiwisolver-1.4.7-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3ab58c12a2cd0fc769089e6d38466c46d7f76aced0a1f54c77652446733d2d02"}, + {file = "kiwisolver-1.4.7-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:803b8e1459341c1bb56d1c5c010406d5edec8a0713a0945851290a7930679b51"}, + {file = "kiwisolver-1.4.7-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f9a9e8a507420fe35992ee9ecb302dab68550dedc0da9e2880dd88071c5fb052"}, + {file = "kiwisolver-1.4.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:18077b53dc3bb490e330669a99920c5e6a496889ae8c63b58fbc57c3d7f33a18"}, + {file = "kiwisolver-1.4.7-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6af936f79086a89b3680a280c47ea90b4df7047b5bdf3aa5c524bbedddb9e545"}, + {file = "kiwisolver-1.4.7-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:3abc5b19d24af4b77d1598a585b8a719beb8569a71568b66f4ebe1fb0449460b"}, + {file = "kiwisolver-1.4.7-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:933d4de052939d90afbe6e9d5273ae05fb836cc86c15b686edd4b3560cc0ee36"}, + {file = "kiwisolver-1.4.7-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:65e720d2ab2b53f1f72fb5da5fb477455905ce2c88aaa671ff0a447c2c80e8e3"}, + {file = "kiwisolver-1.4.7-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:3bf1ed55088f214ba6427484c59553123fdd9b218a42bbc8c6496d6754b1e523"}, + {file = "kiwisolver-1.4.7-cp311-cp311-win32.whl", hash = "sha256:4c00336b9dd5ad96d0a558fd18a8b6f711b7449acce4c157e7343ba92dd0cf3d"}, + {file = "kiwisolver-1.4.7-cp311-cp311-win_amd64.whl", hash = "sha256:929e294c1ac1e9f615c62a4e4313ca1823ba37326c164ec720a803287c4c499b"}, + {file = "kiwisolver-1.4.7-cp311-cp311-win_arm64.whl", hash = "sha256:e33e8fbd440c917106b237ef1a2f1449dfbb9b6f6e1ce17c94cd6a1e0d438376"}, + {file = "kiwisolver-1.4.7-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:5360cc32706dab3931f738d3079652d20982511f7c0ac5711483e6eab08efff2"}, + {file = "kiwisolver-1.4.7-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:942216596dc64ddb25adb215c3c783215b23626f8d84e8eff8d6d45c3f29f75a"}, + {file = "kiwisolver-1.4.7-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:48b571ecd8bae15702e4f22d3ff6a0f13e54d3d00cd25216d5e7f658242065ee"}, + {file = "kiwisolver-1.4.7-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ad42ba922c67c5f219097b28fae965e10045ddf145d2928bfac2eb2e17673640"}, + {file = "kiwisolver-1.4.7-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:612a10bdae23404a72941a0fc8fa2660c6ea1217c4ce0dbcab8a8f6543ea9e7f"}, + {file = "kiwisolver-1.4.7-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9e838bba3a3bac0fe06d849d29772eb1afb9745a59710762e4ba3f4cb8424483"}, + {file = "kiwisolver-1.4.7-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:22f499f6157236c19f4bbbd472fa55b063db77a16cd74d49afe28992dff8c258"}, + {file = "kiwisolver-1.4.7-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:693902d433cf585133699972b6d7c42a8b9f8f826ebcaf0132ff55200afc599e"}, + {file = "kiwisolver-1.4.7-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:4e77f2126c3e0b0d055f44513ed349038ac180371ed9b52fe96a32aa071a5107"}, + {file = "kiwisolver-1.4.7-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:657a05857bda581c3656bfc3b20e353c232e9193eb167766ad2dc58b56504948"}, + {file = "kiwisolver-1.4.7-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:4bfa75a048c056a411f9705856abfc872558e33c055d80af6a380e3658766038"}, + {file = "kiwisolver-1.4.7-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:34ea1de54beef1c104422d210c47c7d2a4999bdecf42c7b5718fbe59a4cac383"}, + {file = "kiwisolver-1.4.7-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:90da3b5f694b85231cf93586dad5e90e2d71b9428f9aad96952c99055582f520"}, + {file = "kiwisolver-1.4.7-cp312-cp312-win32.whl", hash = "sha256:18e0cca3e008e17fe9b164b55735a325140a5a35faad8de92dd80265cd5eb80b"}, + {file = "kiwisolver-1.4.7-cp312-cp312-win_amd64.whl", hash = "sha256:58cb20602b18f86f83a5c87d3ee1c766a79c0d452f8def86d925e6c60fbf7bfb"}, + {file = "kiwisolver-1.4.7-cp312-cp312-win_arm64.whl", hash = "sha256:f5a8b53bdc0b3961f8b6125e198617c40aeed638b387913bf1ce78afb1b0be2a"}, + {file = "kiwisolver-1.4.7-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:2e6039dcbe79a8e0f044f1c39db1986a1b8071051efba3ee4d74f5b365f5226e"}, + {file = "kiwisolver-1.4.7-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:a1ecf0ac1c518487d9d23b1cd7139a6a65bc460cd101ab01f1be82ecf09794b6"}, + {file = "kiwisolver-1.4.7-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:7ab9ccab2b5bd5702ab0803676a580fffa2aa178c2badc5557a84cc943fcf750"}, + {file = "kiwisolver-1.4.7-cp313-cp313-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f816dd2277f8d63d79f9c8473a79fe54047bc0467754962840782c575522224d"}, + {file = "kiwisolver-1.4.7-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cf8bcc23ceb5a1b624572a1623b9f79d2c3b337c8c455405ef231933a10da379"}, + {file = "kiwisolver-1.4.7-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:dea0bf229319828467d7fca8c7c189780aa9ff679c94539eed7532ebe33ed37c"}, + {file = "kiwisolver-1.4.7-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7c06a4c7cf15ec739ce0e5971b26c93638730090add60e183530d70848ebdd34"}, + {file = "kiwisolver-1.4.7-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:913983ad2deb14e66d83c28b632fd35ba2b825031f2fa4ca29675e665dfecbe1"}, + {file = "kiwisolver-1.4.7-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:5337ec7809bcd0f424c6b705ecf97941c46279cf5ed92311782c7c9c2026f07f"}, + {file = "kiwisolver-1.4.7-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:4c26ed10c4f6fa6ddb329a5120ba3b6db349ca192ae211e882970bfc9d91420b"}, + {file = "kiwisolver-1.4.7-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:c619b101e6de2222c1fcb0531e1b17bbffbe54294bfba43ea0d411d428618c27"}, + {file = "kiwisolver-1.4.7-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:073a36c8273647592ea332e816e75ef8da5c303236ec0167196793eb1e34657a"}, + {file = "kiwisolver-1.4.7-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:3ce6b2b0231bda412463e152fc18335ba32faf4e8c23a754ad50ffa70e4091ee"}, + {file = "kiwisolver-1.4.7-cp313-cp313-win32.whl", hash = "sha256:f4c9aee212bc89d4e13f58be11a56cc8036cabad119259d12ace14b34476fd07"}, + {file = "kiwisolver-1.4.7-cp313-cp313-win_amd64.whl", hash = "sha256:8a3ec5aa8e38fc4c8af308917ce12c536f1c88452ce554027e55b22cbbfbff76"}, + {file = "kiwisolver-1.4.7-cp313-cp313-win_arm64.whl", hash = "sha256:76c8094ac20ec259471ac53e774623eb62e6e1f56cd8690c67ce6ce4fcb05650"}, + {file = "kiwisolver-1.4.7-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5d5abf8f8ec1f4e22882273c423e16cae834c36856cac348cfbfa68e01c40f3a"}, + {file = "kiwisolver-1.4.7-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:aeb3531b196ef6f11776c21674dba836aeea9d5bd1cf630f869e3d90b16cfade"}, + {file = "kiwisolver-1.4.7-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b7d755065e4e866a8086c9bdada157133ff466476a2ad7861828e17b6026e22c"}, + {file = "kiwisolver-1.4.7-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:08471d4d86cbaec61f86b217dd938a83d85e03785f51121e791a6e6689a3be95"}, + {file = "kiwisolver-1.4.7-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7bbfcb7165ce3d54a3dfbe731e470f65739c4c1f85bb1018ee912bae139e263b"}, + {file = "kiwisolver-1.4.7-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5d34eb8494bea691a1a450141ebb5385e4b69d38bb8403b5146ad279f4b30fa3"}, + {file = "kiwisolver-1.4.7-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9242795d174daa40105c1d86aba618e8eab7bf96ba8c3ee614da8302a9f95503"}, + {file = "kiwisolver-1.4.7-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:a0f64a48bb81af7450e641e3fe0b0394d7381e342805479178b3d335d60ca7cf"}, + {file = "kiwisolver-1.4.7-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:8e045731a5416357638d1700927529e2b8ab304811671f665b225f8bf8d8f933"}, + {file = "kiwisolver-1.4.7-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:4322872d5772cae7369f8351da1edf255a604ea7087fe295411397d0cfd9655e"}, + {file = "kiwisolver-1.4.7-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:e1631290ee9271dffe3062d2634c3ecac02c83890ada077d225e081aca8aab89"}, + {file = "kiwisolver-1.4.7-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:edcfc407e4eb17e037bca59be0e85a2031a2ac87e4fed26d3e9df88b4165f92d"}, + {file = "kiwisolver-1.4.7-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:4d05d81ecb47d11e7f8932bd8b61b720bf0b41199358f3f5e36d38e28f0532c5"}, + {file = "kiwisolver-1.4.7-cp38-cp38-win32.whl", hash = "sha256:b38ac83d5f04b15e515fd86f312479d950d05ce2368d5413d46c088dda7de90a"}, + {file = "kiwisolver-1.4.7-cp38-cp38-win_amd64.whl", hash = "sha256:d83db7cde68459fc803052a55ace60bea2bae361fc3b7a6d5da07e11954e4b09"}, + {file = "kiwisolver-1.4.7-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:3f9362ecfca44c863569d3d3c033dbe8ba452ff8eed6f6b5806382741a1334bd"}, + {file = "kiwisolver-1.4.7-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e8df2eb9b2bac43ef8b082e06f750350fbbaf2887534a5be97f6cf07b19d9583"}, + {file = "kiwisolver-1.4.7-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f32d6edbc638cde7652bd690c3e728b25332acbadd7cad670cc4a02558d9c417"}, + {file = "kiwisolver-1.4.7-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:e2e6c39bd7b9372b0be21456caab138e8e69cc0fc1190a9dfa92bd45a1e6e904"}, + {file = "kiwisolver-1.4.7-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:dda56c24d869b1193fcc763f1284b9126550eaf84b88bbc7256e15028f19188a"}, + {file = "kiwisolver-1.4.7-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:79849239c39b5e1fd906556c474d9b0439ea6792b637511f3fe3a41158d89ca8"}, + {file = "kiwisolver-1.4.7-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5e3bc157fed2a4c02ec468de4ecd12a6e22818d4f09cde2c31ee3226ffbefab2"}, + {file = "kiwisolver-1.4.7-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3da53da805b71e41053dc670f9a820d1157aae77b6b944e08024d17bcd51ef88"}, + {file = "kiwisolver-1.4.7-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:8705f17dfeb43139a692298cb6637ee2e59c0194538153e83e9ee0c75c2eddde"}, + {file = "kiwisolver-1.4.7-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:82a5c2f4b87c26bb1a0ef3d16b5c4753434633b83d365cc0ddf2770c93829e3c"}, + {file = "kiwisolver-1.4.7-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:ce8be0466f4c0d585cdb6c1e2ed07232221df101a4c6f28821d2aa754ca2d9e2"}, + {file = "kiwisolver-1.4.7-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:409afdfe1e2e90e6ee7fc896f3df9a7fec8e793e58bfa0d052c8a82f99c37abb"}, + {file = "kiwisolver-1.4.7-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:5b9c3f4ee0b9a439d2415012bd1b1cc2df59e4d6a9939f4d669241d30b414327"}, + {file = "kiwisolver-1.4.7-cp39-cp39-win32.whl", hash = "sha256:a79ae34384df2b615eefca647a2873842ac3b596418032bef9a7283675962644"}, + {file = "kiwisolver-1.4.7-cp39-cp39-win_amd64.whl", hash = "sha256:cf0438b42121a66a3a667de17e779330fc0f20b0d97d59d2f2121e182b0505e4"}, + {file = "kiwisolver-1.4.7-cp39-cp39-win_arm64.whl", hash = "sha256:764202cc7e70f767dab49e8df52c7455e8de0df5d858fa801a11aa0d882ccf3f"}, + {file = "kiwisolver-1.4.7-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:94252291e3fe68001b1dd747b4c0b3be12582839b95ad4d1b641924d68fd4643"}, + {file = "kiwisolver-1.4.7-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:5b7dfa3b546da08a9f622bb6becdb14b3e24aaa30adba66749d38f3cc7ea9706"}, + {file = "kiwisolver-1.4.7-pp310-pypy310_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bd3de6481f4ed8b734da5df134cd5a6a64fe32124fe83dde1e5b5f29fe30b1e6"}, + {file = "kiwisolver-1.4.7-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a91b5f9f1205845d488c928e8570dcb62b893372f63b8b6e98b863ebd2368ff2"}, + {file = "kiwisolver-1.4.7-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:40fa14dbd66b8b8f470d5fc79c089a66185619d31645f9b0773b88b19f7223c4"}, + {file = "kiwisolver-1.4.7-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:eb542fe7933aa09d8d8f9d9097ef37532a7df6497819d16efe4359890a2f417a"}, + {file = "kiwisolver-1.4.7-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:bfa1acfa0c54932d5607e19a2c24646fb4c1ae2694437789129cf099789a3b00"}, + {file = "kiwisolver-1.4.7-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:eee3ea935c3d227d49b4eb85660ff631556841f6e567f0f7bda972df6c2c9935"}, + {file = "kiwisolver-1.4.7-pp38-pypy38_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:f3160309af4396e0ed04db259c3ccbfdc3621b5559b5453075e5de555e1f3a1b"}, + {file = "kiwisolver-1.4.7-pp38-pypy38_pp73-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:a17f6a29cf8935e587cc8a4dbfc8368c55edc645283db0ce9801016f83526c2d"}, + {file = "kiwisolver-1.4.7-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:10849fb2c1ecbfae45a693c070e0320a91b35dd4bcf58172c023b994283a124d"}, + {file = "kiwisolver-1.4.7-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:ac542bf38a8a4be2dc6b15248d36315ccc65f0743f7b1a76688ffb6b5129a5c2"}, + {file = "kiwisolver-1.4.7-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:8b01aac285f91ca889c800042c35ad3b239e704b150cfd3382adfc9dcc780e39"}, + {file = "kiwisolver-1.4.7-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:48be928f59a1f5c8207154f935334d374e79f2b5d212826307d072595ad76a2e"}, + {file = "kiwisolver-1.4.7-pp39-pypy39_pp73-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f37cfe618a117e50d8c240555331160d73d0411422b59b5ee217843d7b693608"}, + {file = "kiwisolver-1.4.7-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:599b5c873c63a1f6ed7eead644a8a380cfbdf5db91dcb6f85707aaab213b1674"}, + {file = "kiwisolver-1.4.7-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:801fa7802e5cfabe3ab0c81a34c323a319b097dfb5004be950482d882f3d7225"}, + {file = "kiwisolver-1.4.7-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:0c6c43471bc764fad4bc99c5c2d6d16a676b1abf844ca7c8702bdae92df01ee0"}, + {file = "kiwisolver-1.4.7.tar.gz", hash = "sha256:9893ff81bd7107f7b685d3017cc6583daadb4fc26e4a888350df530e41980a60"}, +] + +[[package]] +name = "kiwisolver" +version = "1.4.9" +description = "A fast implementation of the Cassowary constraint solver" +optional = false +python-versions = ">=3.10" +groups = ["main", "dev"] +markers = "python_version >= \"3.10\"" +files = [ + {file = "kiwisolver-1.4.9-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:b4b4d74bda2b8ebf4da5bd42af11d02d04428b2c32846e4c2c93219df8a7987b"}, + {file = "kiwisolver-1.4.9-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:fb3b8132019ea572f4611d770991000d7f58127560c4889729248eb5852a102f"}, + {file = "kiwisolver-1.4.9-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:84fd60810829c27ae375114cd379da1fa65e6918e1da405f356a775d49a62bcf"}, + {file = "kiwisolver-1.4.9-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:b78efa4c6e804ecdf727e580dbb9cba85624d2e1c6b5cb059c66290063bd99a9"}, + {file = "kiwisolver-1.4.9-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d4efec7bcf21671db6a3294ff301d2fc861c31faa3c8740d1a94689234d1b415"}, + {file = "kiwisolver-1.4.9-cp310-cp310-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:90f47e70293fc3688b71271100a1a5453aa9944a81d27ff779c108372cf5567b"}, + {file = "kiwisolver-1.4.9-cp310-cp310-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:8fdca1def57a2e88ef339de1737a1449d6dbf5fab184c54a1fca01d541317154"}, + {file = "kiwisolver-1.4.9-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:9cf554f21be770f5111a1690d42313e140355e687e05cf82cb23d0a721a64a48"}, + {file = "kiwisolver-1.4.9-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:fc1795ac5cd0510207482c3d1d3ed781143383b8cfd36f5c645f3897ce066220"}, + {file = "kiwisolver-1.4.9-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:ccd09f20ccdbbd341b21a67ab50a119b64a403b09288c27481575105283c1586"}, + {file = "kiwisolver-1.4.9-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:540c7c72324d864406a009d72f5d6856f49693db95d1fbb46cf86febef873634"}, + {file = "kiwisolver-1.4.9-cp310-cp310-win_amd64.whl", hash = "sha256:ede8c6d533bc6601a47ad4046080d36b8fc99f81e6f1c17b0ac3c2dc91ac7611"}, + {file = "kiwisolver-1.4.9-cp310-cp310-win_arm64.whl", hash = "sha256:7b4da0d01ac866a57dd61ac258c5607b4cd677f63abaec7b148354d2b2cdd536"}, + {file = "kiwisolver-1.4.9-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:eb14a5da6dc7642b0f3a18f13654847cd8b7a2550e2645a5bda677862b03ba16"}, + {file = "kiwisolver-1.4.9-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:39a219e1c81ae3b103643d2aedb90f1ef22650deb266ff12a19e7773f3e5f089"}, + {file = "kiwisolver-1.4.9-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2405a7d98604b87f3fc28b1716783534b1b4b8510d8142adca34ee0bc3c87543"}, + {file = "kiwisolver-1.4.9-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:dc1ae486f9abcef254b5618dfb4113dd49f94c68e3e027d03cf0143f3f772b61"}, + {file = "kiwisolver-1.4.9-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8a1f570ce4d62d718dce3f179ee78dac3b545ac16c0c04bb363b7607a949c0d1"}, + {file = "kiwisolver-1.4.9-cp311-cp311-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:cb27e7b78d716c591e88e0a09a2139c6577865d7f2e152488c2cc6257f460872"}, + {file = "kiwisolver-1.4.9-cp311-cp311-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:15163165efc2f627eb9687ea5f3a28137217d217ac4024893d753f46bce9de26"}, + {file = "kiwisolver-1.4.9-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:bdee92c56a71d2b24c33a7d4c2856bd6419d017e08caa7802d2963870e315028"}, + {file = "kiwisolver-1.4.9-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:412f287c55a6f54b0650bd9b6dce5aceddb95864a1a90c87af16979d37c89771"}, + {file = "kiwisolver-1.4.9-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:2c93f00dcba2eea70af2be5f11a830a742fe6b579a1d4e00f47760ef13be247a"}, + {file = "kiwisolver-1.4.9-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f117e1a089d9411663a3207ba874f31be9ac8eaa5b533787024dc07aeb74f464"}, + {file = "kiwisolver-1.4.9-cp311-cp311-win_amd64.whl", hash = "sha256:be6a04e6c79819c9a8c2373317d19a96048e5a3f90bec587787e86a1153883c2"}, + {file = "kiwisolver-1.4.9-cp311-cp311-win_arm64.whl", hash = "sha256:0ae37737256ba2de764ddc12aed4956460277f00c4996d51a197e72f62f5eec7"}, + {file = "kiwisolver-1.4.9-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:ac5a486ac389dddcc5bef4f365b6ae3ffff2c433324fb38dd35e3fab7c957999"}, + {file = "kiwisolver-1.4.9-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:f2ba92255faa7309d06fe44c3a4a97efe1c8d640c2a79a5ef728b685762a6fd2"}, + {file = "kiwisolver-1.4.9-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4a2899935e724dd1074cb568ce7ac0dce28b2cd6ab539c8e001a8578eb106d14"}, + {file = "kiwisolver-1.4.9-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f6008a4919fdbc0b0097089f67a1eb55d950ed7e90ce2cc3e640abadd2757a04"}, + {file = "kiwisolver-1.4.9-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:67bb8b474b4181770f926f7b7d2f8c0248cbcb78b660fdd41a47054b28d2a752"}, + {file = "kiwisolver-1.4.9-cp312-cp312-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:2327a4a30d3ee07d2fbe2e7933e8a37c591663b96ce42a00bc67461a87d7df77"}, + {file = "kiwisolver-1.4.9-cp312-cp312-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:7a08b491ec91b1d5053ac177afe5290adacf1f0f6307d771ccac5de30592d198"}, + {file = "kiwisolver-1.4.9-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:d8fc5c867c22b828001b6a38d2eaeb88160bf5783c6cb4a5e440efc981ce286d"}, + {file = "kiwisolver-1.4.9-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:3b3115b2581ea35bb6d1f24a4c90af37e5d9b49dcff267eeed14c3893c5b86ab"}, + {file = "kiwisolver-1.4.9-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:858e4c22fb075920b96a291928cb7dea5644e94c0ee4fcd5af7e865655e4ccf2"}, + {file = "kiwisolver-1.4.9-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ed0fecd28cc62c54b262e3736f8bb2512d8dcfdc2bcf08be5f47f96bf405b145"}, + {file = "kiwisolver-1.4.9-cp312-cp312-win_amd64.whl", hash = "sha256:f68208a520c3d86ea51acf688a3e3002615a7f0238002cccc17affecc86a8a54"}, + {file = "kiwisolver-1.4.9-cp312-cp312-win_arm64.whl", hash = "sha256:2c1a4f57df73965f3f14df20b80ee29e6a7930a57d2d9e8491a25f676e197c60"}, + {file = "kiwisolver-1.4.9-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:a5d0432ccf1c7ab14f9949eec60c5d1f924f17c037e9f8b33352fa05799359b8"}, + {file = "kiwisolver-1.4.9-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:efb3a45b35622bb6c16dbfab491a8f5a391fe0e9d45ef32f4df85658232ca0e2"}, + {file = "kiwisolver-1.4.9-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1a12cf6398e8a0a001a059747a1cbf24705e18fe413bc22de7b3d15c67cffe3f"}, + {file = "kiwisolver-1.4.9-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:b67e6efbf68e077dd71d1a6b37e43e1a99d0bff1a3d51867d45ee8908b931098"}, + {file = "kiwisolver-1.4.9-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5656aa670507437af0207645273ccdfee4f14bacd7f7c67a4306d0dcaeaf6eed"}, + {file = "kiwisolver-1.4.9-cp313-cp313-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:bfc08add558155345129c7803b3671cf195e6a56e7a12f3dde7c57d9b417f525"}, + {file = "kiwisolver-1.4.9-cp313-cp313-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:40092754720b174e6ccf9e845d0d8c7d8e12c3d71e7fc35f55f3813e96376f78"}, + {file = "kiwisolver-1.4.9-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:497d05f29a1300d14e02e6441cf0f5ee81c1ff5a304b0d9fb77423974684e08b"}, + {file = "kiwisolver-1.4.9-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:bdd1a81a1860476eb41ac4bc1e07b3f07259e6d55bbf739b79c8aaedcf512799"}, + {file = "kiwisolver-1.4.9-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:e6b93f13371d341afee3be9f7c5964e3fe61d5fa30f6a30eb49856935dfe4fc3"}, + {file = "kiwisolver-1.4.9-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:d75aa530ccfaa593da12834b86a0724f58bff12706659baa9227c2ccaa06264c"}, + {file = "kiwisolver-1.4.9-cp313-cp313-win_amd64.whl", hash = "sha256:dd0a578400839256df88c16abddf9ba14813ec5f21362e1fe65022e00c883d4d"}, + {file = "kiwisolver-1.4.9-cp313-cp313-win_arm64.whl", hash = "sha256:d4188e73af84ca82468f09cadc5ac4db578109e52acb4518d8154698d3a87ca2"}, + {file = "kiwisolver-1.4.9-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:5a0f2724dfd4e3b3ac5a82436a8e6fd16baa7d507117e4279b660fe8ca38a3a1"}, + {file = "kiwisolver-1.4.9-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:1b11d6a633e4ed84fc0ddafd4ebfd8ea49b3f25082c04ad12b8315c11d504dc1"}, + {file = "kiwisolver-1.4.9-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:61874cdb0a36016354853593cffc38e56fc9ca5aa97d2c05d3dcf6922cd55a11"}, + {file = "kiwisolver-1.4.9-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:60c439763a969a6af93b4881db0eed8fadf93ee98e18cbc35bc8da868d0c4f0c"}, + {file = "kiwisolver-1.4.9-cp313-cp313t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:92a2f997387a1b79a75e7803aa7ded2cfbe2823852ccf1ba3bcf613b62ae3197"}, + {file = "kiwisolver-1.4.9-cp313-cp313t-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a31d512c812daea6d8b3be3b2bfcbeb091dbb09177706569bcfc6240dcf8b41c"}, + {file = "kiwisolver-1.4.9-cp313-cp313t-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:52a15b0f35dad39862d376df10c5230155243a2c1a436e39eb55623ccbd68185"}, + {file = "kiwisolver-1.4.9-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:a30fd6fdef1430fd9e1ba7b3398b5ee4e2887783917a687d86ba69985fb08748"}, + {file = "kiwisolver-1.4.9-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:cc9617b46837c6468197b5945e196ee9ca43057bb7d9d1ae688101e4e1dddf64"}, + {file = "kiwisolver-1.4.9-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:0ab74e19f6a2b027ea4f845a78827969af45ce790e6cb3e1ebab71bdf9f215ff"}, + {file = "kiwisolver-1.4.9-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:dba5ee5d3981160c28d5490f0d1b7ed730c22470ff7f6cc26cfcfaacb9896a07"}, + {file = "kiwisolver-1.4.9-cp313-cp313t-win_arm64.whl", hash = "sha256:0749fd8f4218ad2e851e11cc4dc05c7cbc0cbc4267bdfdb31782e65aace4ee9c"}, + {file = "kiwisolver-1.4.9-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:9928fe1eb816d11ae170885a74d074f57af3a0d65777ca47e9aeb854a1fba386"}, + {file = "kiwisolver-1.4.9-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:d0005b053977e7b43388ddec89fa567f43d4f6d5c2c0affe57de5ebf290dc552"}, + {file = "kiwisolver-1.4.9-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:2635d352d67458b66fd0667c14cb1d4145e9560d503219034a18a87e971ce4f3"}, + {file = "kiwisolver-1.4.9-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:767c23ad1c58c9e827b649a9ab7809fd5fd9db266a9cf02b0e926ddc2c680d58"}, + {file = "kiwisolver-1.4.9-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:72d0eb9fba308b8311685c2268cf7d0a0639a6cd027d8128659f72bdd8a024b4"}, + {file = "kiwisolver-1.4.9-cp314-cp314-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:f68e4f3eeca8fb22cc3d731f9715a13b652795ef657a13df1ad0c7dc0e9731df"}, + {file = "kiwisolver-1.4.9-cp314-cp314-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d84cd4061ae292d8ac367b2c3fa3aad11cb8625a95d135fe93f286f914f3f5a6"}, + {file = "kiwisolver-1.4.9-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:a60ea74330b91bd22a29638940d115df9dc00af5035a9a2a6ad9399ffb4ceca5"}, + {file = "kiwisolver-1.4.9-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:ce6a3a4e106cf35c2d9c4fa17c05ce0b180db622736845d4315519397a77beaf"}, + {file = "kiwisolver-1.4.9-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:77937e5e2a38a7b48eef0585114fe7930346993a88060d0bf886086d2aa49ef5"}, + {file = "kiwisolver-1.4.9-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:24c175051354f4a28c5d6a31c93906dc653e2bf234e8a4bbfb964892078898ce"}, + {file = "kiwisolver-1.4.9-cp314-cp314-win_amd64.whl", hash = "sha256:0763515d4df10edf6d06a3c19734e2566368980d21ebec439f33f9eb936c07b7"}, + {file = "kiwisolver-1.4.9-cp314-cp314-win_arm64.whl", hash = "sha256:0e4e2bf29574a6a7b7f6cb5fa69293b9f96c928949ac4a53ba3f525dffb87f9c"}, + {file = "kiwisolver-1.4.9-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:d976bbb382b202f71c67f77b0ac11244021cfa3f7dfd9e562eefcea2df711548"}, + {file = "kiwisolver-1.4.9-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:2489e4e5d7ef9a1c300a5e0196e43d9c739f066ef23270607d45aba368b91f2d"}, + {file = "kiwisolver-1.4.9-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:e2ea9f7ab7fbf18fffb1b5434ce7c69a07582f7acc7717720f1d69f3e806f90c"}, + {file = "kiwisolver-1.4.9-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:b34e51affded8faee0dfdb705416153819d8ea9250bbbf7ea1b249bdeb5f1122"}, + {file = "kiwisolver-1.4.9-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d8aacd3d4b33b772542b2e01beb50187536967b514b00003bdda7589722d2a64"}, + {file = "kiwisolver-1.4.9-cp314-cp314t-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:7cf974dd4e35fa315563ac99d6287a1024e4dc2077b8a7d7cd3d2fb65d283134"}, + {file = "kiwisolver-1.4.9-cp314-cp314t-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:85bd218b5ecfbee8c8a82e121802dcb519a86044c9c3b2e4aef02fa05c6da370"}, + {file = "kiwisolver-1.4.9-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:0856e241c2d3df4efef7c04a1e46b1936b6120c9bcf36dd216e3acd84bc4fb21"}, + {file = "kiwisolver-1.4.9-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:9af39d6551f97d31a4deebeac6f45b156f9755ddc59c07b402c148f5dbb6482a"}, + {file = "kiwisolver-1.4.9-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:bb4ae2b57fc1d8cbd1cf7b1d9913803681ffa903e7488012be5b76dedf49297f"}, + {file = "kiwisolver-1.4.9-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:aedff62918805fb62d43a4aa2ecd4482c380dc76cd31bd7c8878588a61bd0369"}, + {file = "kiwisolver-1.4.9-cp314-cp314t-win_amd64.whl", hash = "sha256:1fa333e8b2ce4d9660f2cda9c0e1b6bafcfb2457a9d259faa82289e73ec24891"}, + {file = "kiwisolver-1.4.9-cp314-cp314t-win_arm64.whl", hash = "sha256:4a48a2ce79d65d363597ef7b567ce3d14d68783d2b2263d98db3d9477805ba32"}, + {file = "kiwisolver-1.4.9-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:4d1d9e582ad4d63062d34077a9a1e9f3c34088a2ec5135b1f7190c07cf366527"}, + {file = "kiwisolver-1.4.9-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:deed0c7258ceb4c44ad5ec7d9918f9f14fd05b2be86378d86cf50e63d1e7b771"}, + {file = "kiwisolver-1.4.9-pp310-pypy310_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0a590506f303f512dff6b7f75fd2fd18e16943efee932008fe7140e5fa91d80e"}, + {file = "kiwisolver-1.4.9-pp310-pypy310_pp73-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e09c2279a4d01f099f52d5c4b3d9e208e91edcbd1a175c9662a8b16e000fece9"}, + {file = "kiwisolver-1.4.9-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:c9e7cdf45d594ee04d5be1b24dd9d49f3d1590959b2271fb30b5ca2b262c00fb"}, + {file = "kiwisolver-1.4.9-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:720e05574713db64c356e86732c0f3c5252818d05f9df320f0ad8380641acea5"}, + {file = "kiwisolver-1.4.9-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:17680d737d5335b552994a2008fab4c851bcd7de33094a82067ef3a576ff02fa"}, + {file = "kiwisolver-1.4.9-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:85b5352f94e490c028926ea567fc569c52ec79ce131dadb968d3853e809518c2"}, + {file = "kiwisolver-1.4.9-pp311-pypy311_pp73-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:464415881e4801295659462c49461a24fb107c140de781d55518c4b80cb6790f"}, + {file = "kiwisolver-1.4.9-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:fb940820c63a9590d31d88b815e7a3aa5915cad3ce735ab45f0c730b39547de1"}, + {file = "kiwisolver-1.4.9.tar.gz", hash = "sha256:c3b22c26c6fd6811b0ae8363b95ca8ce4ea3c202d3d0975b2914310ceb1bcc4d"}, +] + +[[package]] +name = "matplotlib" +version = "3.9.4" +description = "Python plotting package" +optional = false +python-versions = ">=3.9" +groups = ["main", "dev"] +markers = "python_version == \"3.9\"" +files = [ + {file = "matplotlib-3.9.4-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:c5fdd7abfb706dfa8d307af64a87f1a862879ec3cd8d0ec8637458f0885b9c50"}, + {file = "matplotlib-3.9.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d89bc4e85e40a71d1477780366c27fb7c6494d293e1617788986f74e2a03d7ff"}, + {file = "matplotlib-3.9.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ddf9f3c26aae695c5daafbf6b94e4c1a30d6cd617ba594bbbded3b33a1fcfa26"}, + {file = "matplotlib-3.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:18ebcf248030173b59a868fda1fe42397253f6698995b55e81e1f57431d85e50"}, + {file = "matplotlib-3.9.4-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:974896ec43c672ec23f3f8c648981e8bc880ee163146e0312a9b8def2fac66f5"}, + {file = "matplotlib-3.9.4-cp310-cp310-win_amd64.whl", hash = "sha256:4598c394ae9711cec135639374e70871fa36b56afae17bdf032a345be552a88d"}, + {file = "matplotlib-3.9.4-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:d4dd29641d9fb8bc4492420c5480398dd40a09afd73aebe4eb9d0071a05fbe0c"}, + {file = "matplotlib-3.9.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:30e5b22e8bcfb95442bf7d48b0d7f3bdf4a450cbf68986ea45fca3d11ae9d099"}, + {file = "matplotlib-3.9.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2bb0030d1d447fd56dcc23b4c64a26e44e898f0416276cac1ebc25522e0ac249"}, + {file = "matplotlib-3.9.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aca90ed222ac3565d2752b83dbb27627480d27662671e4d39da72e97f657a423"}, + {file = "matplotlib-3.9.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:a181b2aa2906c608fcae72f977a4a2d76e385578939891b91c2550c39ecf361e"}, + {file = "matplotlib-3.9.4-cp311-cp311-win_amd64.whl", hash = "sha256:1f6882828231eca17f501c4dcd98a05abb3f03d157fbc0769c6911fe08b6cfd3"}, + {file = "matplotlib-3.9.4-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:dfc48d67e6661378a21c2983200a654b72b5c5cdbd5d2cf6e5e1ece860f0cc70"}, + {file = "matplotlib-3.9.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:47aef0fab8332d02d68e786eba8113ffd6f862182ea2999379dec9e237b7e483"}, + {file = "matplotlib-3.9.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fba1f52c6b7dc764097f52fd9ab627b90db452c9feb653a59945de16752e965f"}, + {file = "matplotlib-3.9.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:173ac3748acaac21afcc3fa1633924609ba1b87749006bc25051c52c422a5d00"}, + {file = "matplotlib-3.9.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:320edea0cadc07007765e33f878b13b3738ffa9745c5f707705692df70ffe0e0"}, + {file = "matplotlib-3.9.4-cp312-cp312-win_amd64.whl", hash = "sha256:a4a4cfc82330b27042a7169533da7991e8789d180dd5b3daeaee57d75cd5a03b"}, + {file = "matplotlib-3.9.4-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:37eeffeeca3c940985b80f5b9a7b95ea35671e0e7405001f249848d2b62351b6"}, + {file = "matplotlib-3.9.4-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3e7465ac859ee4abcb0d836137cd8414e7bb7ad330d905abced457217d4f0f45"}, + {file = "matplotlib-3.9.4-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f4c12302c34afa0cf061bea23b331e747e5e554b0fa595c96e01c7b75bc3b858"}, + {file = "matplotlib-3.9.4-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2b8c97917f21b75e72108b97707ba3d48f171541a74aa2a56df7a40626bafc64"}, + {file = "matplotlib-3.9.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:0229803bd7e19271b03cb09f27db76c918c467aa4ce2ae168171bc67c3f508df"}, + {file = "matplotlib-3.9.4-cp313-cp313-win_amd64.whl", hash = "sha256:7c0d8ef442ebf56ff5e206f8083d08252ee738e04f3dc88ea882853a05488799"}, + {file = "matplotlib-3.9.4-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:a04c3b00066a688834356d196136349cb32f5e1003c55ac419e91585168b88fb"}, + {file = "matplotlib-3.9.4-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:04c519587f6c210626741a1e9a68eefc05966ede24205db8982841826af5871a"}, + {file = "matplotlib-3.9.4-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:308afbf1a228b8b525fcd5cec17f246bbbb63b175a3ef6eb7b4d33287ca0cf0c"}, + {file = "matplotlib-3.9.4-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ddb3b02246ddcffd3ce98e88fed5b238bc5faff10dbbaa42090ea13241d15764"}, + {file = "matplotlib-3.9.4-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:8a75287e9cb9eee48cb79ec1d806f75b29c0fde978cb7223a1f4c5848d696041"}, + {file = "matplotlib-3.9.4-cp313-cp313t-win_amd64.whl", hash = "sha256:488deb7af140f0ba86da003e66e10d55ff915e152c78b4b66d231638400b1965"}, + {file = "matplotlib-3.9.4-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:3c3724d89a387ddf78ff88d2a30ca78ac2b4c89cf37f2db4bd453c34799e933c"}, + {file = "matplotlib-3.9.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:d5f0a8430ffe23d7e32cfd86445864ccad141797f7d25b7c41759a5b5d17cfd7"}, + {file = "matplotlib-3.9.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6bb0141a21aef3b64b633dc4d16cbd5fc538b727e4958be82a0e1c92a234160e"}, + {file = "matplotlib-3.9.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:57aa235109e9eed52e2c2949db17da185383fa71083c00c6c143a60e07e0888c"}, + {file = "matplotlib-3.9.4-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:b18c600061477ccfdd1e6fd050c33d8be82431700f3452b297a56d9ed7037abb"}, + {file = "matplotlib-3.9.4-cp39-cp39-win_amd64.whl", hash = "sha256:ef5f2d1b67d2d2145ff75e10f8c008bfbf71d45137c4b648c87193e7dd053eac"}, + {file = "matplotlib-3.9.4-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:44e0ed786d769d85bc787b0606a53f2d8d2d1d3c8a2608237365e9121c1a338c"}, + {file = "matplotlib-3.9.4-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:09debb9ce941eb23ecdbe7eab972b1c3e0276dcf01688073faff7b0f61d6c6ca"}, + {file = "matplotlib-3.9.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bcc53cf157a657bfd03afab14774d54ba73aa84d42cfe2480c91bd94873952db"}, + {file = "matplotlib-3.9.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:ad45da51be7ad02387801fd154ef74d942f49fe3fcd26a64c94842ba7ec0d865"}, + {file = "matplotlib-3.9.4.tar.gz", hash = "sha256:1e00e8be7393cbdc6fedfa8a6fba02cf3e83814b285db1c60b906a023ba41bc3"}, +] + +[package.dependencies] +contourpy = ">=1.0.1" +cycler = ">=0.10" +fonttools = ">=4.22.0" +importlib-resources = {version = ">=3.2.0", markers = "python_version < \"3.10\""} +kiwisolver = ">=1.3.1" +numpy = ">=1.23" +packaging = ">=20.0" +pillow = ">=8" +pyparsing = ">=2.3.1" +python-dateutil = ">=2.7" + +[package.extras] +dev = ["meson-python (>=0.13.1,<0.17.0)", "numpy (>=1.25)", "pybind11 (>=2.6,!=2.13.3)", "setuptools (>=64)", "setuptools_scm (>=7)"] + +[[package]] +name = "matplotlib" +version = "3.10.8" +description = "Python plotting package" +optional = false +python-versions = ">=3.10" +groups = ["main", "dev"] +markers = "python_version >= \"3.10\"" +files = [ + {file = "matplotlib-3.10.8-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:00270d217d6b20d14b584c521f810d60c5c78406dc289859776550df837dcda7"}, + {file = "matplotlib-3.10.8-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:37b3c1cc42aa184b3f738cfa18c1c1d72fd496d85467a6cf7b807936d39aa656"}, + {file = "matplotlib-3.10.8-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ee40c27c795bda6a5292e9cff9890189d32f7e3a0bf04e0e3c9430c4a00c37df"}, + {file = "matplotlib-3.10.8-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a48f2b74020919552ea25d222d5cc6af9ca3f4eb43a93e14d068457f545c2a17"}, + {file = "matplotlib-3.10.8-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f254d118d14a7f99d616271d6c3c27922c092dac11112670b157798b89bf4933"}, + {file = "matplotlib-3.10.8-cp310-cp310-win_amd64.whl", hash = "sha256:f9b587c9c7274c1613a30afabf65a272114cd6cdbe67b3406f818c79d7ab2e2a"}, + {file = "matplotlib-3.10.8-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:6be43b667360fef5c754dda5d25a32e6307a03c204f3c0fc5468b78fa87b4160"}, + {file = "matplotlib-3.10.8-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a2b336e2d91a3d7006864e0990c83b216fcdca64b5a6484912902cef87313d78"}, + {file = "matplotlib-3.10.8-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:efb30e3baaea72ce5928e32bab719ab4770099079d66726a62b11b1ef7273be4"}, + {file = "matplotlib-3.10.8-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d56a1efd5bfd61486c8bc968fa18734464556f0fb8e51690f4ac25d85cbbbbc2"}, + {file = "matplotlib-3.10.8-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:238b7ce5717600615c895050239ec955d91f321c209dd110db988500558e70d6"}, + {file = "matplotlib-3.10.8-cp311-cp311-win_amd64.whl", hash = "sha256:18821ace09c763ec93aef5eeff087ee493a24051936d7b9ebcad9662f66501f9"}, + {file = "matplotlib-3.10.8-cp311-cp311-win_arm64.whl", hash = "sha256:bab485bcf8b1c7d2060b4fcb6fc368a9e6f4cd754c9c2fea281f4be21df394a2"}, + {file = "matplotlib-3.10.8-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:64fcc24778ca0404ce0cb7b6b77ae1f4c7231cdd60e6778f999ee05cbd581b9a"}, + {file = "matplotlib-3.10.8-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b9a5ca4ac220a0cdd1ba6bcba3608547117d30468fefce49bb26f55c1a3d5c58"}, + {file = "matplotlib-3.10.8-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3ab4aabc72de4ff77b3ec33a6d78a68227bf1123465887f9905ba79184a1cc04"}, + {file = "matplotlib-3.10.8-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:24d50994d8c5816ddc35411e50a86ab05f575e2530c02752e02538122613371f"}, + {file = "matplotlib-3.10.8-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:99eefd13c0dc3b3c1b4d561c1169e65fe47aab7b8158754d7c084088e2329466"}, + {file = "matplotlib-3.10.8-cp312-cp312-win_amd64.whl", hash = "sha256:dd80ecb295460a5d9d260df63c43f4afbdd832d725a531f008dad1664f458adf"}, + {file = "matplotlib-3.10.8-cp312-cp312-win_arm64.whl", hash = "sha256:3c624e43ed56313651bc18a47f838b60d7b8032ed348911c54906b130b20071b"}, + {file = "matplotlib-3.10.8-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:3f2e409836d7f5ac2f1c013110a4d50b9f7edc26328c108915f9075d7d7a91b6"}, + {file = "matplotlib-3.10.8-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:56271f3dac49a88d7fca5060f004d9d22b865f743a12a23b1e937a0be4818ee1"}, + {file = "matplotlib-3.10.8-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a0a7f52498f72f13d4a25ea70f35f4cb60642b466cbb0a9be951b5bc3f45a486"}, + {file = "matplotlib-3.10.8-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:646d95230efb9ca614a7a594d4fcacde0ac61d25e37dd51710b36477594963ce"}, + {file = "matplotlib-3.10.8-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f89c151aab2e2e23cb3fe0acad1e8b82841fd265379c4cecd0f3fcb34c15e0f6"}, + {file = "matplotlib-3.10.8-cp313-cp313-win_amd64.whl", hash = "sha256:e8ea3e2d4066083e264e75c829078f9e149fa119d27e19acd503de65e0b13149"}, + {file = "matplotlib-3.10.8-cp313-cp313-win_arm64.whl", hash = "sha256:c108a1d6fa78a50646029cb6d49808ff0fc1330fda87fa6f6250c6b5369b6645"}, + {file = "matplotlib-3.10.8-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:ad3d9833a64cf48cc4300f2b406c3d0f4f4724a91c0bd5640678a6ba7c102077"}, + {file = "matplotlib-3.10.8-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:eb3823f11823deade26ce3b9f40dcb4a213da7a670013929f31d5f5ed1055b22"}, + {file = "matplotlib-3.10.8-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d9050fee89a89ed57b4fb2c1bfac9a3d0c57a0d55aed95949eedbc42070fea39"}, + {file = "matplotlib-3.10.8-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b44d07310e404ba95f8c25aa5536f154c0a8ec473303535949e52eb71d0a1565"}, + {file = "matplotlib-3.10.8-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:0a33deb84c15ede243aead39f77e990469fff93ad1521163305095b77b72ce4a"}, + {file = "matplotlib-3.10.8-cp313-cp313t-win_amd64.whl", hash = "sha256:3a48a78d2786784cc2413e57397981fb45c79e968d99656706018d6e62e57958"}, + {file = "matplotlib-3.10.8-cp313-cp313t-win_arm64.whl", hash = "sha256:15d30132718972c2c074cd14638c7f4592bd98719e2308bccea40e0538bc0cb5"}, + {file = "matplotlib-3.10.8-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:b53285e65d4fa4c86399979e956235deb900be5baa7fc1218ea67fbfaeaadd6f"}, + {file = "matplotlib-3.10.8-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:32f8dce744be5569bebe789e46727946041199030db8aeb2954d26013a0eb26b"}, + {file = "matplotlib-3.10.8-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4cf267add95b1c88300d96ca837833d4112756045364f5c734a2276038dae27d"}, + {file = "matplotlib-3.10.8-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2cf5bd12cecf46908f286d7838b2abc6c91cda506c0445b8223a7c19a00df008"}, + {file = "matplotlib-3.10.8-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:41703cc95688f2516b480f7f339d8851a6035f18e100ee6a32bc0b8536a12a9c"}, + {file = "matplotlib-3.10.8-cp314-cp314-win_amd64.whl", hash = "sha256:83d282364ea9f3e52363da262ce32a09dfe241e4080dcedda3c0db059d3c1f11"}, + {file = "matplotlib-3.10.8-cp314-cp314-win_arm64.whl", hash = "sha256:2c1998e92cd5999e295a731bcb2911c75f597d937341f3030cc24ef2733d78a8"}, + {file = "matplotlib-3.10.8-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:b5a2b97dbdc7d4f353ebf343744f1d1f1cca8aa8bfddb4262fcf4306c3761d50"}, + {file = "matplotlib-3.10.8-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:3f5c3e4da343bba819f0234186b9004faba952cc420fbc522dc4e103c1985908"}, + {file = "matplotlib-3.10.8-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5f62550b9a30afde8c1c3ae450e5eb547d579dd69b25c2fc7a1c67f934c1717a"}, + {file = "matplotlib-3.10.8-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:495672de149445ec1b772ff2c9ede9b769e3cb4f0d0aa7fa730d7f59e2d4e1c1"}, + {file = "matplotlib-3.10.8-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:595ba4d8fe983b88f0eec8c26a241e16d6376fe1979086232f481f8f3f67494c"}, + {file = "matplotlib-3.10.8-cp314-cp314t-win_amd64.whl", hash = "sha256:25d380fe8b1dc32cf8f0b1b448470a77afb195438bafdf1d858bfb876f3edf7b"}, + {file = "matplotlib-3.10.8-cp314-cp314t-win_arm64.whl", hash = "sha256:113bb52413ea508ce954a02c10ffd0d565f9c3bc7f2eddc27dfe1731e71c7b5f"}, + {file = "matplotlib-3.10.8-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:f97aeb209c3d2511443f8797e3e5a569aebb040d4f8bc79aa3ee78a8fb9e3dd8"}, + {file = "matplotlib-3.10.8-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:fb061f596dad3a0f52b60dc6a5dec4a0c300dec41e058a7efe09256188d170b7"}, + {file = "matplotlib-3.10.8-pp310-pypy310_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:12d90df9183093fcd479f4172ac26b322b1248b15729cb57f42f71f24c7e37a3"}, + {file = "matplotlib-3.10.8-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:6da7c2ce169267d0d066adcf63758f0604aa6c3eebf67458930f9d9b79ad1db1"}, + {file = "matplotlib-3.10.8-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:9153c3292705be9f9c64498a8872118540c3f4123d1a1c840172edf262c8be4a"}, + {file = "matplotlib-3.10.8-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1ae029229a57cd1e8fe542485f27e7ca7b23aa9e8944ddb4985d0bc444f1eca2"}, + {file = "matplotlib-3.10.8.tar.gz", hash = "sha256:2299372c19d56bcd35cf05a2738308758d32b9eaed2371898d8f5bd33f084aa3"}, +] + +[package.dependencies] +contourpy = ">=1.0.1" +cycler = ">=0.10" +fonttools = ">=4.22.0" +kiwisolver = ">=1.3.1" +numpy = ">=1.23" +packaging = ">=20.0" +pillow = ">=8" +pyparsing = ">=3" +python-dateutil = ">=2.7" + +[package.extras] +dev = ["meson-python (>=0.13.1,<0.17.0)", "pybind11 (>=2.13.2,!=2.13.3)", "setuptools (>=64)", "setuptools_scm (>=7)"] + [[package]] name = "mypy-extensions" version = "1.1.0" description = "Type system extensions for programs checked with the mypy type checker." optional = false python-versions = ">=3.8" +groups = ["main", "dev"] files = [ {file = "mypy_extensions-1.1.0-py3-none-any.whl", hash = "sha256:1be4cccdb0f2482337c4743e60421de3a356cd97508abadd57d47403e94f5505"}, {file = "mypy_extensions-1.1.0.tar.gz", hash = "sha256:52e68efc3284861e772bbcd66823fde5ae21fd2fdb51c62a211403730b916558"}, @@ -402,6 +1312,7 @@ version = "1.10.0" description = "Node.js virtual environment builder" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" +groups = ["main", "dev"] files = [ {file = "nodeenv-1.10.0-py2.py3-none-any.whl", hash = "sha256:5bb13e3eed2923615535339b3c620e76779af4cb4c6a90deccc9e36b274d3827"}, {file = "nodeenv-1.10.0.tar.gz", hash = "sha256:996c191ad80897d076bdfba80a41994c2b47c68e224c542b48feba42ba00f8bb"}, @@ -413,6 +1324,7 @@ version = "2.0.2" description = "Fundamental package for array computing in Python" optional = false python-versions = ">=3.9" +groups = ["main", "dev"] files = [ {file = "numpy-2.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:51129a29dbe56f9ca83438b706e2e69a39892b5eda6cedcb6b0c9fdc9b0d3ece"}, {file = "numpy-2.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f15975dfec0cf2239224d80e32c3170b1d168335eaedee69da84fbe9f1f9cd04"}, @@ -467,6 +1379,7 @@ version = "25.0" description = "Core utilities for Python packages" optional = false python-versions = ">=3.8" +groups = ["main", "dev"] files = [ {file = "packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484"}, {file = "packaging-25.0.tar.gz", hash = "sha256:d443872c98d677bf60f6a1f2f8c1cb748e8fe762d2bf9d3148b5599295b0fc4f"}, @@ -478,6 +1391,7 @@ version = "2.3.3" description = "Powerful data structures for data analysis, time series, and statistics" optional = false python-versions = ">=3.9" +groups = ["main", "dev"] files = [ {file = "pandas-2.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:376c6446ae31770764215a6c937f72d917f214b43560603cd60da6408f183b6c"}, {file = "pandas-2.3.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e19d192383eab2f4ceb30b412b22ea30690c9e618f78870357ae1d682912015a"}, @@ -577,6 +1491,7 @@ version = "1.0.3" description = "Utility library for gitignore style pattern matching of file paths." optional = false python-versions = ">=3.9" +groups = ["main", "dev"] files = [ {file = "pathspec-1.0.3-py3-none-any.whl", hash = "sha256:e80767021c1cc524aa3fb14bedda9c34406591343cc42797b386ce7b9354fb6c"}, {file = "pathspec-1.0.3.tar.gz", hash = "sha256:bac5cf97ae2c2876e2d25ebb15078eb04d76e4b98921ee31c6f85ade8b59444d"}, @@ -588,12 +1503,249 @@ optional = ["typing-extensions (>=4)"] re2 = ["google-re2 (>=1.1)"] tests = ["pytest (>=9)", "typing-extensions (>=4.15)"] +[[package]] +name = "pillow" +version = "11.3.0" +description = "Python Imaging Library (Fork)" +optional = false +python-versions = ">=3.9" +groups = ["main", "dev"] +markers = "python_version == \"3.9\"" +files = [ + {file = "pillow-11.3.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:1b9c17fd4ace828b3003dfd1e30bff24863e0eb59b535e8f80194d9cc7ecf860"}, + {file = "pillow-11.3.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:65dc69160114cdd0ca0f35cb434633c75e8e7fad4cf855177a05bf38678f73ad"}, + {file = "pillow-11.3.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7107195ddc914f656c7fc8e4a5e1c25f32e9236ea3ea860f257b0436011fddd0"}, + {file = "pillow-11.3.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:cc3e831b563b3114baac7ec2ee86819eb03caa1a2cef0b481a5675b59c4fe23b"}, + {file = "pillow-11.3.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f1f182ebd2303acf8c380a54f615ec883322593320a9b00438eb842c1f37ae50"}, + {file = "pillow-11.3.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4445fa62e15936a028672fd48c4c11a66d641d2c05726c7ec1f8ba6a572036ae"}, + {file = "pillow-11.3.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:71f511f6b3b91dd543282477be45a033e4845a40278fa8dcdbfdb07109bf18f9"}, + {file = "pillow-11.3.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:040a5b691b0713e1f6cbe222e0f4f74cd233421e105850ae3b3c0ceda520f42e"}, + {file = "pillow-11.3.0-cp310-cp310-win32.whl", hash = "sha256:89bd777bc6624fe4115e9fac3352c79ed60f3bb18651420635f26e643e3dd1f6"}, + {file = "pillow-11.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:19d2ff547c75b8e3ff46f4d9ef969a06c30ab2d4263a9e287733aa8b2429ce8f"}, + {file = "pillow-11.3.0-cp310-cp310-win_arm64.whl", hash = "sha256:819931d25e57b513242859ce1876c58c59dc31587847bf74cfe06b2e0cb22d2f"}, + {file = "pillow-11.3.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:1cd110edf822773368b396281a2293aeb91c90a2db00d78ea43e7e861631b722"}, + {file = "pillow-11.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9c412fddd1b77a75aa904615ebaa6001f169b26fd467b4be93aded278266b288"}, + {file = "pillow-11.3.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7d1aa4de119a0ecac0a34a9c8bde33f34022e2e8f99104e47a3ca392fd60e37d"}, + {file = "pillow-11.3.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:91da1d88226663594e3f6b4b8c3c8d85bd504117d043740a8e0ec449087cc494"}, + {file = "pillow-11.3.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:643f189248837533073c405ec2f0bb250ba54598cf80e8c1e043381a60632f58"}, + {file = "pillow-11.3.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:106064daa23a745510dabce1d84f29137a37224831d88eb4ce94bb187b1d7e5f"}, + {file = "pillow-11.3.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:cd8ff254faf15591e724dc7c4ddb6bf4793efcbe13802a4ae3e863cd300b493e"}, + {file = "pillow-11.3.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:932c754c2d51ad2b2271fd01c3d121daaa35e27efae2a616f77bf164bc0b3e94"}, + {file = "pillow-11.3.0-cp311-cp311-win32.whl", hash = "sha256:b4b8f3efc8d530a1544e5962bd6b403d5f7fe8b9e08227c6b255f98ad82b4ba0"}, + {file = "pillow-11.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:1a992e86b0dd7aeb1f053cd506508c0999d710a8f07b4c791c63843fc6a807ac"}, + {file = "pillow-11.3.0-cp311-cp311-win_arm64.whl", hash = "sha256:30807c931ff7c095620fe04448e2c2fc673fcbb1ffe2a7da3fb39613489b1ddd"}, + {file = "pillow-11.3.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:fdae223722da47b024b867c1ea0be64e0df702c5e0a60e27daad39bf960dd1e4"}, + {file = "pillow-11.3.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:921bd305b10e82b4d1f5e802b6850677f965d8394203d182f078873851dada69"}, + {file = "pillow-11.3.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:eb76541cba2f958032d79d143b98a3a6b3ea87f0959bbe256c0b5e416599fd5d"}, + {file = "pillow-11.3.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:67172f2944ebba3d4a7b54f2e95c786a3a50c21b88456329314caaa28cda70f6"}, + {file = "pillow-11.3.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:97f07ed9f56a3b9b5f49d3661dc9607484e85c67e27f3e8be2c7d28ca032fec7"}, + {file = "pillow-11.3.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:676b2815362456b5b3216b4fd5bd89d362100dc6f4945154ff172e206a22c024"}, + {file = "pillow-11.3.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3e184b2f26ff146363dd07bde8b711833d7b0202e27d13540bfe2e35a323a809"}, + {file = "pillow-11.3.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:6be31e3fc9a621e071bc17bb7de63b85cbe0bfae91bb0363c893cbe67247780d"}, + {file = "pillow-11.3.0-cp312-cp312-win32.whl", hash = "sha256:7b161756381f0918e05e7cb8a371fff367e807770f8fe92ecb20d905d0e1c149"}, + {file = "pillow-11.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:a6444696fce635783440b7f7a9fc24b3ad10a9ea3f0ab66c5905be1c19ccf17d"}, + {file = "pillow-11.3.0-cp312-cp312-win_arm64.whl", hash = "sha256:2aceea54f957dd4448264f9bf40875da0415c83eb85f55069d89c0ed436e3542"}, + {file = "pillow-11.3.0-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:1c627742b539bba4309df89171356fcb3cc5a9178355b2727d1b74a6cf155fbd"}, + {file = "pillow-11.3.0-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:30b7c02f3899d10f13d7a48163c8969e4e653f8b43416d23d13d1bbfdc93b9f8"}, + {file = "pillow-11.3.0-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:7859a4cc7c9295f5838015d8cc0a9c215b77e43d07a25e460f35cf516df8626f"}, + {file = "pillow-11.3.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:ec1ee50470b0d050984394423d96325b744d55c701a439d2bd66089bff963d3c"}, + {file = "pillow-11.3.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:7db51d222548ccfd274e4572fdbf3e810a5e66b00608862f947b163e613b67dd"}, + {file = "pillow-11.3.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:2d6fcc902a24ac74495df63faad1884282239265c6839a0a6416d33faedfae7e"}, + {file = "pillow-11.3.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f0f5d8f4a08090c6d6d578351a2b91acf519a54986c055af27e7a93feae6d3f1"}, + {file = "pillow-11.3.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c37d8ba9411d6003bba9e518db0db0c58a680ab9fe5179f040b0463644bc9805"}, + {file = "pillow-11.3.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:13f87d581e71d9189ab21fe0efb5a23e9f28552d5be6979e84001d3b8505abe8"}, + {file = "pillow-11.3.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:023f6d2d11784a465f09fd09a34b150ea4672e85fb3d05931d89f373ab14abb2"}, + {file = "pillow-11.3.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:45dfc51ac5975b938e9809451c51734124e73b04d0f0ac621649821a63852e7b"}, + {file = "pillow-11.3.0-cp313-cp313-win32.whl", hash = "sha256:a4d336baed65d50d37b88ca5b60c0fa9d81e3a87d4a7930d3880d1624d5b31f3"}, + {file = "pillow-11.3.0-cp313-cp313-win_amd64.whl", hash = "sha256:0bce5c4fd0921f99d2e858dc4d4d64193407e1b99478bc5cacecba2311abde51"}, + {file = "pillow-11.3.0-cp313-cp313-win_arm64.whl", hash = "sha256:1904e1264881f682f02b7f8167935cce37bc97db457f8e7849dc3a6a52b99580"}, + {file = "pillow-11.3.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:4c834a3921375c48ee6b9624061076bc0a32a60b5532b322cc0ea64e639dd50e"}, + {file = "pillow-11.3.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:5e05688ccef30ea69b9317a9ead994b93975104a677a36a8ed8106be9260aa6d"}, + {file = "pillow-11.3.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1019b04af07fc0163e2810167918cb5add8d74674b6267616021ab558dc98ced"}, + {file = "pillow-11.3.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f944255db153ebb2b19c51fe85dd99ef0ce494123f21b9db4877ffdfc5590c7c"}, + {file = "pillow-11.3.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1f85acb69adf2aaee8b7da124efebbdb959a104db34d3a2cb0f3793dbae422a8"}, + {file = "pillow-11.3.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:05f6ecbeff5005399bb48d198f098a9b4b6bdf27b8487c7f38ca16eeb070cd59"}, + {file = "pillow-11.3.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:a7bc6e6fd0395bc052f16b1a8670859964dbd7003bd0af2ff08342eb6e442cfe"}, + {file = "pillow-11.3.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:83e1b0161c9d148125083a35c1c5a89db5b7054834fd4387499e06552035236c"}, + {file = "pillow-11.3.0-cp313-cp313t-win32.whl", hash = "sha256:2a3117c06b8fb646639dce83694f2f9eac405472713fcb1ae887469c0d4f6788"}, + {file = "pillow-11.3.0-cp313-cp313t-win_amd64.whl", hash = "sha256:857844335c95bea93fb39e0fa2726b4d9d758850b34075a7e3ff4f4fa3aa3b31"}, + {file = "pillow-11.3.0-cp313-cp313t-win_arm64.whl", hash = "sha256:8797edc41f3e8536ae4b10897ee2f637235c94f27404cac7297f7b607dd0716e"}, + {file = "pillow-11.3.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:d9da3df5f9ea2a89b81bb6087177fb1f4d1c7146d583a3fe5c672c0d94e55e12"}, + {file = "pillow-11.3.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:0b275ff9b04df7b640c59ec5a3cb113eefd3795a8df80bac69646ef699c6981a"}, + {file = "pillow-11.3.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0743841cabd3dba6a83f38a92672cccbd69af56e3e91777b0ee7f4dba4385632"}, + {file = "pillow-11.3.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2465a69cf967b8b49ee1b96d76718cd98c4e925414ead59fdf75cf0fd07df673"}, + {file = "pillow-11.3.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:41742638139424703b4d01665b807c6468e23e699e8e90cffefe291c5832b027"}, + {file = "pillow-11.3.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:93efb0b4de7e340d99057415c749175e24c8864302369e05914682ba642e5d77"}, + {file = "pillow-11.3.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7966e38dcd0fa11ca390aed7c6f20454443581d758242023cf36fcb319b1a874"}, + {file = "pillow-11.3.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:98a9afa7b9007c67ed84c57c9e0ad86a6000da96eaa638e4f8abe5b65ff83f0a"}, + {file = "pillow-11.3.0-cp314-cp314-win32.whl", hash = "sha256:02a723e6bf909e7cea0dac1b0e0310be9d7650cd66222a5f1c571455c0a45214"}, + {file = "pillow-11.3.0-cp314-cp314-win_amd64.whl", hash = "sha256:a418486160228f64dd9e9efcd132679b7a02a5f22c982c78b6fc7dab3fefb635"}, + {file = "pillow-11.3.0-cp314-cp314-win_arm64.whl", hash = "sha256:155658efb5e044669c08896c0c44231c5e9abcaadbc5cd3648df2f7c0b96b9a6"}, + {file = "pillow-11.3.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:59a03cdf019efbfeeed910bf79c7c93255c3d54bc45898ac2a4140071b02b4ae"}, + {file = "pillow-11.3.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:f8a5827f84d973d8636e9dc5764af4f0cf2318d26744b3d902931701b0d46653"}, + {file = "pillow-11.3.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ee92f2fd10f4adc4b43d07ec5e779932b4eb3dbfbc34790ada5a6669bc095aa6"}, + {file = "pillow-11.3.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c96d333dcf42d01f47b37e0979b6bd73ec91eae18614864622d9b87bbd5bbf36"}, + {file = "pillow-11.3.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4c96f993ab8c98460cd0c001447bff6194403e8b1d7e149ade5f00594918128b"}, + {file = "pillow-11.3.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:41342b64afeba938edb034d122b2dda5db2139b9a4af999729ba8818e0056477"}, + {file = "pillow-11.3.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:068d9c39a2d1b358eb9f245ce7ab1b5c3246c7c8c7d9ba58cfa5b43146c06e50"}, + {file = "pillow-11.3.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:a1bc6ba083b145187f648b667e05a2534ecc4b9f2784c2cbe3089e44868f2b9b"}, + {file = "pillow-11.3.0-cp314-cp314t-win32.whl", hash = "sha256:118ca10c0d60b06d006be10a501fd6bbdfef559251ed31b794668ed569c87e12"}, + {file = "pillow-11.3.0-cp314-cp314t-win_amd64.whl", hash = "sha256:8924748b688aa210d79883357d102cd64690e56b923a186f35a82cbc10f997db"}, + {file = "pillow-11.3.0-cp314-cp314t-win_arm64.whl", hash = "sha256:79ea0d14d3ebad43ec77ad5272e6ff9bba5b679ef73375ea760261207fa8e0aa"}, + {file = "pillow-11.3.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:48d254f8a4c776de343051023eb61ffe818299eeac478da55227d96e241de53f"}, + {file = "pillow-11.3.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:7aee118e30a4cf54fdd873bd3a29de51e29105ab11f9aad8c32123f58c8f8081"}, + {file = "pillow-11.3.0-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:23cff760a9049c502721bdb743a7cb3e03365fafcdfc2ef9784610714166e5a4"}, + {file = "pillow-11.3.0-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6359a3bc43f57d5b375d1ad54a0074318a0844d11b76abccf478c37c986d3cfc"}, + {file = "pillow-11.3.0-cp39-cp39-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:092c80c76635f5ecb10f3f83d76716165c96f5229addbd1ec2bdbbda7d496e06"}, + {file = "pillow-11.3.0-cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:cadc9e0ea0a2431124cde7e1697106471fc4c1da01530e679b2391c37d3fbb3a"}, + {file = "pillow-11.3.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:6a418691000f2a418c9135a7cf0d797c1bb7d9a485e61fe8e7722845b95ef978"}, + {file = "pillow-11.3.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:97afb3a00b65cc0804d1c7abddbf090a81eaac02768af58cbdcaaa0a931e0b6d"}, + {file = "pillow-11.3.0-cp39-cp39-win32.whl", hash = "sha256:ea944117a7974ae78059fcc1800e5d3295172bb97035c0c1d9345fca1419da71"}, + {file = "pillow-11.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:e5c5858ad8ec655450a7c7df532e9842cf8df7cc349df7225c60d5d348c8aada"}, + {file = "pillow-11.3.0-cp39-cp39-win_arm64.whl", hash = "sha256:6abdbfd3aea42be05702a8dd98832329c167ee84400a1d1f61ab11437f1717eb"}, + {file = "pillow-11.3.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:3cee80663f29e3843b68199b9d6f4f54bd1d4a6b59bdd91bceefc51238bcb967"}, + {file = "pillow-11.3.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:b5f56c3f344f2ccaf0dd875d3e180f631dc60a51b314295a3e681fe8cf851fbe"}, + {file = "pillow-11.3.0-pp310-pypy310_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e67d793d180c9df62f1f40aee3accca4829d3794c95098887edc18af4b8b780c"}, + {file = "pillow-11.3.0-pp310-pypy310_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d000f46e2917c705e9fb93a3606ee4a819d1e3aa7a9b442f6444f07e77cf5e25"}, + {file = "pillow-11.3.0-pp310-pypy310_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:527b37216b6ac3a12d7838dc3bd75208ec57c1c6d11ef01902266a5a0c14fc27"}, + {file = "pillow-11.3.0-pp310-pypy310_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:be5463ac478b623b9dd3937afd7fb7ab3d79dd290a28e2b6df292dc75063eb8a"}, + {file = "pillow-11.3.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:8dc70ca24c110503e16918a658b869019126ecfe03109b754c402daff12b3d9f"}, + {file = "pillow-11.3.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:7c8ec7a017ad1bd562f93dbd8505763e688d388cde6e4a010ae1486916e713e6"}, + {file = "pillow-11.3.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:9ab6ae226de48019caa8074894544af5b53a117ccb9d3b3dcb2871464c829438"}, + {file = "pillow-11.3.0-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:fe27fb049cdcca11f11a7bfda64043c37b30e6b91f10cb5bab275806c32f6ab3"}, + {file = "pillow-11.3.0-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:465b9e8844e3c3519a983d58b80be3f668e2a7a5db97f2784e7079fbc9f9822c"}, + {file = "pillow-11.3.0-pp311-pypy311_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5418b53c0d59b3824d05e029669efa023bbef0f3e92e75ec8428f3799487f361"}, + {file = "pillow-11.3.0-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:504b6f59505f08ae014f724b6207ff6222662aab5cc9542577fb084ed0676ac7"}, + {file = "pillow-11.3.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:c84d689db21a1c397d001aa08241044aa2069e7587b398c8cc63020390b1c1b8"}, + {file = "pillow-11.3.0.tar.gz", hash = "sha256:3828ee7586cd0b2091b6209e5ad53e20d0649bbe87164a459d0676e035e8f523"}, +] + +[package.extras] +docs = ["furo", "olefile", "sphinx (>=8.2)", "sphinx-autobuild", "sphinx-copybutton", "sphinx-inline-tabs", "sphinxext-opengraph"] +fpx = ["olefile"] +mic = ["olefile"] +test-arrow = ["pyarrow"] +tests = ["check-manifest", "coverage (>=7.4.2)", "defusedxml", "markdown2", "olefile", "packaging", "pyroma", "pytest", "pytest-cov", "pytest-timeout", "pytest-xdist", "trove-classifiers (>=2024.10.12)"] +typing = ["typing-extensions ; python_version < \"3.10\""] +xmp = ["defusedxml"] + +[[package]] +name = "pillow" +version = "12.1.0" +description = "Python Imaging Library (fork)" +optional = false +python-versions = ">=3.10" +groups = ["main", "dev"] +markers = "python_version >= \"3.10\"" +files = [ + {file = "pillow-12.1.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:fb125d860738a09d363a88daa0f59c4533529a90e564785e20fe875b200b6dbd"}, + {file = "pillow-12.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cad302dc10fac357d3467a74a9561c90609768a6f73a1923b0fd851b6486f8b0"}, + {file = "pillow-12.1.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:a40905599d8079e09f25027423aed94f2823adaf2868940de991e53a449e14a8"}, + {file = "pillow-12.1.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:92a7fe4225365c5e3a8e598982269c6d6698d3e783b3b1ae979e7819f9cd55c1"}, + {file = "pillow-12.1.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f10c98f49227ed8383d28174ee95155a675c4ed7f85e2e573b04414f7e371bda"}, + {file = "pillow-12.1.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8637e29d13f478bc4f153d8daa9ffb16455f0a6cb287da1b432fdad2bfbd66c7"}, + {file = "pillow-12.1.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:21e686a21078b0f9cb8c8a961d99e6a4ddb88e0fc5ea6e130172ddddc2e5221a"}, + {file = "pillow-12.1.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:2415373395a831f53933c23ce051021e79c8cd7979822d8cc478547a3f4da8ef"}, + {file = "pillow-12.1.0-cp310-cp310-win32.whl", hash = "sha256:e75d3dba8fc1ddfec0cd752108f93b83b4f8d6ab40e524a95d35f016b9683b09"}, + {file = "pillow-12.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:64efdf00c09e31efd754448a383ea241f55a994fd079866b92d2bbff598aad91"}, + {file = "pillow-12.1.0-cp310-cp310-win_arm64.whl", hash = "sha256:f188028b5af6b8fb2e9a76ac0f841a575bd1bd396e46ef0840d9b88a48fdbcea"}, + {file = "pillow-12.1.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:a83e0850cb8f5ac975291ebfc4170ba481f41a28065277f7f735c202cd8e0af3"}, + {file = "pillow-12.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b6e53e82ec2db0717eabb276aa56cf4e500c9a7cec2c2e189b55c24f65a3e8c0"}, + {file = "pillow-12.1.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:40a8e3b9e8773876d6e30daed22f016509e3987bab61b3b7fe309d7019a87451"}, + {file = "pillow-12.1.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:800429ac32c9b72909c671aaf17ecd13110f823ddb7db4dfef412a5587c2c24e"}, + {file = "pillow-12.1.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0b022eaaf709541b391ee069f0022ee5b36c709df71986e3f7be312e46f42c84"}, + {file = "pillow-12.1.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1f345e7bc9d7f368887c712aa5054558bad44d2a301ddf9248599f4161abc7c0"}, + {file = "pillow-12.1.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d70347c8a5b7ccd803ec0c85c8709f036e6348f1e6a5bf048ecd9c64d3550b8b"}, + {file = "pillow-12.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:1fcc52d86ce7a34fd17cb04e87cfdb164648a3662a6f20565910a99653d66c18"}, + {file = "pillow-12.1.0-cp311-cp311-win32.whl", hash = "sha256:3ffaa2f0659e2f740473bcf03c702c39a8d4b2b7ffc629052028764324842c64"}, + {file = "pillow-12.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:806f3987ffe10e867bab0ddad45df1148a2b98221798457fa097ad85d6e8bc75"}, + {file = "pillow-12.1.0-cp311-cp311-win_arm64.whl", hash = "sha256:9f5fefaca968e700ad1a4a9de98bf0869a94e397fe3524c4c9450c1445252304"}, + {file = "pillow-12.1.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:a332ac4ccb84b6dde65dbace8431f3af08874bf9770719d32a635c4ef411b18b"}, + {file = "pillow-12.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:907bfa8a9cb790748a9aa4513e37c88c59660da3bcfffbd24a7d9e6abf224551"}, + {file = "pillow-12.1.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:efdc140e7b63b8f739d09a99033aa430accce485ff78e6d311973a67b6bf3208"}, + {file = "pillow-12.1.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bef9768cab184e7ae6e559c032e95ba8d07b3023c289f79a2bd36e8bf85605a5"}, + {file = "pillow-12.1.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:742aea052cf5ab5034a53c3846165bc3ce88d7c38e954120db0ab867ca242661"}, + {file = "pillow-12.1.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a6dfc2af5b082b635af6e08e0d1f9f1c4e04d17d4e2ca0ef96131e85eda6eb17"}, + {file = "pillow-12.1.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:609e89d9f90b581c8d16358c9087df76024cf058fa693dd3e1e1620823f39670"}, + {file = "pillow-12.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:43b4899cfd091a9693a1278c4982f3e50f7fb7cff5153b05174b4afc9593b616"}, + {file = "pillow-12.1.0-cp312-cp312-win32.whl", hash = "sha256:aa0c9cc0b82b14766a99fbe6084409972266e82f459821cd26997a488a7261a7"}, + {file = "pillow-12.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:d70534cea9e7966169ad29a903b99fc507e932069a881d0965a1a84bb57f6c6d"}, + {file = "pillow-12.1.0-cp312-cp312-win_arm64.whl", hash = "sha256:65b80c1ee7e14a87d6a068dd3b0aea268ffcabfe0498d38661b00c5b4b22e74c"}, + {file = "pillow-12.1.0-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:7b5dd7cbae20285cdb597b10eb5a2c13aa9de6cde9bb64a3c1317427b1db1ae1"}, + {file = "pillow-12.1.0-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:29a4cef9cb672363926f0470afc516dbf7305a14d8c54f7abbb5c199cd8f8179"}, + {file = "pillow-12.1.0-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:681088909d7e8fa9e31b9799aaa59ba5234c58e5e4f1951b4c4d1082a2e980e0"}, + {file = "pillow-12.1.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:983976c2ab753166dc66d36af6e8ec15bb511e4a25856e2227e5f7e00a160587"}, + {file = "pillow-12.1.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:db44d5c160a90df2d24a24760bbd37607d53da0b34fb546c4c232af7192298ac"}, + {file = "pillow-12.1.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:6b7a9d1db5dad90e2991645874f708e87d9a3c370c243c2d7684d28f7e133e6b"}, + {file = "pillow-12.1.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6258f3260986990ba2fa8a874f8b6e808cf5abb51a94015ca3dc3c68aa4f30ea"}, + {file = "pillow-12.1.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e115c15e3bc727b1ca3e641a909f77f8ca72a64fff150f666fcc85e57701c26c"}, + {file = "pillow-12.1.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6741e6f3074a35e47c77b23a4e4f2d90db3ed905cb1c5e6e0d49bff2045632bc"}, + {file = "pillow-12.1.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:935b9d1aed48fcfb3f838caac506f38e29621b44ccc4f8a64d575cb1b2a88644"}, + {file = "pillow-12.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5fee4c04aad8932da9f8f710af2c1a15a83582cfb884152a9caa79d4efcdbf9c"}, + {file = "pillow-12.1.0-cp313-cp313-win32.whl", hash = "sha256:a786bf667724d84aa29b5db1c61b7bfdde380202aaca12c3461afd6b71743171"}, + {file = "pillow-12.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:461f9dfdafa394c59cd6d818bdfdbab4028b83b02caadaff0ffd433faf4c9a7a"}, + {file = "pillow-12.1.0-cp313-cp313-win_arm64.whl", hash = "sha256:9212d6b86917a2300669511ed094a9406888362e085f2431a7da985a6b124f45"}, + {file = "pillow-12.1.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:00162e9ca6d22b7c3ee8e61faa3c3253cd19b6a37f126cad04f2f88b306f557d"}, + {file = "pillow-12.1.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:7d6daa89a00b58c37cb1747ec9fb7ac3bc5ffd5949f5888657dfddde6d1312e0"}, + {file = "pillow-12.1.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e2479c7f02f9d505682dc47df8c0ea1fc5e264c4d1629a5d63fe3e2334b89554"}, + {file = "pillow-12.1.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f188d580bd870cda1e15183790d1cc2fa78f666e76077d103edf048eed9c356e"}, + {file = "pillow-12.1.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0fde7ec5538ab5095cc02df38ee99b0443ff0e1c847a045554cf5f9af1f4aa82"}, + {file = "pillow-12.1.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0ed07dca4a8464bada6139ab38f5382f83e5f111698caf3191cb8dbf27d908b4"}, + {file = "pillow-12.1.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:f45bd71d1fa5e5749587613037b172e0b3b23159d1c00ef2fc920da6f470e6f0"}, + {file = "pillow-12.1.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:277518bf4fe74aa91489e1b20577473b19ee70fb97c374aa50830b279f25841b"}, + {file = "pillow-12.1.0-cp313-cp313t-win32.whl", hash = "sha256:7315f9137087c4e0ee73a761b163fc9aa3b19f5f606a7fc08d83fd3e4379af65"}, + {file = "pillow-12.1.0-cp313-cp313t-win_amd64.whl", hash = "sha256:0ddedfaa8b5f0b4ffbc2fa87b556dc59f6bb4ecb14a53b33f9189713ae8053c0"}, + {file = "pillow-12.1.0-cp313-cp313t-win_arm64.whl", hash = "sha256:80941e6d573197a0c28f394753de529bb436b1ca990ed6e765cf42426abc39f8"}, + {file = "pillow-12.1.0-cp314-cp314-ios_13_0_arm64_iphoneos.whl", hash = "sha256:5cb7bc1966d031aec37ddb9dcf15c2da5b2e9f7cc3ca7c54473a20a927e1eb91"}, + {file = "pillow-12.1.0-cp314-cp314-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:97e9993d5ed946aba26baf9c1e8cf18adbab584b99f452ee72f7ee8acb882796"}, + {file = "pillow-12.1.0-cp314-cp314-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:414b9a78e14ffeb98128863314e62c3f24b8a86081066625700b7985b3f529bd"}, + {file = "pillow-12.1.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:e6bdb408f7c9dd2a5ff2b14a3b0bb6d4deb29fb9961e6eb3ae2031ae9a5cec13"}, + {file = "pillow-12.1.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:3413c2ae377550f5487991d444428f1a8ae92784aac79caa8b1e3b89b175f77e"}, + {file = "pillow-12.1.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e5dcbe95016e88437ecf33544ba5db21ef1b8dd6e1b434a2cb2a3d605299e643"}, + {file = "pillow-12.1.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d0a7735df32ccbcc98b98a1ac785cc4b19b580be1bdf0aeb5c03223220ea09d5"}, + {file = "pillow-12.1.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0c27407a2d1b96774cbc4a7594129cc027339fd800cd081e44497722ea1179de"}, + {file = "pillow-12.1.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:15c794d74303828eaa957ff8070846d0efe8c630901a1c753fdc63850e19ecd9"}, + {file = "pillow-12.1.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c990547452ee2800d8506c4150280757f88532f3de2a58e3022e9b179107862a"}, + {file = "pillow-12.1.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:b63e13dd27da389ed9475b3d28510f0f954bca0041e8e551b2a4eb1eab56a39a"}, + {file = "pillow-12.1.0-cp314-cp314-win32.whl", hash = "sha256:1a949604f73eb07a8adab38c4fe50791f9919344398bdc8ac6b307f755fc7030"}, + {file = "pillow-12.1.0-cp314-cp314-win_amd64.whl", hash = "sha256:4f9f6a650743f0ddee5593ac9e954ba1bdbc5e150bc066586d4f26127853ab94"}, + {file = "pillow-12.1.0-cp314-cp314-win_arm64.whl", hash = "sha256:808b99604f7873c800c4840f55ff389936ef1948e4e87645eaf3fccbc8477ac4"}, + {file = "pillow-12.1.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:bc11908616c8a283cf7d664f77411a5ed2a02009b0097ff8abbba5e79128ccf2"}, + {file = "pillow-12.1.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:896866d2d436563fa2a43a9d72f417874f16b5545955c54a64941e87c1376c61"}, + {file = "pillow-12.1.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8e178e3e99d3c0ea8fc64b88447f7cac8ccf058af422a6cedc690d0eadd98c51"}, + {file = "pillow-12.1.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:079af2fb0c599c2ec144ba2c02766d1b55498e373b3ac64687e43849fbbef5bc"}, + {file = "pillow-12.1.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bdec5e43377761c5dbca620efb69a77f6855c5a379e32ac5b158f54c84212b14"}, + {file = "pillow-12.1.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:565c986f4b45c020f5421a4cea13ef294dde9509a8577f29b2fc5edc7587fff8"}, + {file = "pillow-12.1.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:43aca0a55ce1eefc0aefa6253661cb54571857b1a7b2964bd8a1e3ef4b729924"}, + {file = "pillow-12.1.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:0deedf2ea233722476b3a81e8cdfbad786f7adbed5d848469fa59fe52396e4ef"}, + {file = "pillow-12.1.0-cp314-cp314t-win32.whl", hash = "sha256:b17fbdbe01c196e7e159aacb889e091f28e61020a8abeac07b68079b6e626988"}, + {file = "pillow-12.1.0-cp314-cp314t-win_amd64.whl", hash = "sha256:27b9baecb428899db6c0de572d6d305cfaf38ca1596b5c0542a5182e3e74e8c6"}, + {file = "pillow-12.1.0-cp314-cp314t-win_arm64.whl", hash = "sha256:f61333d817698bdcdd0f9d7793e365ac3d2a21c1f1eb02b32ad6aefb8d8ea831"}, + {file = "pillow-12.1.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:ca94b6aac0d7af2a10ba08c0f888b3d5114439b6b3ef39968378723622fed377"}, + {file = "pillow-12.1.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:351889afef0f485b84078ea40fe33727a0492b9af3904661b0abbafee0355b72"}, + {file = "pillow-12.1.0-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:bb0984b30e973f7e2884362b7d23d0a348c7143ee559f38ef3eaab640144204c"}, + {file = "pillow-12.1.0-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:84cabc7095dd535ca934d57e9ce2a72ffd216e435a84acb06b2277b1de2689bd"}, + {file = "pillow-12.1.0-pp311-pypy311_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:53d8b764726d3af1a138dd353116f774e3862ec7e3794e0c8781e30db0f35dfc"}, + {file = "pillow-12.1.0-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5da841d81b1a05ef940a8567da92decaa15bc4d7dedb540a8c219ad83d91808a"}, + {file = "pillow-12.1.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:75af0b4c229ac519b155028fa1be632d812a519abba9b46b20e50c6caa184f19"}, + {file = "pillow-12.1.0.tar.gz", hash = "sha256:5c5ae0a06e9ea030ab786b0251b32c7e4ce10e58d983c0d5c56029455180b5b9"}, +] + +[package.extras] +docs = ["furo", "olefile", "sphinx (>=8.2)", "sphinx-autobuild", "sphinx-copybutton", "sphinx-inline-tabs", "sphinxext-opengraph"] +fpx = ["olefile"] +mic = ["olefile"] +test-arrow = ["arro3-compute", "arro3-core", "nanoarrow", "pyarrow"] +tests = ["check-manifest", "coverage (>=7.4.2)", "defusedxml", "markdown2", "olefile", "packaging", "pyroma (>=5)", "pytest", "pytest-cov", "pytest-timeout", "pytest-xdist", "trove-classifiers (>=2024.10.12)"] +xmp = ["defusedxml"] + [[package]] name = "platformdirs" version = "4.4.0" description = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`." optional = false python-versions = ">=3.9" +groups = ["main", "dev"] files = [ {file = "platformdirs-4.4.0-py3-none-any.whl", hash = "sha256:abd01743f24e5287cd7a5db3752faf1a2d65353f38ec26d98e25a6db65958c85"}, {file = "platformdirs-4.4.0.tar.gz", hash = "sha256:ca753cf4d81dc309bc67b0ea38fd15dc97bc30ce419a7f58d13eb3bf14c4febf"}, @@ -610,6 +1762,7 @@ version = "1.6.0" description = "plugin and hook calling mechanisms for python" optional = false python-versions = ">=3.9" +groups = ["main", "dev"] files = [ {file = "pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746"}, {file = "pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3"}, @@ -625,6 +1778,7 @@ version = "3.8.0" description = "A framework for managing and maintaining multi-language pre-commit hooks." optional = false python-versions = ">=3.9" +groups = ["main", "dev"] files = [ {file = "pre_commit-3.8.0-py2.py3-none-any.whl", hash = "sha256:9a90a53bf82fdd8778d58085faf8d83df56e40dfe18f45b19446e26bf1b3a63f"}, {file = "pre_commit-3.8.0.tar.gz", hash = "sha256:8bb6494d4a20423842e198980c9ecf9f96607a07ea29549e180eef9ae80fe7af"}, @@ -643,6 +1797,7 @@ version = "6.33.4" description = "" optional = false python-versions = ">=3.9" +groups = ["main", "dev"] files = [ {file = "protobuf-6.33.4-cp310-abi3-win32.whl", hash = "sha256:918966612c8232fc6c24c78e1cd89784307f5814ad7506c308ee3cf86662850d"}, {file = "protobuf-6.33.4-cp310-abi3-win_amd64.whl", hash = "sha256:8f11ffae31ec67fc2554c2ef891dcb561dae9a2a3ed941f9e134c2db06657dbc"}, @@ -662,6 +1817,7 @@ version = "0.10.9.7" description = "Enables Python programs to dynamically access arbitrary Java objects" optional = false python-versions = "*" +groups = ["main", "dev"] files = [ {file = "py4j-0.10.9.7-py2.py3-none-any.whl", hash = "sha256:85defdfd2b2376eb3abf5ca6474b51ab7e0de341c75a02f46dc9b5976f5a5c1b"}, {file = "py4j-0.10.9.7.tar.gz", hash = "sha256:0b6e5315bb3ada5cf62ac651d107bb2ebc02def3dee9d9548e3baac644ea8dbb"}, @@ -673,6 +1829,7 @@ version = "21.0.0" description = "Python library for Apache Arrow" optional = false python-versions = ">=3.9" +groups = ["main", "dev"] files = [ {file = "pyarrow-21.0.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:e563271e2c5ff4d4a4cbeb2c83d5cf0d4938b891518e676025f7268c6fe5fe26"}, {file = "pyarrow-21.0.0-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:fee33b0ca46f4c85443d6c450357101e47d53e6c3f008d658c27a2d020d44c79"}, @@ -728,6 +1885,7 @@ version = "2.19.2" description = "Pygments is a syntax highlighting package written in Python." optional = false python-versions = ">=3.8" +groups = ["main", "dev"] files = [ {file = "pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b"}, {file = "pygments-2.19.2.tar.gz", hash = "sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887"}, @@ -736,12 +1894,28 @@ files = [ [package.extras] windows-terminal = ["colorama (>=0.4.6)"] +[[package]] +name = "pyparsing" +version = "3.3.1" +description = "pyparsing - Classes and methods to define and execute parsing grammars" +optional = false +python-versions = ">=3.9" +groups = ["main", "dev"] +files = [ + {file = "pyparsing-3.3.1-py3-none-any.whl", hash = "sha256:023b5e7e5520ad96642e2c6db4cb683d3970bd640cdf7115049a6e9c3682df82"}, + {file = "pyparsing-3.3.1.tar.gz", hash = "sha256:47fad0f17ac1e2cad3de3b458570fbc9b03560aa029ed5e16ee5554da9a2251c"}, +] + +[package.extras] +diagrams = ["jinja2", "railroad-diagrams"] + [[package]] name = "pyspark" version = "3.5.0" description = "Apache Spark Python API" optional = false python-versions = ">=3.8" +groups = ["main", "dev"] files = [ {file = "pyspark-3.5.0.tar.gz", hash = "sha256:d41a9b76bd2aca370a6100d075c029e22ba44c5940927877e9435a3a9c566558"}, ] @@ -768,6 +1942,7 @@ version = "8.4.2" description = "pytest: simple powerful testing with Python" optional = false python-versions = ">=3.9" +groups = ["main", "dev"] files = [ {file = "pytest-8.4.2-py3-none-any.whl", hash = "sha256:872f880de3fc3a5bdc88a11b39c9710c3497a547cfa9320bc3c5e62fbf272e79"}, {file = "pytest-8.4.2.tar.gz", hash = "sha256:86c0d0b93306b961d58d62a4db4879f27fe25513d4b969df351abdddb3c30e01"}, @@ -791,6 +1966,7 @@ version = "4.1.0" description = "Pytest plugin for measuring coverage." optional = false python-versions = ">=3.7" +groups = ["main", "dev"] files = [ {file = "pytest-cov-4.1.0.tar.gz", hash = "sha256:3904b13dfbfec47f003b8e77fd5b589cd11904a21ddf1ab38a64f204d6a10ef6"}, {file = "pytest_cov-4.1.0-py3-none-any.whl", hash = "sha256:6ba70b9e97e69fcc3fb45bfeab2d0a138fb65c4d0d6a41ef33983ad114be8c3a"}, @@ -809,6 +1985,7 @@ version = "14.0" description = "pytest plugin to re-run tests to eliminate flaky failures" optional = false python-versions = ">=3.8" +groups = ["main", "dev"] files = [ {file = "pytest-rerunfailures-14.0.tar.gz", hash = "sha256:4a400bcbcd3c7a4ad151ab8afac123d90eca3abe27f98725dc4d9702887d2e92"}, {file = "pytest_rerunfailures-14.0-py3-none-any.whl", hash = "sha256:4197bdd2eaeffdbf50b5ea6e7236f47ff0e44d1def8dae08e409f536d84e7b32"}, @@ -824,6 +2001,7 @@ version = "2.9.0.post0" description = "Extensions to the standard Python datetime module" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" +groups = ["main", "dev"] files = [ {file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"}, {file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"}, @@ -838,6 +2016,7 @@ version = "2025.2" description = "World timezone definitions, modern and historical" optional = false python-versions = "*" +groups = ["main", "dev"] files = [ {file = "pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00"}, {file = "pytz-2025.2.tar.gz", hash = "sha256:360b9e3dbb49a209c21ad61809c7fb453643e048b38924c765813546746e81c3"}, @@ -849,6 +2028,7 @@ version = "6.0.3" description = "YAML parser and emitter for Python" optional = false python-versions = ">=3.8" +groups = ["main", "dev"] files = [ {file = "PyYAML-6.0.3-cp38-cp38-macosx_10_13_x86_64.whl", hash = "sha256:c2514fceb77bc5e7a2f7adfaa1feb2fb311607c9cb518dbc378688ec73d8292f"}, {file = "PyYAML-6.0.3-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9c57bb8c96f6d1808c030b1687b9b5fb476abaa47f0db9c0101f5e9f394e97f4"}, @@ -931,19 +2111,20 @@ version = "80.9.0" description = "Easily download, build, install, upgrade, and uninstall Python packages" optional = false python-versions = ">=3.9" +groups = ["main"] files = [ {file = "setuptools-80.9.0-py3-none-any.whl", hash = "sha256:062d34222ad13e0cc312a4c02d73f059e86a4acbfbdea8f8f76b28c99f306922"}, {file = "setuptools-80.9.0.tar.gz", hash = "sha256:f36b47402ecde768dbfafc46e8e4207b4360c654f1f3bb84475f0a28628fb19c"}, ] [package.extras] -check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)", "ruff (>=0.8.0)"] -core = ["importlib_metadata (>=6)", "jaraco.functools (>=4)", "jaraco.text (>=3.7)", "more_itertools", "more_itertools (>=8.8)", "packaging (>=24.2)", "platformdirs (>=4.2.2)", "tomli (>=2.0.1)", "wheel (>=0.43.0)"] +check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\"", "ruff (>=0.8.0) ; sys_platform != \"cygwin\""] +core = ["importlib_metadata (>=6) ; python_version < \"3.10\"", "jaraco.functools (>=4)", "jaraco.text (>=3.7)", "more_itertools", "more_itertools (>=8.8)", "packaging (>=24.2)", "platformdirs (>=4.2.2)", "tomli (>=2.0.1) ; python_version < \"3.11\"", "wheel (>=0.43.0)"] cover = ["pytest-cov"] doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier", "towncrier (<24.7)"] enabler = ["pytest-enabler (>=2.2)"] -test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.7.2)", "jaraco.test (>=5.5)", "packaging (>=24.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-home (>=0.5)", "pytest-perf", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel (>=0.44.0)"] -type = ["importlib_metadata (>=7.0.2)", "jaraco.develop (>=7.21)", "mypy (==1.14.*)", "pytest-mypy"] +test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21) ; python_version >= \"3.9\" and sys_platform != \"cygwin\"", "jaraco.envs (>=2.2)", "jaraco.path (>=3.7.2)", "jaraco.test (>=5.5)", "packaging (>=24.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-home (>=0.5)", "pytest-perf ; sys_platform != \"cygwin\"", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel (>=0.44.0)"] +type = ["importlib_metadata (>=7.0.2) ; python_version < \"3.10\"", "jaraco.develop (>=7.21) ; sys_platform != \"cygwin\"", "mypy (==1.14.*)", "pytest-mypy"] [[package]] name = "six" @@ -951,6 +2132,7 @@ version = "1.17.0" description = "Python 2 and 3 compatibility utilities" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" +groups = ["main", "dev"] files = [ {file = "six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274"}, {file = "six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81"}, @@ -962,6 +2144,8 @@ version = "2.4.0" description = "A lil' TOML parser" optional = false python-versions = ">=3.8" +groups = ["main", "dev"] +markers = "python_version < \"3.11\"" files = [ {file = "tomli-2.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b5ef256a3fd497d4973c11bf142e9ed78b150d36f5773f1ca6088c230ffc5867"}, {file = "tomli-2.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5572e41282d5268eb09a697c89a7bee84fae66511f87533a6f88bd2f7b652da9"}, @@ -1018,6 +2202,7 @@ version = "4.15.0" description = "Backported and Experimental Type Hints for Python 3.9+" optional = false python-versions = ">=3.9" +groups = ["main", "dev"] files = [ {file = "typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548"}, {file = "typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466"}, @@ -1029,6 +2214,7 @@ version = "2025.3" description = "Provider of IANA time zone data" optional = false python-versions = ">=2" +groups = ["main", "dev"] files = [ {file = "tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1"}, {file = "tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7"}, @@ -1040,6 +2226,7 @@ version = "20.36.1" description = "Virtual Python Environment builder" optional = false python-versions = ">=3.8" +groups = ["main", "dev"] files = [ {file = "virtualenv-20.36.1-py3-none-any.whl", hash = "sha256:575a8d6b124ef88f6f51d56d656132389f961062a9177016a50e4f507bbcc19f"}, {file = "virtualenv-20.36.1.tar.gz", hash = "sha256:8befb5c81842c641f8ee658481e42641c68b5eab3521d8e092d18320902466ba"}, @@ -1056,9 +2243,36 @@ typing-extensions = {version = ">=4.13.2", markers = "python_version < \"3.11\"" [package.extras] docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2,!=7.3)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"] -test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8)", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10)"] +test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8) ; platform_python_implementation == \"PyPy\" or platform_python_implementation == \"GraalVM\" or platform_python_implementation == \"CPython\" and sys_platform == \"win32\" and python_version >= \"3.13\"", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10) ; platform_python_implementation == \"CPython\""] + +[[package]] +name = "zipp" +version = "3.23.0" +description = "Backport of pathlib-compatible object wrapper for zip files" +optional = false +python-versions = ">=3.9" +groups = ["main", "dev"] +markers = "python_version == \"3.9\"" +files = [ + {file = "zipp-3.23.0-py3-none-any.whl", hash = "sha256:071652d6115ed432f5ce1d34c336c0adfd6a884660d1e9712a256d3d3bd4b14e"}, + {file = "zipp-3.23.0.tar.gz", hash = "sha256:a07157588a12518c9d4034df3fbbee09c814741a33ff63c05fa29d26a2404166"}, +] + +[package.extras] +check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\""] +cover = ["pytest-cov"] +doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] +enabler = ["pytest-enabler (>=2.2)"] +test = ["big-O", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more_itertools", "pytest (>=6,!=8.1.*)", "pytest-ignore-flaky"] +type = ["pytest-mypy"] + +[extras] +all = ["duckdb", "pyspark"] +dev = ["black", "coverage", "duckdb", "matplotlib", "pre-commit", "pyspark", "pytest", "pytest-cov", "pytest-rerunfailures"] +duckdb = ["duckdb"] +spark = ["pyspark"] [metadata] -lock-version = "2.0" +lock-version = "2.1" python-versions = ">=3.9,<4" -content-hash = "18db29f1829ab8baebdd68c486c74b5e7e4304a6d344a26773685b07b85fe7c3" +content-hash = "7097bf5f307c1956c17cb9b53c57ed2adfa5a18836e3ab01a9beec01589fb20b" diff --git a/pydeequ/__init__.py b/pydeequ/__init__.py index 6d2202f..f31b47f 100644 --- a/pydeequ/__init__.py +++ b/pydeequ/__init__.py @@ -14,6 +14,15 @@ """ PyDeequ - Python API for Deequ data quality library. +For PyDeequ 2.0 with DuckDB (no Spark required): + import duckdb + import pydeequ + from pydeequ.v2.analyzers import Size, Completeness + + con = duckdb.connect() + con.execute("CREATE TABLE test AS SELECT 1 as id") + engine = pydeequ.connect(con, table="test") + For PyDeequ 2.0 (Spark Connect), use: from pydeequ.v2 import VerificationSuite, Check, CheckLevel from pydeequ.v2.predicates import eq, gte @@ -22,8 +31,52 @@ from pydeequ import deequ_maven_coord from pydeequ.checks import Check, CheckLevel """ +from typing import Any, Optional + __version__ = "2.0.0b1" + +def connect( + connection: Any, + table: Optional[str] = None, + dataframe: Optional[Any] = None, +): + """ + Create an engine from a connection object with auto-detection. + + This function inspects the connection type and creates the appropriate + engine backend. It supports: + - DuckDB connections (duckdb.DuckDBPyConnection) - runs locally + - Spark sessions (pyspark.sql.SparkSession) - uses Spark Connect + + Args: + connection: A database connection or Spark session + table: Table name for SQL-based backends (required for DuckDB) + dataframe: DataFrame for Spark backend (alternative to table) + + Returns: + An engine instance appropriate for the connection type + + Raises: + ValueError: If connection type is not supported + + Example: + # DuckDB (local, no Spark required) + import duckdb + import pydeequ + + con = duckdb.connect() + con.execute("CREATE TABLE reviews AS SELECT * FROM 'reviews.csv'") + engine = pydeequ.connect(con, table="reviews") + + # Spark Connect + from pyspark.sql import SparkSession + spark = SparkSession.builder.remote("sc://localhost:15002").getOrCreate() + engine = pydeequ.connect(spark, dataframe=df) + """ + from pydeequ.engines import connect as engines_connect + return engines_connect(connection, table=table, dataframe=dataframe) + # Legacy imports are deferred to avoid requiring SPARK_VERSION for V2 users. # V2 users should import from pydeequ.v2 directly. diff --git a/pydeequ/configs.py b/pydeequ/configs.py index e56c97d..ba5e378 100644 --- a/pydeequ/configs.py +++ b/pydeequ/configs.py @@ -41,4 +41,4 @@ def _get_deequ_maven_config(): SPARK_VERSION = _get_spark_version() DEEQU_MAVEN_COORD = _get_deequ_maven_config() -IS_DEEQU_V1 = re.search("com\.amazon\.deequ\:deequ\:1.*", DEEQU_MAVEN_COORD) is not None +IS_DEEQU_V1 = re.search(r"com\.amazon\.deequ\:deequ\:1.*", DEEQU_MAVEN_COORD) is not None diff --git a/pydeequ/engines/__init__.py b/pydeequ/engines/__init__.py new file mode 100644 index 0000000..63a8327 --- /dev/null +++ b/pydeequ/engines/__init__.py @@ -0,0 +1,410 @@ +# -*- coding: utf-8 -*- +""" +Engine abstraction for PyDeequ. + +This module provides the engine abstraction layer that enables PyDeequ +to work with different execution backends (Spark, DuckDB, etc.). + +Key design principles (inspired by DuckDQ): +1. State computation is engine-dependent (SQL queries, Spark jobs) +2. State merging is engine-independent (pure Python) +3. This separation enables incremental validation and easy backend additions + +Example usage: + import duckdb + import pydeequ + + # Auto-detection from connection type + con = duckdb.connect() + con.execute("CREATE TABLE test AS SELECT 1 as id, 2 as value") + engine = pydeequ.connect(con, table="test") + + # Direct import + from pydeequ.engines.duckdb import DuckDBEngine + engine = DuckDBEngine(con, table="test") +""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +from enum import Enum +from typing import ( + TYPE_CHECKING, + Any, + Dict, + List, + Optional, + Sequence, + Tuple, + Union, +) + +import pandas as pd + +if TYPE_CHECKING: + from pydeequ.v2.analyzers import _ConnectAnalyzer + from pydeequ.v2.checks import Check + + +class ConstraintStatus(Enum): + """Status of a constraint evaluation.""" + SUCCESS = "Success" + FAILURE = "Failure" + + # Aliases for backwards compatibility + Success = "Success" + Failure = "Failure" + + +class CheckStatus(Enum): + """Status of a check evaluation.""" + SUCCESS = "Success" + WARNING = "Warning" + ERROR = "Error" + + # Aliases for backwards compatibility + Success = "Success" + Warning = "Warning" + Error = "Error" + + +@dataclass +class MetricResult: + """Result of computing a metric.""" + name: str + instance: str + entity: str + value: Optional[float] + success: bool = True + message: Optional[str] = None + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary for DataFrame creation.""" + return { + "name": self.name, + "instance": self.instance, + "entity": self.entity, + "value": self.value, + } + + +@dataclass +class ConstraintResult: + """Result of evaluating a constraint.""" + check_description: str + check_level: str + check_status: Union[str, "CheckStatus"] + constraint: str + constraint_status: Union[str, "ConstraintStatus"] + constraint_message: Optional[str] = None + + def __post_init__(self): + """Convert string status values to enum values.""" + # Handle check_status + if isinstance(self.check_status, str): + for status in CheckStatus: + if status.value == self.check_status: + self.check_status = status + break + # Handle constraint_status + if isinstance(self.constraint_status, str): + for status in ConstraintStatus: + if status.value == self.constraint_status: + self.constraint_status = status + break + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary for DataFrame creation.""" + check_status_val = self.check_status.value if isinstance(self.check_status, CheckStatus) else self.check_status + constraint_status_val = self.constraint_status.value if isinstance(self.constraint_status, ConstraintStatus) else self.constraint_status + return { + "check": self.check_description, + "check_level": self.check_level, + "check_status": check_status_val, + "constraint": self.constraint, + "constraint_status": constraint_status_val, + "constraint_message": self.constraint_message or "", + } + + +@dataclass +class ColumnProfile: + """Profile of a single column.""" + column: str + completeness: float + approx_distinct_values: int + data_type: str + is_data_type_inferred: bool = True + type_counts: Optional[str] = None + histogram: Optional[str] = None + mean: Optional[float] = None + minimum: Optional[float] = None + maximum: Optional[float] = None + sum: Optional[float] = None + std_dev: Optional[float] = None + approx_percentiles: Optional[str] = None + kll_buckets: Optional[str] = None + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary for DataFrame creation.""" + return { + "column": self.column, + "completeness": self.completeness, + "approx_distinct_values": self.approx_distinct_values, + "data_type": self.data_type, + "is_data_type_inferred": self.is_data_type_inferred, + "type_counts": self.type_counts, + "histogram": self.histogram, + "mean": self.mean, + "minimum": self.minimum, + "maximum": self.maximum, + "sum": self.sum, + "std_dev": self.std_dev, + "approx_percentiles": self.approx_percentiles, + "kll_buckets": self.kll_buckets, + } + + +@dataclass +class ConstraintSuggestion: + """A suggested constraint.""" + column_name: str + constraint_name: str + current_value: Optional[str] + description: str + suggesting_rule: str + code_for_constraint: str + + def to_dict(self) -> Dict[str, Any]: + """Convert to dictionary for DataFrame creation.""" + return { + "column_name": self.column_name, + "constraint_name": self.constraint_name, + "current_value": self.current_value, + "description": self.description, + "suggesting_rule": self.suggesting_rule, + "code_for_constraint": self.code_for_constraint, + } + + +class BaseEngine(ABC): + """ + Abstract base class for execution engines. + + Engines are responsible for: + 1. Computing metrics from data (engine-dependent) + 2. Evaluating constraints against computed metrics + 3. Profiling columns + 4. Suggesting constraints + + Subclasses must implement the core computation methods for their + specific backend (DuckDB, Spark, etc.). + """ + + @abstractmethod + def compute_metrics( + self, analyzers: Sequence["_ConnectAnalyzer"] + ) -> List[MetricResult]: + """ + Compute metrics for the given analyzers. + + Args: + analyzers: Sequence of analyzers to compute metrics for + + Returns: + List of MetricResult objects + """ + pass + + @abstractmethod + def run_checks(self, checks: Sequence["Check"]) -> List[ConstraintResult]: + """ + Run verification checks and return constraint results. + + Args: + checks: Sequence of Check objects to evaluate + + Returns: + List of ConstraintResult objects + """ + pass + + @abstractmethod + def profile_columns( + self, + columns: Optional[Sequence[str]] = None, + low_cardinality_threshold: int = 0, + ) -> List[ColumnProfile]: + """ + Profile columns in the data source. + + Args: + columns: Optional list of columns to profile. If None, profile all. + low_cardinality_threshold: Threshold for histogram computation + + Returns: + List of ColumnProfile objects + """ + pass + + @abstractmethod + def suggest_constraints( + self, + columns: Optional[Sequence[str]] = None, + rules: Optional[Sequence[str]] = None, + ) -> List[ConstraintSuggestion]: + """ + Suggest constraints based on data characteristics. + + Args: + columns: Optional list of columns to analyze + rules: Optional list of rule sets to apply + + Returns: + List of ConstraintSuggestion objects + """ + pass + + @abstractmethod + def get_schema(self) -> Dict[str, str]: + """ + Get the schema of the data source. + + Returns: + Dictionary mapping column names to data types + """ + pass + + def metrics_to_dataframe(self, metrics: List[MetricResult]) -> pd.DataFrame: + """Convert metrics to a pandas DataFrame.""" + if not metrics: + return pd.DataFrame(columns=["name", "instance", "entity", "value"]) + return pd.DataFrame([m.to_dict() for m in metrics]) + + def constraints_to_dataframe( + self, results: List[ConstraintResult] + ) -> pd.DataFrame: + """Convert constraint results to a pandas DataFrame.""" + if not results: + return pd.DataFrame( + columns=[ + "check", "check_level", "check_status", + "constraint", "constraint_status", "constraint_message" + ] + ) + return pd.DataFrame([r.to_dict() for r in results]) + + def profiles_to_dataframe(self, profiles: List[ColumnProfile]) -> pd.DataFrame: + """Convert column profiles to a pandas DataFrame.""" + if not profiles: + return pd.DataFrame(columns=["column", "completeness", "data_type"]) + return pd.DataFrame([p.to_dict() for p in profiles]) + + def suggestions_to_dataframe( + self, suggestions: List[ConstraintSuggestion] + ) -> pd.DataFrame: + """Convert suggestions to a pandas DataFrame.""" + if not suggestions: + return pd.DataFrame( + columns=[ + "column_name", "constraint_name", "current_value", + "description", "suggesting_rule", "code_for_constraint" + ] + ) + return pd.DataFrame([s.to_dict() for s in suggestions]) + + +def connect( + connection: Any, + table: Optional[str] = None, + dataframe: Optional[Any] = None, +) -> BaseEngine: + """ + Create an engine from a connection object with auto-detection. + + This function inspects the connection type and creates the appropriate + engine backend. It supports: + - DuckDB connections (duckdb.DuckDBPyConnection) + - Spark sessions (pyspark.sql.SparkSession) - wraps existing v2 API + + Args: + connection: A database connection or Spark session + table: Table name for SQL-based backends + dataframe: DataFrame for Spark backend (alternative to table) + + Returns: + An engine instance appropriate for the connection type + + Raises: + ValueError: If connection type is not supported + + Example: + import duckdb + import pydeequ + + con = duckdb.connect() + con.execute("CREATE TABLE reviews AS SELECT * FROM 'reviews.csv'") + engine = pydeequ.connect(con, table="reviews") + """ + connection_type = type(connection).__name__ + connection_module = type(connection).__module__ + + # Try DuckDB + if "duckdb" in connection_module.lower(): + try: + import duckdb + if isinstance(connection, duckdb.DuckDBPyConnection): + if table is None: + raise ValueError("table parameter is required for DuckDB connections") + from pydeequ.engines.duckdb import DuckDBEngine + return DuckDBEngine(connection, table) + except ImportError: + raise ImportError( + "DuckDB backend requires the 'duckdb' package. " + "Install it with: pip install pydeequ[duckdb]" + ) from None + + # Try Spark + if "pyspark" in connection_module.lower() or "spark" in connection_type.lower(): + try: + from pyspark.sql import SparkSession + if isinstance(connection, SparkSession): + from pydeequ.engines.spark import SparkEngine + return SparkEngine(connection, table=table, dataframe=dataframe) + except ImportError: + raise ImportError( + "Spark backend requires the 'pyspark' package. " + "Install it with: pip install pydeequ[spark]" + ) from None + + raise ValueError( + f"Unsupported connection type: {connection_type}. " + "Supported types:\n" + " - duckdb.DuckDBPyConnection (pip install pydeequ[duckdb])\n" + " - pyspark.sql.SparkSession (pip install pydeequ[spark])" + ) + + +# Export public API +__all__ = [ + # Base classes + "BaseEngine", + # Result types + "MetricResult", + "ConstraintResult", + "ConstraintStatus", + "CheckStatus", + "ColumnProfile", + "ConstraintSuggestion", + # Factory function + "connect", +] + + +# Lazy import for DuckDB config to avoid import errors when duckdb is not installed +def __getattr__(name: str) -> Any: + if name == "DuckDBEngineConfig": + from pydeequ.engines.duckdb_config import DuckDBEngineConfig + return DuckDBEngineConfig + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") diff --git a/pydeequ/engines/constraints/__init__.py b/pydeequ/engines/constraints/__init__.py new file mode 100644 index 0000000..a93b3bf --- /dev/null +++ b/pydeequ/engines/constraints/__init__.py @@ -0,0 +1,132 @@ +# -*- coding: utf-8 -*- +""" +Constraint evaluator abstractions for data quality checks. + +This module provides a constraint evaluator pattern that: +1. Encapsulates constraint evaluation logic in self-contained classes +2. Separates value computation from assertion evaluation +3. Provides consistent WHERE clause handling +4. Enables easy addition of new constraint types + +Architecture: + Protocols (Contracts) + └── ConstraintEvaluatorProtocol - Defines evaluator contract + + Base Classes (Hierarchy) + ├── BaseEvaluator - Base with WHERE clause and assertion handling + ├── RatioCheckEvaluator - For match/total ratio constraints + └── AnalyzerBasedEvaluator - Delegates to analyzer operators + + Evaluator Implementations + ├── Analyzer-based (SizeEvaluator, CompletenessEvaluator, etc.) + ├── Ratio-check (IsPositiveEvaluator, IsContainedInEvaluator, etc.) + ├── Comparison (ColumnComparisonEvaluator) + └── Multi-column (MultiColumnCompletenessEvaluator) + + Factory + └── ConstraintEvaluatorFactory - Creates evaluators from protobufs + +Example usage: + from pydeequ.engines.constraints import ConstraintEvaluatorFactory + + # Create evaluator from constraint protobuf + evaluator = ConstraintEvaluatorFactory.create(constraint_proto) + + if evaluator: + # Compute the metric value + value = evaluator.compute_value(table, execute_fn) + + # Evaluate the assertion + passed = evaluator.evaluate(value) + + # Get human-readable description + description = evaluator.to_string() +""" + +from pydeequ.engines.constraints.base import ( + AnalyzerBasedEvaluator, + BaseEvaluator, + RatioCheckEvaluator, +) +from pydeequ.engines.constraints.batch_evaluator import ( + ConstraintBatchEvaluator, + SCAN_BASED_EVALUATORS, +) +from pydeequ.engines.constraints.evaluators import ( + ApproxCountDistinctEvaluator, + ApproxQuantileEvaluator, + ColumnComparisonEvaluator, + CompletenessEvaluator, + ComplianceEvaluator, + ContainsCreditCardEvaluator, + ContainsEmailEvaluator, + ContainsSSNEvaluator, + ContainsURLEvaluator, + CorrelationEvaluator, + DistinctnessEvaluator, + EntropyEvaluator, + IsContainedInEvaluator, + IsNonNegativeEvaluator, + IsPositiveEvaluator, + MaximumEvaluator, + MaxLengthEvaluator, + MeanEvaluator, + MinimumEvaluator, + MinLengthEvaluator, + MultiColumnCompletenessEvaluator, + MutualInformationEvaluator, + PatternMatchEvaluator, + SizeEvaluator, + StandardDeviationEvaluator, + SumEvaluator, + UniquenessEvaluator, + UniqueValueRatioEvaluator, +) +from pydeequ.engines.constraints.factory import ConstraintEvaluatorFactory +from pydeequ.engines.constraints.protocols import ConstraintEvaluatorProtocol + +__all__ = [ + # Protocols + "ConstraintEvaluatorProtocol", + # Base classes + "BaseEvaluator", + "RatioCheckEvaluator", + "AnalyzerBasedEvaluator", + # Batch evaluator + "ConstraintBatchEvaluator", + "SCAN_BASED_EVALUATORS", + # Analyzer-based evaluators + "SizeEvaluator", + "CompletenessEvaluator", + "MeanEvaluator", + "MinimumEvaluator", + "MaximumEvaluator", + "SumEvaluator", + "StandardDeviationEvaluator", + "UniquenessEvaluator", + "DistinctnessEvaluator", + "UniqueValueRatioEvaluator", + "CorrelationEvaluator", + "EntropyEvaluator", + "MutualInformationEvaluator", + "PatternMatchEvaluator", + "MinLengthEvaluator", + "MaxLengthEvaluator", + "ApproxCountDistinctEvaluator", + "ApproxQuantileEvaluator", + "ComplianceEvaluator", + # Ratio-check evaluators + "IsPositiveEvaluator", + "IsNonNegativeEvaluator", + "IsContainedInEvaluator", + "ContainsEmailEvaluator", + "ContainsURLEvaluator", + "ContainsCreditCardEvaluator", + "ContainsSSNEvaluator", + # Comparison evaluators + "ColumnComparisonEvaluator", + # Multi-column evaluators + "MultiColumnCompletenessEvaluator", + # Factory + "ConstraintEvaluatorFactory", +] diff --git a/pydeequ/engines/constraints/base.py b/pydeequ/engines/constraints/base.py new file mode 100644 index 0000000..abcd32e --- /dev/null +++ b/pydeequ/engines/constraints/base.py @@ -0,0 +1,271 @@ +# -*- coding: utf-8 -*- +""" +Base classes for constraint evaluators. + +This module provides the abstract base classes that combine mixins +to create the foundation for concrete evaluator implementations. +""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, Callable, List, Optional + +from pydeequ.engines.operators.mixins import SafeExtractMixin, WhereClauseMixin + +if TYPE_CHECKING: + import pandas as pd + from pydeequ.v2.predicates import Predicate + + +class BaseEvaluator(WhereClauseMixin, SafeExtractMixin, ABC): + """ + Base class for all constraint evaluators. + + Provides shared functionality for WHERE clause handling, + assertion parsing, and predicate evaluation. + + Attributes: + column: Optional column name for single-column constraints + columns: List of column names for multi-column constraints + where: Optional SQL WHERE clause for filtering + assertion: Parsed predicate for evaluation + """ + + def __init__(self, constraint_proto): + """ + Initialize evaluator from constraint protobuf. + + Args: + constraint_proto: Protobuf message containing constraint definition + """ + self.column = constraint_proto.column if constraint_proto.column else None + self.columns = list(constraint_proto.columns) if constraint_proto.columns else [] + self.where = constraint_proto.where if constraint_proto.where else None + self.assertion = self._parse_assertion(constraint_proto) + self._constraint_type = constraint_proto.type + + @property + def constraint_type(self) -> str: + """Return the constraint type identifier.""" + return self._constraint_type + + def _parse_assertion(self, constraint_proto) -> Optional["Predicate"]: + """ + Parse assertion predicate from constraint protobuf. + + Args: + constraint_proto: Protobuf message containing constraint definition + + Returns: + Parsed predicate or None if no assertion specified + """ + from pydeequ.v2.proto import deequ_connect_pb2 as proto + + if not constraint_proto.HasField("assertion"): + return None + + pred_msg = constraint_proto.assertion + + if pred_msg.operator == proto.PredicateMessage.Operator.BETWEEN: + from pydeequ.v2.predicates import Between + return Between(pred_msg.lower_bound, pred_msg.upper_bound) + else: + from pydeequ.v2.predicates import Comparison + return Comparison(pred_msg.operator, pred_msg.value) + + def _evaluate_predicate(self, value: float, assertion: "Predicate") -> bool: + """ + Evaluate a predicate against a value. + + Args: + value: The value to check + assertion: The predicate to evaluate + + Returns: + True if the value satisfies the predicate + """ + from pydeequ.v2.predicates import Between, Comparison + from pydeequ.v2.proto import deequ_connect_pb2 as proto + + if isinstance(assertion, Comparison): + op = assertion.operator + target = assertion.value + + if op == proto.PredicateMessage.Operator.EQ: + return abs(value - target) < 1e-9 + elif op == proto.PredicateMessage.Operator.NE: + return abs(value - target) >= 1e-9 + elif op == proto.PredicateMessage.Operator.GT: + return value > target + elif op == proto.PredicateMessage.Operator.GE: + return value >= target + elif op == proto.PredicateMessage.Operator.LT: + return value < target + elif op == proto.PredicateMessage.Operator.LE: + return value <= target + + elif isinstance(assertion, Between): + return assertion.lower <= value <= assertion.upper + + return False + + def evaluate(self, value: Optional[float]) -> bool: + """ + Evaluate whether the computed value satisfies the constraint. + + Args: + value: The computed metric value + + Returns: + True if the constraint is satisfied, False otherwise + """ + if value is None: + return False + + if self.assertion: + return self._evaluate_predicate(value, self.assertion) + + # Default: value must equal 1.0 (for completeness-like constraints) + return value == 1.0 + + @abstractmethod + def compute_value( + self, table: str, execute_fn: Callable[[str], "pd.DataFrame"] + ) -> Optional[float]: + """ + Compute the metric value for this constraint. + + Args: + table: Name of the table to query + execute_fn: Function to execute SQL and return DataFrame + + Returns: + Computed metric value, or None if computation fails + """ + raise NotImplementedError + + @abstractmethod + def to_string(self) -> str: + """ + Return a human-readable string representation of the constraint. + + Returns: + Description of what the constraint checks + """ + raise NotImplementedError + + +class RatioCheckEvaluator(BaseEvaluator): + """ + Base class for constraints that compute matches/total ratio. + + These constraints check what fraction of rows satisfy some condition, + such as isPositive, isNonNegative, isContainedIn, etc. + """ + + @abstractmethod + def get_condition(self) -> str: + """ + Get the SQL condition that defines a 'match'. + + Returns: + SQL boolean expression for the match condition + """ + raise NotImplementedError + + def compute_value( + self, table: str, execute_fn: Callable[[str], "pd.DataFrame"] + ) -> Optional[float]: + """ + Compute the fraction of rows matching the condition. + + Args: + table: Name of the table to query + execute_fn: Function to execute SQL and return DataFrame + + Returns: + Ratio of matching rows to total rows + """ + condition = self.get_condition() + + if self.where: + query = f""" + SELECT + SUM(CASE WHEN {self.where} THEN 1 ELSE 0 END) as total, + SUM(CASE WHEN ({self.where}) AND ({condition}) THEN 1 ELSE 0 END) as matches + FROM {table} + """ + else: + query = f""" + SELECT + COUNT(*) as total, + SUM(CASE WHEN {condition} THEN 1 ELSE 0 END) as matches + FROM {table} + """ + + result = execute_fn(query) + total = self.safe_float(result, "total") or 0 + matches = self.safe_float(result, "matches") or 0 + + if total == 0: + return 1.0 + return matches / total + + +class AnalyzerBasedEvaluator(BaseEvaluator): + """ + Base class for constraints that delegate to an analyzer operator. + + These constraints compute their value by creating and running + the corresponding analyzer operator. + """ + + @abstractmethod + def get_operator(self): + """ + Get the operator instance to compute the metric. + + Returns: + Operator instance (ScanOperator or GroupingOperator) + """ + raise NotImplementedError + + def compute_value( + self, table: str, execute_fn: Callable[[str], "pd.DataFrame"] + ) -> Optional[float]: + """ + Compute the metric value using the analyzer operator. + + Args: + table: Name of the table to query + execute_fn: Function to execute SQL and return DataFrame + + Returns: + Computed metric value + """ + operator = self.get_operator() + + # Check if it's a scan or grouping operator + if hasattr(operator, "get_aggregations"): + # Scan operator + aggregations = operator.get_aggregations() + query = f"SELECT {', '.join(aggregations)} FROM {table}" + result = execute_fn(query) + metric_result = operator.extract_result(result) + return metric_result.value + elif hasattr(operator, "build_query"): + # Grouping operator + query = operator.build_query(table) + result = execute_fn(query) + metric_result = operator.extract_result(result) + return metric_result.value + + return None + + +__all__ = [ + "BaseEvaluator", + "RatioCheckEvaluator", + "AnalyzerBasedEvaluator", +] diff --git a/pydeequ/engines/constraints/batch_evaluator.py b/pydeequ/engines/constraints/batch_evaluator.py new file mode 100644 index 0000000..14eea45 --- /dev/null +++ b/pydeequ/engines/constraints/batch_evaluator.py @@ -0,0 +1,298 @@ +# -*- coding: utf-8 -*- +""" +Constraint batch evaluation for DuckDB performance optimization. + +This module provides functionality to batch constraint evaluations that can +share SQL queries, reducing the number of queries executed. + +Key optimizations: +1. Scan-based constraints (Size, Mean, Completeness, etc.) can be batched + when they use scan operators with compatible aggregations. +2. Ratio-check constraints (isPositive, isNonNegative, isContainedIn, etc.) + can be batched into a single query when they operate on the same table. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Type + +from pydeequ.engines.constraints.base import ( + AnalyzerBasedEvaluator, + BaseEvaluator, + RatioCheckEvaluator, +) +from pydeequ.engines.constraints.evaluators import ( + CompletenessEvaluator, + MaximumEvaluator, + MeanEvaluator, + MinimumEvaluator, + SizeEvaluator, + StandardDeviationEvaluator, + SumEvaluator, +) + +if TYPE_CHECKING: + import pandas as pd + + +# Evaluators that use scan operators (can be batched via aggregations) +SCAN_BASED_EVALUATORS: Tuple[Type[AnalyzerBasedEvaluator], ...] = ( + SizeEvaluator, + CompletenessEvaluator, + MeanEvaluator, + MinimumEvaluator, + MaximumEvaluator, + SumEvaluator, + StandardDeviationEvaluator, +) + + +class ConstraintBatchEvaluator: + """ + Batches constraint evaluations to minimize SQL queries. + + This class groups constraints by their evaluation pattern and executes + them in batches where possible: + - Scan-based evaluators: batched into single aggregation queries + - Ratio-check evaluators: batched into single ratio queries + - Other evaluators: executed individually + """ + + def __init__(self, evaluators: List[BaseEvaluator]): + """ + Initialize the batch evaluator. + + Args: + evaluators: List of constraint evaluators + """ + self.evaluators = evaluators + self._scan_based: List[AnalyzerBasedEvaluator] = [] + self._ratio_checks: List[RatioCheckEvaluator] = [] + self._other: List[BaseEvaluator] = [] + self._analyze() + + def _analyze(self) -> None: + """Categorize evaluators by type for batching.""" + for evaluator in self.evaluators: + if isinstance(evaluator, SCAN_BASED_EVALUATORS): + self._scan_based.append(evaluator) + elif isinstance(evaluator, RatioCheckEvaluator): + self._ratio_checks.append(evaluator) + else: + self._other.append(evaluator) + + def get_batch_info(self) -> Dict[str, int]: + """Return batch grouping information for debugging.""" + return { + "scan_based": len(self._scan_based), + "ratio_checks": len(self._ratio_checks), + "other": len(self._other), + } + + def execute( + self, + table: str, + execute_fn: Callable[[str], "pd.DataFrame"], + ) -> Dict[BaseEvaluator, Optional[float]]: + """ + Execute all evaluators with batching optimization. + + Args: + table: Name of the table to query + execute_fn: Function to execute SQL and return DataFrame + + Returns: + Dictionary mapping evaluators to their computed values + """ + results: Dict[BaseEvaluator, Optional[float]] = {} + + # Batch scan-based evaluators + if self._scan_based: + scan_results = self._execute_scan_batch(table, execute_fn) + results.update(scan_results) + + # Batch ratio-check evaluators + if self._ratio_checks: + ratio_results = self._execute_ratio_batch(table, execute_fn) + results.update(ratio_results) + + # Execute other evaluators individually + for evaluator in self._other: + try: + value = evaluator.compute_value(table, execute_fn) + results[evaluator] = value + except Exception: + results[evaluator] = None + + return results + + def _execute_scan_batch( + self, + table: str, + execute_fn: Callable[[str], "pd.DataFrame"], + ) -> Dict[BaseEvaluator, Optional[float]]: + """ + Execute scan-based evaluators in a single batched query. + + Args: + table: Name of the table to query + execute_fn: Function to execute SQL and return DataFrame + + Returns: + Dictionary mapping evaluators to their computed values + """ + results: Dict[BaseEvaluator, Optional[float]] = {} + + # Collect all aggregations from scan operators + operators = [] + operator_to_evaluator = {} + + for evaluator in self._scan_based: + operator = evaluator.get_operator() + if operator and hasattr(operator, "get_aggregations"): + operators.append(operator) + operator_to_evaluator[id(operator)] = evaluator + + if not operators: + # Fall back to individual execution + for evaluator in self._scan_based: + try: + value = evaluator.compute_value(table, execute_fn) + results[evaluator] = value + except Exception: + results[evaluator] = None + return results + + # Build batched query + aggregations = [] + for operator in operators: + aggregations.extend(operator.get_aggregations()) + + query = f"SELECT {', '.join(aggregations)} FROM {table}" + + try: + df = execute_fn(query) + + # Extract results for each operator + for operator in operators: + evaluator = operator_to_evaluator[id(operator)] + try: + metric_result = operator.extract_result(df) + results[evaluator] = metric_result.value + except Exception: + results[evaluator] = None + + except Exception: + # Fall back to individual execution on batch failure + for evaluator in self._scan_based: + try: + value = evaluator.compute_value(table, execute_fn) + results[evaluator] = value + except Exception: + results[evaluator] = None + + return results + + def _execute_ratio_batch( + self, + table: str, + execute_fn: Callable[[str], "pd.DataFrame"], + ) -> Dict[BaseEvaluator, Optional[float]]: + """ + Execute ratio-check evaluators in a single batched query. + + Args: + table: Name of the table to query + execute_fn: Function to execute SQL and return DataFrame + + Returns: + Dictionary mapping evaluators to their computed values + """ + results: Dict[BaseEvaluator, Optional[float]] = {} + + # Group evaluators by WHERE clause for proper batching + where_groups: Dict[Optional[str], List[RatioCheckEvaluator]] = {} + for evaluator in self._ratio_checks: + where = getattr(evaluator, "where", None) + if where not in where_groups: + where_groups[where] = [] + where_groups[where].append(evaluator) + + # Execute each where-group as a batch + for where, group_evaluators in where_groups.items(): + try: + group_results = self._execute_ratio_group( + table, execute_fn, group_evaluators, where + ) + results.update(group_results) + except Exception: + # Fall back to individual execution + for evaluator in group_evaluators: + try: + value = evaluator.compute_value(table, execute_fn) + results[evaluator] = value + except Exception: + results[evaluator] = None + + return results + + def _execute_ratio_group( + self, + table: str, + execute_fn: Callable[[str], "pd.DataFrame"], + evaluators: List[RatioCheckEvaluator], + where: Optional[str], + ) -> Dict[BaseEvaluator, Optional[float]]: + """ + Execute a group of ratio-check evaluators with the same WHERE clause. + + Args: + table: Name of the table to query + execute_fn: Function to execute SQL and return DataFrame + evaluators: List of ratio-check evaluators + where: WHERE clause (None if no filter) + + Returns: + Dictionary mapping evaluators to their computed values + """ + results: Dict[BaseEvaluator, Optional[float]] = {} + + # Build batched ratio query + cases = [] + for i, evaluator in enumerate(evaluators): + condition = evaluator.get_condition() + cases.append(f"SUM(CASE WHEN {condition} THEN 1 ELSE 0 END) as matches_{i}") + + # Add total count + if where: + query = f""" + SELECT + SUM(CASE WHEN {where} THEN 1 ELSE 0 END) as total, + {', '.join([f"SUM(CASE WHEN ({where}) AND ({evaluators[i].get_condition()}) THEN 1 ELSE 0 END) as matches_{i}" for i in range(len(evaluators))])} + FROM {table} + """ + else: + query = f""" + SELECT + COUNT(*) as total, + {', '.join(cases)} + FROM {table} + """ + + df = execute_fn(query) + total = float(df["total"].iloc[0]) if df["total"].iloc[0] else 0 + + for i, evaluator in enumerate(evaluators): + matches = float(df[f"matches_{i}"].iloc[0]) if df[f"matches_{i}"].iloc[0] else 0 + if total == 0: + results[evaluator] = 1.0 + else: + results[evaluator] = matches / total + + return results + + +__all__ = [ + "ConstraintBatchEvaluator", + "SCAN_BASED_EVALUATORS", +] diff --git a/pydeequ/engines/constraints/evaluators.py b/pydeequ/engines/constraints/evaluators.py new file mode 100644 index 0000000..2ac5650 --- /dev/null +++ b/pydeequ/engines/constraints/evaluators.py @@ -0,0 +1,494 @@ +# -*- coding: utf-8 -*- +""" +Constraint evaluator implementations. + +This module contains all concrete evaluator classes that implement +specific constraint types. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Callable, Optional + +from pydeequ.engines.constraints.base import ( + AnalyzerBasedEvaluator, + BaseEvaluator, + RatioCheckEvaluator, +) +from pydeequ.engines.operators import ( + ApproxCountDistinctOperator, + ApproxQuantileOperator, + CompletenessOperator, + ComplianceOperator, + CorrelationOperator, + DistinctnessOperator, + EntropyOperator, + MaximumOperator, + MaxLengthOperator, + MeanOperator, + MinimumOperator, + MinLengthOperator, + MutualInformationOperator, + PatternMatchOperator, + SizeOperator, + StandardDeviationOperator, + SumOperator, + UniqueValueRatioOperator, + UniquenessOperator, +) + +if TYPE_CHECKING: + import pandas as pd + + +# ============================================================================= +# Analyzer-based evaluators +# ============================================================================= + + +class SizeEvaluator(AnalyzerBasedEvaluator): + """Evaluator for hasSize constraint.""" + + def get_operator(self): + return SizeOperator(where=self.where) + + def to_string(self) -> str: + if self.assertion: + return f"hasSize(assertion)" + return "hasSize()" + + +class CompletenessEvaluator(AnalyzerBasedEvaluator): + """Evaluator for isComplete and hasCompleteness constraints.""" + + def get_operator(self): + return CompletenessOperator(self.column, where=self.where) + + def to_string(self) -> str: + if self.assertion: + return f"hasCompleteness({self.column}, assertion)" + return f"isComplete({self.column})" + + +class MeanEvaluator(AnalyzerBasedEvaluator): + """Evaluator for hasMean constraint.""" + + def get_operator(self): + return MeanOperator(self.column, where=self.where) + + def to_string(self) -> str: + return f"hasMean({self.column}, assertion)" + + +class MinimumEvaluator(AnalyzerBasedEvaluator): + """Evaluator for hasMin constraint.""" + + def get_operator(self): + return MinimumOperator(self.column, where=self.where) + + def to_string(self) -> str: + return f"hasMin({self.column}, assertion)" + + +class MaximumEvaluator(AnalyzerBasedEvaluator): + """Evaluator for hasMax constraint.""" + + def get_operator(self): + return MaximumOperator(self.column, where=self.where) + + def to_string(self) -> str: + return f"hasMax({self.column}, assertion)" + + +class SumEvaluator(AnalyzerBasedEvaluator): + """Evaluator for hasSum constraint.""" + + def get_operator(self): + return SumOperator(self.column, where=self.where) + + def to_string(self) -> str: + return f"hasSum({self.column}, assertion)" + + +class StandardDeviationEvaluator(AnalyzerBasedEvaluator): + """Evaluator for hasStandardDeviation constraint.""" + + def get_operator(self): + return StandardDeviationOperator(self.column, where=self.where) + + def to_string(self) -> str: + return f"hasStandardDeviation({self.column}, assertion)" + + +class UniquenessEvaluator(AnalyzerBasedEvaluator): + """Evaluator for isUnique and hasUniqueness constraints.""" + + def get_operator(self): + cols = self.columns if self.columns else [self.column] + return UniquenessOperator(cols, where=self.where) + + def to_string(self) -> str: + cols = self.columns if self.columns else [self.column] + col_str = ", ".join(cols) + if self.assertion: + return f"hasUniqueness({col_str}, assertion)" + return f"isUnique({col_str})" + + +class DistinctnessEvaluator(AnalyzerBasedEvaluator): + """Evaluator for hasDistinctness constraint.""" + + def get_operator(self): + cols = self.columns if self.columns else [self.column] + return DistinctnessOperator(cols, where=self.where) + + def to_string(self) -> str: + cols = self.columns if self.columns else [self.column] + col_str = ", ".join(cols) + return f"hasDistinctness({col_str}, assertion)" + + +class UniqueValueRatioEvaluator(AnalyzerBasedEvaluator): + """Evaluator for hasUniqueValueRatio constraint.""" + + def get_operator(self): + cols = self.columns if self.columns else [self.column] + return UniqueValueRatioOperator(cols, where=self.where) + + def to_string(self) -> str: + cols = self.columns if self.columns else [self.column] + col_str = ", ".join(cols) + return f"hasUniqueValueRatio({col_str}, assertion)" + + +class CorrelationEvaluator(AnalyzerBasedEvaluator): + """Evaluator for hasCorrelation constraint.""" + + def get_operator(self): + if len(self.columns) >= 2: + return CorrelationOperator(self.columns[0], self.columns[1], where=self.where) + return None + + def compute_value( + self, table: str, execute_fn: Callable[[str], "pd.DataFrame"] + ) -> Optional[float]: + if len(self.columns) < 2: + return None + return super().compute_value(table, execute_fn) + + def to_string(self) -> str: + if len(self.columns) >= 2: + return f"hasCorrelation({self.columns[0]}, {self.columns[1]}, assertion)" + return "hasCorrelation()" + + +class EntropyEvaluator(AnalyzerBasedEvaluator): + """Evaluator for hasEntropy constraint.""" + + def get_operator(self): + return EntropyOperator(self.column, where=self.where) + + def to_string(self) -> str: + return f"hasEntropy({self.column}, assertion)" + + +class MutualInformationEvaluator(AnalyzerBasedEvaluator): + """Evaluator for hasMutualInformation constraint.""" + + def get_operator(self): + if len(self.columns) >= 2: + return MutualInformationOperator(self.columns, where=self.where) + return None + + def compute_value( + self, table: str, execute_fn: Callable[[str], "pd.DataFrame"] + ) -> Optional[float]: + if len(self.columns) < 2: + return None + return super().compute_value(table, execute_fn) + + def to_string(self) -> str: + col_str = ", ".join(self.columns) + return f"hasMutualInformation({col_str}, assertion)" + + +class PatternMatchEvaluator(AnalyzerBasedEvaluator): + """Evaluator for hasPattern constraint.""" + + def __init__(self, constraint_proto): + super().__init__(constraint_proto) + self.pattern = constraint_proto.pattern if constraint_proto.pattern else "" + + def get_operator(self): + return PatternMatchOperator(self.column, self.pattern, where=self.where) + + def to_string(self) -> str: + return f"hasPattern({self.column}, '{self.pattern}')" + + +class MinLengthEvaluator(AnalyzerBasedEvaluator): + """Evaluator for hasMinLength constraint.""" + + def get_operator(self): + return MinLengthOperator(self.column, where=self.where) + + def to_string(self) -> str: + return f"hasMinLength({self.column}, assertion)" + + +class MaxLengthEvaluator(AnalyzerBasedEvaluator): + """Evaluator for hasMaxLength constraint.""" + + def get_operator(self): + return MaxLengthOperator(self.column, where=self.where) + + def to_string(self) -> str: + return f"hasMaxLength({self.column}, assertion)" + + +class ApproxCountDistinctEvaluator(AnalyzerBasedEvaluator): + """Evaluator for hasApproxCountDistinct constraint.""" + + def get_operator(self): + return ApproxCountDistinctOperator(self.column, where=self.where) + + def to_string(self) -> str: + return f"hasApproxCountDistinct({self.column}, assertion)" + + +class ApproxQuantileEvaluator(AnalyzerBasedEvaluator): + """Evaluator for hasApproxQuantile constraint.""" + + def __init__(self, constraint_proto): + super().__init__(constraint_proto) + self.quantile = constraint_proto.quantile if constraint_proto.quantile else 0.5 + + def get_operator(self): + return ApproxQuantileOperator(self.column, self.quantile, where=self.where) + + def to_string(self) -> str: + return f"hasApproxQuantile({self.column}, {self.quantile}, assertion)" + + +class ComplianceEvaluator(AnalyzerBasedEvaluator): + """Evaluator for satisfies constraint.""" + + def __init__(self, constraint_proto): + super().__init__(constraint_proto) + self.predicate = constraint_proto.column_condition if constraint_proto.column_condition else "" + self.name = constraint_proto.constraint_name if constraint_proto.constraint_name else "satisfies" + + def get_operator(self): + return ComplianceOperator(self.name, self.predicate, where=self.where) + + def to_string(self) -> str: + return f"satisfies({self.name}, '{self.predicate}')" + + +# ============================================================================= +# Ratio-check evaluators +# ============================================================================= + + +class IsPositiveEvaluator(RatioCheckEvaluator): + """Evaluator for isPositive constraint.""" + + def get_condition(self) -> str: + return f"{self.column} > 0" + + def to_string(self) -> str: + return f"isPositive({self.column})" + + +class IsNonNegativeEvaluator(RatioCheckEvaluator): + """Evaluator for isNonNegative constraint.""" + + def get_condition(self) -> str: + return f"{self.column} >= 0" + + def to_string(self) -> str: + return f"isNonNegative({self.column})" + + +class IsContainedInEvaluator(RatioCheckEvaluator): + """Evaluator for isContainedIn constraint.""" + + def __init__(self, constraint_proto): + super().__init__(constraint_proto) + self.allowed_values = list(constraint_proto.allowed_values) if constraint_proto.allowed_values else [] + + def get_condition(self) -> str: + # Escape single quotes in values + escaped_values = [v.replace("'", "''") for v in self.allowed_values] + values_str = ", ".join([f"'{v}'" for v in escaped_values]) + return f"{self.column} IN ({values_str})" + + def to_string(self) -> str: + values_str = ", ".join([f"'{v}'" for v in self.allowed_values]) + return f"isContainedIn({self.column}, [{values_str}])" + + +class ContainsEmailEvaluator(RatioCheckEvaluator): + """Evaluator for containsEmail constraint.""" + + EMAIL_PATTERN = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$" + + def get_condition(self) -> str: + return f"REGEXP_MATCHES({self.column}, '{self.EMAIL_PATTERN}')" + + def to_string(self) -> str: + return f"containsEmail({self.column})" + + +class ContainsURLEvaluator(RatioCheckEvaluator): + """Evaluator for containsURL constraint.""" + + URL_PATTERN = r"^https?://[^\s]+$" + + def get_condition(self) -> str: + return f"REGEXP_MATCHES({self.column}, '{self.URL_PATTERN}')" + + def to_string(self) -> str: + return f"containsURL({self.column})" + + +class ContainsCreditCardEvaluator(RatioCheckEvaluator): + """Evaluator for containsCreditCardNumber constraint.""" + + CC_PATTERN = r"^\d{13,19}$" + + def get_condition(self) -> str: + return f"REGEXP_MATCHES({self.column}, '{self.CC_PATTERN}')" + + def to_string(self) -> str: + return f"containsCreditCardNumber({self.column})" + + +class ContainsSSNEvaluator(RatioCheckEvaluator): + """Evaluator for containsSocialSecurityNumber constraint.""" + + SSN_PATTERN = r"^\d{3}-\d{2}-\d{4}$" + + def get_condition(self) -> str: + return f"REGEXP_MATCHES({self.column}, '{self.SSN_PATTERN}')" + + def to_string(self) -> str: + return f"containsSocialSecurityNumber({self.column})" + + +# ============================================================================= +# Comparison evaluators +# ============================================================================= + + +class ColumnComparisonEvaluator(RatioCheckEvaluator): + """Evaluator for column comparison constraints.""" + + def __init__(self, constraint_proto): + super().__init__(constraint_proto) + self._comparison_type = constraint_proto.type + + def get_condition(self) -> str: + if len(self.columns) < 2: + return "1=0" # Always false if not enough columns + + col_a, col_b = self.columns[0], self.columns[1] + + if self._comparison_type == "isLessThan": + return f"{col_a} < {col_b}" + elif self._comparison_type == "isLessThanOrEqualTo": + return f"{col_a} <= {col_b}" + elif self._comparison_type == "isGreaterThan": + return f"{col_a} > {col_b}" + elif self._comparison_type == "isGreaterThanOrEqualTo": + return f"{col_a} >= {col_b}" + + return "1=0" + + def to_string(self) -> str: + if len(self.columns) >= 2: + return f"{self._comparison_type}({self.columns[0]}, {self.columns[1]})" + return f"{self._comparison_type}()" + + +# ============================================================================= +# Multi-column evaluators +# ============================================================================= + + +class MultiColumnCompletenessEvaluator(BaseEvaluator): + """Evaluator for areComplete and haveCompleteness constraints.""" + + def compute_value( + self, table: str, execute_fn: Callable[[str], "pd.DataFrame"] + ) -> Optional[float]: + if not self.columns: + return 1.0 + + # All columns must be non-null for a row to be "complete" + null_conditions = " OR ".join([f"{col} IS NULL" for col in self.columns]) + + if self.where: + query = f""" + SELECT + SUM(CASE WHEN {self.where} THEN 1 ELSE 0 END) as total, + SUM(CASE WHEN ({self.where}) AND ({null_conditions}) THEN 1 ELSE 0 END) as any_null + FROM {table} + """ + else: + query = f""" + SELECT + COUNT(*) as total, + SUM(CASE WHEN {null_conditions} THEN 1 ELSE 0 END) as any_null + FROM {table} + """ + + result = execute_fn(query) + total = self.safe_float(result, "total") or 0 + any_null = self.safe_float(result, "any_null") or 0 + + if total == 0: + return 1.0 + return (total - any_null) / total + + def to_string(self) -> str: + col_str = ", ".join(self.columns) + if self.assertion: + return f"haveCompleteness({col_str}, assertion)" + return f"areComplete({col_str})" + + +__all__ = [ + # Analyzer-based evaluators + "SizeEvaluator", + "CompletenessEvaluator", + "MeanEvaluator", + "MinimumEvaluator", + "MaximumEvaluator", + "SumEvaluator", + "StandardDeviationEvaluator", + "UniquenessEvaluator", + "DistinctnessEvaluator", + "UniqueValueRatioEvaluator", + "CorrelationEvaluator", + "EntropyEvaluator", + "MutualInformationEvaluator", + "PatternMatchEvaluator", + "MinLengthEvaluator", + "MaxLengthEvaluator", + "ApproxCountDistinctEvaluator", + "ApproxQuantileEvaluator", + "ComplianceEvaluator", + # Ratio-check evaluators + "IsPositiveEvaluator", + "IsNonNegativeEvaluator", + "IsContainedInEvaluator", + "ContainsEmailEvaluator", + "ContainsURLEvaluator", + "ContainsCreditCardEvaluator", + "ContainsSSNEvaluator", + # Comparison evaluators + "ColumnComparisonEvaluator", + # Multi-column evaluators + "MultiColumnCompletenessEvaluator", +] diff --git a/pydeequ/engines/constraints/factory.py b/pydeequ/engines/constraints/factory.py new file mode 100644 index 0000000..c04e60c --- /dev/null +++ b/pydeequ/engines/constraints/factory.py @@ -0,0 +1,136 @@ +# -*- coding: utf-8 -*- +""" +Factory for creating constraint evaluators. + +This module provides a registry-based factory pattern for creating +evaluator instances from constraint protobufs. +""" + +from __future__ import annotations + +from typing import Dict, Optional, Type + +from pydeequ.engines.constraints.base import BaseEvaluator +from pydeequ.engines.constraints.evaluators import ( + ApproxCountDistinctEvaluator, + ApproxQuantileEvaluator, + ColumnComparisonEvaluator, + CompletenessEvaluator, + ComplianceEvaluator, + ContainsCreditCardEvaluator, + ContainsEmailEvaluator, + ContainsSSNEvaluator, + ContainsURLEvaluator, + CorrelationEvaluator, + DistinctnessEvaluator, + EntropyEvaluator, + IsContainedInEvaluator, + IsNonNegativeEvaluator, + IsPositiveEvaluator, + MaximumEvaluator, + MaxLengthEvaluator, + MeanEvaluator, + MinimumEvaluator, + MinLengthEvaluator, + MultiColumnCompletenessEvaluator, + MutualInformationEvaluator, + PatternMatchEvaluator, + SizeEvaluator, + StandardDeviationEvaluator, + SumEvaluator, + UniquenessEvaluator, + UniqueValueRatioEvaluator, +) + +class ConstraintEvaluatorFactory: + """ + Factory for creating constraint evaluators from protobufs. + + This factory uses a registry pattern to map constraint type strings + to their corresponding evaluator classes. + """ + + _registry: Dict[str, Type[BaseEvaluator]] = { + # Analyzer-based evaluators + "hasSize": SizeEvaluator, + "isComplete": CompletenessEvaluator, + "hasCompleteness": CompletenessEvaluator, + "hasMean": MeanEvaluator, + "hasMin": MinimumEvaluator, + "hasMax": MaximumEvaluator, + "hasSum": SumEvaluator, + "hasStandardDeviation": StandardDeviationEvaluator, + "isUnique": UniquenessEvaluator, + "hasUniqueness": UniquenessEvaluator, + "hasDistinctness": DistinctnessEvaluator, + "hasUniqueValueRatio": UniqueValueRatioEvaluator, + "hasCorrelation": CorrelationEvaluator, + "hasEntropy": EntropyEvaluator, + "hasMutualInformation": MutualInformationEvaluator, + "hasPattern": PatternMatchEvaluator, + "hasMinLength": MinLengthEvaluator, + "hasMaxLength": MaxLengthEvaluator, + "hasApproxCountDistinct": ApproxCountDistinctEvaluator, + "hasApproxQuantile": ApproxQuantileEvaluator, + "satisfies": ComplianceEvaluator, + # Ratio-check evaluators + "isPositive": IsPositiveEvaluator, + "isNonNegative": IsNonNegativeEvaluator, + "isContainedIn": IsContainedInEvaluator, + "containsEmail": ContainsEmailEvaluator, + "containsURL": ContainsURLEvaluator, + "containsCreditCardNumber": ContainsCreditCardEvaluator, + "containsSocialSecurityNumber": ContainsSSNEvaluator, + # Comparison evaluators + "isLessThan": ColumnComparisonEvaluator, + "isLessThanOrEqualTo": ColumnComparisonEvaluator, + "isGreaterThan": ColumnComparisonEvaluator, + "isGreaterThanOrEqualTo": ColumnComparisonEvaluator, + # Multi-column evaluators + "areComplete": MultiColumnCompletenessEvaluator, + "haveCompleteness": MultiColumnCompletenessEvaluator, + } + + @classmethod + def create(cls, constraint_proto) -> Optional[BaseEvaluator]: + """ + Create an evaluator instance from a constraint protobuf. + + Args: + constraint_proto: Protobuf message containing constraint definition + + Returns: + Evaluator instance or None if constraint type not supported + """ + evaluator_class = cls._registry.get(constraint_proto.type) + if evaluator_class: + return evaluator_class(constraint_proto) + return None + + @classmethod + def is_supported(cls, constraint_type: str) -> bool: + """ + Check if a constraint type is supported by the factory. + + Args: + constraint_type: The constraint type string to check + + Returns: + True if the constraint type is supported + """ + return constraint_type in cls._registry + + @classmethod + def supported_types(cls) -> list: + """ + Get list of all supported constraint types. + + Returns: + List of supported constraint type strings + """ + return list(cls._registry.keys()) + + +__all__ = [ + "ConstraintEvaluatorFactory", +] diff --git a/pydeequ/engines/constraints/protocols.py b/pydeequ/engines/constraints/protocols.py new file mode 100644 index 0000000..87a7e67 --- /dev/null +++ b/pydeequ/engines/constraints/protocols.py @@ -0,0 +1,73 @@ +# -*- coding: utf-8 -*- +""" +Protocol definitions for constraint evaluators. + +This module defines the structural typing contracts that all constraint +evaluators must satisfy. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Callable, Optional, Protocol, runtime_checkable + +if TYPE_CHECKING: + from pydeequ.v2.predicates import Predicate + + +@runtime_checkable +class ConstraintEvaluatorProtocol(Protocol): + """ + Contract for constraint evaluators. + + Constraint evaluators compute values from data and evaluate + assertions against those values to determine pass/fail status. + """ + + @property + def constraint_type(self) -> str: + """Return the constraint type identifier.""" + ... + + def compute_value( + self, table: str, execute_fn: Callable[[str], "pd.DataFrame"] + ) -> Optional[float]: + """ + Compute the metric value for this constraint. + + Args: + table: Name of the table to query + execute_fn: Function to execute SQL and return DataFrame + + Returns: + Computed metric value, or None if computation fails + """ + ... + + def evaluate( + self, value: Optional[float], assertion: Optional["Predicate"] = None + ) -> bool: + """ + Evaluate whether the computed value satisfies the constraint. + + Args: + value: The computed metric value + assertion: Optional predicate to evaluate against + + Returns: + True if the constraint is satisfied, False otherwise + """ + ... + + def to_string(self) -> str: + """ + Return a human-readable string representation of the constraint. + + Returns: + Description of what the constraint checks + """ + ... + + +__all__ = [ + "ConstraintEvaluatorProtocol", +] diff --git a/pydeequ/engines/duckdb.py b/pydeequ/engines/duckdb.py new file mode 100644 index 0000000..e4b0200 --- /dev/null +++ b/pydeequ/engines/duckdb.py @@ -0,0 +1,533 @@ +# -*- coding: utf-8 -*- +""" +DuckDB execution engine for PyDeequ. + +This module provides a DuckDB-based execution engine that runs data quality +checks directly via SQL queries, without requiring a Spark cluster. + +Example usage: + import duckdb + from pydeequ.engines.duckdb import DuckDBEngine + from pydeequ.v2.analyzers import Size, Completeness, Mean + + con = duckdb.connect() + con.execute("CREATE TABLE test AS SELECT 1 as id, 2 as value") + + engine = DuckDBEngine(con, table="test") + metrics = engine.compute_metrics([Size(), Completeness("id"), Mean("value")]) + + # With profiling enabled + engine = DuckDBEngine(con, table="test", enable_profiling=True) + engine.compute_metrics([Size(), Completeness("id")]) + stats = engine.get_query_stats() + print(f"Total queries: {engine.get_query_count()}") +""" + +from __future__ import annotations + +import time +from typing import TYPE_CHECKING, Dict, List, Optional, Sequence + +import pandas as pd + +from pydeequ.engines import ( + BaseEngine, + ColumnProfile, + ConstraintResult, + ConstraintSuggestion, + ConstraintStatus, + CheckStatus, + MetricResult, +) +from pydeequ.engines.operators import GroupingOperatorBatcher, OperatorFactory + +if TYPE_CHECKING: + import duckdb + from pydeequ.engines.duckdb_config import DuckDBEngineConfig + from pydeequ.v2.analyzers import _ConnectAnalyzer + from pydeequ.v2.checks import Check + from pydeequ.v2.predicates import Predicate + + +class DuckDBEngine(BaseEngine): + """ + DuckDB-based execution engine. + + This engine executes data quality checks using DuckDB SQL queries. + It supports most analyzers through standard SQL aggregations. + + Attributes: + con: DuckDB connection + table: Name of the table to analyze + enable_profiling: Whether to collect query timing statistics + config: Optional configuration for DuckDB optimization + """ + + def __init__( + self, + con: "duckdb.DuckDBPyConnection", + table: str, + enable_profiling: bool = False, + config: Optional["DuckDBEngineConfig"] = None, + ): + """ + Create a new DuckDBEngine. + + Args: + con: DuckDB connection object + table: Name of the table to analyze + enable_profiling: Whether to collect query timing statistics + config: Optional DuckDB configuration for optimization + """ + self.con = con + self.table = table + self._schema: Optional[Dict[str, str]] = None + self._enable_profiling = enable_profiling + self._query_stats: List[Dict] = [] + + # Apply configuration if provided + if config is not None: + config.apply(con) + + def get_schema(self) -> Dict[str, str]: + """Get the schema of the table.""" + if self._schema is None: + df = self.con.execute(f"PRAGMA table_info('{self.table}')").fetchdf() + self._schema = {} + for _, row in df.iterrows(): + # Normalize type names to uppercase for consistency + col_type = str(row["type"]).upper() + # Extract base type (e.g., "DECIMAL(10,2)" -> "DECIMAL") + base_type = col_type.split("(")[0] + self._schema[row["name"]] = base_type + return self._schema + + def _execute_query(self, query: str) -> pd.DataFrame: + """Execute a SQL query and return results as DataFrame.""" + if self._enable_profiling: + start = time.perf_counter() + result = self.con.execute(query).fetchdf() + elapsed = time.perf_counter() - start + self._query_stats.append({ + 'query': query[:200] + ('...' if len(query) > 200 else ''), + 'time_ms': elapsed * 1000, + 'rows': len(result), + }) + return result + return self.con.execute(query).fetchdf() + + def get_query_stats(self) -> pd.DataFrame: + """Return profiling statistics as DataFrame.""" + return pd.DataFrame(self._query_stats) + + def get_query_count(self) -> int: + """Return number of queries executed.""" + return len(self._query_stats) + + def explain_query(self, query: str) -> str: + """Get DuckDB query plan with EXPLAIN ANALYZE.""" + return self.con.execute(f"EXPLAIN ANALYZE {query}").fetchdf().to_string() + + def reset_profiling(self) -> None: + """Reset profiling statistics.""" + self._query_stats = [] + + def _get_row_count(self, where: Optional[str] = None) -> int: + """Get the row count, optionally filtered.""" + if where: + query = f"SELECT COUNT(*) as cnt FROM {self.table} WHERE {where}" + else: + query = f"SELECT COUNT(*) as cnt FROM {self.table}" + result = self._execute_query(query) + return int(result["cnt"].iloc[0]) + + # ========================================================================= + # Main compute_metrics implementation using operators + # ========================================================================= + + def compute_metrics( + self, analyzers: Sequence["_ConnectAnalyzer"] + ) -> List[MetricResult]: + """ + Compute metrics for the given analyzers. + + This method uses the operator abstraction to: + 1. Create operators from analyzers via OperatorFactory + 2. Batch scan operators into a single SQL query + 3. Execute grouping operators individually + 4. Handle metadata operators using schema access + 5. Extract results using operator-specific logic + """ + results: List[MetricResult] = [] + + # Separate analyzers by operator type + scan_operators = [] + grouping_operators = [] + metadata_operators = [] + + for analyzer in analyzers: + if OperatorFactory.is_scan_operator(analyzer): + operator = OperatorFactory.create(analyzer) + if operator: + scan_operators.append(operator) + elif OperatorFactory.is_grouping_operator(analyzer): + operator = OperatorFactory.create(analyzer) + if operator: + grouping_operators.append(operator) + elif OperatorFactory.is_metadata_operator(analyzer): + operator = OperatorFactory.create(analyzer) + if operator: + metadata_operators.append(operator) + else: + # Unsupported analyzer + results.append(MetricResult( + name=type(analyzer).__name__, + instance=getattr(analyzer, 'column', '*'), + entity="Column" if hasattr(analyzer, 'column') else "Dataset", + value=None, + success=False, + message=f"Analyzer {type(analyzer).__name__} not implemented" + )) + + # Execute batched scan query + if scan_operators: + try: + # Collect all aggregations + aggregations = [] + for operator in scan_operators: + aggregations.extend(operator.get_aggregations()) + + # Build and execute single query + query = f"SELECT {', '.join(aggregations)} FROM {self.table}" + scan_result = self._execute_query(query) + + # Extract results from each operator + for operator in scan_operators: + try: + result = operator.extract_result(scan_result) + results.append(result) + except Exception as e: + results.append(MetricResult( + name=operator.metric_name, + instance=operator.instance, + entity=operator.entity, + value=None, + success=False, + message=str(e) + )) + + except Exception as e: + # If batch query fails, report error for all scan operators + for operator in scan_operators: + results.append(MetricResult( + name=operator.metric_name, + instance=operator.instance, + entity=operator.entity, + value=None, + success=False, + message=f"Batch query failed: {str(e)}" + )) + + # Execute grouping operators with batching optimization + if grouping_operators: + batcher = GroupingOperatorBatcher(grouping_operators) + + # Execute batched queries (fused operators with same columns/where) + try: + batched_results = batcher.execute_batched( + self.table, self._execute_query + ) + results.extend(batched_results) + except Exception as e: + # If batched execution fails, fall back to individual execution + for operator in grouping_operators: + if operator not in batcher.get_unbatchable_operators(): + results.append(MetricResult( + name=operator.metric_name, + instance=operator.instance, + entity=operator.entity, + value=None, + success=False, + message=f"Batched query failed: {str(e)}" + )) + + # Execute unbatchable operators individually + for operator in batcher.get_unbatchable_operators(): + try: + query = operator.build_query(self.table) + df = self._execute_query(query) + result = operator.extract_result(df) + results.append(result) + except Exception as e: + results.append(MetricResult( + name=operator.metric_name, + instance=operator.instance, + entity=operator.entity, + value=None, + success=False, + message=str(e) + )) + + # Execute metadata operators using schema + schema = self.get_schema() + for operator in metadata_operators: + try: + result = operator.compute_from_schema(schema) + results.append(result) + except Exception as e: + results.append(MetricResult( + name=operator.metric_name, + instance=operator.instance, + entity=operator.entity, + value=None, + success=False, + message=str(e) + )) + + return results + + # ========================================================================= + # Constraint checking + # ========================================================================= + + def run_checks(self, checks: Sequence["Check"]) -> List[ConstraintResult]: + """Run verification checks and return constraint results. + + Uses ConstraintBatchEvaluator to batch compatible constraints, + reducing the number of SQL queries executed. + """ + from pydeequ.v2.checks import CheckLevel + from pydeequ.engines.constraints import ( + ConstraintBatchEvaluator, + ConstraintEvaluatorFactory, + ) + + results: List[ConstraintResult] = [] + + # Phase 1: Create all evaluators and collect metadata + all_evaluators = [] + constraint_info = [] # (check, constraint, evaluator) tuples + + for check in checks: + for constraint in check._constraints: + evaluator = ConstraintEvaluatorFactory.create(constraint) + if evaluator: + all_evaluators.append(evaluator) + constraint_info.append((check, constraint, evaluator)) + else: + constraint_info.append((check, constraint, None)) + + # Phase 2: Batch execute all evaluators + computed_values: Dict = {} + if all_evaluators: + batcher = ConstraintBatchEvaluator(all_evaluators) + computed_values = batcher.execute(self.table, self._execute_query) + + # Phase 3: Process results by check + info_idx = 0 + for check in checks: + check_description = check.description + check_level = check.level.value + check_has_failure = False + + for constraint in check._constraints: + _, _, evaluator = constraint_info[info_idx] + info_idx += 1 + + constraint_message = None + constraint_passed = False + + try: + if evaluator: + # Get pre-computed value from batch execution + value = computed_values.get(evaluator) + + # Evaluate the constraint + constraint_passed = evaluator.evaluate(value) + + # Get constraint description + constraint_str = evaluator.to_string() + + if not constraint_passed: + if value is not None: + constraint_message = f"Value: {value:.6g}" + else: + constraint_message = "Could not compute metric" + else: + constraint_str = constraint.type + constraint_message = f"Unknown constraint type: {constraint.type}" + + except Exception as e: + constraint_str = constraint.type + constraint_message = f"Error: {str(e)}" + constraint_passed = False + + if not constraint_passed: + check_has_failure = True + + results.append(ConstraintResult( + check_description=check_description, + check_level=check_level, + check_status=CheckStatus.ERROR.value if check_has_failure else CheckStatus.SUCCESS.value, + constraint=constraint_str, + constraint_status=ConstraintStatus.SUCCESS.value if constraint_passed else ConstraintStatus.FAILURE.value, + constraint_message=constraint_message, + )) + + # Update check status for all constraints in this check + final_status = CheckStatus.ERROR.value if check_has_failure else CheckStatus.SUCCESS.value + if check.level == CheckLevel.Warning and check_has_failure: + final_status = CheckStatus.WARNING.value + + for i in range(len(results) - len(check._constraints), len(results)): + results[i] = ConstraintResult( + check_description=results[i].check_description, + check_level=results[i].check_level, + check_status=final_status, + constraint=results[i].constraint, + constraint_status=results[i].constraint_status, + constraint_message=results[i].constraint_message, + ) + + return results + + # ========================================================================= + # Column profiling + # ========================================================================= + + def profile_columns( + self, + columns: Optional[Sequence[str]] = None, + low_cardinality_threshold: int = 0, + ) -> List[ColumnProfile]: + """ + Profile columns in the table. + + Uses MultiColumnProfileOperator to batch profile statistics across + multiple columns, significantly reducing the number of SQL queries + from 2-3 per column to 2-3 total. + + Args: + columns: Optional list of columns to profile. If None, profile all. + low_cardinality_threshold: Threshold for histogram computation. + If > 0 and distinct values <= threshold, compute histogram. + + Returns: + List of ColumnProfile objects + """ + from pydeequ.engines.operators.profiling_operators import ( + ColumnProfileOperator, + MultiColumnProfileOperator, + ) + + schema = self.get_schema() + + # Determine which columns to profile + if columns: + cols_to_profile = [c for c in columns if c in schema] + else: + cols_to_profile = list(schema.keys()) + + if not cols_to_profile: + return [] + + # Use MultiColumnProfileOperator for batched profiling + operator = MultiColumnProfileOperator(cols_to_profile, schema) + + # Query 1: Completeness and distinct counts for all columns + completeness_query = operator.build_completeness_query(self.table) + completeness_df = self._execute_query(completeness_query) + + # Query 2: Numeric stats for all numeric columns (if any) + numeric_df = None + numeric_query = operator.build_numeric_stats_query(self.table) + if numeric_query: + numeric_df = self._execute_query(numeric_query) + + # Query 3: Percentiles for all numeric columns (if any) + percentile_df = None + percentile_query = operator.build_percentile_query(self.table) + if percentile_query: + try: + percentile_df = self._execute_query(percentile_query) + except Exception: + # Percentile computation may fail for some types + pass + + # Extract profiles from batched results + profiles = operator.extract_profiles(completeness_df, numeric_df, percentile_df) + + # Add histograms for low cardinality columns (requires per-column queries) + if low_cardinality_threshold > 0: + for profile in profiles: + if profile.approx_distinct_values <= low_cardinality_threshold: + col_type = schema.get(profile.column, "VARCHAR") + col_operator = ColumnProfileOperator( + column=profile.column, + column_type=col_type, + compute_percentiles=False, + compute_histogram=True, + histogram_limit=low_cardinality_threshold, + ) + hist_query = col_operator.build_histogram_query(self.table) + hist_result = self._execute_query(hist_query) + profile.histogram = col_operator.extract_histogram_result(hist_result) + + return profiles + + # ========================================================================= + # Constraint suggestions + # ========================================================================= + + def suggest_constraints( + self, + columns: Optional[Sequence[str]] = None, + rules: Optional[Sequence[str]] = None, + ) -> List[ConstraintSuggestion]: + """ + Suggest constraints based on data characteristics. + + Uses the SuggestionRunner to apply modular suggestion rules against + column profiles. Rules are organized into sets: + - DEFAULT: completeness, non-negative, categorical + - NUMERICAL: min, max, mean + - STRING: min/max length + - COMMON: uniqueness + - EXTENDED: all rules + + Args: + columns: Optional list of columns to analyze. If None, analyze all. + rules: Optional list of rule sets to apply. Defaults to ["DEFAULT"]. + + Returns: + List of ConstraintSuggestion objects + """ + from pydeequ.engines.suggestions import SuggestionRunner + from pydeequ.v2.suggestions import Rules + + # Default rules - normalize to strings for SuggestionRunner + if rules is None: + rule_strings = ["DEFAULT"] + else: + # Accept both Rules enum and string values + rule_strings = [] + for rule in rules: + if isinstance(rule, Rules): + rule_strings.append(rule.value) + else: + rule_strings.append(rule) + + # Profile columns with histograms for categorical detection + profiles = self.profile_columns(columns, low_cardinality_threshold=100) + + # Get row count for uniqueness checks + row_count = self._get_row_count() + + # Run suggestion rules + runner = SuggestionRunner(rule_sets=rule_strings) + return runner.run( + profiles, + execute_fn=self._execute_query, + table=self.table, + row_count=row_count, + ) diff --git a/pydeequ/engines/duckdb_config.py b/pydeequ/engines/duckdb_config.py new file mode 100644 index 0000000..b470ae0 --- /dev/null +++ b/pydeequ/engines/duckdb_config.py @@ -0,0 +1,140 @@ +# -*- coding: utf-8 -*- +""" +DuckDB engine configuration for PyDeequ. + +This module provides configuration options to optimize DuckDB performance +for analytical workloads like data quality checks. + +Example usage: + import duckdb + from pydeequ.engines.duckdb import DuckDBEngine + from pydeequ.engines.duckdb_config import DuckDBEngineConfig + + # Create config with optimizations + config = DuckDBEngineConfig( + threads=8, + memory_limit="8GB", + preserve_insertion_order=False, # Better parallelism + ) + + con = duckdb.connect() + config.apply(con) + + engine = DuckDBEngine(con, table="test") +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import TYPE_CHECKING, Dict, Optional + +if TYPE_CHECKING: + import duckdb + + +@dataclass +class DuckDBEngineConfig: + """ + Configuration for DuckDB engine optimization. + + Attributes: + threads: Number of threads to use. None = auto (all cores). + memory_limit: Memory limit string (e.g., "8GB"). None = auto. + preserve_insertion_order: If False, allows better parallelism. + Set to False for read-only analytical workloads. + parquet_metadata_cache: Cache Parquet metadata for faster repeated reads. + enable_object_cache: Enable object caching for repeated queries. + enable_progress_bar: Show progress bar for long-running queries. + default_null_order: How to order NULLs (NULLS_FIRST, NULLS_LAST). + custom_settings: Additional DuckDB settings as key-value pairs. + """ + + threads: Optional[int] = None + memory_limit: Optional[str] = None + preserve_insertion_order: bool = False + parquet_metadata_cache: bool = True + enable_object_cache: bool = True + enable_progress_bar: bool = False + default_null_order: str = "NULLS_LAST" + custom_settings: Dict[str, str] = field(default_factory=dict) + + def apply(self, con: "duckdb.DuckDBPyConnection") -> None: + """ + Apply configuration settings to a DuckDB connection. + + Args: + con: DuckDB connection to configure + """ + # Thread configuration + if self.threads is not None: + con.execute(f"SET threads = {self.threads}") + + # Memory configuration + if self.memory_limit is not None: + con.execute(f"SET memory_limit = '{self.memory_limit}'") + + # Parallelism optimization + con.execute( + f"SET preserve_insertion_order = {str(self.preserve_insertion_order).lower()}" + ) + + # Caching optimization + if self.parquet_metadata_cache: + con.execute("SET parquet_metadata_cache = true") + + if self.enable_object_cache: + con.execute("SET enable_object_cache = true") + + # Progress bar (useful for debugging long queries) + con.execute( + f"SET enable_progress_bar = {str(self.enable_progress_bar).lower()}" + ) + + # NULL ordering + con.execute(f"SET default_null_order = '{self.default_null_order}'") + + # Apply custom settings + for key, value in self.custom_settings.items(): + # Determine if value needs quoting (strings vs numbers/booleans) + if value.lower() in ("true", "false") or value.isdigit(): + con.execute(f"SET {key} = {value}") + else: + con.execute(f"SET {key} = '{value}'") + + @classmethod + def default(cls) -> "DuckDBEngineConfig": + """Create a default configuration.""" + return cls() + + @classmethod + def high_performance(cls) -> "DuckDBEngineConfig": + """ + Create a high-performance configuration for analytical workloads. + + This configuration prioritizes read performance over write safety: + - Disables insertion order preservation for better parallelism + - Enables all caching options + - Uses all available cores + """ + return cls( + threads=None, # Use all cores + preserve_insertion_order=False, + parquet_metadata_cache=True, + enable_object_cache=True, + ) + + @classmethod + def memory_constrained(cls, memory_limit: str = "4GB") -> "DuckDBEngineConfig": + """ + Create a configuration for memory-constrained environments. + + Args: + memory_limit: Memory limit string (e.g., "4GB") + """ + return cls( + memory_limit=memory_limit, + enable_object_cache=False, # Reduce memory usage + ) + + +__all__ = ["DuckDBEngineConfig"] diff --git a/pydeequ/engines/operators/__init__.py b/pydeequ/engines/operators/__init__.py new file mode 100644 index 0000000..a76f8b9 --- /dev/null +++ b/pydeequ/engines/operators/__init__.py @@ -0,0 +1,139 @@ +# -*- coding: utf-8 -*- +""" +SQL Operator abstractions for data quality metrics. + +This module provides a hierarchical operator abstraction pattern that: +1. Eliminates code duplication across analyzer implementations +2. Separates SQL generation from result extraction +3. Enables efficient batch execution of scan operators +4. Provides consistent WHERE clause handling + +Architecture: + Protocols (Contracts) + ├── ScanOperatorProtocol - Single-pass aggregation operators + └── GroupingOperatorProtocol - GROUP BY-based operators + + Mixins (Shared Behaviors) + ├── WhereClauseMixin - Conditional aggregation wrapping + ├── SafeExtractMixin - Safe value extraction from DataFrames + └── ColumnAliasMixin - Consistent alias generation + + Base Classes (Hierarchy) + ├── ScanOperator - Base for single-pass operators + └── GroupingOperator - Base for GROUP BY operators + + Factory + └── OperatorFactory - Creates operators from analyzers + +Example usage: + from pydeequ.engines.operators import OperatorFactory + + # Create operator from analyzer + operator = OperatorFactory.create(Mean("price")) + + # Get SQL aggregations for scan operators + aggregations = operator.get_aggregations() + # ["AVG(price) AS mean_price"] + + # Execute query and extract result + df = engine._execute_query(f"SELECT {', '.join(aggregations)} FROM table") + result = operator.extract_result(df) +""" + +from pydeequ.engines.operators.base import GroupingOperator, ScanOperator +from pydeequ.engines.operators.factory import OperatorFactory +from pydeequ.engines.operators.grouping_batcher import ( + GroupingOperatorBatcher, + BATCHABLE_OPERATORS, +) +from pydeequ.engines.operators.grouping_operators import ( + DistinctnessOperator, + EntropyOperator, + HistogramOperator, + MutualInformationOperator, + UniqueValueRatioOperator, + UniquenessOperator, +) +from pydeequ.engines.operators.metadata_operators import ( + DataTypeOperator, +) +from pydeequ.engines.operators.profiling_operators import ( + ColumnProfileOperator, + MultiColumnProfileOperator, + NUMERIC_TYPES, + STRING_TYPES, +) +from pydeequ.engines.operators.mixins import ( + ColumnAliasMixin, + SafeExtractMixin, + WhereClauseMixin, +) +from pydeequ.engines.operators.protocols import ( + GroupingOperatorProtocol, + ScanOperatorProtocol, +) +from pydeequ.engines.operators.scan_operators import ( + ApproxCountDistinctOperator, + ApproxQuantileOperator, + ComplianceOperator, + CompletenessOperator, + CorrelationOperator, + CountDistinctOperator, + MaximumOperator, + MaxLengthOperator, + MeanOperator, + MinimumOperator, + MinLengthOperator, + PatternMatchOperator, + SizeOperator, + StandardDeviationOperator, + SumOperator, +) + +__all__ = [ + # Protocols + "ScanOperatorProtocol", + "GroupingOperatorProtocol", + # Mixins + "WhereClauseMixin", + "SafeExtractMixin", + "ColumnAliasMixin", + # Base classes + "ScanOperator", + "GroupingOperator", + # Scan operators + "SizeOperator", + "CompletenessOperator", + "MeanOperator", + "SumOperator", + "MinimumOperator", + "MaximumOperator", + "StandardDeviationOperator", + "MaxLengthOperator", + "MinLengthOperator", + "PatternMatchOperator", + "ComplianceOperator", + "CorrelationOperator", + "CountDistinctOperator", + "ApproxCountDistinctOperator", + "ApproxQuantileOperator", + # Grouping operators + "DistinctnessOperator", + "UniquenessOperator", + "UniqueValueRatioOperator", + "EntropyOperator", + "MutualInformationOperator", + "HistogramOperator", + # Grouping operator batching + "GroupingOperatorBatcher", + "BATCHABLE_OPERATORS", + # Metadata operators + "DataTypeOperator", + # Profiling operators + "ColumnProfileOperator", + "MultiColumnProfileOperator", + "NUMERIC_TYPES", + "STRING_TYPES", + # Factory + "OperatorFactory", +] diff --git a/pydeequ/engines/operators/base.py b/pydeequ/engines/operators/base.py new file mode 100644 index 0000000..88ef664 --- /dev/null +++ b/pydeequ/engines/operators/base.py @@ -0,0 +1,178 @@ +# -*- coding: utf-8 -*- +""" +Base classes for SQL operators. + +This module provides the abstract base classes that combine protocols +and mixins to create the foundation for concrete operator implementations. +""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, List, Optional + +from pydeequ.engines.operators.mixins import ( + ColumnAliasMixin, + SafeExtractMixin, + WhereClauseMixin, +) + +if TYPE_CHECKING: + import pandas as pd + from pydeequ.engines import MetricResult + + +class ScanOperator(WhereClauseMixin, SafeExtractMixin, ColumnAliasMixin, ABC): + """ + Base class for single-pass aggregation operators. + + Scan operators compute metrics via SQL aggregations that can be + combined into a single SELECT statement. This enables efficient + batch execution where multiple metrics are computed in one query. + + Subclasses must implement: + - get_aggregations(): Return SQL aggregation expressions + - extract_result(): Parse query results into MetricResult + + Attributes: + column: Column name to analyze + where: Optional SQL WHERE clause for filtering + """ + + def __init__(self, column: str, where: Optional[str] = None): + """ + Initialize scan operator. + + Args: + column: Column name to analyze + where: Optional SQL WHERE clause for filtering + """ + self.column = column + self.where = where + + @abstractmethod + def get_aggregations(self) -> List[str]: + """ + Return SQL aggregation expressions. + + Returns: + List of SQL aggregation expressions with AS alias clauses + """ + raise NotImplementedError + + @abstractmethod + def extract_result(self, df: "pd.DataFrame") -> "MetricResult": + """ + Extract metric from query result DataFrame. + + Args: + df: DataFrame containing query results + + Returns: + MetricResult with extracted value + """ + raise NotImplementedError + + @property + def instance(self) -> str: + """Return the instance identifier for this operator.""" + return self.column + + @property + def entity(self) -> str: + """Return the entity type for this operator.""" + return "Column" + + @property + @abstractmethod + def metric_name(self) -> str: + """Return the metric name for this operator.""" + raise NotImplementedError + + +class GroupingOperator(WhereClauseMixin, SafeExtractMixin, ColumnAliasMixin, ABC): + """ + Base class for operators requiring GROUP BY queries. + + Grouping operators need to compute intermediate aggregations + via GROUP BY before computing the final metric. They cannot + be batched with scan operators and require separate queries. + + Subclasses must implement: + - get_grouping_columns(): Return columns to GROUP BY + - build_query(): Build complete CTE-based query + - extract_result(): Parse query results into MetricResult + + Attributes: + columns: Column name(s) to analyze + where: Optional SQL WHERE clause for filtering + """ + + def __init__(self, columns: List[str], where: Optional[str] = None): + """ + Initialize grouping operator. + + Args: + columns: Column name(s) to analyze + where: Optional SQL WHERE clause for filtering + """ + self.columns = columns + self.where = where + + @abstractmethod + def get_grouping_columns(self) -> List[str]: + """ + Return columns to GROUP BY. + + Returns: + List of column names for the GROUP BY clause + """ + raise NotImplementedError + + @abstractmethod + def build_query(self, table: str) -> str: + """ + Build complete CTE-based query. + + Args: + table: Name of the table to query + + Returns: + Complete SQL query string + """ + raise NotImplementedError + + @abstractmethod + def extract_result(self, df: "pd.DataFrame") -> "MetricResult": + """ + Extract metric from query result DataFrame. + + Args: + df: DataFrame containing query results + + Returns: + MetricResult with extracted value + """ + raise NotImplementedError + + @property + def instance(self) -> str: + """Return the instance identifier for this operator.""" + return ",".join(self.columns) + + @property + def entity(self) -> str: + """Return the entity type for this operator.""" + return "Multicolumn" if len(self.columns) > 1 else "Column" + + @property + @abstractmethod + def metric_name(self) -> str: + """Return the metric name for this operator.""" + raise NotImplementedError + + +__all__ = [ + "ScanOperator", + "GroupingOperator", +] diff --git a/pydeequ/engines/operators/factory.py b/pydeequ/engines/operators/factory.py new file mode 100644 index 0000000..ed3c2ce --- /dev/null +++ b/pydeequ/engines/operators/factory.py @@ -0,0 +1,277 @@ +# -*- coding: utf-8 -*- +""" +Operator factory for creating operators from analyzers. + +This module provides a registry-based factory pattern that eliminates +isinstance() chains when creating operators from analyzers. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Dict, Optional, Type, Union + +from pydeequ.engines.operators.grouping_operators import ( + DistinctnessOperator, + EntropyOperator, + HistogramOperator, + MutualInformationOperator, + UniqueValueRatioOperator, + UniquenessOperator, +) +from pydeequ.engines.operators.metadata_operators import ( + DataTypeOperator, +) +from pydeequ.engines.operators.scan_operators import ( + ApproxCountDistinctOperator, + ApproxQuantileOperator, + ComplianceOperator, + CompletenessOperator, + CorrelationOperator, + CountDistinctOperator, + MaximumOperator, + MaxLengthOperator, + MeanOperator, + MinimumOperator, + MinLengthOperator, + PatternMatchOperator, + SizeOperator, + StandardDeviationOperator, + SumOperator, +) + +if TYPE_CHECKING: + from pydeequ.engines.operators.base import GroupingOperator, ScanOperator + from pydeequ.v2.analyzers import _ConnectAnalyzer + +# Type alias for operator types +OperatorType = Union["ScanOperator", "GroupingOperator", "DataTypeOperator"] + + +class OperatorFactory: + """ + Creates operators from analyzers using registry pattern. + + This factory eliminates isinstance() chains by mapping analyzer + types to their corresponding operator classes. + """ + + # Registry mapping analyzer type names to operator classes + _scan_registry: Dict[str, Type] = {} + _grouping_registry: Dict[str, Type] = {} + _metadata_registry: Dict[str, Type] = {} + + @classmethod + def register_scan(cls, analyzer_name: str): + """ + Decorator to register a scan operator for an analyzer type. + + Args: + analyzer_name: Name of the analyzer class (e.g., "Mean", "Sum") + + Returns: + Decorator function + """ + def decorator(operator_class: Type): + cls._scan_registry[analyzer_name] = operator_class + return operator_class + return decorator + + @classmethod + def register_grouping(cls, analyzer_name: str): + """ + Decorator to register a grouping operator for an analyzer type. + + Args: + analyzer_name: Name of the analyzer class + + Returns: + Decorator function + """ + def decorator(operator_class: Type): + cls._grouping_registry[analyzer_name] = operator_class + return operator_class + return decorator + + @classmethod + def register_metadata(cls, analyzer_name: str): + """ + Decorator to register a metadata operator for an analyzer type. + + Metadata operators compute metrics from schema information rather + than SQL queries. They are used for type inference and similar + schema-based analysis. + + Args: + analyzer_name: Name of the analyzer class + + Returns: + Decorator function + """ + def decorator(operator_class: Type): + cls._metadata_registry[analyzer_name] = operator_class + return operator_class + return decorator + + @classmethod + def create(cls, analyzer: "_ConnectAnalyzer") -> Optional[OperatorType]: + """ + Create operator instance for given analyzer. + + Args: + analyzer: Analyzer instance to create operator for + + Returns: + Operator instance or None if analyzer type not supported + """ + analyzer_name = type(analyzer).__name__ + + # Try scan registry first + if analyzer_name in cls._scan_registry: + return cls._create_scan_operator(analyzer_name, analyzer) + + # Try grouping registry + if analyzer_name in cls._grouping_registry: + return cls._create_grouping_operator(analyzer_name, analyzer) + + # Try metadata registry + if analyzer_name in cls._metadata_registry: + return cls._create_metadata_operator(analyzer_name, analyzer) + + return None + + @classmethod + def _create_scan_operator( + cls, analyzer_name: str, analyzer: "_ConnectAnalyzer" + ) -> "ScanOperator": + """Create a scan operator from an analyzer.""" + operator_class = cls._scan_registry[analyzer_name] + + # Extract common attributes + column = getattr(analyzer, "column", None) + where = getattr(analyzer, "where", None) + + # Handle special cases + if analyzer_name == "Size": + return operator_class(where=where) + elif analyzer_name == "Compliance": + instance = getattr(analyzer, "instance", "compliance") + predicate = getattr(analyzer, "predicate", "") + return operator_class(instance, predicate, where=where) + elif analyzer_name == "PatternMatch": + pattern = getattr(analyzer, "pattern", "") + return operator_class(column, pattern, where=where) + elif analyzer_name == "Correlation": + column1 = getattr(analyzer, "column1", "") + column2 = getattr(analyzer, "column2", "") + return operator_class(column1, column2, where=where) + elif analyzer_name == "CountDistinct": + columns = list(getattr(analyzer, "columns", [])) + return operator_class(columns, where=where) + elif analyzer_name == "ApproxQuantile": + quantile = getattr(analyzer, "quantile", 0.5) + return operator_class(column, quantile, where=where) + else: + # Standard single-column operators + return operator_class(column, where=where) + + @classmethod + def _create_grouping_operator( + cls, analyzer_name: str, analyzer: "_ConnectAnalyzer" + ) -> "GroupingOperator": + """Create a grouping operator from an analyzer.""" + operator_class = cls._grouping_registry[analyzer_name] + + where = getattr(analyzer, "where", None) + + if analyzer_name == "Entropy": + column = getattr(analyzer, "column", "") + return operator_class(column, where=where) + elif analyzer_name == "Histogram": + column = getattr(analyzer, "column", "") + max_bins = getattr(analyzer, "max_detail_bins", 100) or 100 + return operator_class(column, max_bins, where=where) + else: + # Multi-column operators (Distinctness, Uniqueness, etc.) + columns = getattr(analyzer, "columns", []) + if isinstance(columns, str): + columns = [columns] + return operator_class(list(columns), where=where) + + @classmethod + def _create_metadata_operator( + cls, analyzer_name: str, analyzer: "_ConnectAnalyzer" + ) -> "DataTypeOperator": + """Create a metadata operator from an analyzer.""" + operator_class = cls._metadata_registry[analyzer_name] + + # Extract common attributes + column = getattr(analyzer, "column", None) + where = getattr(analyzer, "where", None) + + # Standard single-column metadata operators + return operator_class(column, where=where) + + @classmethod + def is_scan_operator(cls, analyzer: "_ConnectAnalyzer") -> bool: + """Check if analyzer maps to a scan operator.""" + return type(analyzer).__name__ in cls._scan_registry + + @classmethod + def is_grouping_operator(cls, analyzer: "_ConnectAnalyzer") -> bool: + """Check if analyzer maps to a grouping operator.""" + return type(analyzer).__name__ in cls._grouping_registry + + @classmethod + def is_metadata_operator(cls, analyzer: "_ConnectAnalyzer") -> bool: + """Check if analyzer maps to a metadata operator.""" + return type(analyzer).__name__ in cls._metadata_registry + + @classmethod + def is_supported(cls, analyzer: "_ConnectAnalyzer") -> bool: + """Check if analyzer type is supported by the factory.""" + analyzer_name = type(analyzer).__name__ + return ( + analyzer_name in cls._scan_registry + or analyzer_name in cls._grouping_registry + or analyzer_name in cls._metadata_registry + ) + + +# Register all scan operators +OperatorFactory._scan_registry = { + "Size": SizeOperator, + "Completeness": CompletenessOperator, + "Mean": MeanOperator, + "Sum": SumOperator, + "Minimum": MinimumOperator, + "Maximum": MaximumOperator, + "StandardDeviation": StandardDeviationOperator, + "MaxLength": MaxLengthOperator, + "MinLength": MinLengthOperator, + "PatternMatch": PatternMatchOperator, + "Compliance": ComplianceOperator, + "Correlation": CorrelationOperator, + "CountDistinct": CountDistinctOperator, + "ApproxCountDistinct": ApproxCountDistinctOperator, + "ApproxQuantile": ApproxQuantileOperator, +} + +# Register all grouping operators +OperatorFactory._grouping_registry = { + "Distinctness": DistinctnessOperator, + "Uniqueness": UniquenessOperator, + "UniqueValueRatio": UniqueValueRatioOperator, + "Entropy": EntropyOperator, + "MutualInformation": MutualInformationOperator, + "Histogram": HistogramOperator, +} + +# Register all metadata operators +OperatorFactory._metadata_registry = { + "DataType": DataTypeOperator, +} + + +__all__ = [ + "OperatorFactory", +] diff --git a/pydeequ/engines/operators/grouping_batcher.py b/pydeequ/engines/operators/grouping_batcher.py new file mode 100644 index 0000000..db75641 --- /dev/null +++ b/pydeequ/engines/operators/grouping_batcher.py @@ -0,0 +1,220 @@ +# -*- coding: utf-8 -*- +""" +Grouping operator batching for DuckDB performance optimization. + +This module provides functionality to batch grouping operators that share +identical CTEs (same columns and where clause) into single queries. + +Key insight: DistinctnessOperator, UniquenessOperator, and UniqueValueRatioOperator +all use the same frequency CTE: + WITH freq AS (SELECT cols, COUNT(*) AS cnt FROM table GROUP BY cols) + +By fusing operators with matching (columns, where_clause), we can: +- Compute all metrics in a single query +- Reduce the number of table scans +- Improve performance by 20-40% for checks with multiple grouping operators +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Dict, List, Optional, Tuple + +from pydeequ.engines import MetricResult +from pydeequ.engines.operators.grouping_operators import ( + DistinctnessOperator, + UniquenessOperator, + UniqueValueRatioOperator, +) + +if TYPE_CHECKING: + import pandas as pd + from pydeequ.engines.operators.base import GroupingOperator + + +# Operators that can be batched together (share same freq CTE structure) +BATCHABLE_OPERATORS = (DistinctnessOperator, UniquenessOperator, UniqueValueRatioOperator) + + +class GroupingOperatorBatcher: + """ + Batches grouping operators with matching (columns, where) into single queries. + + This class analyzes grouping operators and fuses compatible ones to reduce + the number of SQL queries executed. + """ + + def __init__(self, operators: List["GroupingOperator"]): + """ + Initialize the batcher with operators to analyze. + + Args: + operators: List of grouping operators + """ + self.operators = operators + self._batched_groups: Dict[Tuple, List["GroupingOperator"]] = {} + self._unbatchable: List["GroupingOperator"] = [] + self._analyze() + + def _get_batch_key(self, operator: "GroupingOperator") -> Optional[Tuple]: + """ + Get the batch key for an operator (columns tuple + where clause). + + Returns None if the operator cannot be batched. + """ + if not isinstance(operator, BATCHABLE_OPERATORS): + return None + + # Create key from (columns tuple, where clause) + cols = tuple(operator.columns) + where = operator.where or "" + return (cols, where) + + def _analyze(self) -> None: + """Analyze operators and group batchable ones by key.""" + for operator in self.operators: + key = self._get_batch_key(operator) + if key is None: + self._unbatchable.append(operator) + else: + if key not in self._batched_groups: + self._batched_groups[key] = [] + self._batched_groups[key].append(operator) + + def get_unbatchable_operators(self) -> List["GroupingOperator"]: + """Return operators that cannot be batched.""" + return self._unbatchable + + def get_batch_count(self) -> int: + """Return the number of batched query groups.""" + return len(self._batched_groups) + + def execute_batched( + self, + table: str, + execute_fn, + ) -> List[MetricResult]: + """ + Execute batched queries and return results. + + Args: + table: Name of the table to query + execute_fn: Function to execute SQL and return DataFrame + + Returns: + List of MetricResult objects for all batched operators + """ + results: List[MetricResult] = [] + + for (cols, where), operators in self._batched_groups.items(): + # Build fused query + query = self._build_fused_query(table, cols, where, operators) + + # Execute query + df = execute_fn(query) + + # Extract results for each operator + for operator in operators: + result = self._extract_result(df, operator) + results.append(result) + + return results + + def _build_fused_query( + self, + table: str, + cols: Tuple[str, ...], + where: str, + operators: List["GroupingOperator"], + ) -> str: + """ + Build a fused query that computes metrics for all operators in a batch. + + Args: + table: Name of the table to query + cols: Tuple of column names + where: WHERE clause (empty string if none) + operators: List of operators to fuse + + Returns: + SQL query string + """ + cols_str = ", ".join(cols) + where_clause = f"WHERE {where}" if where else "" + + # Determine which metrics we need to compute + needs_distinct = any(isinstance(op, DistinctnessOperator) for op in operators) + needs_unique = any(isinstance(op, UniquenessOperator) for op in operators) + needs_unique_ratio = any(isinstance(op, UniqueValueRatioOperator) for op in operators) + + # Build SELECT clause + select_parts = [] + + # Always need total_count for Distinctness and Uniqueness + if needs_distinct or needs_unique: + select_parts.append("SUM(cnt) AS total_count") + + # distinct_count needed for Distinctness and UniqueValueRatio + if needs_distinct or needs_unique_ratio: + select_parts.append("COUNT(*) AS distinct_count") + + # unique_count needed for Uniqueness and UniqueValueRatio + if needs_unique or needs_unique_ratio: + select_parts.append("SUM(CASE WHEN cnt = 1 THEN 1 ELSE 0 END) AS unique_count") + + return f""" + WITH freq AS ( + SELECT {cols_str}, COUNT(*) AS cnt + FROM {table} + {where_clause} + GROUP BY {cols_str} + ) + SELECT {', '.join(select_parts)} + FROM freq + """ + + def _extract_result( + self, + df: "pd.DataFrame", + operator: "GroupingOperator", + ) -> MetricResult: + """ + Extract the metric result for a specific operator from the fused query result. + + Args: + df: DataFrame containing fused query results + operator: The operator to extract result for + + Returns: + MetricResult for the operator + """ + if isinstance(operator, DistinctnessOperator): + distinct = operator.safe_float(df, "distinct_count") or 0 + total = operator.safe_float(df, "total_count") or 0 + value = distinct / total if total > 0 else 0.0 + + elif isinstance(operator, UniquenessOperator): + unique = operator.safe_float(df, "unique_count") or 0 + total = operator.safe_float(df, "total_count") or 0 + value = unique / total if total > 0 else 0.0 + + elif isinstance(operator, UniqueValueRatioOperator): + distinct = operator.safe_float(df, "distinct_count") or 0 + unique = operator.safe_float(df, "unique_count") or 0 + value = unique / distinct if distinct > 0 else 0.0 + + else: + # Fallback (shouldn't happen for batchable operators) + value = 0.0 + + return MetricResult( + name=operator.metric_name, + instance=operator.instance, + entity=operator.entity, + value=value, + ) + + +__all__ = [ + "GroupingOperatorBatcher", + "BATCHABLE_OPERATORS", +] diff --git a/pydeequ/engines/operators/grouping_operators.py b/pydeequ/engines/operators/grouping_operators.py new file mode 100644 index 0000000..702cfac --- /dev/null +++ b/pydeequ/engines/operators/grouping_operators.py @@ -0,0 +1,334 @@ +# -*- coding: utf-8 -*- +""" +Grouping operator implementations. + +Grouping operators require GROUP BY queries and cannot be batched with +scan operators. They require separate query execution. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, List, Optional + +from pydeequ.engines import MetricResult +from pydeequ.engines.operators.base import GroupingOperator + +if TYPE_CHECKING: + import pandas as pd + + +class DistinctnessOperator(GroupingOperator): + """ + Computes distinctness = count_distinct / total_count. + + Distinctness measures what fraction of the total rows have + unique value combinations in the specified columns. + """ + + def __init__(self, columns: List[str], where: Optional[str] = None): + super().__init__(columns, where) + + @property + def metric_name(self) -> str: + return "Distinctness" + + def get_grouping_columns(self) -> List[str]: + return self.columns + + def build_query(self, table: str) -> str: + cols_str = ", ".join(self.columns) + where_clause = self.get_where_clause() + + return f""" + WITH freq AS ( + SELECT {cols_str}, COUNT(*) AS cnt + FROM {table} + {where_clause} + GROUP BY {cols_str} + ) + SELECT + COUNT(*) AS distinct_count, + SUM(cnt) AS total_count + FROM freq + """ + + def extract_result(self, df: "pd.DataFrame") -> MetricResult: + distinct = self.safe_float(df, "distinct_count") or 0 + total = self.safe_float(df, "total_count") or 0 + value = distinct / total if total > 0 else 0.0 + return MetricResult( + name=self.metric_name, + instance=self.instance, + entity=self.entity, + value=value, + ) + + +class UniquenessOperator(GroupingOperator): + """ + Computes uniqueness = count_unique (count=1) / total_count. + + Uniqueness measures what fraction of the total rows have + value combinations that appear exactly once. + """ + + def __init__(self, columns: List[str], where: Optional[str] = None): + super().__init__(columns, where) + + @property + def metric_name(self) -> str: + return "Uniqueness" + + def get_grouping_columns(self) -> List[str]: + return self.columns + + def build_query(self, table: str) -> str: + cols_str = ", ".join(self.columns) + where_clause = self.get_where_clause() + + return f""" + WITH freq AS ( + SELECT {cols_str}, COUNT(*) AS cnt + FROM {table} + {where_clause} + GROUP BY {cols_str} + ) + SELECT + SUM(CASE WHEN cnt = 1 THEN 1 ELSE 0 END) AS unique_count, + SUM(cnt) AS total_count + FROM freq + """ + + def extract_result(self, df: "pd.DataFrame") -> MetricResult: + unique = self.safe_float(df, "unique_count") or 0 + total = self.safe_float(df, "total_count") or 0 + value = unique / total if total > 0 else 0.0 + return MetricResult( + name=self.metric_name, + instance=self.instance, + entity=self.entity, + value=value, + ) + + +class UniqueValueRatioOperator(GroupingOperator): + """ + Computes unique value ratio = count_unique / count_distinct. + + This measures what fraction of distinct value combinations + appear exactly once. + """ + + def __init__(self, columns: List[str], where: Optional[str] = None): + super().__init__(columns, where) + + @property + def metric_name(self) -> str: + return "UniqueValueRatio" + + def get_grouping_columns(self) -> List[str]: + return self.columns + + def build_query(self, table: str) -> str: + cols_str = ", ".join(self.columns) + where_clause = self.get_where_clause() + + return f""" + WITH freq AS ( + SELECT {cols_str}, COUNT(*) AS cnt + FROM {table} + {where_clause} + GROUP BY {cols_str} + ) + SELECT + COUNT(*) AS distinct_count, + SUM(CASE WHEN cnt = 1 THEN 1 ELSE 0 END) AS unique_count + FROM freq + """ + + def extract_result(self, df: "pd.DataFrame") -> MetricResult: + distinct = self.safe_float(df, "distinct_count") or 0 + unique = self.safe_float(df, "unique_count") or 0 + value = unique / distinct if distinct > 0 else 0.0 + return MetricResult( + name=self.metric_name, + instance=self.instance, + entity=self.entity, + value=value, + ) + + +class EntropyOperator(GroupingOperator): + """ + Computes entropy = -SUM(p * ln(p)). + + Entropy measures the information content of a column's + value distribution. Uses natural log (nats) for Spark parity. + """ + + def __init__(self, column: str, where: Optional[str] = None): + super().__init__([column], where) + self.column = column + + @property + def metric_name(self) -> str: + return "Entropy" + + @property + def instance(self) -> str: + return self.column + + @property + def entity(self) -> str: + return "Column" + + def get_grouping_columns(self) -> List[str]: + return [self.column] + + def build_query(self, table: str) -> str: + where_clause = self.get_where_clause() + + return f""" + WITH freq AS ( + SELECT {self.column}, COUNT(*) AS cnt + FROM {table} + {where_clause} + GROUP BY {self.column} + ), + total AS ( + SELECT SUM(cnt) AS total_cnt FROM freq + ) + SELECT + -SUM((cnt * 1.0 / total_cnt) * LN(cnt * 1.0 / total_cnt)) AS entropy + FROM freq, total + WHERE cnt > 0 + """ + + def extract_result(self, df: "pd.DataFrame") -> MetricResult: + value = self.safe_float(df, "entropy") + if value is None: + value = 0.0 + return MetricResult( + name=self.metric_name, + instance=self.instance, + entity=self.entity, + value=value, + ) + + +class MutualInformationOperator(GroupingOperator): + """Computes mutual information between two columns.""" + + def __init__(self, columns: List[str], where: Optional[str] = None): + if len(columns) != 2: + raise ValueError("MutualInformation requires exactly 2 columns") + super().__init__(columns, where) + + @property + def metric_name(self) -> str: + return "MutualInformation" + + def get_grouping_columns(self) -> List[str]: + return self.columns + + def build_query(self, table: str) -> str: + col1, col2 = self.columns + where_clause = self.get_where_clause() + + return f""" + WITH + joint AS ( + SELECT {col1}, {col2}, COUNT(*) AS cnt + FROM {table} + {where_clause} + GROUP BY {col1}, {col2} + ), + total AS (SELECT SUM(cnt) AS n FROM joint), + marginal1 AS ( + SELECT {col1}, SUM(cnt) AS cnt1 FROM joint GROUP BY {col1} + ), + marginal2 AS ( + SELECT {col2}, SUM(cnt) AS cnt2 FROM joint GROUP BY {col2} + ) + SELECT SUM( + (j.cnt * 1.0 / t.n) * + LN((j.cnt * 1.0 / t.n) / ((m1.cnt1 * 1.0 / t.n) * (m2.cnt2 * 1.0 / t.n))) + ) AS mi + FROM joint j, total t, marginal1 m1, marginal2 m2 + WHERE j.{col1} = m1.{col1} AND j.{col2} = m2.{col2} AND j.cnt > 0 + """ + + def extract_result(self, df: "pd.DataFrame") -> MetricResult: + value = self.safe_float(df, "mi") + if value is None: + value = 0.0 + return MetricResult( + name=self.metric_name, + instance=self.instance, + entity=self.entity, + value=value, + ) + + +class HistogramOperator(GroupingOperator): + """ + Computes histogram of value distribution in a column. + + Returns a JSON-serialized dict mapping values to their counts. + """ + + def __init__(self, column: str, max_bins: int = 100, where: Optional[str] = None): + super().__init__([column], where) + self.column = column + self.max_bins = max_bins + + @property + def metric_name(self) -> str: + return "Histogram" + + @property + def instance(self) -> str: + return self.column + + @property + def entity(self) -> str: + return "Column" + + def get_grouping_columns(self) -> List[str]: + return [self.column] + + def build_query(self, table: str) -> str: + where_clause = self.get_where_clause() + if where_clause: + where_clause += f" AND {self.column} IS NOT NULL" + else: + where_clause = f"WHERE {self.column} IS NOT NULL" + + return f""" + SELECT {self.column} as value, COUNT(*) as count + FROM {table} + {where_clause} + GROUP BY {self.column} + ORDER BY count DESC + LIMIT {self.max_bins} + """ + + def extract_result(self, df: "pd.DataFrame") -> MetricResult: + import json + histogram = {str(row["value"]): int(row["count"]) for _, row in df.iterrows()} + return MetricResult( + name=self.metric_name, + instance=self.instance, + entity=self.entity, + value=json.dumps(histogram), + ) + + +__all__ = [ + "DistinctnessOperator", + "UniquenessOperator", + "UniqueValueRatioOperator", + "EntropyOperator", + "MutualInformationOperator", + "HistogramOperator", +] diff --git a/pydeequ/engines/operators/metadata_operators.py b/pydeequ/engines/operators/metadata_operators.py new file mode 100644 index 0000000..963a2a1 --- /dev/null +++ b/pydeequ/engines/operators/metadata_operators.py @@ -0,0 +1,145 @@ +# -*- coding: utf-8 -*- +""" +Metadata operator implementations. + +Metadata operators compute metrics using schema information rather than +SQL aggregations. They are useful for type inference and schema-based +analysis that don't require scanning data. +""" + +from __future__ import annotations + +import json +from typing import Dict, Optional + +from pydeequ.engines import MetricResult +from pydeequ.engines.operators.mixins import ( + ColumnAliasMixin, + SafeExtractMixin, +) + +class DataTypeOperator(SafeExtractMixin, ColumnAliasMixin): + """ + Computes data type information from schema metadata. + + Unlike scan operators that require SQL queries, DataTypeOperator + infers type information directly from the table schema, making it + more efficient for type analysis. + + Type Mapping: + DuckDB types are mapped to Deequ-compatible type categories: + - Integral: TINYINT, SMALLINT, INTEGER, BIGINT, HUGEINT, etc. + - Fractional: FLOAT, DOUBLE, REAL, DECIMAL, NUMERIC + - String: VARCHAR, CHAR, TEXT, etc. + - Boolean: BOOLEAN, BOOL + + Attributes: + column: Column name to analyze + where: Optional WHERE clause (ignored for schema-based inference) + """ + + # Mapping from DuckDB SQL types to Deequ type categories + TYPE_MAPPING: Dict[str, str] = { + # Integral types + "TINYINT": "Integral", + "SMALLINT": "Integral", + "INTEGER": "Integral", + "BIGINT": "Integral", + "HUGEINT": "Integral", + "UTINYINT": "Integral", + "USMALLINT": "Integral", + "UINTEGER": "Integral", + "UBIGINT": "Integral", + "INT": "Integral", + "INT1": "Integral", + "INT2": "Integral", + "INT4": "Integral", + "INT8": "Integral", + # Fractional types + "FLOAT": "Fractional", + "DOUBLE": "Fractional", + "REAL": "Fractional", + "DECIMAL": "Fractional", + "NUMERIC": "Fractional", + "FLOAT4": "Fractional", + "FLOAT8": "Fractional", + # String types + "VARCHAR": "String", + "CHAR": "String", + "BPCHAR": "String", + "TEXT": "String", + "STRING": "String", + # Boolean types + "BOOLEAN": "Boolean", + "BOOL": "Boolean", + # Date/Time types (mapped to String for Deequ compatibility) + "DATE": "String", + "TIMESTAMP": "String", + "TIME": "String", + "TIMESTAMPTZ": "String", + "TIMETZ": "String", + "INTERVAL": "String", + # Binary types + "BLOB": "Unknown", + "BYTEA": "Unknown", + # UUID + "UUID": "String", + } + + def __init__(self, column: str, where: Optional[str] = None): + """ + Initialize DataTypeOperator. + + Args: + column: Column name to analyze + where: Optional WHERE clause (ignored for schema-based inference) + """ + self.column = column + self.where = where # Stored but ignored for schema-based type inference + + @property + def metric_name(self) -> str: + """Return the metric name for this operator.""" + return "DataType" + + @property + def instance(self) -> str: + """Return the instance identifier for this operator.""" + return self.column + + @property + def entity(self) -> str: + """Return the entity type for this operator.""" + return "Column" + + def compute_from_schema(self, schema: Dict[str, str]) -> MetricResult: + """ + Compute data type information from schema. + + Args: + schema: Dictionary mapping column names to SQL type names + + Returns: + MetricResult with JSON-encoded type information + """ + sql_type = schema.get(self.column, "Unknown") + mapped_type = self.TYPE_MAPPING.get(sql_type, "Unknown") + + # Build result compatible with Spark Deequ format + result = { + "dtype": sql_type, + "mapped_type": mapped_type, + "type_counts": {mapped_type: 1.0} # DuckDB has strict typing + } + + return MetricResult( + name=self.metric_name, + instance=self.instance, + entity=self.entity, + value=json.dumps(result), + ) + + +__all__ = [ + "DataTypeOperator", +] diff --git a/pydeequ/engines/operators/mixins.py b/pydeequ/engines/operators/mixins.py new file mode 100644 index 0000000..c68dfad --- /dev/null +++ b/pydeequ/engines/operators/mixins.py @@ -0,0 +1,188 @@ +# -*- coding: utf-8 -*- +""" +Mixin classes providing shared behaviors for SQL operators. + +These mixins provide reusable functionality that eliminates code duplication +across operator implementations. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Optional + +if TYPE_CHECKING: + import pandas as pd + + +class WhereClauseMixin: + """ + Provides WHERE clause wrapping for conditional aggregations. + + This mixin eliminates the repeated if/else pattern for handling + optional WHERE clauses in aggregations. Expects the class to + have a `where` attribute. + """ + + where: Optional[str] + + def wrap_agg_with_where(self, agg_func: str, column: str) -> str: + """ + Wrap an aggregation with optional WHERE filter using CASE WHEN. + + Args: + agg_func: SQL aggregation function name (e.g., "AVG", "SUM", "MIN") + column: Column name to aggregate + + Returns: + SQL expression with conditional aggregation if where is set, + otherwise standard aggregation + + Example: + >>> op = SomeOperator(column="price", where="status='active'") + >>> op.wrap_agg_with_where("AVG", "price") + "AVG(CASE WHEN status='active' THEN price ELSE NULL END)" + """ + if self.where: + return f"{agg_func}(CASE WHEN {self.where} THEN {column} ELSE NULL END)" + return f"{agg_func}({column})" + + def wrap_count_with_where(self, condition: str = "1") -> str: + """ + Wrap COUNT with optional WHERE filter. + + Args: + condition: SQL condition to count (default "1" counts all rows) + + Returns: + SQL expression for conditional count + + Example: + >>> op = SomeOperator(where="status='active'") + >>> op.wrap_count_with_where() + "SUM(CASE WHEN status='active' THEN 1 ELSE 0 END)" + >>> op.wrap_count_with_where("price > 0") + "SUM(CASE WHEN status='active' AND (price > 0) THEN 1 ELSE 0 END)" + """ + if self.where: + if condition == "1": + return f"SUM(CASE WHEN {self.where} THEN 1 ELSE 0 END)" + return f"SUM(CASE WHEN ({self.where}) AND ({condition}) THEN 1 ELSE 0 END)" + if condition == "1": + return "COUNT(*)" + return f"SUM(CASE WHEN {condition} THEN 1 ELSE 0 END)" + + def get_where_clause(self) -> str: + """ + Get WHERE clause for standalone queries. + + Returns: + "WHERE {condition}" if where is set, otherwise empty string + """ + if self.where: + return f"WHERE {self.where}" + return "" + + +class SafeExtractMixin: + """ + Provides safe value extraction from DataFrames. + + This mixin standardizes the pattern of safely extracting values + from query result DataFrames, handling NULL and NaN values. + """ + + def safe_float(self, df: "pd.DataFrame", column: str) -> Optional[float]: + """ + Extract float value from DataFrame, handling NULL/NaN. + + Args: + df: DataFrame containing query results + column: Column name to extract + + Returns: + Float value or None if not present/invalid + """ + import pandas as pd + + if column not in df.columns: + return None + val = df[column].iloc[0] + if val is not None and not pd.isna(val): + return float(val) + return None + + def safe_int(self, df: "pd.DataFrame", column: str) -> Optional[int]: + """ + Extract int value from DataFrame, handling NULL/NaN. + + Args: + df: DataFrame containing query results + column: Column name to extract + + Returns: + Integer value or None if not present/invalid + """ + val = self.safe_float(df, column) + return int(val) if val is not None else None + + def safe_string(self, df: "pd.DataFrame", column: str) -> Optional[str]: + """ + Extract string value from DataFrame, handling NULL/NaN. + + Args: + df: DataFrame containing query results + column: Column name to extract + + Returns: + String value or None if not present/invalid + """ + import pandas as pd + + if column not in df.columns: + return None + val = df[column].iloc[0] + if val is not None and not pd.isna(val): + return str(val) + return None + + +class ColumnAliasMixin: + """ + Provides consistent column alias generation. + + This mixin ensures all operators generate unique and predictable + column aliases for their SQL expressions. + """ + + def make_alias(self, prefix: str, *parts: str) -> str: + """ + Generate unique column alias from prefix and parts. + + Args: + prefix: Alias prefix (e.g., "mean", "count", "sum") + *parts: Additional parts to include (e.g., column names) + + Returns: + Underscore-separated alias with sanitized column names + + Example: + >>> op = SomeOperator() + >>> op.make_alias("mean", "price") + "mean_price" + >>> op.make_alias("corr", "price", "quantity") + "corr_price_quantity" + """ + # Sanitize parts: replace dots and other special chars + sanitized = [] + for p in parts: + if p: + sanitized.append(p.replace(".", "_").replace(" ", "_")) + suffix = "_".join(sanitized) + return f"{prefix}_{suffix}" if suffix else prefix + + +__all__ = [ + "WhereClauseMixin", + "SafeExtractMixin", + "ColumnAliasMixin", +] diff --git a/pydeequ/engines/operators/profiling_operators.py b/pydeequ/engines/operators/profiling_operators.py new file mode 100644 index 0000000..9e4a892 --- /dev/null +++ b/pydeequ/engines/operators/profiling_operators.py @@ -0,0 +1,449 @@ +# -*- coding: utf-8 -*- +""" +Profiling operator implementations. + +Profiling operators compute column profile statistics including completeness, +distinct values, min, max, mean, sum, stddev, percentiles, and histograms. +""" + +from __future__ import annotations + +import json +from typing import TYPE_CHECKING, Dict, List, Optional, Set + +from pydeequ.engines import ColumnProfile +from pydeequ.engines.operators.mixins import ( + ColumnAliasMixin, + SafeExtractMixin, + WhereClauseMixin, +) + +if TYPE_CHECKING: + import pandas as pd + + +# SQL types that are considered numeric +NUMERIC_TYPES: Set[str] = { + "TINYINT", "SMALLINT", "INTEGER", "BIGINT", "HUGEINT", + "UTINYINT", "USMALLINT", "UINTEGER", "UBIGINT", + "FLOAT", "DOUBLE", "REAL", "DECIMAL", "NUMERIC", + "INT", "INT1", "INT2", "INT4", "INT8", + "FLOAT4", "FLOAT8", +} + +# SQL types that are considered string +STRING_TYPES: Set[str] = {"VARCHAR", "CHAR", "BPCHAR", "TEXT", "STRING"} + + +class ColumnProfileOperator(WhereClauseMixin, SafeExtractMixin, ColumnAliasMixin): + """ + Computes all profile statistics for a column. + + This operator generates SQL queries to compute completeness, distinct values, + and (for numeric columns) min, max, mean, sum, stddev, and percentiles. + + Attributes: + column: Column name to profile + column_type: SQL type of the column (e.g., "INTEGER", "VARCHAR") + is_numeric: Whether the column is a numeric type + compute_percentiles: Whether to compute percentile statistics + compute_histogram: Whether to compute value histogram + histogram_limit: Maximum number of histogram buckets + where: Optional WHERE clause for filtering + """ + + def __init__( + self, + column: str, + column_type: str, + compute_percentiles: bool = True, + compute_histogram: bool = False, + histogram_limit: int = 100, + where: Optional[str] = None, + ): + """ + Initialize ColumnProfileOperator. + + Args: + column: Column name to profile + column_type: SQL type of the column + compute_percentiles: Whether to compute percentile statistics + compute_histogram: Whether to compute value histogram + histogram_limit: Maximum number of histogram buckets + where: Optional WHERE clause for filtering + """ + self.column = column + self.column_type = column_type + self.is_numeric = column_type in NUMERIC_TYPES + self.compute_percentiles = compute_percentiles and self.is_numeric + self.compute_histogram = compute_histogram + self.histogram_limit = histogram_limit + self.where = where + + def build_base_query(self, table: str) -> str: + """ + Build query for basic statistics. + + Args: + table: Table name to query + + Returns: + SQL query string for base statistics + """ + col = self.column + if self.is_numeric: + query = f""" + SELECT + COUNT(*) as total, + SUM(CASE WHEN {col} IS NULL THEN 1 ELSE 0 END) as null_count, + APPROX_COUNT_DISTINCT({col}) as distinct_count, + MIN({col}) as min_val, + MAX({col}) as max_val, + AVG({col}) as mean_val, + SUM({col}) as sum_val, + STDDEV_POP({col}) as stddev_val + FROM {table} + """ + else: + query = f""" + SELECT + COUNT(*) as total, + SUM(CASE WHEN {col} IS NULL THEN 1 ELSE 0 END) as null_count, + APPROX_COUNT_DISTINCT({col}) as distinct_count + FROM {table} + """ + return query.strip() + + def build_percentile_query(self, table: str) -> str: + """ + Build query for percentiles (separate query). + + Args: + table: Table name to query + + Returns: + SQL query string for percentile statistics + """ + col = self.column + return f""" + SELECT + QUANTILE_CONT({col}, 0.25) as p25, + QUANTILE_CONT({col}, 0.50) as p50, + QUANTILE_CONT({col}, 0.75) as p75 + FROM {table} + """.strip() + + def build_histogram_query(self, table: str) -> str: + """ + Build query for histogram (separate query). + + Args: + table: Table name to query + + Returns: + SQL query string for histogram + """ + col = self.column + return f""" + SELECT {col} as value, COUNT(*) as count + FROM {table} + WHERE {col} IS NOT NULL + GROUP BY {col} + ORDER BY count DESC + LIMIT {self.histogram_limit} + """.strip() + + def extract_base_result(self, df: "pd.DataFrame") -> Dict: + """ + Extract base statistics from query result. + + Args: + df: DataFrame containing query results + + Returns: + Dictionary of extracted statistics + """ + import pandas as pd + + total = int(df["total"].iloc[0]) + + # Handle NaN for empty datasets + null_count_raw = df["null_count"].iloc[0] + null_count = int(null_count_raw) if not pd.isna(null_count_raw) else 0 + + distinct_count_raw = df["distinct_count"].iloc[0] + distinct_count = int(distinct_count_raw) if not pd.isna(distinct_count_raw) else 0 + + completeness = (total - null_count) / total if total > 0 else 1.0 + + result = { + "total": total, + "null_count": null_count, + "distinct_count": distinct_count, + "completeness": completeness, + } + + if self.is_numeric: + result["minimum"] = self.safe_float(df, "min_val") + result["maximum"] = self.safe_float(df, "max_val") + result["mean"] = self.safe_float(df, "mean_val") + result["sum"] = self.safe_float(df, "sum_val") + result["std_dev"] = self.safe_float(df, "stddev_val") + + return result + + def extract_percentile_result(self, df: "pd.DataFrame") -> Optional[str]: + """ + Extract percentile statistics from query result. + + Args: + df: DataFrame containing percentile query results + + Returns: + JSON string of percentile values or None + """ + p25 = self.safe_float(df, "p25") + p50 = self.safe_float(df, "p50") + p75 = self.safe_float(df, "p75") + + return json.dumps({ + "0.25": p25, + "0.50": p50, + "0.75": p75, + }) + + def extract_histogram_result(self, df: "pd.DataFrame") -> Optional[str]: + """ + Extract histogram from query result. + + Args: + df: DataFrame containing histogram query results + + Returns: + JSON string of histogram or None + """ + histogram = { + str(row["value"]): int(row["count"]) + for _, row in df.iterrows() + } + return json.dumps(histogram) + + def build_profile( + self, + base_stats: Dict, + percentiles: Optional[str] = None, + histogram: Optional[str] = None, + ) -> ColumnProfile: + """ + Build ColumnProfile from extracted statistics. + + Args: + base_stats: Dictionary of base statistics + percentiles: JSON string of percentile values + histogram: JSON string of histogram + + Returns: + ColumnProfile object + """ + profile = ColumnProfile( + column=self.column, + completeness=base_stats["completeness"], + approx_distinct_values=base_stats["distinct_count"], + data_type=self.column_type, + is_data_type_inferred=True, + ) + + if self.is_numeric: + profile.minimum = base_stats.get("minimum") + profile.maximum = base_stats.get("maximum") + profile.mean = base_stats.get("mean") + profile.sum = base_stats.get("sum") + profile.std_dev = base_stats.get("std_dev") + + if percentiles: + profile.approx_percentiles = percentiles + + if histogram: + profile.histogram = histogram + + return profile + + +class MultiColumnProfileOperator(WhereClauseMixin, SafeExtractMixin, ColumnAliasMixin): + """ + Profiles multiple columns in minimal queries. + + This operator batches profile statistics for multiple columns to reduce + the number of SQL queries needed for profiling. + + Attributes: + columns: List of column names to profile + schema: Dictionary mapping column names to SQL types + numeric_columns: List of numeric column names + string_columns: List of string column names + where: Optional WHERE clause for filtering + """ + + def __init__( + self, + columns: List[str], + schema: Dict[str, str], + where: Optional[str] = None, + ): + """ + Initialize MultiColumnProfileOperator. + + Args: + columns: List of column names to profile + schema: Dictionary mapping column names to SQL types + where: Optional WHERE clause for filtering + """ + self.columns = columns + self.schema = schema + self.where = where + + # Categorize columns by type + self.numeric_columns = [c for c in columns if schema.get(c) in NUMERIC_TYPES] + self.string_columns = [c for c in columns if schema.get(c) in STRING_TYPES] + self.other_columns = [ + c for c in columns + if c not in self.numeric_columns and c not in self.string_columns + ] + + def build_completeness_query(self, table: str) -> str: + """ + Build query for completeness of all columns. + + Args: + table: Table name to query + + Returns: + SQL query string + """ + aggregations = ["COUNT(*) as total"] + for col in self.columns: + aggregations.append( + f"SUM(CASE WHEN {col} IS NULL THEN 1 ELSE 0 END) as null_{col}" + ) + aggregations.append(f"APPROX_COUNT_DISTINCT({col}) as distinct_{col}") + + return f"SELECT {', '.join(aggregations)} FROM {table}" + + def build_numeric_stats_query(self, table: str) -> str: + """ + Build query for numeric column statistics. + + Args: + table: Table name to query + + Returns: + SQL query string + """ + if not self.numeric_columns: + return "" + + aggregations = [] + for col in self.numeric_columns: + aggregations.extend([ + f"MIN({col}) as min_{col}", + f"MAX({col}) as max_{col}", + f"AVG({col}) as mean_{col}", + f"SUM({col}) as sum_{col}", + f"STDDEV_POP({col}) as stddev_{col}", + ]) + + return f"SELECT {', '.join(aggregations)} FROM {table}" + + def build_percentile_query(self, table: str) -> str: + """ + Build query for percentiles of all numeric columns. + + Args: + table: Table name to query + + Returns: + SQL query string (empty if no numeric columns) + """ + if not self.numeric_columns: + return "" + + aggregations = [] + for col in self.numeric_columns: + aggregations.extend([ + f"QUANTILE_CONT({col}, 0.25) as p25_{col}", + f"QUANTILE_CONT({col}, 0.50) as p50_{col}", + f"QUANTILE_CONT({col}, 0.75) as p75_{col}", + ]) + + return f"SELECT {', '.join(aggregations)} FROM {table}" + + def extract_profiles( + self, + completeness_df: "pd.DataFrame", + numeric_df: Optional["pd.DataFrame"] = None, + percentile_df: Optional["pd.DataFrame"] = None, + ) -> List[ColumnProfile]: + """ + Extract column profiles from query results. + + Args: + completeness_df: DataFrame with completeness statistics + numeric_df: DataFrame with numeric statistics (optional) + percentile_df: DataFrame with percentile statistics (optional) + + Returns: + List of ColumnProfile objects + """ + import pandas as pd + + profiles = [] + total = int(completeness_df["total"].iloc[0]) + + for col in self.columns: + # Extract completeness stats + null_count_raw = completeness_df[f"null_{col}"].iloc[0] + null_count = int(null_count_raw) if not pd.isna(null_count_raw) else 0 + + distinct_count_raw = completeness_df[f"distinct_{col}"].iloc[0] + distinct_count = int(distinct_count_raw) if not pd.isna(distinct_count_raw) else 0 + + completeness = (total - null_count) / total if total > 0 else 1.0 + + profile = ColumnProfile( + column=col, + completeness=completeness, + approx_distinct_values=distinct_count, + data_type=self.schema.get(col, "Unknown"), + is_data_type_inferred=True, + ) + + # Add numeric stats if applicable + if col in self.numeric_columns and numeric_df is not None: + profile.minimum = self.safe_float(numeric_df, f"min_{col}") + profile.maximum = self.safe_float(numeric_df, f"max_{col}") + profile.mean = self.safe_float(numeric_df, f"mean_{col}") + profile.sum = self.safe_float(numeric_df, f"sum_{col}") + profile.std_dev = self.safe_float(numeric_df, f"stddev_{col}") + + # Add percentiles if applicable + if col in self.numeric_columns and percentile_df is not None: + p25 = self.safe_float(percentile_df, f"p25_{col}") + p50 = self.safe_float(percentile_df, f"p50_{col}") + p75 = self.safe_float(percentile_df, f"p75_{col}") + profile.approx_percentiles = json.dumps({ + "0.25": p25, + "0.50": p50, + "0.75": p75, + }) + + profiles.append(profile) + + return profiles + + +__all__ = [ + "ColumnProfileOperator", + "MultiColumnProfileOperator", + "NUMERIC_TYPES", + "STRING_TYPES", +] diff --git a/pydeequ/engines/operators/protocols.py b/pydeequ/engines/operators/protocols.py new file mode 100644 index 0000000..3d36c5a --- /dev/null +++ b/pydeequ/engines/operators/protocols.py @@ -0,0 +1,100 @@ +# -*- coding: utf-8 -*- +""" +Protocol definitions for SQL operators. + +This module defines the structural typing contracts that operators must +implement. Using Protocol from typing allows for duck typing while still +providing IDE support and type checking. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, List, Protocol, runtime_checkable + +if TYPE_CHECKING: + import pandas as pd + from pydeequ.engines import MetricResult + + +@runtime_checkable +class ScanOperatorProtocol(Protocol): + """ + Contract for single-pass aggregation operators. + + Scan operators compute metrics via SQL aggregations that can be + combined into a single SELECT statement, enabling efficient + batch execution. + """ + + def get_aggregations(self) -> List[str]: + """ + Return SQL aggregation expressions. + + Returns: + List of SQL aggregation expressions with AS alias clauses, + e.g., ["AVG(col) AS mean_col", "COUNT(*) AS count_col"] + """ + ... + + def extract_result(self, df: "pd.DataFrame") -> "MetricResult": + """ + Extract metric from query result DataFrame. + + Args: + df: DataFrame containing query results with columns + matching the aliases from get_aggregations() + + Returns: + MetricResult with extracted value + """ + ... + + +@runtime_checkable +class GroupingOperatorProtocol(Protocol): + """ + Contract for operators requiring GROUP BY queries. + + Grouping operators need to compute intermediate aggregations + via GROUP BY before computing the final metric. They cannot + be batched with scan operators and require separate queries. + """ + + def get_grouping_columns(self) -> List[str]: + """ + Return columns to GROUP BY. + + Returns: + List of column names for the GROUP BY clause + """ + ... + + def build_query(self, table: str) -> str: + """ + Build complete CTE-based query. + + Args: + table: Name of the table to query + + Returns: + Complete SQL query string with CTEs as needed + """ + ... + + def extract_result(self, df: "pd.DataFrame") -> "MetricResult": + """ + Extract metric from query result DataFrame. + + Args: + df: DataFrame containing query results + + Returns: + MetricResult with extracted value + """ + ... + + +__all__ = [ + "ScanOperatorProtocol", + "GroupingOperatorProtocol", +] diff --git a/pydeequ/engines/operators/scan_operators.py b/pydeequ/engines/operators/scan_operators.py new file mode 100644 index 0000000..b9a7f22 --- /dev/null +++ b/pydeequ/engines/operators/scan_operators.py @@ -0,0 +1,502 @@ +# -*- coding: utf-8 -*- +""" +Scan operator implementations. + +Scan operators compute metrics via SQL aggregations that can be combined +into a single SELECT statement, enabling efficient batch execution. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, List, Optional + +from pydeequ.engines import MetricResult +from pydeequ.engines.operators.base import ScanOperator +from pydeequ.engines.operators.mixins import ( + ColumnAliasMixin, + SafeExtractMixin, + WhereClauseMixin, +) + +if TYPE_CHECKING: + import pandas as pd + + +class SizeOperator(WhereClauseMixin, SafeExtractMixin, ColumnAliasMixin): + """ + Computes the number of rows in a table. + + Unlike other scan operators, Size operates on the dataset level + rather than a specific column. + """ + + def __init__(self, where: Optional[str] = None): + self.where = where + self.alias = "size_value" + + @property + def metric_name(self) -> str: + return "Size" + + @property + def instance(self) -> str: + return "*" + + @property + def entity(self) -> str: + return "Dataset" + + def get_aggregations(self) -> List[str]: + if self.where: + sql = f"SUM(CASE WHEN {self.where} THEN 1 ELSE 0 END) AS {self.alias}" + else: + sql = f"COUNT(*) AS {self.alias}" + return [sql] + + def extract_result(self, df: "pd.DataFrame") -> MetricResult: + value = self.safe_float(df, self.alias) + return MetricResult( + name=self.metric_name, + instance=self.instance, + entity=self.entity, + value=value, + ) + + +class CompletenessOperator(ScanOperator): + """Computes the fraction of non-null values in a column.""" + + def __init__(self, column: str, where: Optional[str] = None): + super().__init__(column, where) + self.count_alias = self.make_alias("count", column) + self.null_alias = self.make_alias("null_count", column) + + @property + def metric_name(self) -> str: + return "Completeness" + + def get_aggregations(self) -> List[str]: + count_sql = self.wrap_count_with_where("1") + null_sql = self.wrap_count_with_where(f"{self.column} IS NULL") + return [ + f"{count_sql} AS {self.count_alias}", + f"{null_sql} AS {self.null_alias}", + ] + + def extract_result(self, df: "pd.DataFrame") -> MetricResult: + total = self.safe_float(df, self.count_alias) or 0 + nulls = self.safe_float(df, self.null_alias) or 0 + value = (total - nulls) / total if total > 0 else 1.0 + return MetricResult( + name=self.metric_name, + instance=self.instance, + entity=self.entity, + value=value, + ) + + +class MeanOperator(ScanOperator): + """Computes the average of a numeric column.""" + + def __init__(self, column: str, where: Optional[str] = None): + super().__init__(column, where) + self.alias = self.make_alias("mean", column) + + @property + def metric_name(self) -> str: + return "Mean" + + def get_aggregations(self) -> List[str]: + agg = self.wrap_agg_with_where("AVG", self.column) + return [f"{agg} AS {self.alias}"] + + def extract_result(self, df: "pd.DataFrame") -> MetricResult: + value = self.safe_float(df, self.alias) + return MetricResult( + name=self.metric_name, + instance=self.instance, + entity=self.entity, + value=value, + ) + + +class SumOperator(ScanOperator): + """Computes the sum of a numeric column.""" + + def __init__(self, column: str, where: Optional[str] = None): + super().__init__(column, where) + self.alias = self.make_alias("sum", column) + + @property + def metric_name(self) -> str: + return "Sum" + + def get_aggregations(self) -> List[str]: + agg = self.wrap_agg_with_where("SUM", self.column) + return [f"{agg} AS {self.alias}"] + + def extract_result(self, df: "pd.DataFrame") -> MetricResult: + value = self.safe_float(df, self.alias) + return MetricResult( + name=self.metric_name, + instance=self.instance, + entity=self.entity, + value=value, + ) + + +class MinimumOperator(ScanOperator): + """Computes the minimum value of a numeric column.""" + + def __init__(self, column: str, where: Optional[str] = None): + super().__init__(column, where) + self.alias = self.make_alias("min", column) + + @property + def metric_name(self) -> str: + return "Minimum" + + def get_aggregations(self) -> List[str]: + agg = self.wrap_agg_with_where("MIN", self.column) + return [f"{agg} AS {self.alias}"] + + def extract_result(self, df: "pd.DataFrame") -> MetricResult: + value = self.safe_float(df, self.alias) + return MetricResult( + name=self.metric_name, + instance=self.instance, + entity=self.entity, + value=value, + ) + + +class MaximumOperator(ScanOperator): + """Computes the maximum value of a numeric column.""" + + def __init__(self, column: str, where: Optional[str] = None): + super().__init__(column, where) + self.alias = self.make_alias("max", column) + + @property + def metric_name(self) -> str: + return "Maximum" + + def get_aggregations(self) -> List[str]: + agg = self.wrap_agg_with_where("MAX", self.column) + return [f"{agg} AS {self.alias}"] + + def extract_result(self, df: "pd.DataFrame") -> MetricResult: + value = self.safe_float(df, self.alias) + return MetricResult( + name=self.metric_name, + instance=self.instance, + entity=self.entity, + value=value, + ) + + +class StandardDeviationOperator(ScanOperator): + """Computes the standard deviation of a numeric column.""" + + def __init__(self, column: str, where: Optional[str] = None): + super().__init__(column, where) + self.alias = self.make_alias("stddev", column) + + @property + def metric_name(self) -> str: + return "StandardDeviation" + + def get_aggregations(self) -> List[str]: + agg = self.wrap_agg_with_where("STDDEV_POP", self.column) + return [f"{agg} AS {self.alias}"] + + def extract_result(self, df: "pd.DataFrame") -> MetricResult: + value = self.safe_float(df, self.alias) + return MetricResult( + name=self.metric_name, + instance=self.instance, + entity=self.entity, + value=value, + ) + + +class MaxLengthOperator(ScanOperator): + """Computes the maximum string length in a column.""" + + def __init__(self, column: str, where: Optional[str] = None): + super().__init__(column, where) + self.alias = self.make_alias("max_length", column) + + @property + def metric_name(self) -> str: + return "MaxLength" + + def get_aggregations(self) -> List[str]: + if self.where: + sql = f"MAX(CASE WHEN {self.where} THEN LENGTH({self.column}) ELSE NULL END)" + else: + sql = f"MAX(LENGTH({self.column}))" + return [f"{sql} AS {self.alias}"] + + def extract_result(self, df: "pd.DataFrame") -> MetricResult: + value = self.safe_float(df, self.alias) + return MetricResult( + name=self.metric_name, + instance=self.instance, + entity=self.entity, + value=value, + ) + + +class MinLengthOperator(ScanOperator): + """Computes the minimum string length in a column.""" + + def __init__(self, column: str, where: Optional[str] = None): + super().__init__(column, where) + self.alias = self.make_alias("min_length", column) + + @property + def metric_name(self) -> str: + return "MinLength" + + def get_aggregations(self) -> List[str]: + if self.where: + sql = f"MIN(CASE WHEN {self.where} THEN LENGTH({self.column}) ELSE NULL END)" + else: + sql = f"MIN(LENGTH({self.column}))" + return [f"{sql} AS {self.alias}"] + + def extract_result(self, df: "pd.DataFrame") -> MetricResult: + value = self.safe_float(df, self.alias) + return MetricResult( + name=self.metric_name, + instance=self.instance, + entity=self.entity, + value=value, + ) + + +class PatternMatchOperator(ScanOperator): + """Computes the fraction of values matching a regex pattern.""" + + def __init__(self, column: str, pattern: str, where: Optional[str] = None): + super().__init__(column, where) + self.pattern = pattern.replace("'", "''") # Escape single quotes + self.count_alias = self.make_alias("count", column) + self.match_alias = self.make_alias("pattern_match", column) + + @property + def metric_name(self) -> str: + return "PatternMatch" + + def get_aggregations(self) -> List[str]: + count_sql = self.wrap_count_with_where("1") + match_cond = f"REGEXP_MATCHES({self.column}, '{self.pattern}')" + match_sql = self.wrap_count_with_where(match_cond) + return [ + f"{count_sql} AS {self.count_alias}", + f"{match_sql} AS {self.match_alias}", + ] + + def extract_result(self, df: "pd.DataFrame") -> MetricResult: + total = self.safe_float(df, self.count_alias) or 0 + matches = self.safe_float(df, self.match_alias) or 0 + value = matches / total if total > 0 else 1.0 + return MetricResult( + name=self.metric_name, + instance=self.instance, + entity=self.entity, + value=value, + ) + + +class ComplianceOperator(WhereClauseMixin, SafeExtractMixin, ColumnAliasMixin): + """ + Computes the fraction of rows satisfying a SQL condition. + + Unlike other scan operators, Compliance operates on a predicate + rather than a specific column. + """ + + def __init__(self, instance: str, predicate: str, where: Optional[str] = None): + self.instance_name = instance + self.predicate = predicate + self.where = where + self.count_alias = "compliance_count" + self.match_alias = self.make_alias("compliance_match", instance) + + @property + def metric_name(self) -> str: + return "Compliance" + + @property + def instance(self) -> str: + return self.instance_name + + @property + def entity(self) -> str: + return "Dataset" + + def get_aggregations(self) -> List[str]: + count_sql = self.wrap_count_with_where("1") + match_sql = self.wrap_count_with_where(f"({self.predicate})") + return [ + f"{count_sql} AS {self.count_alias}", + f"{match_sql} AS {self.match_alias}", + ] + + def extract_result(self, df: "pd.DataFrame") -> MetricResult: + total = self.safe_float(df, self.count_alias) or 0 + matches = self.safe_float(df, self.match_alias) or 0 + value = matches / total if total > 0 else 1.0 + return MetricResult( + name=self.metric_name, + instance=self.instance, + entity=self.entity, + value=value, + ) + + +class CorrelationOperator(WhereClauseMixin, SafeExtractMixin, ColumnAliasMixin): + """Computes Pearson correlation between two columns.""" + + def __init__(self, column1: str, column2: str, where: Optional[str] = None): + self.column1 = column1 + self.column2 = column2 + self.where = where + self.alias = self.make_alias("corr", column1, column2) + + @property + def metric_name(self) -> str: + return "Correlation" + + @property + def instance(self) -> str: + return f"{self.column1},{self.column2}" + + @property + def entity(self) -> str: + return "Multicolumn" + + def get_aggregations(self) -> List[str]: + # Note: CORR doesn't support CASE WHEN wrapping in most DBs + # For WHERE clause, the engine should apply it to the whole query + sql = f"CORR({self.column1}, {self.column2})" + return [f"{sql} AS {self.alias}"] + + def extract_result(self, df: "pd.DataFrame") -> MetricResult: + value = self.safe_float(df, self.alias) + return MetricResult( + name=self.metric_name, + instance=self.instance, + entity=self.entity, + value=value, + ) + + +class CountDistinctOperator(WhereClauseMixin, SafeExtractMixin, ColumnAliasMixin): + """Computes the count of distinct values in column(s).""" + + def __init__(self, columns: List[str], where: Optional[str] = None): + self.columns = columns + self.where = where + self.alias = self.make_alias("count_distinct", *columns) + + @property + def metric_name(self) -> str: + return "CountDistinct" + + @property + def instance(self) -> str: + return ",".join(self.columns) + + @property + def entity(self) -> str: + return "Multicolumn" if len(self.columns) > 1 else "Column" + + def get_aggregations(self) -> List[str]: + cols_str = ", ".join(self.columns) + sql = f"COUNT(DISTINCT ({cols_str}))" + return [f"{sql} AS {self.alias}"] + + def extract_result(self, df: "pd.DataFrame") -> MetricResult: + value = self.safe_float(df, self.alias) + return MetricResult( + name=self.metric_name, + instance=self.instance, + entity=self.entity, + value=value, + ) + + +class ApproxCountDistinctOperator(ScanOperator): + """Computes approximate count distinct using HyperLogLog.""" + + def __init__(self, column: str, where: Optional[str] = None): + super().__init__(column, where) + self.alias = self.make_alias("approx_count_distinct", column) + + @property + def metric_name(self) -> str: + return "ApproxCountDistinct" + + def get_aggregations(self) -> List[str]: + agg = self.wrap_agg_with_where("APPROX_COUNT_DISTINCT", self.column) + return [f"{agg} AS {self.alias}"] + + def extract_result(self, df: "pd.DataFrame") -> MetricResult: + value = self.safe_float(df, self.alias) + return MetricResult( + name=self.metric_name, + instance=self.instance, + entity=self.entity, + value=value, + ) + + +class ApproxQuantileOperator(ScanOperator): + """Computes approximate quantile using QUANTILE_CONT.""" + + def __init__(self, column: str, quantile: float = 0.5, where: Optional[str] = None): + super().__init__(column, where) + self.quantile = quantile + self.alias = self.make_alias("approx_quantile", column) + + @property + def metric_name(self) -> str: + return "ApproxQuantile" + + def get_aggregations(self) -> List[str]: + if self.where: + agg = f"QUANTILE_CONT(CASE WHEN {self.where} THEN {self.column} ELSE NULL END, {self.quantile})" + else: + agg = f"QUANTILE_CONT({self.column}, {self.quantile})" + return [f"{agg} AS {self.alias}"] + + def extract_result(self, df: "pd.DataFrame") -> MetricResult: + value = self.safe_float(df, self.alias) + return MetricResult( + name=self.metric_name, + instance=self.instance, + entity=self.entity, + value=value, + ) + + +__all__ = [ + "SizeOperator", + "CompletenessOperator", + "MeanOperator", + "SumOperator", + "MinimumOperator", + "MaximumOperator", + "StandardDeviationOperator", + "MaxLengthOperator", + "MinLengthOperator", + "PatternMatchOperator", + "ComplianceOperator", + "CorrelationOperator", + "CountDistinctOperator", + "ApproxCountDistinctOperator", + "ApproxQuantileOperator", +] diff --git a/pydeequ/engines/spark.py b/pydeequ/engines/spark.py new file mode 100644 index 0000000..2072bac --- /dev/null +++ b/pydeequ/engines/spark.py @@ -0,0 +1,264 @@ +# -*- coding: utf-8 -*- +""" +Spark execution engine for PyDeequ. + +This module provides a Spark-based execution engine that wraps the existing +v2 Spark Connect API, providing a unified engine interface. + +Example usage: + from pyspark.sql import SparkSession + from pydeequ.engines.spark import SparkEngine + from pydeequ.v2.analyzers import Size, Completeness + + spark = SparkSession.builder.remote("sc://localhost:15002").getOrCreate() + df = spark.createDataFrame([(1, 2), (3, 4)], ["a", "b"]) + + engine = SparkEngine(spark, dataframe=df) + metrics = engine.compute_metrics([Size(), Completeness("a")]) +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Dict, List, Optional, Sequence + +import pandas as pd + +from pydeequ.engines import ( + BaseEngine, + ColumnProfile, + ConstraintResult, + ConstraintSuggestion, + ConstraintStatus, + CheckStatus, + MetricResult, +) + +if TYPE_CHECKING: + from pyspark.sql import DataFrame, SparkSession + from pydeequ.v2.analyzers import _ConnectAnalyzer + from pydeequ.v2.checks import Check + + +class SparkEngine(BaseEngine): + """ + Spark-based execution engine. + + This engine wraps the existing v2 Spark Connect API to provide + a unified engine interface. It delegates execution to the + Deequ plugin running on the Spark cluster. + + Attributes: + spark: SparkSession + table: Optional table name + dataframe: Optional DataFrame to analyze + """ + + def __init__( + self, + spark: "SparkSession", + table: Optional[str] = None, + dataframe: Optional["DataFrame"] = None, + ): + """ + Create a new SparkEngine. + + Args: + spark: SparkSession (Spark Connect) + table: Optional table name to analyze + dataframe: Optional DataFrame to analyze (preferred over table) + """ + self.spark = spark + self.table = table + self._dataframe = dataframe + + def _get_dataframe(self) -> "DataFrame": + """Get the DataFrame to analyze.""" + if self._dataframe is not None: + return self._dataframe + if self.table: + return self.spark.table(self.table) + raise ValueError("Either dataframe or table must be provided") + + def get_schema(self) -> Dict[str, str]: + """Get the schema of the data source.""" + df = self._get_dataframe() + return {field.name: str(field.dataType) for field in df.schema.fields} + + def compute_metrics( + self, analyzers: Sequence["_ConnectAnalyzer"] + ) -> List[MetricResult]: + """ + Compute metrics using the Spark Connect Deequ plugin. + + Args: + analyzers: Sequence of analyzers to compute metrics for + + Returns: + List of MetricResult objects + """ + from pydeequ.v2.verification import AnalysisRunner + + df = self._get_dataframe() + + # Build and run the analysis + runner = AnalysisRunner(self.spark).onData(df) + for analyzer in analyzers: + runner = runner.addAnalyzer(analyzer) + + result_df = runner.run() + + # Convert Spark DataFrame result to MetricResult objects + results: List[MetricResult] = [] + for row in result_df.collect(): + results.append(MetricResult( + name=row["name"], + instance=row["instance"], + entity=row["entity"], + value=float(row["value"]) if row["value"] is not None else None, + )) + + return results + + def run_checks(self, checks: Sequence["Check"]) -> List[ConstraintResult]: + """ + Run verification checks using the Spark Connect Deequ plugin. + + Args: + checks: Sequence of Check objects to evaluate + + Returns: + List of ConstraintResult objects + """ + from pydeequ.v2.verification import VerificationSuite + + df = self._get_dataframe() + + # Build and run the verification + suite = VerificationSuite(self.spark).onData(df) + for check in checks: + suite = suite.addCheck(check) + + result_df = suite.run() + + # Convert Spark DataFrame result to ConstraintResult objects + results: List[ConstraintResult] = [] + for row in result_df.collect(): + results.append(ConstraintResult( + check_description=row["check"], + check_level=row["check_level"], + check_status=row["check_status"], + constraint=row["constraint"], + constraint_status=row["constraint_status"], + constraint_message=row["constraint_message"], + )) + + return results + + def profile_columns( + self, + columns: Optional[Sequence[str]] = None, + low_cardinality_threshold: int = 0, + ) -> List[ColumnProfile]: + """ + Profile columns using the Spark Connect Deequ plugin. + + Args: + columns: Optional list of columns to profile + low_cardinality_threshold: Threshold for histogram computation + + Returns: + List of ColumnProfile objects + """ + from pydeequ.v2.profiles import ColumnProfilerRunner + + df = self._get_dataframe() + + # Build and run the profiler + runner = ColumnProfilerRunner(self.spark).onData(df) + + if columns: + runner = runner.restrictToColumns(columns) + + if low_cardinality_threshold > 0: + runner = runner.withLowCardinalityHistogramThreshold(low_cardinality_threshold) + + result_df = runner.run() + + # Convert Spark DataFrame result to ColumnProfile objects + profiles: List[ColumnProfile] = [] + for row in result_df.collect(): + profiles.append(ColumnProfile( + column=row["column"], + completeness=float(row["completeness"]) if row["completeness"] is not None else 0.0, + approx_distinct_values=int(row["approx_distinct_values"]) if row["approx_distinct_values"] is not None else 0, + data_type=row["data_type"] if row["data_type"] else "Unknown", + is_data_type_inferred=bool(row["is_data_type_inferred"]) if "is_data_type_inferred" in row else True, + type_counts=row["type_counts"] if "type_counts" in row else None, + histogram=row["histogram"] if "histogram" in row else None, + mean=float(row["mean"]) if "mean" in row and row["mean"] is not None else None, + minimum=float(row["minimum"]) if "minimum" in row and row["minimum"] is not None else None, + maximum=float(row["maximum"]) if "maximum" in row and row["maximum"] is not None else None, + sum=float(row["sum"]) if "sum" in row and row["sum"] is not None else None, + std_dev=float(row["std_dev"]) if "std_dev" in row and row["std_dev"] is not None else None, + )) + + return profiles + + def suggest_constraints( + self, + columns: Optional[Sequence[str]] = None, + rules: Optional[Sequence[str]] = None, + ) -> List[ConstraintSuggestion]: + """ + Suggest constraints using the Spark Connect Deequ plugin. + + Args: + columns: Optional list of columns to analyze + rules: Optional list of rule sets to apply + + Returns: + List of ConstraintSuggestion objects + """ + from pydeequ.v2.suggestions import ConstraintSuggestionRunner, Rules + + df = self._get_dataframe() + + # Build and run the suggestion runner + runner = ConstraintSuggestionRunner(self.spark).onData(df) + + if columns: + runner = runner.restrictToColumns(columns) + + # Map rule strings to Rules enum (accept both strings and enum values) + if rules: + rule_map = { + "DEFAULT": Rules.DEFAULT, + "STRING": Rules.STRING, + "NUMERICAL": Rules.NUMERICAL, + "COMMON": Rules.COMMON, + "EXTENDED": Rules.EXTENDED, + } + for rule in rules: + # Accept both Rules enum and string values + if isinstance(rule, Rules): + runner = runner.addConstraintRules(rule) + elif rule in rule_map: + runner = runner.addConstraintRules(rule_map[rule]) + else: + runner = runner.addConstraintRules(Rules.DEFAULT) + + result_df = runner.run() + + # Convert Spark DataFrame result to ConstraintSuggestion objects + suggestions: List[ConstraintSuggestion] = [] + for row in result_df.collect(): + suggestions.append(ConstraintSuggestion( + column_name=row["column_name"], + constraint_name=row["constraint_name"], + current_value=row["current_value"] if "current_value" in row else None, + description=row["description"], + suggesting_rule=row["suggesting_rule"], + code_for_constraint=row["code_for_constraint"], + )) + + return suggestions diff --git a/pydeequ/engines/suggestions/__init__.py b/pydeequ/engines/suggestions/__init__.py new file mode 100644 index 0000000..6303692 --- /dev/null +++ b/pydeequ/engines/suggestions/__init__.py @@ -0,0 +1,67 @@ +# -*- coding: utf-8 -*- +""" +Constraint suggestion module. + +This module provides a modular, rule-based system for suggesting data quality +constraints based on column profiles. + +Architecture: + rules.py - SuggestionRule base class and 10 rule implementations + registry.py - RuleRegistry for organizing rules by rule set + runner.py - SuggestionRunner for orchestrating rule execution + +Available Rule Sets: + - DEFAULT: Basic rules (completeness, non-negative, categorical) + - NUMERICAL: Rules for numeric columns (min, max, mean) + - STRING: Rules for string columns (min/max length) + - COMMON: General rules (uniqueness) + - EXTENDED: All rules combined + +Example usage: + from pydeequ.engines.suggestions import SuggestionRunner + + # Run default rules + runner = SuggestionRunner(rule_sets=["DEFAULT"]) + suggestions = runner.run(profiles, execute_fn=engine._execute_query, table="my_table") + + # Run multiple rule sets + runner = SuggestionRunner(rule_sets=["DEFAULT", "NUMERICAL", "STRING"]) + suggestions = runner.run(profiles, execute_fn=engine._execute_query, table="my_table") +""" + +from pydeequ.engines.suggestions.registry import RuleRegistry +from pydeequ.engines.suggestions.rules import ( + SuggestionRule, + CompleteIfCompleteRule, + RetainCompletenessRule, + NonNegativeNumbersRule, + CategoricalRangeRule, + HasMinRule, + HasMaxRule, + HasMeanRule, + HasMinLengthRule, + HasMaxLengthRule, + UniqueIfApproximatelyUniqueRule, +) +from pydeequ.engines.suggestions.runner import SuggestionRunner + + +__all__ = [ + # Registry + "RuleRegistry", + # Runner + "SuggestionRunner", + # Base class + "SuggestionRule", + # Rules + "CompleteIfCompleteRule", + "RetainCompletenessRule", + "NonNegativeNumbersRule", + "CategoricalRangeRule", + "HasMinRule", + "HasMaxRule", + "HasMeanRule", + "HasMinLengthRule", + "HasMaxLengthRule", + "UniqueIfApproximatelyUniqueRule", +] diff --git a/pydeequ/engines/suggestions/registry.py b/pydeequ/engines/suggestions/registry.py new file mode 100644 index 0000000..f762845 --- /dev/null +++ b/pydeequ/engines/suggestions/registry.py @@ -0,0 +1,98 @@ +# -*- coding: utf-8 -*- +""" +Suggestion rule registry. + +This module provides a registry for suggestion rules, allowing rules to be +organized by rule sets (DEFAULT, NUMERICAL, STRING, COMMON, EXTENDED). +""" + +from __future__ import annotations + +from typing import List + +from pydeequ.engines.suggestions.rules import ( + SuggestionRule, + CompleteIfCompleteRule, + RetainCompletenessRule, + NonNegativeNumbersRule, + CategoricalRangeRule, + HasMinRule, + HasMaxRule, + HasMeanRule, + HasMinLengthRule, + HasMaxLengthRule, + UniqueIfApproximatelyUniqueRule, +) + + +class RuleRegistry: + """ + Registry of suggestion rules by rule set. + + Provides centralized management of suggestion rules and retrieval + by rule set names. + """ + + _rules: List[SuggestionRule] = [] + + @classmethod + def register(cls, rule: SuggestionRule) -> None: + """ + Register a suggestion rule. + + Args: + rule: SuggestionRule instance to register + """ + cls._rules.append(rule) + + @classmethod + def get_rules_for_sets(cls, rule_sets: List[str]) -> List[SuggestionRule]: + """ + Get all rules that belong to any of the specified rule sets. + + Args: + rule_sets: List of rule set names (e.g., ["DEFAULT", "NUMERICAL"]) + + Returns: + List of rules that belong to any of the specified sets + """ + return [r for r in cls._rules if any(s in r.rule_sets for s in rule_sets)] + + @classmethod + def get_all_rules(cls) -> List[SuggestionRule]: + """ + Get all registered rules. + + Returns: + List of all registered rules + """ + return cls._rules.copy() + + @classmethod + def clear(cls) -> None: + """Clear all registered rules (mainly for testing).""" + cls._rules = [] + + +# Auto-register all default rules +def _register_default_rules() -> None: + """Register all built-in suggestion rules.""" + RuleRegistry.register(CompleteIfCompleteRule()) + RuleRegistry.register(RetainCompletenessRule()) + RuleRegistry.register(NonNegativeNumbersRule()) + RuleRegistry.register(CategoricalRangeRule()) + RuleRegistry.register(HasMinRule()) + RuleRegistry.register(HasMaxRule()) + RuleRegistry.register(HasMeanRule()) + RuleRegistry.register(HasMinLengthRule()) + RuleRegistry.register(HasMaxLengthRule()) + RuleRegistry.register(UniqueIfApproximatelyUniqueRule()) + + +# Register rules on module load +_register_default_rules() + + +__all__ = [ + "RuleRegistry", +] diff --git a/pydeequ/engines/suggestions/rules.py b/pydeequ/engines/suggestions/rules.py new file mode 100644 index 0000000..40a7ebf --- /dev/null +++ b/pydeequ/engines/suggestions/rules.py @@ -0,0 +1,380 @@ +# -*- coding: utf-8 -*- +""" +Suggestion rule implementations. + +This module provides the base class and implementations for constraint +suggestion rules. Each rule analyzes column profiles and generates +appropriate constraint suggestions. +""" + +from __future__ import annotations + +import json +from abc import ABC, abstractmethod +from typing import TYPE_CHECKING, List, Optional, Set + +if TYPE_CHECKING: + from pydeequ.engines import ColumnProfile, ConstraintSuggestion + + +# SQL types that are considered string +STRING_TYPES: Set[str] = {"VARCHAR", "CHAR", "BPCHAR", "TEXT", "STRING"} + + +class SuggestionRule(ABC): + """ + Base class for constraint suggestion rules. + + Each rule examines column profiles and generates appropriate + constraint suggestions based on data characteristics. + """ + + @property + @abstractmethod + def name(self) -> str: + """Rule name for identification.""" + pass + + @property + @abstractmethod + def rule_sets(self) -> List[str]: + """Which rule sets this rule belongs to (DEFAULT, NUMERICAL, etc).""" + pass + + @abstractmethod + def applies_to(self, profile: "ColumnProfile") -> bool: + """Whether this rule applies to the given column profile.""" + pass + + @abstractmethod + def generate(self, profile: "ColumnProfile") -> Optional["ConstraintSuggestion"]: + """Generate a suggestion if applicable, or None.""" + pass + + +class CompleteIfCompleteRule(SuggestionRule): + """Suggests isComplete() constraint for fully complete columns.""" + + @property + def name(self) -> str: + return "CompleteIfComplete" + + @property + def rule_sets(self) -> List[str]: + return ["DEFAULT", "EXTENDED"] + + def applies_to(self, profile: "ColumnProfile") -> bool: + return profile.completeness == 1.0 + + def generate(self, profile: "ColumnProfile") -> Optional["ConstraintSuggestion"]: + from pydeequ.engines import ConstraintSuggestion + + return ConstraintSuggestion( + column_name=profile.column, + constraint_name="Completeness", + current_value="1.0", + description=f"'{profile.column}' is complete", + suggesting_rule=self.name, + code_for_constraint=f'.isComplete("{profile.column}")', + ) + + +class RetainCompletenessRule(SuggestionRule): + """Suggests hasCompleteness() constraint for highly complete columns.""" + + THRESHOLD = 0.9 # Minimum completeness to suggest retaining + + @property + def name(self) -> str: + return "RetainCompleteness" + + @property + def rule_sets(self) -> List[str]: + return ["DEFAULT", "EXTENDED"] + + def applies_to(self, profile: "ColumnProfile") -> bool: + # Apply only if not fully complete but >= threshold + return self.THRESHOLD <= profile.completeness < 1.0 + + def generate(self, profile: "ColumnProfile") -> Optional["ConstraintSuggestion"]: + from pydeequ.engines import ConstraintSuggestion + + return ConstraintSuggestion( + column_name=profile.column, + constraint_name="Completeness", + current_value=f"{profile.completeness:.4f}", + description=f"'{profile.column}' has completeness {profile.completeness:.2%}", + suggesting_rule=self.name, + code_for_constraint=f'.hasCompleteness("{profile.column}", gte({profile.completeness:.2f}))', + ) + + +class NonNegativeNumbersRule(SuggestionRule): + """Suggests isNonNegative() constraint for columns with no negative values.""" + + @property + def name(self) -> str: + return "NonNegativeNumbers" + + @property + def rule_sets(self) -> List[str]: + return ["DEFAULT", "EXTENDED"] + + def applies_to(self, profile: "ColumnProfile") -> bool: + return profile.minimum is not None and profile.minimum >= 0 + + def generate(self, profile: "ColumnProfile") -> Optional["ConstraintSuggestion"]: + from pydeequ.engines import ConstraintSuggestion + + return ConstraintSuggestion( + column_name=profile.column, + constraint_name="NonNegative", + current_value=f"{profile.minimum:.2f}", + description=f"'{profile.column}' has no negative values", + suggesting_rule=self.name, + code_for_constraint=f'.isNonNegative("{profile.column}")', + ) + + +class CategoricalRangeRule(SuggestionRule): + """Suggests isContainedIn() constraint for low cardinality categorical columns.""" + + MAX_CATEGORIES = 10 # Maximum distinct values to suggest containment + + @property + def name(self) -> str: + return "CategoricalRangeRule" + + @property + def rule_sets(self) -> List[str]: + return ["DEFAULT", "EXTENDED"] + + def applies_to(self, profile: "ColumnProfile") -> bool: + if not profile.histogram: + return False + hist = json.loads(profile.histogram) + return len(hist) <= self.MAX_CATEGORIES + + def generate(self, profile: "ColumnProfile") -> Optional["ConstraintSuggestion"]: + from pydeequ.engines import ConstraintSuggestion + + hist = json.loads(profile.histogram) + values = list(hist.keys()) + values_str = ", ".join([f'"{v}"' for v in values]) + + return ConstraintSuggestion( + column_name=profile.column, + constraint_name="Compliance", + current_value=f"{len(values)} distinct values", + description=f"'{profile.column}' has categorical values", + suggesting_rule=self.name, + code_for_constraint=f'.isContainedIn("{profile.column}", [{values_str}])', + ) + + +class HasMinRule(SuggestionRule): + """Suggests hasMin() constraint for numeric columns.""" + + @property + def name(self) -> str: + return "HasMin" + + @property + def rule_sets(self) -> List[str]: + return ["NUMERICAL", "EXTENDED"] + + def applies_to(self, profile: "ColumnProfile") -> bool: + return profile.minimum is not None and profile.mean is not None + + def generate(self, profile: "ColumnProfile") -> Optional["ConstraintSuggestion"]: + from pydeequ.engines import ConstraintSuggestion + + return ConstraintSuggestion( + column_name=profile.column, + constraint_name="Minimum", + current_value=f"{profile.minimum:.2f}", + description=f"'{profile.column}' has minimum {profile.minimum:.2f}", + suggesting_rule=self.name, + code_for_constraint=f'.hasMin("{profile.column}", gte({profile.minimum:.2f}))', + ) + + +class HasMaxRule(SuggestionRule): + """Suggests hasMax() constraint for numeric columns.""" + + @property + def name(self) -> str: + return "HasMax" + + @property + def rule_sets(self) -> List[str]: + return ["NUMERICAL", "EXTENDED"] + + def applies_to(self, profile: "ColumnProfile") -> bool: + return profile.maximum is not None and profile.mean is not None + + def generate(self, profile: "ColumnProfile") -> Optional["ConstraintSuggestion"]: + from pydeequ.engines import ConstraintSuggestion + + return ConstraintSuggestion( + column_name=profile.column, + constraint_name="Maximum", + current_value=f"{profile.maximum:.2f}", + description=f"'{profile.column}' has maximum {profile.maximum:.2f}", + suggesting_rule=self.name, + code_for_constraint=f'.hasMax("{profile.column}", lte({profile.maximum:.2f}))', + ) + + +class HasMeanRule(SuggestionRule): + """Suggests hasMean() constraint for numeric columns.""" + + @property + def name(self) -> str: + return "HasMean" + + @property + def rule_sets(self) -> List[str]: + return ["NUMERICAL", "EXTENDED"] + + def applies_to(self, profile: "ColumnProfile") -> bool: + return profile.mean is not None + + def generate(self, profile: "ColumnProfile") -> Optional["ConstraintSuggestion"]: + from pydeequ.engines import ConstraintSuggestion + + lower = profile.mean * 0.9 + upper = profile.mean * 1.1 + + return ConstraintSuggestion( + column_name=profile.column, + constraint_name="Mean", + current_value=f"{profile.mean:.2f}", + description=f"'{profile.column}' has mean {profile.mean:.2f}", + suggesting_rule=self.name, + code_for_constraint=f'.hasMean("{profile.column}", between({lower:.2f}, {upper:.2f}))', + ) + + +class HasMinLengthRule(SuggestionRule): + """Suggests hasMinLength() constraint for string columns.""" + + @property + def name(self) -> str: + return "HasMinLength" + + @property + def rule_sets(self) -> List[str]: + return ["STRING", "EXTENDED"] + + def applies_to(self, profile: "ColumnProfile") -> bool: + return profile.data_type in STRING_TYPES + + def generate( + self, + profile: "ColumnProfile", + min_length: Optional[int] = None, + ) -> Optional["ConstraintSuggestion"]: + from pydeequ.engines import ConstraintSuggestion + + if min_length is None or min_length <= 0: + return None + + return ConstraintSuggestion( + column_name=profile.column, + constraint_name="MinLength", + current_value=str(min_length), + description=f"'{profile.column}' has minimum length {min_length}", + suggesting_rule=self.name, + code_for_constraint=f'.hasMinLength("{profile.column}", gte({min_length}))', + ) + + +class HasMaxLengthRule(SuggestionRule): + """Suggests hasMaxLength() constraint for string columns.""" + + @property + def name(self) -> str: + return "HasMaxLength" + + @property + def rule_sets(self) -> List[str]: + return ["STRING", "EXTENDED"] + + def applies_to(self, profile: "ColumnProfile") -> bool: + return profile.data_type in STRING_TYPES + + def generate( + self, + profile: "ColumnProfile", + max_length: Optional[int] = None, + ) -> Optional["ConstraintSuggestion"]: + from pydeequ.engines import ConstraintSuggestion + + if max_length is None or max_length <= 0: + return None + + return ConstraintSuggestion( + column_name=profile.column, + constraint_name="MaxLength", + current_value=str(max_length), + description=f"'{profile.column}' has maximum length {max_length}", + suggesting_rule=self.name, + code_for_constraint=f'.hasMaxLength("{profile.column}", lte({max_length}))', + ) + + +class UniqueIfApproximatelyUniqueRule(SuggestionRule): + """Suggests isUnique() constraint for approximately unique columns.""" + + UNIQUENESS_THRESHOLD = 0.99 # Minimum distinct ratio to consider unique + + @property + def name(self) -> str: + return "UniqueIfApproximatelyUnique" + + @property + def rule_sets(self) -> List[str]: + return ["COMMON", "EXTENDED"] + + def applies_to(self, profile: "ColumnProfile") -> bool: + # Need total row count to determine uniqueness + return True # Check is done in generate with row_count + + def generate( + self, + profile: "ColumnProfile", + row_count: Optional[int] = None, + ) -> Optional["ConstraintSuggestion"]: + from pydeequ.engines import ConstraintSuggestion + + if row_count is None or row_count <= 0: + return None + + if profile.approx_distinct_values >= row_count * self.UNIQUENESS_THRESHOLD: + return ConstraintSuggestion( + column_name=profile.column, + constraint_name="Uniqueness", + current_value="~1.0", + description=f"'{profile.column}' appears to be unique", + suggesting_rule=self.name, + code_for_constraint=f'.isUnique("{profile.column}")', + ) + return None + + +# Export all rule classes +__all__ = [ + "SuggestionRule", + "CompleteIfCompleteRule", + "RetainCompletenessRule", + "NonNegativeNumbersRule", + "CategoricalRangeRule", + "HasMinRule", + "HasMaxRule", + "HasMeanRule", + "HasMinLengthRule", + "HasMaxLengthRule", + "UniqueIfApproximatelyUniqueRule", + "STRING_TYPES", +] diff --git a/pydeequ/engines/suggestions/runner.py b/pydeequ/engines/suggestions/runner.py new file mode 100644 index 0000000..f9b1479 --- /dev/null +++ b/pydeequ/engines/suggestions/runner.py @@ -0,0 +1,178 @@ +# -*- coding: utf-8 -*- +""" +Suggestion runner for executing rules against column profiles. + +This module provides the SuggestionRunner class that orchestrates +running suggestion rules against column profiles. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Callable, List, Optional + +from pydeequ.engines.suggestions.registry import RuleRegistry +from pydeequ.engines.suggestions.rules import ( + HasMinLengthRule, + HasMaxLengthRule, + UniqueIfApproximatelyUniqueRule, + STRING_TYPES, +) + +if TYPE_CHECKING: + import pandas as pd + from pydeequ.engines import ColumnProfile, ConstraintSuggestion + + +class SuggestionRunner: + """ + Runs suggestion rules against column profiles. + + The runner retrieves rules from the registry based on the specified + rule sets and executes them against each column profile. + + Attributes: + rule_sets: List of rule set names to apply + """ + + def __init__(self, rule_sets: Optional[List[str]] = None): + """ + Initialize SuggestionRunner. + + Args: + rule_sets: List of rule set names (e.g., ["DEFAULT", "NUMERICAL"]). + If None, defaults to ["DEFAULT"]. + """ + self.rule_sets = rule_sets or ["DEFAULT"] + + def run( + self, + profiles: List["ColumnProfile"], + execute_fn: Optional[Callable[[str], "pd.DataFrame"]] = None, + table: Optional[str] = None, + row_count: Optional[int] = None, + ) -> List["ConstraintSuggestion"]: + """ + Run suggestion rules against column profiles. + + Args: + profiles: List of column profiles to analyze + execute_fn: Optional function to execute SQL queries (for rules + that need additional data like string lengths) + table: Optional table name for queries + row_count: Optional total row count for uniqueness checks + + Returns: + List of constraint suggestions + """ + rules = RuleRegistry.get_rules_for_sets(self.rule_sets) + suggestions: List["ConstraintSuggestion"] = [] + + for profile in profiles: + for rule in rules: + suggestion = self._apply_rule( + rule, profile, execute_fn, table, row_count + ) + if suggestion: + suggestions.append(suggestion) + + return suggestions + + def _apply_rule( + self, + rule, + profile: "ColumnProfile", + execute_fn: Optional[Callable[[str], "pd.DataFrame"]], + table: Optional[str], + row_count: Optional[int], + ) -> Optional["ConstraintSuggestion"]: + """ + Apply a single rule to a profile. + + Some rules require special handling (e.g., string length rules need + to query the database, uniqueness rules need row count). + + Args: + rule: The rule to apply + profile: Column profile to analyze + execute_fn: Optional SQL execution function + table: Optional table name + row_count: Optional row count + + Returns: + Constraint suggestion or None + """ + # Handle HasMinLengthRule - needs string length from query + if isinstance(rule, HasMinLengthRule): + return self._handle_string_length_rule( + rule, profile, execute_fn, table, is_min=True + ) + + # Handle HasMaxLengthRule - needs string length from query + if isinstance(rule, HasMaxLengthRule): + return self._handle_string_length_rule( + rule, profile, execute_fn, table, is_min=False + ) + + # Handle UniqueIfApproximatelyUniqueRule - needs row count + if isinstance(rule, UniqueIfApproximatelyUniqueRule): + if rule.applies_to(profile): + return rule.generate(profile, row_count=row_count) + return None + + # Standard rule handling + if rule.applies_to(profile): + return rule.generate(profile) + return None + + def _handle_string_length_rule( + self, + rule, + profile: "ColumnProfile", + execute_fn: Optional[Callable[[str], "pd.DataFrame"]], + table: Optional[str], + is_min: bool, + ) -> Optional["ConstraintSuggestion"]: + """ + Handle string length rules that need database queries. + + Args: + rule: HasMinLengthRule or HasMaxLengthRule + profile: Column profile + execute_fn: SQL execution function + table: Table name + is_min: True for min length, False for max length + + Returns: + Constraint suggestion or None + """ + import pandas as pd + + if not rule.applies_to(profile): + return None + + if execute_fn is None or table is None: + return None + + col = profile.column + agg_func = "MIN" if is_min else "MAX" + query = f"SELECT {agg_func}(LENGTH({col})) as len FROM {table} WHERE {col} IS NOT NULL" + + try: + result = execute_fn(query) + length = result["len"].iloc[0] + if length is not None and not pd.isna(length): + length = int(length) + if length > 0: + if is_min: + return rule.generate(profile, min_length=length) + else: + return rule.generate(profile, max_length=length) + except Exception: + pass + + return None + + +__all__ = [ + "SuggestionRunner", +] diff --git a/pydeequ/v2/analyzers.py b/pydeequ/v2/analyzers.py index 53a979c..b796f12 100644 --- a/pydeequ/v2/analyzers.py +++ b/pydeequ/v2/analyzers.py @@ -25,7 +25,7 @@ from abc import ABC, abstractmethod from dataclasses import dataclass, field -from typing import List, Optional, Sequence, Union +from typing import Optional, Sequence, Union from pydeequ.v2.proto import deequ_connect_pb2 as proto diff --git a/pydeequ/v2/checks.py b/pydeequ/v2/checks.py index 2a86ba8..c788f6c 100644 --- a/pydeequ/v2/checks.py +++ b/pydeequ/v2/checks.py @@ -19,7 +19,7 @@ from __future__ import annotations from enum import Enum -from typing import List, Optional, Sequence, Union +from typing import List, Optional, Sequence from pydeequ.v2.predicates import Predicate, is_one from pydeequ.v2.proto import deequ_connect_pb2 as proto diff --git a/pydeequ/v2/predicates.py b/pydeequ/v2/predicates.py index adaf23d..3f2c06c 100644 --- a/pydeequ/v2/predicates.py +++ b/pydeequ/v2/predicates.py @@ -40,6 +40,26 @@ def to_proto(self) -> proto.PredicateMessage: def __repr__(self) -> str: raise NotImplementedError + @abstractmethod + def to_callable(self): + """ + Convert predicate to a callable function. + + Returns: + A callable that takes a value and returns True/False + + Example: + pred = gte(0.95) + func = pred.to_callable() + assert func(0.96) == True + assert func(0.90) == False + """ + raise NotImplementedError + + def __call__(self, value: float) -> bool: + """Allow predicates to be called directly like functions.""" + return self.to_callable()(value) + @dataclass class Comparison(Predicate): @@ -62,6 +82,26 @@ def __repr__(self) -> str: } return f"x {op_map.get(self.operator, '?')} {self.value}" + def to_callable(self): + """Convert to a callable function.""" + op = self.operator + target = self.value + + if op == proto.PredicateMessage.Operator.EQ: + return lambda x: abs(x - target) < 1e-9 if x is not None else False + elif op == proto.PredicateMessage.Operator.NE: + return lambda x: abs(x - target) >= 1e-9 if x is not None else False + elif op == proto.PredicateMessage.Operator.GT: + return lambda x: x > target if x is not None else False + elif op == proto.PredicateMessage.Operator.GE: + return lambda x: x >= target if x is not None else False + elif op == proto.PredicateMessage.Operator.LT: + return lambda x: x < target if x is not None else False + elif op == proto.PredicateMessage.Operator.LE: + return lambda x: x <= target if x is not None else False + else: + return lambda x: False + @dataclass class Between(Predicate): @@ -80,6 +120,12 @@ def to_proto(self) -> proto.PredicateMessage: def __repr__(self) -> str: return f"{self.lower} <= x <= {self.upper}" + def to_callable(self): + """Convert to a callable function.""" + lower = self.lower + upper = self.upper + return lambda x: lower <= x <= upper if x is not None else False + # ============================================================================ # Factory Functions - Convenient way to create predicates diff --git a/pydeequ/v2/profiles.py b/pydeequ/v2/profiles.py index 97f71ef..1e2a373 100644 --- a/pydeequ/v2/profiles.py +++ b/pydeequ/v2/profiles.py @@ -1,12 +1,25 @@ # -*- coding: utf-8 -*- """ -Column Profiler for Deequ Spark Connect. +Column Profiler for PyDeequ v2. This module provides column profiling capabilities that analyze DataFrame columns to compute statistics like completeness, data type distribution, and optional KLL sketch-based quantile estimation. -Example usage: +Example usage with DuckDB: + import duckdb + import pydeequ + from pydeequ.v2.profiles import ColumnProfilerRunner + + con = duckdb.connect() + con.execute("CREATE TABLE test AS SELECT 1 as id, 'foo' as name") + engine = pydeequ.connect(con, table="test") + + profiles = (ColumnProfilerRunner() + .on_engine(engine) + .run()) + +Example usage with Spark Connect: from pyspark.sql import SparkSession from pydeequ.v2.profiles import ColumnProfilerRunner, KLLParameters @@ -30,8 +43,9 @@ from __future__ import annotations from dataclasses import dataclass -from typing import TYPE_CHECKING, Dict, Optional, Sequence +from typing import TYPE_CHECKING, Dict, List, Optional, Sequence +import pandas as pd from google.protobuf import any_pb2 from pydeequ.v2.proto import deequ_connect_pb2 as proto @@ -39,6 +53,7 @@ if TYPE_CHECKING: from pyspark.sql import DataFrame, SparkSession + from pydeequ.engines import BaseEngine @dataclass @@ -74,9 +89,15 @@ class ColumnProfilerRunner: ColumnProfilerRunner analyzes DataFrame columns to compute statistics including completeness, data type, distinct values, and optionally - KLL sketches for numeric columns. + KLL sketches for numeric columns. Supports both engine-based and Spark-based execution. - Example: + Example (Engine-based with DuckDB): + profiles = (ColumnProfilerRunner() + .on_engine(engine) + .restrictToColumns(["col1", "col2"]) + .run()) + + Example (Spark Connect): profiles = (ColumnProfilerRunner(spark) .onData(df) .restrictToColumns(["col1", "col2"]) @@ -84,27 +105,49 @@ class ColumnProfilerRunner: .run()) """ - def __init__(self, spark: "SparkSession"): + def __init__(self, spark: Optional["SparkSession"] = None): """ Create a new ColumnProfilerRunner. Args: - spark: SparkSession (can be either local or Spark Connect) + spark: Optional SparkSession for Spark Connect mode. + Not required for engine-based execution. """ self._spark = spark def onData(self, df: "DataFrame") -> "ColumnProfilerRunBuilder": """ - Specify the DataFrame to profile. + Specify the DataFrame to profile (Spark mode). Args: df: DataFrame to profile Returns: ColumnProfilerRunBuilder for method chaining + + Raises: + ValueError: If SparkSession was not provided in constructor """ + if self._spark is None: + raise ValueError( + "SparkSession required for onData(). " + "Use ColumnProfilerRunner(spark).onData(df) or " + "ColumnProfilerRunner().on_engine(engine) for engine-based execution." + ) return ColumnProfilerRunBuilder(self._spark, df) + def on_engine(self, engine: "BaseEngine") -> "EngineColumnProfilerRunBuilder": + """ + Specify the engine to run profiling on (Engine mode). + + Args: + engine: BaseEngine instance (e.g., DuckDBEngine) + + Returns: + EngineColumnProfilerRunBuilder for method chaining + """ + return EngineColumnProfilerRunBuilder(engine) + class ColumnProfilerRunBuilder: """ @@ -274,9 +317,80 @@ def _run_via_spark_connect( return dataframe_from_plan(plan, self._spark) +class EngineColumnProfilerRunBuilder: + """ + Builder for configuring and executing engine-based column profiling. + + This class works with DuckDB and other SQL backends via the engine abstraction. + """ + + def __init__(self, engine: "BaseEngine"): + """ + Create a new EngineColumnProfilerRunBuilder. + + Args: + engine: BaseEngine instance (e.g., DuckDBEngine) + """ + self._engine = engine + self._restrict_to_columns: Optional[Sequence[str]] = None + self._low_cardinality_threshold: int = 0 + + def restrictToColumns(self, columns: Sequence[str]) -> "EngineColumnProfilerRunBuilder": + """ + Restrict profiling to specific columns. + + Args: + columns: List of column names to profile + + Returns: + self for method chaining + """ + self._restrict_to_columns = columns + return self + + def withLowCardinalityHistogramThreshold( + self, threshold: int + ) -> "EngineColumnProfilerRunBuilder": + """ + Set threshold for computing histograms. + + Columns with distinct values <= threshold will have histograms computed. + + Args: + threshold: Maximum distinct values for histogram computation + + Returns: + self for method chaining + """ + self._low_cardinality_threshold = threshold + return self + + def run(self) -> pd.DataFrame: + """ + Execute the profiling and return results as a pandas DataFrame. + + The result DataFrame contains columns: + - column: Column name + - completeness: Non-null ratio (0.0-1.0) + - approx_distinct_values: Approximate cardinality + - data_type: Detected/provided type + - mean, minimum, maximum, sum, std_dev: Numeric stats (null for non-numeric) + - histogram: JSON string of histogram (or null) + + Returns: + pandas DataFrame with profiling results (one row per column) + """ + profiles = self._engine.profile_columns( + columns=self._restrict_to_columns, + low_cardinality_threshold=self._low_cardinality_threshold, + ) + return self._engine.profiles_to_dataframe(profiles) + + # Export all public symbols __all__ = [ "ColumnProfilerRunner", "ColumnProfilerRunBuilder", + "EngineColumnProfilerRunBuilder", "KLLParameters", ] diff --git a/pydeequ/v2/suggestions.py b/pydeequ/v2/suggestions.py index b89b07b..5d6c371 100644 --- a/pydeequ/v2/suggestions.py +++ b/pydeequ/v2/suggestions.py @@ -1,12 +1,26 @@ # -*- coding: utf-8 -*- """ -Constraint Suggestions for Deequ Spark Connect. +Constraint Suggestions for PyDeequ v2. This module provides automatic constraint suggestion capabilities that analyze DataFrame columns and suggest appropriate data quality constraints based on the data characteristics. -Example usage: +Example usage with DuckDB: + import duckdb + import pydeequ + from pydeequ.v2.suggestions import ConstraintSuggestionRunner, Rules + + con = duckdb.connect() + con.execute("CREATE TABLE test AS SELECT 1 as id, 'foo' as name") + engine = pydeequ.connect(con, table="test") + + suggestions = (ConstraintSuggestionRunner() + .on_engine(engine) + .addConstraintRules(Rules.DEFAULT) + .run()) + +Example usage with Spark Connect: from pyspark.sql import SparkSession from pydeequ.v2.suggestions import ConstraintSuggestionRunner, Rules @@ -33,6 +47,7 @@ from enum import Enum from typing import TYPE_CHECKING, Dict, List, Optional, Sequence +import pandas as pd from google.protobuf import any_pb2 from pydeequ.v2.profiles import KLLParameters @@ -41,6 +56,7 @@ if TYPE_CHECKING: from pyspark.sql import DataFrame, SparkSession + from pydeequ.engines import BaseEngine class Rules(Enum): @@ -79,35 +95,64 @@ class ConstraintSuggestionRunner: ConstraintSuggestionRunner analyzes DataFrame columns to suggest appropriate data quality constraints based on the data characteristics. + Supports both engine-based and Spark-based execution. - Example: + Example (Engine-based with DuckDB): + suggestions = (ConstraintSuggestionRunner() + .on_engine(engine) + .addConstraintRules(Rules.DEFAULT) + .run()) + + Example (Spark Connect): suggestions = (ConstraintSuggestionRunner(spark) .onData(df) .addConstraintRules(Rules.DEFAULT) .run()) """ - def __init__(self, spark: "SparkSession"): + def __init__(self, spark: Optional["SparkSession"] = None): """ Create a new ConstraintSuggestionRunner. Args: - spark: SparkSession (can be either local or Spark Connect) + spark: Optional SparkSession for Spark Connect mode. + Not required for engine-based execution. """ self._spark = spark def onData(self, df: "DataFrame") -> "ConstraintSuggestionRunBuilder": """ - Specify the DataFrame to analyze. + Specify the DataFrame to analyze (Spark mode). Args: df: DataFrame to analyze for constraint suggestions Returns: ConstraintSuggestionRunBuilder for method chaining + + Raises: + ValueError: If SparkSession was not provided in constructor """ + if self._spark is None: + raise ValueError( + "SparkSession required for onData(). " + "Use ConstraintSuggestionRunner(spark).onData(df) or " + "ConstraintSuggestionRunner().on_engine(engine) for engine-based execution." + ) return ConstraintSuggestionRunBuilder(self._spark, df) + def on_engine(self, engine: "BaseEngine") -> "EngineConstraintSuggestionRunBuilder": + """ + Specify the engine to run suggestion analysis on (Engine mode). + + Args: + engine: BaseEngine instance (e.g., DuckDBEngine) + + Returns: + EngineConstraintSuggestionRunBuilder for method chaining + """ + return EngineConstraintSuggestionRunBuilder(engine) + class ConstraintSuggestionRunBuilder: """ @@ -332,9 +377,92 @@ def _run_via_spark_connect( return dataframe_from_plan(plan, self._spark) +class EngineConstraintSuggestionRunBuilder: + """ + Builder for configuring and executing engine-based constraint suggestions. + + This class works with DuckDB and other SQL backends via the engine abstraction. + """ + + def __init__(self, engine: "BaseEngine"): + """ + Create a new EngineConstraintSuggestionRunBuilder. + + Args: + engine: BaseEngine instance (e.g., DuckDBEngine) + """ + self._engine = engine + self._rules: List[Rules] = [] + self._restrict_to_columns: Optional[Sequence[str]] = None + + def addConstraintRules(self, rules: Rules) -> "EngineConstraintSuggestionRunBuilder": + """ + Add a constraint rule set. + + Can be called multiple times to add multiple rule sets. + + Args: + rules: Rules enum value specifying which rules to use + + Returns: + self for method chaining + """ + self._rules.append(rules) + return self + + def restrictToColumns( + self, columns: Sequence[str] + ) -> "EngineConstraintSuggestionRunBuilder": + """ + Restrict suggestions to specific columns. + + Args: + columns: List of column names to analyze + + Returns: + self for method chaining + """ + self._restrict_to_columns = columns + return self + + def run(self) -> pd.DataFrame: + """ + Execute the suggestion analysis and return results as a pandas DataFrame. + + The result DataFrame contains columns: + - column_name: Column the constraint applies to + - constraint_name: Type of constraint (e.g., "Completeness", "IsIn") + - current_value: Current metric value that triggered suggestion + - description: Human-readable description + - suggesting_rule: Rule that generated this suggestion + - code_for_constraint: Python code snippet for the constraint + + Returns: + pandas DataFrame with constraint suggestions + + Raises: + ValueError: If no rules have been added + """ + if not self._rules: + raise ValueError( + "At least one constraint rule set must be added. " + "Use .addConstraintRules(Rules.DEFAULT) to add rules." + ) + + # Convert Rules enum to string list + rule_strs = [r.value for r in self._rules] + + suggestions = self._engine.suggest_constraints( + columns=self._restrict_to_columns, + rules=rule_strs, + ) + return self._engine.suggestions_to_dataframe(suggestions) + + # Export all public symbols __all__ = [ "ConstraintSuggestionRunner", "ConstraintSuggestionRunBuilder", + "EngineConstraintSuggestionRunBuilder", "Rules", ] diff --git a/pydeequ/v2/verification.py b/pydeequ/v2/verification.py index c6d8d2f..10e43d0 100644 --- a/pydeequ/v2/verification.py +++ b/pydeequ/v2/verification.py @@ -1,12 +1,31 @@ # -*- coding: utf-8 -*- """ -VerificationSuite for Deequ Spark Connect. +VerificationSuite for PyDeequ v2. -This module provides the main entry point for running data quality checks -via Spark Connect. It builds protobuf messages and sends them to the -server-side Deequ plugin. +This module provides the main entry point for running data quality checks. +It supports two execution modes: -Example usage: +1. Engine-based (DuckDB, etc.) - uses pydeequ.connect() +2. Spark Connect - uses SparkSession with Deequ plugin + +Example usage with DuckDB: + import duckdb + import pydeequ + from pydeequ.v2.verification import VerificationSuite, AnalysisRunner + from pydeequ.v2.checks import Check, CheckLevel + from pydeequ.v2.predicates import gte, eq + + con = duckdb.connect() + con.execute("CREATE TABLE test AS SELECT 1 as id, 'foo@bar.com' as email") + engine = pydeequ.connect(con, table="test") + + check = (Check(CheckLevel.Error, "Data quality check") + .isComplete("id") + .hasCompleteness("email", gte(0.95))) + + result = VerificationSuite().on_engine(engine).addCheck(check).run() + +Example usage with Spark Connect: from pyspark.sql import SparkSession from pydeequ.v2.verification import VerificationSuite from pydeequ.v2.checks import Check, CheckLevel @@ -28,8 +47,9 @@ from __future__ import annotations -from typing import TYPE_CHECKING, List +from typing import TYPE_CHECKING, List, Optional +import pandas as pd from google.protobuf import any_pb2 from pydeequ.v2.analyzers import _ConnectAnalyzer @@ -39,18 +59,27 @@ if TYPE_CHECKING: from pyspark.sql import DataFrame, SparkSession + from pydeequ.engines import BaseEngine class VerificationSuite: """ Main entry point for running data quality verification. - VerificationSuite allows you to define checks and analyzers to run - on a DataFrame. When run() is called, the checks and analyzers are - serialized to protobuf and sent to the Spark Connect server where - the Deequ plugin executes them. + VerificationSuite allows you to define checks and analyzers to run. + It supports two execution modes: + + 1. Engine-based: Use on_engine() for DuckDB and other SQL backends + 2. Spark-based: Use onData() for Spark Connect + + Example (Engine-based): + suite = VerificationSuite() + result = (suite + .on_engine(engine) + .addCheck(check) + .run()) - Example: + Example (Spark-based): suite = VerificationSuite(spark) result = (suite .onData(df) @@ -58,27 +87,66 @@ class VerificationSuite: .run()) """ - def __init__(self, spark: "SparkSession"): + def __init__(self, spark: Optional["SparkSession"] = None): """ Create a new VerificationSuite. Args: - spark: SparkSession connected via Spark Connect + spark: Optional SparkSession for Spark Connect mode. + Not required for engine-based execution. """ self._spark = spark def onData(self, df: "DataFrame") -> "VerificationRunBuilder": """ - Specify the DataFrame to run verification on. + Specify the DataFrame to run verification on (Spark mode). Args: df: DataFrame to verify Returns: VerificationRunBuilder for method chaining + + Raises: + ValueError: If SparkSession was not provided in constructor """ + if self._spark is None: + raise ValueError( + "SparkSession required for onData(). " + "Use VerificationSuite(spark).onData(df) or " + "VerificationSuite().on_engine(engine) for engine-based execution." + ) return VerificationRunBuilder(self._spark, df) + def on_engine(self, engine: "BaseEngine") -> "EngineVerificationRunBuilder": + """ + Specify the engine to run verification on (Engine mode). + + Args: + engine: BaseEngine instance (e.g., DuckDBEngine) + + Returns: + EngineVerificationRunBuilder for method chaining + """ + return EngineVerificationRunBuilder(engine) + + # Alias for consistency with other methods + def on_table(self, table: str) -> "EngineVerificationRunBuilder": + """ + Specify a table name for engine-based verification. + + Note: This method requires an engine to be set first via on_engine(). + For direct table access, use: + pydeequ.connect(con, table="my_table") + + Raises: + ValueError: This method is deprecated + """ + raise ValueError( + "on_table() requires an engine. Use: " + "VerificationSuite().on_engine(pydeequ.connect(con, table='my_table'))" + ) + class VerificationRunBuilder: """ @@ -173,14 +241,89 @@ def run(self) -> "DataFrame": return dataframe_from_plan(plan, self._spark) +class EngineVerificationRunBuilder: + """ + Builder for configuring and executing engine-based verification. + + This class works with DuckDB and other SQL backends via the engine abstraction. + """ + + def __init__(self, engine: "BaseEngine"): + """ + Create a new EngineVerificationRunBuilder. + + Args: + engine: BaseEngine instance (e.g., DuckDBEngine) + """ + self._engine = engine + self._checks: List[Check] = [] + self._analyzers: List[_ConnectAnalyzer] = [] + + def addCheck(self, check: Check) -> "EngineVerificationRunBuilder": + """ + Add a check to run. + + Args: + check: Check to add + + Returns: + self for method chaining + """ + self._checks.append(check) + return self + + def addAnalyzer(self, analyzer: _ConnectAnalyzer) -> "EngineVerificationRunBuilder": + """ + Add an analyzer to run (in addition to those required by checks). + + Args: + analyzer: Analyzer to add + + Returns: + self for method chaining + """ + self._analyzers.append(analyzer) + return self + + def run(self) -> pd.DataFrame: + """ + Execute the verification and return results as a pandas DataFrame. + + The result DataFrame contains columns: + - check: Check description + - check_level: Error or Warning + - check_status: Success, Warning, or Error + - constraint: Constraint description + - constraint_status: Success or Failure + - constraint_message: Details about failures + + Returns: + pandas DataFrame with verification results + """ + # Run checks via engine + results = self._engine.run_checks(self._checks) + return self._engine.constraints_to_dataframe(results) + + class AnalysisRunner: """ Entry point for running analyzers without checks. Use this when you want to compute metrics without defining - pass/fail constraints. + pass/fail constraints. Supports both engine-based and Spark-based execution. + + Example (Engine-based with DuckDB): + from pydeequ.v2.analyzers import Size, Completeness, Mean + import pydeequ + + engine = pydeequ.connect(con, table="my_table") + result = (AnalysisRunner() + .on_engine(engine) + .addAnalyzer(Size()) + .addAnalyzer(Completeness("email")) + .run()) - Example: + Example (Spark Connect): from pydeequ.v2.analyzers import Size, Completeness, Mean result = (AnalysisRunner(spark) @@ -191,27 +334,61 @@ class AnalysisRunner: .run()) """ - def __init__(self, spark: "SparkSession"): + def __init__(self, spark: Optional["SparkSession"] = None): """ Create a new AnalysisRunner. Args: - spark: SparkSession connected via Spark Connect + spark: Optional SparkSession for Spark Connect mode. + Not required for engine-based execution. """ self._spark = spark def onData(self, df: "DataFrame") -> "AnalysisRunBuilder": """ - Specify the DataFrame to analyze. + Specify the DataFrame to analyze (Spark mode). Args: df: DataFrame to analyze Returns: AnalysisRunBuilder for method chaining + + Raises: + ValueError: If SparkSession was not provided in constructor """ + if self._spark is None: + raise ValueError( + "SparkSession required for onData(). " + "Use AnalysisRunner(spark).onData(df) or " + "AnalysisRunner().on_engine(engine) for engine-based execution." + ) return AnalysisRunBuilder(self._spark, df) + def on_engine(self, engine: "BaseEngine") -> "EngineAnalysisRunBuilder": + """ + Specify the engine to run analysis on (Engine mode). + + Args: + engine: BaseEngine instance (e.g., DuckDBEngine) + + Returns: + EngineAnalysisRunBuilder for method chaining + """ + return EngineAnalysisRunBuilder(engine) + + def on_table(self, table: str) -> "EngineAnalysisRunBuilder": + """ + Specify a table name for engine-based analysis. + + Raises: + ValueError: This method requires an engine + """ + raise ValueError( + "on_table() requires an engine. Use: " + "AnalysisRunner().on_engine(pydeequ.connect(con, table='my_table'))" + ) + class AnalysisRunBuilder: """Builder for configuring and executing an analysis run.""" @@ -270,10 +447,49 @@ def run(self) -> "DataFrame": return dataframe_from_plan(plan, self._spark) +class EngineAnalysisRunBuilder: + """Builder for configuring and executing engine-based analysis.""" + + def __init__(self, engine: "BaseEngine"): + """ + Create a new EngineAnalysisRunBuilder. + + Args: + engine: BaseEngine instance (e.g., DuckDBEngine) + """ + self._engine = engine + self._analyzers: List[_ConnectAnalyzer] = [] + + def addAnalyzer(self, analyzer: _ConnectAnalyzer) -> "EngineAnalysisRunBuilder": + """ + Add an analyzer to run. + + Args: + analyzer: Analyzer to add + + Returns: + self for method chaining + """ + self._analyzers.append(analyzer) + return self + + def run(self) -> pd.DataFrame: + """ + Execute the analysis and return metrics as pandas DataFrame. + + Returns: + pandas DataFrame with computed metrics + """ + results = self._engine.compute_metrics(self._analyzers) + return self._engine.metrics_to_dataframe(results) + + # Export all public symbols __all__ = [ "VerificationSuite", "VerificationRunBuilder", + "EngineVerificationRunBuilder", "AnalysisRunner", "AnalysisRunBuilder", + "EngineAnalysisRunBuilder", ] diff --git a/pyproject.toml b/pyproject.toml index 8168444..26b5237 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,14 +1,18 @@ -[tool.poetry] +[project] name = "pydeequ" version = "2.0.0b1" description = "PyDeequ - Unit Tests for Data" -authors = ["Chenyang Liu ", "Rahul Sharma "] -maintainers = ["Chenyang Liu ","Rahul Sharma "] -license = "Apache-2.0" readme = "README.md" -homepage = "https://pydeequ.readthedocs.io" -repository = "https://github.com/awslabs/python-deequ" -documentation = "https://pydeequ.readthedocs.io" +license = {text = "Apache-2.0"} +requires-python = ">=3.9,<4" +authors = [ + {name = "Chenyang Liu", email = "peterl@amazon.com"}, + {name = "Rahul Sharma", email = "rdsharma@amazon.com"}, +] +maintainers = [ + {name = "Chenyang Liu", email = "peterl@amazon.com"}, + {name = "Rahul Sharma", email = "rdsharma@amazon.com"}, +] keywords = [ "deequ", "pydeequ", @@ -23,17 +27,64 @@ keywords = [ classifiers = [ "Development Status :: 4 - Beta", "Programming Language :: Python :: 3", - "License :: OSI Approved :: Apache Software License" + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", + "Topic :: Database", + "Topic :: Software Development :: Quality Assurance", + "Topic :: Software Development :: Testing", +] + +# Core dependencies - minimal set required for the base package +dependencies = [ + "numpy>=1.23.0", + "pandas>=1.5.0", + "protobuf>=4.21.0", + "setuptools>=69.0.0", # Required for Python 3.12+ (distutils removed) +] + +[project.optional-dependencies] +# DuckDB backend - lightweight, no JVM required +duckdb = ["duckdb>=0.9.0"] + +# Spark backend - requires Spark Connect server +spark = ["pyspark[connect]>=3.5.0"] + +# All backends +all = [ + "duckdb>=0.9.0", + "pyspark[connect]>=3.5.0", ] +# Development dependencies +dev = [ + "pytest>=8.0.0", + "pytest-cov>=4.1.0", + "coverage>=7.4.0", + "black>=24.0.0", + "pre-commit>=3.6.0", + "pytest-rerunfailures>=14.0", + "matplotlib>=3.8.0", + "duckdb>=0.9.0", + "pyspark[connect]>=3.5.0", +] -[tool.poetry.dependencies] -python = ">=3.9,<4" -numpy = ">=1.23.0" -pandas = ">=1.5.0" -protobuf = ">=4.21.0" -setuptools = ">=69.0.0" # Required for Python 3.12+ (distutils removed) -pyspark = {version = "3.5.0", extras = ["connect"]} +[project.urls] +Homepage = "https://pydeequ.readthedocs.io" +Repository = "https://github.com/awslabs/python-deequ" +Documentation = "https://pydeequ.readthedocs.io" +Issues = "https://github.com/awslabs/python-deequ/issues" + +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" + +[tool.poetry] +# Poetry-specific settings (for poetry install compatibility) +packages = [{include = "pydeequ"}] [tool.poetry.group.dev.dependencies] pytest = "^8.0.0" @@ -42,12 +93,9 @@ coverage = "^7.4.0" black = "^24.0.0" pre-commit = "^3.6.0" pytest-rerunfailures = "^14.0" - -[tool.poetry.extras] - -[build-system] -requires = ["poetry-core>=1.0.0"] -build-backend = "poetry.core.masonry.api" +matplotlib = "^3.8.0" +duckdb = ">=0.9.0" +pyspark = {version = ">=3.5.0", extras = ["connect"]} [tool.black] # https://github.com/psf/black @@ -57,7 +105,7 @@ include_trailing_comma = true force_grid_wrap = 0 use_parentheses = true ensure_newline_before_comments = true -target_version = ['py38'] +target_version = ['py39'] include = '\.pyi?$' exclude = ''' /( @@ -89,27 +137,10 @@ indent = ' ' multi_line_output = 3 include_trailing_comma = true skip_glob = ['__init__.py'] -#force_grid_wrap = 0 atomic = true -#lines_after_imports = 2 -#lines_between_types = 1 -#src_paths=isort,test - -# [mypy] -# python_version = 3.8 -#warn_return_any = True -#warn_unused_configs = True - -#[mypy-pyspark.*] -#ignore_missing_imports = True -# pytest -n 2 --reruns 3 --reruns-delay 5 --dist loadscope --tx 2*popen//python=python -[pytest] -testpaths = "tests" -norecursedirs = ".git .* *.egg* old docs dist build" +[tool.pytest.ini_options] +testpaths = ["tests"] +norecursedirs = [".git", ".*", "*.egg*", "old", "docs", "dist", "build"] cache_dir = "./.pytest_cache" python_files = "*test_*.py" -looponfailroots = "pydeequ tests" -# addopts = "-n3 --reruns 3 --reruns-delay 5 --dist loadscope" -# rsyncdirs = . mypkg helperpkg -# rsyncignore = .hg diff --git a/tests/conftest.py b/tests/conftest.py index 543a27e..077afec 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,8 +2,7 @@ """ Pytest configuration for PyDeequ tests using Spark Connect. -All tests use the Spark Connect server which must be running before tests. -Start it with: scripts/start-spark-connect.sh +The Spark Connect server is automatically started by the spark_connect_server fixture. """ import os @@ -15,6 +14,33 @@ os.environ.setdefault("SPARK_VERSION", "3.5") +@pytest.fixture(scope="session") +def spark_connect_server(): + """Session-scoped fixture to start Spark Connect server. + + Automatically starts the Spark Connect server if not already running. + The server is NOT stopped after tests complete (to allow reuse across test runs). + """ + from tests.helpers.spark_server import SparkConnectServer, SparkServerConfig + + config = SparkServerConfig() + server = SparkConnectServer(config) + + if not server.is_running(): + print("\nStarting Spark Connect server for tests...") + server.start() + print("Spark Connect server started.") + else: + print("\nSpark Connect server already running.") + + # Set SPARK_REMOTE if not already set + if not os.environ.get("SPARK_REMOTE"): + os.environ["SPARK_REMOTE"] = f"sc://localhost:{config.port}" + + yield server + # Note: We don't stop the server here to allow reuse across test runs + + def create_spark_connect_session() -> SparkSession: """ Create a Spark Connect session for testing. @@ -29,11 +55,12 @@ def create_spark_connect_session() -> SparkSession: @pytest.fixture(scope="module") -def spark() -> SparkSession: +def spark(spark_connect_server) -> SparkSession: """ Pytest fixture providing a Spark Connect session. The session is shared within each test module for efficiency. + Depends on spark_connect_server to ensure server is running. Yields: SparkSession for testing @@ -75,6 +102,6 @@ def config(self, key, value): return self def getOrCreate(self): - return get_spark_connect_session() + return create_spark_connect_session() return SparkConnectBuilder() diff --git a/tests/engines/__init__.py b/tests/engines/__init__.py new file mode 100644 index 0000000..2d41719 --- /dev/null +++ b/tests/engines/__init__.py @@ -0,0 +1,19 @@ +# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Engine correctness testing module. + +This module contains tests for validating the DuckDB engine implementation +and comparing it against the Spark engine baseline. +""" diff --git a/tests/engines/comparison/__init__.py b/tests/engines/comparison/__init__.py new file mode 100644 index 0000000..6b039c7 --- /dev/null +++ b/tests/engines/comparison/__init__.py @@ -0,0 +1,19 @@ +# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Cross-engine comparison tests. + +Contains tests that compare DuckDB engine results against Spark engine +to validate correctness and functional parity. +""" diff --git a/tests/engines/comparison/conftest.py b/tests/engines/comparison/conftest.py new file mode 100644 index 0000000..b8fc389 --- /dev/null +++ b/tests/engines/comparison/conftest.py @@ -0,0 +1,230 @@ +# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Dual-engine test fixtures for cross-engine comparison. + +Provides fixtures for creating both Spark and DuckDB engines with +identical data for parity testing. The Spark Connect server is +automatically started if not already running. +""" + +import os +from dataclasses import dataclass +from typing import Callable, Generator, Optional +import pytest +import duckdb +import pandas as pd + +from pydeequ.engines import BaseEngine +from pydeequ.engines.duckdb import DuckDBEngine +from tests.engines.fixtures.datasets import DATASET_FACTORIES + + +# Marker for tests requiring Spark - uses the spark_connect_server fixture +# from the top-level conftest.py which automatically starts the server +requires_spark = pytest.mark.usefixtures("spark_connect_server") + + +@dataclass +class DualEngines: + """Container for both Spark and DuckDB engines with same data.""" + spark_engine: BaseEngine + duckdb_engine: BaseEngine + dataset_name: str + + +@pytest.fixture(scope="module") +def spark_session(spark_connect_server): + """Create a module-scoped Spark Connect session. + + Depends on spark_connect_server fixture to ensure server is running. + """ + from pyspark.sql import SparkSession + spark_remote = os.environ.get("SPARK_REMOTE", "sc://localhost:15002") + spark = SparkSession.builder.remote(spark_remote).getOrCreate() + yield spark + spark.stop() + + +@pytest.fixture(scope="module") +def duckdb_connection() -> Generator[duckdb.DuckDBPyConnection, None, None]: + """Create a module-scoped DuckDB connection.""" + conn = duckdb.connect(":memory:") + yield conn + conn.close() + + +@pytest.fixture(scope="function") +def dual_engine_factory( + spark_session, + duckdb_connection: duckdb.DuckDBPyConnection +) -> Callable[[str], DualEngines]: + """Factory fixture to create both Spark and DuckDB engines with same data. + + Usage: + def test_comparison(dual_engine_factory): + engines = dual_engine_factory("df_full") + spark_metrics = engines.spark_engine.compute_metrics([Size()]) + duckdb_metrics = engines.duckdb_engine.compute_metrics([Size()]) + assert_metrics_match(spark_metrics, duckdb_metrics) + """ + tables_created = [] + + def factory(dataset_name: str) -> DualEngines: + if dataset_name not in DATASET_FACTORIES: + raise ValueError(f"Unknown dataset: {dataset_name}") + + # Get the pandas DataFrame + pdf = DATASET_FACTORIES[dataset_name]() + table_name = f"test_{dataset_name}" + + # Create DuckDB engine + try: + duckdb_connection.unregister(table_name) + except Exception: + pass + duckdb_connection.register(table_name, pdf) + duckdb_engine = DuckDBEngine(duckdb_connection, table_name) + tables_created.append(table_name) + + # Create Spark engine + from pydeequ.engines.spark import SparkEngine + spark_df = spark_session.createDataFrame(pdf) + spark_engine = SparkEngine(spark_session, dataframe=spark_df) + + return DualEngines( + spark_engine=spark_engine, + duckdb_engine=duckdb_engine, + dataset_name=dataset_name + ) + + yield factory + + # Cleanup DuckDB tables + for table_name in tables_created: + try: + duckdb_connection.unregister(table_name) + except Exception: + pass + + +# Convenience fixtures for common datasets + + +@pytest.fixture(scope="function") +def dual_engines_full(dual_engine_factory) -> DualEngines: + """Dual engines with df_full dataset.""" + return dual_engine_factory("df_full") + + +@pytest.fixture(scope="function") +def dual_engines_missing(dual_engine_factory) -> DualEngines: + """Dual engines with df_missing dataset.""" + return dual_engine_factory("df_missing") + + +@pytest.fixture(scope="function") +def dual_engines_numeric(dual_engine_factory) -> DualEngines: + """Dual engines with df_numeric dataset.""" + return dual_engine_factory("df_numeric") + + +@pytest.fixture(scope="function") +def dual_engines_unique(dual_engine_factory) -> DualEngines: + """Dual engines with df_unique dataset.""" + return dual_engine_factory("df_unique") + + +@pytest.fixture(scope="function") +def dual_engines_distinct(dual_engine_factory) -> DualEngines: + """Dual engines with df_distinct dataset.""" + return dual_engine_factory("df_distinct") + + +@pytest.fixture(scope="function") +def dual_engines_string_lengths(dual_engine_factory) -> DualEngines: + """Dual engines with df_string_lengths dataset.""" + return dual_engine_factory("df_string_lengths") + + +@pytest.fixture(scope="function") +def dual_engines_correlation(dual_engine_factory) -> DualEngines: + """Dual engines with df_correlation dataset.""" + return dual_engine_factory("df_correlation") + + +@pytest.fixture(scope="function") +def dual_engines_entropy(dual_engine_factory) -> DualEngines: + """Dual engines with df_entropy dataset.""" + return dual_engine_factory("df_entropy") + + +@pytest.fixture(scope="function") +def dual_engines_compliance(dual_engine_factory) -> DualEngines: + """Dual engines with df_compliance dataset.""" + return dual_engine_factory("df_compliance") + + +@pytest.fixture(scope="function") +def dual_engines_pattern(dual_engine_factory) -> DualEngines: + """Dual engines with df_pattern dataset.""" + return dual_engine_factory("df_pattern") + + +@pytest.fixture(scope="function") +def dual_engines_quantile(dual_engine_factory) -> DualEngines: + """Dual engines with df_quantile dataset.""" + return dual_engine_factory("df_quantile") + + +@pytest.fixture(scope="function") +def dual_engines_contained_in(dual_engine_factory) -> DualEngines: + """Dual engines with df_contained_in dataset.""" + return dual_engine_factory("df_contained_in") + + +@pytest.fixture(scope="function") +def dual_engines_histogram(dual_engine_factory) -> DualEngines: + """Dual engines with df_histogram dataset.""" + return dual_engine_factory("df_histogram") + + +@pytest.fixture(scope="function") +def dual_engines_mutual_info(dual_engine_factory) -> DualEngines: + """Dual engines with df_mutual_info dataset.""" + return dual_engine_factory("df_mutual_info") + + +@pytest.fixture(scope="function") +def dual_engines_where(dual_engine_factory) -> DualEngines: + """Dual engines with df_where dataset.""" + return dual_engine_factory("df_where") + + +@pytest.fixture(scope="function") +def dual_engines_all_null(dual_engine_factory) -> DualEngines: + """Dual engines with df_all_null dataset.""" + return dual_engine_factory("df_all_null") + + +@pytest.fixture(scope="function") +def dual_engines_single(dual_engine_factory) -> DualEngines: + """Dual engines with df_single dataset.""" + return dual_engine_factory("df_single") + + +@pytest.fixture(scope="function") +def dual_engines_empty(dual_engine_factory) -> DualEngines: + """Dual engines with df_empty dataset.""" + return dual_engine_factory("df_empty") diff --git a/tests/engines/comparison/test_analyzer_parity.py b/tests/engines/comparison/test_analyzer_parity.py new file mode 100644 index 0000000..8db0d2e --- /dev/null +++ b/tests/engines/comparison/test_analyzer_parity.py @@ -0,0 +1,393 @@ +# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Cross-engine analyzer parity tests. + +Tests that verify DuckDB engine produces the same analyzer results +as the Spark engine baseline. Requires Spark Connect to be running. +""" + +import pytest + +from pydeequ.v2.analyzers import ( + Size, + Completeness, + Mean, + Sum, + Maximum, + Minimum, + StandardDeviation, + Distinctness, + Uniqueness, + UniqueValueRatio, + CountDistinct, + ApproxCountDistinct, + ApproxQuantile, + Correlation, + MutualInformation, + MaxLength, + MinLength, + PatternMatch, + Compliance, + Entropy, + Histogram, + DataType, +) + +from tests.engines.comparison.conftest import requires_spark, DualEngines +from tests.engines.comparison.utils import assert_metrics_match + + +@requires_spark +class TestSizeAnalyzerParity: + """Parity tests for Size analyzer.""" + + def test_size_basic(self, dual_engines_full: DualEngines): + """Size produces same result on both engines.""" + analyzers = [Size()] + spark_metrics = dual_engines_full.spark_engine.compute_metrics(analyzers) + duckdb_metrics = dual_engines_full.duckdb_engine.compute_metrics(analyzers) + assert_metrics_match(spark_metrics, duckdb_metrics, "Size basic") + + def test_size_with_nulls(self, dual_engines_missing: DualEngines): + """Size counts all rows regardless of NULLs on both engines.""" + analyzers = [Size()] + spark_metrics = dual_engines_missing.spark_engine.compute_metrics(analyzers) + duckdb_metrics = dual_engines_missing.duckdb_engine.compute_metrics(analyzers) + assert_metrics_match(spark_metrics, duckdb_metrics, "Size with nulls") + + +@requires_spark +class TestCompletenessAnalyzerParity: + """Parity tests for Completeness analyzer.""" + + def test_completeness_full(self, dual_engines_full: DualEngines): + """Completeness produces same result for complete columns.""" + analyzers = [Completeness("att1"), Completeness("att2")] + spark_metrics = dual_engines_full.spark_engine.compute_metrics(analyzers) + duckdb_metrics = dual_engines_full.duckdb_engine.compute_metrics(analyzers) + assert_metrics_match(spark_metrics, duckdb_metrics, "Completeness full") + + def test_completeness_partial(self, dual_engines_missing: DualEngines): + """Completeness produces same result for partial columns.""" + analyzers = [Completeness("att1"), Completeness("att2")] + spark_metrics = dual_engines_missing.spark_engine.compute_metrics(analyzers) + duckdb_metrics = dual_engines_missing.duckdb_engine.compute_metrics(analyzers) + assert_metrics_match(spark_metrics, duckdb_metrics, "Completeness partial") + + def test_completeness_all_null(self, dual_engines_all_null: DualEngines): + """Completeness produces same result for all-NULL columns.""" + analyzers = [Completeness("value")] + spark_metrics = dual_engines_all_null.spark_engine.compute_metrics(analyzers) + duckdb_metrics = dual_engines_all_null.duckdb_engine.compute_metrics(analyzers) + assert_metrics_match(spark_metrics, duckdb_metrics, "Completeness all null") + + +@requires_spark +class TestStatisticalAnalyzerParity: + """Parity tests for statistical analyzers.""" + + def test_mean(self, dual_engines_numeric: DualEngines): + """Mean produces same result on both engines.""" + analyzers = [Mean("att1"), Mean("att2")] + spark_metrics = dual_engines_numeric.spark_engine.compute_metrics(analyzers) + duckdb_metrics = dual_engines_numeric.duckdb_engine.compute_metrics(analyzers) + assert_metrics_match(spark_metrics, duckdb_metrics, "Mean") + + def test_sum(self, dual_engines_numeric: DualEngines): + """Sum produces same result on both engines.""" + analyzers = [Sum("att1"), Sum("att2")] + spark_metrics = dual_engines_numeric.spark_engine.compute_metrics(analyzers) + duckdb_metrics = dual_engines_numeric.duckdb_engine.compute_metrics(analyzers) + assert_metrics_match(spark_metrics, duckdb_metrics, "Sum") + + def test_minimum(self, dual_engines_numeric: DualEngines): + """Minimum produces same result on both engines.""" + analyzers = [Minimum("att1"), Minimum("att2")] + spark_metrics = dual_engines_numeric.spark_engine.compute_metrics(analyzers) + duckdb_metrics = dual_engines_numeric.duckdb_engine.compute_metrics(analyzers) + assert_metrics_match(spark_metrics, duckdb_metrics, "Minimum") + + def test_maximum(self, dual_engines_numeric: DualEngines): + """Maximum produces same result on both engines.""" + analyzers = [Maximum("att1"), Maximum("att2")] + spark_metrics = dual_engines_numeric.spark_engine.compute_metrics(analyzers) + duckdb_metrics = dual_engines_numeric.duckdb_engine.compute_metrics(analyzers) + assert_metrics_match(spark_metrics, duckdb_metrics, "Maximum") + + def test_standard_deviation(self, dual_engines_numeric: DualEngines): + """StandardDeviation produces same result on both engines.""" + analyzers = [StandardDeviation("att1")] + spark_metrics = dual_engines_numeric.spark_engine.compute_metrics(analyzers) + duckdb_metrics = dual_engines_numeric.duckdb_engine.compute_metrics(analyzers) + assert_metrics_match(spark_metrics, duckdb_metrics, "StandardDeviation") + + +@requires_spark +class TestUniquenessAnalyzerParity: + """Parity tests for uniqueness-related analyzers.""" + + def test_distinctness(self, dual_engines_distinct: DualEngines): + """Distinctness produces same result on both engines.""" + analyzers = [Distinctness(["att1"]), Distinctness(["att2"])] + spark_metrics = dual_engines_distinct.spark_engine.compute_metrics(analyzers) + duckdb_metrics = dual_engines_distinct.duckdb_engine.compute_metrics(analyzers) + assert_metrics_match(spark_metrics, duckdb_metrics, "Distinctness") + + def test_uniqueness(self, dual_engines_distinct: DualEngines): + """Uniqueness produces same result on both engines.""" + analyzers = [Uniqueness(["att1"]), Uniqueness(["att2"])] + spark_metrics = dual_engines_distinct.spark_engine.compute_metrics(analyzers) + duckdb_metrics = dual_engines_distinct.duckdb_engine.compute_metrics(analyzers) + assert_metrics_match(spark_metrics, duckdb_metrics, "Uniqueness") + + def test_unique_value_ratio(self, dual_engines_distinct: DualEngines): + """UniqueValueRatio produces same result on both engines.""" + analyzers = [UniqueValueRatio(["att1"]), UniqueValueRatio(["att2"])] + spark_metrics = dual_engines_distinct.spark_engine.compute_metrics(analyzers) + duckdb_metrics = dual_engines_distinct.duckdb_engine.compute_metrics(analyzers) + assert_metrics_match(spark_metrics, duckdb_metrics, "UniqueValueRatio") + + def test_count_distinct(self, dual_engines_distinct: DualEngines): + """CountDistinct produces same result on both engines.""" + analyzers = [CountDistinct(["att1"]), CountDistinct(["att2"])] + spark_metrics = dual_engines_distinct.spark_engine.compute_metrics(analyzers) + duckdb_metrics = dual_engines_distinct.duckdb_engine.compute_metrics(analyzers) + assert_metrics_match(spark_metrics, duckdb_metrics, "CountDistinct") + + def test_approx_count_distinct(self, dual_engines_distinct: DualEngines): + """ApproxCountDistinct produces approximately same result.""" + analyzers = [ApproxCountDistinct("att1"), ApproxCountDistinct("att2")] + spark_metrics = dual_engines_distinct.spark_engine.compute_metrics(analyzers) + duckdb_metrics = dual_engines_distinct.duckdb_engine.compute_metrics(analyzers) + # Uses APPROX_TOLERANCE (10%) for approximate algorithms + assert_metrics_match(spark_metrics, duckdb_metrics, "ApproxCountDistinct") + + +@requires_spark +class TestStringAnalyzerParity: + """Parity tests for string analyzers.""" + + def test_min_length(self, dual_engines_string_lengths: DualEngines): + """MinLength produces same result on both engines.""" + analyzers = [MinLength("att1"), MinLength("att2")] + spark_metrics = dual_engines_string_lengths.spark_engine.compute_metrics(analyzers) + duckdb_metrics = dual_engines_string_lengths.duckdb_engine.compute_metrics(analyzers) + assert_metrics_match(spark_metrics, duckdb_metrics, "MinLength") + + def test_max_length(self, dual_engines_string_lengths: DualEngines): + """MaxLength produces same result on both engines.""" + analyzers = [MaxLength("att1"), MaxLength("att2")] + spark_metrics = dual_engines_string_lengths.spark_engine.compute_metrics(analyzers) + duckdb_metrics = dual_engines_string_lengths.duckdb_engine.compute_metrics(analyzers) + assert_metrics_match(spark_metrics, duckdb_metrics, "MaxLength") + + def test_pattern_match(self, dual_engines_pattern: DualEngines): + """PatternMatch produces same result on both engines.""" + analyzers = [PatternMatch("email", r".*@.*\..*")] + spark_metrics = dual_engines_pattern.spark_engine.compute_metrics(analyzers) + duckdb_metrics = dual_engines_pattern.duckdb_engine.compute_metrics(analyzers) + assert_metrics_match(spark_metrics, duckdb_metrics, "PatternMatch") + + +@requires_spark +class TestCorrelationAnalyzerParity: + """Parity tests for Correlation analyzer.""" + + def test_correlation_positive(self, dual_engines_correlation: DualEngines): + """Correlation produces same result for positively correlated columns.""" + analyzers = [Correlation("x", "y")] + spark_metrics = dual_engines_correlation.spark_engine.compute_metrics(analyzers) + duckdb_metrics = dual_engines_correlation.duckdb_engine.compute_metrics(analyzers) + assert_metrics_match(spark_metrics, duckdb_metrics, "Correlation positive") + + def test_correlation_negative(self, dual_engines_correlation: DualEngines): + """Correlation produces same result for negatively correlated columns.""" + analyzers = [Correlation("x", "z")] + spark_metrics = dual_engines_correlation.spark_engine.compute_metrics(analyzers) + duckdb_metrics = dual_engines_correlation.duckdb_engine.compute_metrics(analyzers) + assert_metrics_match(spark_metrics, duckdb_metrics, "Correlation negative") + + +@requires_spark +class TestEntropyAnalyzerParity: + """Parity tests for Entropy analyzer.""" + + def test_entropy_uniform(self, dual_engines_entropy: DualEngines): + """Entropy produces same result for uniform distribution.""" + analyzers = [Entropy("uniform")] + spark_metrics = dual_engines_entropy.spark_engine.compute_metrics(analyzers) + duckdb_metrics = dual_engines_entropy.duckdb_engine.compute_metrics(analyzers) + assert_metrics_match(spark_metrics, duckdb_metrics, "Entropy uniform") + + def test_entropy_constant(self, dual_engines_entropy: DualEngines): + """Entropy produces same result for constant column.""" + analyzers = [Entropy("constant")] + spark_metrics = dual_engines_entropy.spark_engine.compute_metrics(analyzers) + duckdb_metrics = dual_engines_entropy.duckdb_engine.compute_metrics(analyzers) + assert_metrics_match(spark_metrics, duckdb_metrics, "Entropy constant") + + +@requires_spark +class TestMutualInformationAnalyzerParity: + """Parity tests for MutualInformation analyzer.""" + + def test_mutual_information(self, dual_engines_mutual_info: DualEngines): + """MutualInformation produces same result on both engines.""" + analyzers = [MutualInformation(["x", "y_dependent"])] + spark_metrics = dual_engines_mutual_info.spark_engine.compute_metrics(analyzers) + duckdb_metrics = dual_engines_mutual_info.duckdb_engine.compute_metrics(analyzers) + assert_metrics_match(spark_metrics, duckdb_metrics, "MutualInformation") + + +@requires_spark +class TestComplianceAnalyzerParity: + """Parity tests for Compliance analyzer.""" + + def test_compliance(self, dual_engines_compliance: DualEngines): + """Compliance produces same result on both engines.""" + analyzers = [ + Compliance("positive_check", "positive > 0"), + Compliance("mixed_check", "mixed > 0"), + ] + spark_metrics = dual_engines_compliance.spark_engine.compute_metrics(analyzers) + duckdb_metrics = dual_engines_compliance.duckdb_engine.compute_metrics(analyzers) + assert_metrics_match(spark_metrics, duckdb_metrics, "Compliance") + + +@requires_spark +class TestQuantileAnalyzerParity: + """Parity tests for ApproxQuantile analyzer.""" + + def test_approx_quantile_median(self, dual_engines_quantile: DualEngines): + """ApproxQuantile produces same result for median.""" + analyzers = [ApproxQuantile("value", 0.5)] + spark_metrics = dual_engines_quantile.spark_engine.compute_metrics(analyzers) + duckdb_metrics = dual_engines_quantile.duckdb_engine.compute_metrics(analyzers) + assert_metrics_match(spark_metrics, duckdb_metrics, "ApproxQuantile median") + + def test_approx_quantile_quartiles(self, dual_engines_quantile: DualEngines): + """ApproxQuantile produces same result for quartiles.""" + analyzers = [ + ApproxQuantile("value", 0.25), + ApproxQuantile("value", 0.75), + ] + spark_metrics = dual_engines_quantile.spark_engine.compute_metrics(analyzers) + duckdb_metrics = dual_engines_quantile.duckdb_engine.compute_metrics(analyzers) + assert_metrics_match(spark_metrics, duckdb_metrics, "ApproxQuantile quartiles") + + +@requires_spark +class TestHistogramAnalyzerParity: + """Parity tests for Histogram analyzer.""" + + def test_histogram(self, dual_engines_histogram: DualEngines): + """Histogram produces consistent results on both engines.""" + analyzers = [Histogram("category")] + spark_metrics = dual_engines_histogram.spark_engine.compute_metrics(analyzers) + duckdb_metrics = dual_engines_histogram.duckdb_engine.compute_metrics(analyzers) + # Histogram structure may differ, so just check it exists + assert len(spark_metrics) > 0 + assert len(duckdb_metrics) > 0 + + +@requires_spark +class TestDataTypeAnalyzerParity: + """Parity tests for DataType analyzer.""" + + def test_data_type(self, dual_engines_full: DualEngines): + """DataType produces consistent results on both engines.""" + analyzers = [DataType("att1")] + spark_metrics = dual_engines_full.spark_engine.compute_metrics(analyzers) + duckdb_metrics = dual_engines_full.duckdb_engine.compute_metrics(analyzers) + # DataType format may differ, so just check it exists + assert len(spark_metrics) > 0 + assert len(duckdb_metrics) > 0 + + +@requires_spark +class TestAnalyzersWithWhereParity: + """Parity tests for analyzers with WHERE clause.""" + + def test_size_with_where(self, dual_engines_where: DualEngines): + """Size with WHERE produces same result on both engines.""" + analyzers = [Size(where="category = 'A'")] + spark_metrics = dual_engines_where.spark_engine.compute_metrics(analyzers) + duckdb_metrics = dual_engines_where.duckdb_engine.compute_metrics(analyzers) + assert_metrics_match(spark_metrics, duckdb_metrics, "Size with WHERE") + + def test_completeness_with_where(self, dual_engines_where: DualEngines): + """Completeness with WHERE produces same result on both engines.""" + analyzers = [Completeness("att1", where="category = 'A'")] + spark_metrics = dual_engines_where.spark_engine.compute_metrics(analyzers) + duckdb_metrics = dual_engines_where.duckdb_engine.compute_metrics(analyzers) + assert_metrics_match(spark_metrics, duckdb_metrics, "Completeness with WHERE") + + def test_mean_with_where(self, dual_engines_where: DualEngines): + """Mean with WHERE produces same result on both engines.""" + analyzers = [Mean("value", where="category = 'A'")] + spark_metrics = dual_engines_where.spark_engine.compute_metrics(analyzers) + duckdb_metrics = dual_engines_where.duckdb_engine.compute_metrics(analyzers) + assert_metrics_match(spark_metrics, duckdb_metrics, "Mean with WHERE") + + +@requires_spark +class TestMultipleAnalyzersParity: + """Parity tests for running multiple analyzers together.""" + + def test_all_basic_analyzers(self, dual_engines_numeric: DualEngines): + """All basic analyzers produce same results on both engines.""" + analyzers = [ + Size(), + Completeness("att1"), + Mean("att1"), + Sum("att1"), + Minimum("att1"), + Maximum("att1"), + StandardDeviation("att1"), + ] + spark_metrics = dual_engines_numeric.spark_engine.compute_metrics(analyzers) + duckdb_metrics = dual_engines_numeric.duckdb_engine.compute_metrics(analyzers) + assert_metrics_match(spark_metrics, duckdb_metrics, "All basic analyzers") + + def test_mixed_analyzer_types(self, dual_engines_full: DualEngines): + """Mixed analyzer types produce same results on both engines.""" + analyzers = [ + Size(), + Completeness("att1"), + CountDistinct(["att1"]), + MaxLength("att1"), + MinLength("att1"), + ] + spark_metrics = dual_engines_full.spark_engine.compute_metrics(analyzers) + duckdb_metrics = dual_engines_full.duckdb_engine.compute_metrics(analyzers) + assert_metrics_match(spark_metrics, duckdb_metrics, "Mixed analyzer types") + + +@requires_spark +class TestEdgeCasesParity: + """Parity tests for edge cases.""" + + def test_single_row(self, dual_engines_single: DualEngines): + """Analyzers produce same results for single-row dataset.""" + analyzers = [ + Size(), + Completeness("att1"), + Mean("item"), + Maximum("item"), + Minimum("item"), + ] + spark_metrics = dual_engines_single.spark_engine.compute_metrics(analyzers) + duckdb_metrics = dual_engines_single.duckdb_engine.compute_metrics(analyzers) + assert_metrics_match(spark_metrics, duckdb_metrics, "Single row") diff --git a/tests/engines/comparison/test_constraint_parity.py b/tests/engines/comparison/test_constraint_parity.py new file mode 100644 index 0000000..0e32232 --- /dev/null +++ b/tests/engines/comparison/test_constraint_parity.py @@ -0,0 +1,334 @@ +# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Cross-engine constraint parity tests. + +Tests that verify DuckDB engine produces the same constraint evaluation +results as the Spark engine baseline. Requires Spark Connect to be running. +""" + +import pytest + +from pydeequ.v2.checks import Check, CheckLevel +from pydeequ.v2.predicates import eq, gt, gte, lt, lte, between, is_one + +from tests.engines.comparison.conftest import requires_spark, DualEngines +from tests.engines.comparison.utils import assert_constraints_match + + +@requires_spark +class TestSizeConstraintParity: + """Parity tests for size constraints.""" + + def test_has_size_success(self, dual_engines_full: DualEngines): + """hasSize produces same result on both engines when passing.""" + check = Check(CheckLevel.Error, "size check").hasSize(eq(4)) + spark_results = dual_engines_full.spark_engine.run_checks([check]) + duckdb_results = dual_engines_full.duckdb_engine.run_checks([check]) + assert_constraints_match(spark_results, duckdb_results, "hasSize success") + + def test_has_size_failure(self, dual_engines_full: DualEngines): + """hasSize produces same result on both engines when failing.""" + check = Check(CheckLevel.Error, "size check").hasSize(eq(100)) + spark_results = dual_engines_full.spark_engine.run_checks([check]) + duckdb_results = dual_engines_full.duckdb_engine.run_checks([check]) + assert_constraints_match(spark_results, duckdb_results, "hasSize failure") + + +@requires_spark +class TestCompletenessConstraintParity: + """Parity tests for completeness constraints.""" + + def test_is_complete_success(self, dual_engines_full: DualEngines): + """isComplete produces same result on both engines when passing.""" + check = Check(CheckLevel.Error, "complete").isComplete("att1") + spark_results = dual_engines_full.spark_engine.run_checks([check]) + duckdb_results = dual_engines_full.duckdb_engine.run_checks([check]) + assert_constraints_match(spark_results, duckdb_results, "isComplete success") + + def test_is_complete_failure(self, dual_engines_missing: DualEngines): + """isComplete produces same result on both engines when failing.""" + check = Check(CheckLevel.Error, "complete").isComplete("att1") + spark_results = dual_engines_missing.spark_engine.run_checks([check]) + duckdb_results = dual_engines_missing.duckdb_engine.run_checks([check]) + assert_constraints_match(spark_results, duckdb_results, "isComplete failure") + + def test_has_completeness(self, dual_engines_missing: DualEngines): + """hasCompleteness produces same result on both engines.""" + check = Check(CheckLevel.Error, "threshold").hasCompleteness("att1", gte(0.5)) + spark_results = dual_engines_missing.spark_engine.run_checks([check]) + duckdb_results = dual_engines_missing.duckdb_engine.run_checks([check]) + assert_constraints_match(spark_results, duckdb_results, "hasCompleteness") + + def test_are_complete(self, dual_engines_full: DualEngines): + """areComplete produces same result on both engines.""" + check = Check(CheckLevel.Error, "multi").areComplete(["att1", "att2"]) + spark_results = dual_engines_full.spark_engine.run_checks([check]) + duckdb_results = dual_engines_full.duckdb_engine.run_checks([check]) + assert_constraints_match(spark_results, duckdb_results, "areComplete") + + +@requires_spark +class TestUniquenessConstraintParity: + """Parity tests for uniqueness constraints.""" + + def test_is_unique_success(self, dual_engines_unique: DualEngines): + """isUnique produces same result on both engines when passing.""" + check = Check(CheckLevel.Error, "unique").isUnique("unique_col") + spark_results = dual_engines_unique.spark_engine.run_checks([check]) + duckdb_results = dual_engines_unique.duckdb_engine.run_checks([check]) + assert_constraints_match(spark_results, duckdb_results, "isUnique success") + + def test_is_unique_failure(self, dual_engines_unique: DualEngines): + """isUnique produces same result on both engines when failing.""" + check = Check(CheckLevel.Error, "not unique").isUnique("non_unique") + spark_results = dual_engines_unique.spark_engine.run_checks([check]) + duckdb_results = dual_engines_unique.duckdb_engine.run_checks([check]) + assert_constraints_match(spark_results, duckdb_results, "isUnique failure") + + def test_has_uniqueness(self, dual_engines_distinct: DualEngines): + """hasUniqueness produces same result on both engines.""" + check = Check(CheckLevel.Error, "uniqueness").hasUniqueness(["att2"], is_one()) + spark_results = dual_engines_distinct.spark_engine.run_checks([check]) + duckdb_results = dual_engines_distinct.duckdb_engine.run_checks([check]) + assert_constraints_match(spark_results, duckdb_results, "hasUniqueness") + + def test_has_distinctness(self, dual_engines_distinct: DualEngines): + """hasDistinctness produces same result on both engines.""" + check = Check(CheckLevel.Error, "distinct").hasDistinctness(["att1"], gte(0.5)) + spark_results = dual_engines_distinct.spark_engine.run_checks([check]) + duckdb_results = dual_engines_distinct.duckdb_engine.run_checks([check]) + assert_constraints_match(spark_results, duckdb_results, "hasDistinctness") + + +@requires_spark +class TestStatisticalConstraintParity: + """Parity tests for statistical constraints.""" + + def test_has_min(self, dual_engines_numeric: DualEngines): + """hasMin produces same result on both engines.""" + check = Check(CheckLevel.Error, "min").hasMin("att1", eq(1)) + spark_results = dual_engines_numeric.spark_engine.run_checks([check]) + duckdb_results = dual_engines_numeric.duckdb_engine.run_checks([check]) + assert_constraints_match(spark_results, duckdb_results, "hasMin") + + def test_has_max(self, dual_engines_numeric: DualEngines): + """hasMax produces same result on both engines.""" + check = Check(CheckLevel.Error, "max").hasMax("att1", eq(6)) + spark_results = dual_engines_numeric.spark_engine.run_checks([check]) + duckdb_results = dual_engines_numeric.duckdb_engine.run_checks([check]) + assert_constraints_match(spark_results, duckdb_results, "hasMax") + + def test_has_mean(self, dual_engines_numeric: DualEngines): + """hasMean produces same result on both engines.""" + check = Check(CheckLevel.Error, "mean").hasMean("att1", eq(3.5)) + spark_results = dual_engines_numeric.spark_engine.run_checks([check]) + duckdb_results = dual_engines_numeric.duckdb_engine.run_checks([check]) + assert_constraints_match(spark_results, duckdb_results, "hasMean") + + def test_has_sum(self, dual_engines_numeric: DualEngines): + """hasSum produces same result on both engines.""" + check = Check(CheckLevel.Error, "sum").hasSum("att1", eq(21)) + spark_results = dual_engines_numeric.spark_engine.run_checks([check]) + duckdb_results = dual_engines_numeric.duckdb_engine.run_checks([check]) + assert_constraints_match(spark_results, duckdb_results, "hasSum") + + def test_has_standard_deviation(self, dual_engines_numeric: DualEngines): + """hasStandardDeviation produces same result on both engines.""" + check = Check(CheckLevel.Error, "stddev").hasStandardDeviation("att1", between(1.5, 2.0)) + spark_results = dual_engines_numeric.spark_engine.run_checks([check]) + duckdb_results = dual_engines_numeric.duckdb_engine.run_checks([check]) + assert_constraints_match(spark_results, duckdb_results, "hasStandardDeviation") + + +@requires_spark +class TestCorrelationConstraintParity: + """Parity tests for correlation constraints.""" + + def test_has_correlation(self, dual_engines_correlation: DualEngines): + """hasCorrelation produces same result on both engines.""" + check = Check(CheckLevel.Error, "corr").hasCorrelation("x", "y", is_one()) + spark_results = dual_engines_correlation.spark_engine.run_checks([check]) + duckdb_results = dual_engines_correlation.duckdb_engine.run_checks([check]) + assert_constraints_match(spark_results, duckdb_results, "hasCorrelation") + + +@requires_spark +class TestEntropyConstraintParity: + """Parity tests for entropy constraints.""" + + def test_has_entropy(self, dual_engines_entropy: DualEngines): + """hasEntropy produces same result on both engines.""" + check = Check(CheckLevel.Error, "entropy").hasEntropy("uniform", eq(2.0)) + spark_results = dual_engines_entropy.spark_engine.run_checks([check]) + duckdb_results = dual_engines_entropy.duckdb_engine.run_checks([check]) + assert_constraints_match(spark_results, duckdb_results, "hasEntropy") + + +@requires_spark +class TestStringConstraintParity: + """Parity tests for string constraints.""" + + def test_has_min_length(self, dual_engines_string_lengths: DualEngines): + """hasMinLength produces same result on both engines.""" + check = Check(CheckLevel.Error, "min len").hasMinLength("att1", eq(0)) + spark_results = dual_engines_string_lengths.spark_engine.run_checks([check]) + duckdb_results = dual_engines_string_lengths.duckdb_engine.run_checks([check]) + assert_constraints_match(spark_results, duckdb_results, "hasMinLength") + + def test_has_max_length(self, dual_engines_string_lengths: DualEngines): + """hasMaxLength produces same result on both engines.""" + check = Check(CheckLevel.Error, "max len").hasMaxLength("att1", lte(5)) + spark_results = dual_engines_string_lengths.spark_engine.run_checks([check]) + duckdb_results = dual_engines_string_lengths.duckdb_engine.run_checks([check]) + assert_constraints_match(spark_results, duckdb_results, "hasMaxLength") + + def test_has_pattern(self, dual_engines_pattern: DualEngines): + """hasPattern produces same result on both engines.""" + check = Check(CheckLevel.Error, "pattern").hasPattern("email", r".*@.*\..*", gte(0.5)) + spark_results = dual_engines_pattern.spark_engine.run_checks([check]) + duckdb_results = dual_engines_pattern.duckdb_engine.run_checks([check]) + assert_constraints_match(spark_results, duckdb_results, "hasPattern") + + +@requires_spark +class TestNumericConstraintParity: + """Parity tests for numeric value constraints.""" + + def test_is_positive(self, dual_engines_compliance: DualEngines): + """isPositive produces same result on both engines.""" + check = Check(CheckLevel.Error, "positive").isPositive("positive", is_one()) + spark_results = dual_engines_compliance.spark_engine.run_checks([check]) + duckdb_results = dual_engines_compliance.duckdb_engine.run_checks([check]) + assert_constraints_match(spark_results, duckdb_results, "isPositive") + + def test_is_non_negative(self, dual_engines_compliance: DualEngines): + """isNonNegative produces same result on both engines.""" + check = Check(CheckLevel.Error, "non-neg").isNonNegative("positive", is_one()) + spark_results = dual_engines_compliance.spark_engine.run_checks([check]) + duckdb_results = dual_engines_compliance.duckdb_engine.run_checks([check]) + assert_constraints_match(spark_results, duckdb_results, "isNonNegative") + + +@requires_spark +class TestColumnComparisonConstraintParity: + """Parity tests for column comparison constraints.""" + + def test_is_less_than(self, dual_engines_correlation: DualEngines): + """isLessThan produces same result on both engines.""" + check = Check(CheckLevel.Error, "less").isLessThan("x", "y") + spark_results = dual_engines_correlation.spark_engine.run_checks([check]) + duckdb_results = dual_engines_correlation.duckdb_engine.run_checks([check]) + assert_constraints_match(spark_results, duckdb_results, "isLessThan") + + def test_is_greater_than(self, dual_engines_correlation: DualEngines): + """isGreaterThan produces same result on both engines.""" + check = Check(CheckLevel.Error, "greater").isGreaterThan("y", "x") + spark_results = dual_engines_correlation.spark_engine.run_checks([check]) + duckdb_results = dual_engines_correlation.duckdb_engine.run_checks([check]) + assert_constraints_match(spark_results, duckdb_results, "isGreaterThan") + + +@requires_spark +class TestContainedInConstraintParity: + """Parity tests for isContainedIn constraint.""" + + def test_is_contained_in_success(self, dual_engines_contained_in: DualEngines): + """isContainedIn produces same result on both engines when passing.""" + check = Check(CheckLevel.Error, "contained").isContainedIn( + "status", ["active", "inactive", "pending"], is_one() + ) + spark_results = dual_engines_contained_in.spark_engine.run_checks([check]) + duckdb_results = dual_engines_contained_in.duckdb_engine.run_checks([check]) + assert_constraints_match(spark_results, duckdb_results, "isContainedIn success") + + def test_is_contained_in_failure(self, dual_engines_contained_in: DualEngines): + """isContainedIn produces same result on both engines when failing.""" + check = Check(CheckLevel.Error, "not contained").isContainedIn( + "category", ["A", "B", "C"], is_one() + ) + spark_results = dual_engines_contained_in.spark_engine.run_checks([check]) + duckdb_results = dual_engines_contained_in.duckdb_engine.run_checks([check]) + assert_constraints_match(spark_results, duckdb_results, "isContainedIn failure") + + +@requires_spark +class TestSatisfiesConstraintParity: + """Parity tests for satisfies constraint.""" + + def test_satisfies(self, dual_engines_compliance: DualEngines): + """satisfies produces same result on both engines.""" + check = Check(CheckLevel.Error, "satisfies").satisfies( + "positive > 0", "positive_check", assertion=is_one() + ) + spark_results = dual_engines_compliance.spark_engine.run_checks([check]) + duckdb_results = dual_engines_compliance.duckdb_engine.run_checks([check]) + assert_constraints_match(spark_results, duckdb_results, "satisfies") + + +@requires_spark +class TestMultipleConstraintsParity: + """Parity tests for multiple constraints.""" + + def test_multiple_constraints_all_pass(self, dual_engines_full: DualEngines): + """Multiple passing constraints produce same results.""" + check = (Check(CheckLevel.Error, "multi pass") + .hasSize(eq(4)) + .isComplete("att1") + .isComplete("att2")) + spark_results = dual_engines_full.spark_engine.run_checks([check]) + duckdb_results = dual_engines_full.duckdb_engine.run_checks([check]) + assert_constraints_match(spark_results, duckdb_results, "Multiple pass") + + def test_multiple_constraints_some_fail(self, dual_engines_missing: DualEngines): + """Mixed pass/fail constraints produce same results.""" + check = (Check(CheckLevel.Error, "multi mixed") + .hasSize(eq(12)) # Pass + .isComplete("att1")) # Fail + spark_results = dual_engines_missing.spark_engine.run_checks([check]) + duckdb_results = dual_engines_missing.duckdb_engine.run_checks([check]) + assert_constraints_match(spark_results, duckdb_results, "Multiple mixed") + + +@requires_spark +class TestCheckLevelsParity: + """Parity tests for check levels.""" + + def test_error_level(self, dual_engines_full: DualEngines): + """Error level produces same results on both engines.""" + check = Check(CheckLevel.Error, "error").hasSize(eq(100)) + spark_results = dual_engines_full.spark_engine.run_checks([check]) + duckdb_results = dual_engines_full.duckdb_engine.run_checks([check]) + assert_constraints_match(spark_results, duckdb_results, "Error level") + + def test_warning_level(self, dual_engines_full: DualEngines): + """Warning level produces same results on both engines.""" + check = Check(CheckLevel.Warning, "warning").hasSize(eq(100)) + spark_results = dual_engines_full.spark_engine.run_checks([check]) + duckdb_results = dual_engines_full.duckdb_engine.run_checks([check]) + assert_constraints_match(spark_results, duckdb_results, "Warning level") + + +@requires_spark +class TestConstraintsWithWhereParity: + """Parity tests for constraints with WHERE clause.""" + + def test_completeness_with_where(self, dual_engines_where: DualEngines): + """Completeness with WHERE produces same result on both engines.""" + check = Check(CheckLevel.Error, "filtered").hasCompleteness( + "att1", is_one() + ).where("category = 'A'") + spark_results = dual_engines_where.spark_engine.run_checks([check]) + duckdb_results = dual_engines_where.duckdb_engine.run_checks([check]) + assert_constraints_match(spark_results, duckdb_results, "Completeness with WHERE") diff --git a/tests/engines/comparison/test_profile_parity.py b/tests/engines/comparison/test_profile_parity.py new file mode 100644 index 0000000..c980d34 --- /dev/null +++ b/tests/engines/comparison/test_profile_parity.py @@ -0,0 +1,142 @@ +# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Cross-engine column profiling parity tests. + +Tests that verify DuckDB engine produces the same column profiling +results as the Spark engine baseline. Requires Spark Connect to be running. +""" + +import pytest + +from tests.engines.comparison.conftest import requires_spark, DualEngines +from tests.engines.comparison.utils import assert_profiles_match + + +def get_profile_by_column(profiles, column_name: str): + """Find a column profile by column name.""" + for p in profiles: + if p.column == column_name: + return p + return None + + +@requires_spark +class TestBasicProfilingParity: + """Parity tests for basic profiling functionality.""" + + def test_profile_all_columns(self, dual_engines_full: DualEngines): + """Profile all columns produces same results on both engines.""" + spark_profiles = dual_engines_full.spark_engine.profile_columns() + duckdb_profiles = dual_engines_full.duckdb_engine.profile_columns() + assert_profiles_match(spark_profiles, duckdb_profiles, "All columns") + + def test_profile_specific_columns(self, dual_engines_full: DualEngines): + """Profile specific columns produces same results.""" + spark_profiles = dual_engines_full.spark_engine.profile_columns(columns=["att1", "item"]) + duckdb_profiles = dual_engines_full.duckdb_engine.profile_columns(columns=["att1", "item"]) + assert_profiles_match(spark_profiles, duckdb_profiles, "Specific columns") + + +@requires_spark +class TestCompletenessProfilingParity: + """Parity tests for completeness in profiles.""" + + def test_completeness_full(self, dual_engines_full: DualEngines): + """Completeness is same for complete columns on both engines.""" + spark_profiles = dual_engines_full.spark_engine.profile_columns(columns=["att1"]) + duckdb_profiles = dual_engines_full.duckdb_engine.profile_columns(columns=["att1"]) + assert_profiles_match(spark_profiles, duckdb_profiles, "Completeness full") + + def test_completeness_partial(self, dual_engines_missing: DualEngines): + """Completeness is same for partial columns on both engines.""" + spark_profiles = dual_engines_missing.spark_engine.profile_columns(columns=["att1", "att2"]) + duckdb_profiles = dual_engines_missing.duckdb_engine.profile_columns(columns=["att1", "att2"]) + assert_profiles_match(spark_profiles, duckdb_profiles, "Completeness partial") + + def test_completeness_all_null(self, dual_engines_all_null: DualEngines): + """Completeness is same for all-NULL column on both engines.""" + spark_profiles = dual_engines_all_null.spark_engine.profile_columns(columns=["value"]) + duckdb_profiles = dual_engines_all_null.duckdb_engine.profile_columns(columns=["value"]) + assert_profiles_match(spark_profiles, duckdb_profiles, "Completeness all null") + + +@requires_spark +class TestDistinctValuesProfilingParity: + """Parity tests for distinct values in profiles.""" + + def test_distinct_values(self, dual_engines_distinct: DualEngines): + """Distinct value counts are same on both engines.""" + spark_profiles = dual_engines_distinct.spark_engine.profile_columns(columns=["att1", "att2"]) + duckdb_profiles = dual_engines_distinct.duckdb_engine.profile_columns(columns=["att1", "att2"]) + assert_profiles_match(spark_profiles, duckdb_profiles, "Distinct values") + + +@requires_spark +class TestNumericProfilingParity: + """Parity tests for numeric column profiling.""" + + def test_numeric_statistics(self, dual_engines_numeric: DualEngines): + """Numeric statistics are same on both engines.""" + spark_profiles = dual_engines_numeric.spark_engine.profile_columns(columns=["att1"]) + duckdb_profiles = dual_engines_numeric.duckdb_engine.profile_columns(columns=["att1"]) + assert_profiles_match(spark_profiles, duckdb_profiles, "Numeric statistics") + + def test_numeric_with_nulls(self, dual_engines_numeric: DualEngines): + """Numeric statistics handle NULLs same way on both engines.""" + spark_profiles = dual_engines_numeric.spark_engine.profile_columns(columns=["att2"]) + duckdb_profiles = dual_engines_numeric.duckdb_engine.profile_columns(columns=["att2"]) + assert_profiles_match(spark_profiles, duckdb_profiles, "Numeric with nulls") + + +@requires_spark +class TestHistogramProfilingParity: + """Parity tests for histogram profiling.""" + + def test_histogram(self, dual_engines_histogram: DualEngines): + """Histogram profiling produces consistent results.""" + spark_profiles = dual_engines_histogram.spark_engine.profile_columns( + columns=["category"], + low_cardinality_threshold=10 + ) + duckdb_profiles = dual_engines_histogram.duckdb_engine.profile_columns( + columns=["category"], + low_cardinality_threshold=10 + ) + # Check profiles exist and have histogram data + # (exact histogram format may differ) + assert len(spark_profiles) > 0 + assert len(duckdb_profiles) > 0 + + +@requires_spark +class TestEdgeCaseProfilingParity: + """Parity tests for edge cases in profiling.""" + + def test_single_row(self, dual_engines_single: DualEngines): + """Single-row profiling produces same results.""" + spark_profiles = dual_engines_single.spark_engine.profile_columns() + duckdb_profiles = dual_engines_single.duckdb_engine.profile_columns() + assert_profiles_match(spark_profiles, duckdb_profiles, "Single row") + + +@requires_spark +class TestMixedTypeProfilingParity: + """Parity tests for mixed column types.""" + + def test_mixed_types(self, dual_engines_full: DualEngines): + """Mixed column types produce same results on both engines.""" + spark_profiles = dual_engines_full.spark_engine.profile_columns() + duckdb_profiles = dual_engines_full.duckdb_engine.profile_columns() + assert_profiles_match(spark_profiles, duckdb_profiles, "Mixed types") diff --git a/tests/engines/comparison/test_suggestion_parity.py b/tests/engines/comparison/test_suggestion_parity.py new file mode 100644 index 0000000..510e17d --- /dev/null +++ b/tests/engines/comparison/test_suggestion_parity.py @@ -0,0 +1,222 @@ +# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Cross-engine constraint suggestion parity tests. + +Tests that verify DuckDB engine produces the same constraint suggestions +as the Spark engine baseline. Requires Spark Connect to be running. + +Note: Suggestions may differ between engines due to different profiling +algorithms. These tests focus on structural consistency rather than +exact match. +""" + +import pytest + +from pydeequ.v2.suggestions import Rules + +from tests.engines.comparison.conftest import requires_spark, DualEngines + + +def get_suggestions_for_column(suggestions, column_name: str): + """Get all suggestions for a specific column.""" + return [s for s in suggestions if s.column_name == column_name] + + +def get_suggestions_by_constraint(suggestions, constraint_name: str): + """Get all suggestions matching a constraint type.""" + return [s for s in suggestions if constraint_name in s.constraint_name] + + +@requires_spark +class TestSuggestionStructureParity: + """Parity tests for suggestion structure consistency.""" + + def test_default_rules_structure(self, dual_engines_full: DualEngines): + """DEFAULT rules produce structurally similar suggestions.""" + spark_suggestions = dual_engines_full.spark_engine.suggest_constraints(rules=[Rules.DEFAULT]) + duckdb_suggestions = dual_engines_full.duckdb_engine.suggest_constraints(rules=[Rules.DEFAULT]) + + # Both should return a list of suggestions + assert isinstance(spark_suggestions, list) + assert isinstance(duckdb_suggestions, list) + + # Both should have required fields + if spark_suggestions: + s = spark_suggestions[0] + assert hasattr(s, 'column_name') + assert hasattr(s, 'constraint_name') + assert hasattr(s, 'description') + + if duckdb_suggestions: + s = duckdb_suggestions[0] + assert hasattr(s, 'column_name') + assert hasattr(s, 'constraint_name') + assert hasattr(s, 'description') + + def test_numerical_rules_structure(self, dual_engines_numeric: DualEngines): + """NUMERICAL rules produce structurally similar suggestions.""" + spark_suggestions = dual_engines_numeric.spark_engine.suggest_constraints(rules=[Rules.NUMERICAL]) + duckdb_suggestions = dual_engines_numeric.duckdb_engine.suggest_constraints(rules=[Rules.NUMERICAL]) + + assert isinstance(spark_suggestions, list) + assert isinstance(duckdb_suggestions, list) + + def test_string_rules_structure(self, dual_engines_string_lengths: DualEngines): + """STRING rules produce structurally similar suggestions.""" + spark_suggestions = dual_engines_string_lengths.spark_engine.suggest_constraints(rules=[Rules.STRING]) + duckdb_suggestions = dual_engines_string_lengths.duckdb_engine.suggest_constraints(rules=[Rules.STRING]) + + assert isinstance(spark_suggestions, list) + assert isinstance(duckdb_suggestions, list) + + +@requires_spark +class TestSuggestionColumnCoverage: + """Parity tests for column coverage in suggestions.""" + + def test_complete_column_suggestions(self, dual_engines_full: DualEngines): + """Both engines suggest constraints for complete columns.""" + spark_suggestions = dual_engines_full.spark_engine.suggest_constraints( + columns=["att1"], + rules=[Rules.DEFAULT] + ) + duckdb_suggestions = dual_engines_full.duckdb_engine.suggest_constraints( + columns=["att1"], + rules=[Rules.DEFAULT] + ) + + # Both should return results (may differ in content) + assert isinstance(spark_suggestions, list) + assert isinstance(duckdb_suggestions, list) + + def test_numeric_column_suggestions(self, dual_engines_numeric: DualEngines): + """Both engines suggest constraints for numeric columns.""" + spark_suggestions = dual_engines_numeric.spark_engine.suggest_constraints( + columns=["att1"], + rules=[Rules.NUMERICAL] + ) + duckdb_suggestions = dual_engines_numeric.duckdb_engine.suggest_constraints( + columns=["att1"], + rules=[Rules.NUMERICAL] + ) + + assert isinstance(spark_suggestions, list) + assert isinstance(duckdb_suggestions, list) + + +@requires_spark +class TestSuggestionConstraintTypes: + """Parity tests for suggested constraint types.""" + + def test_completeness_suggestions(self, dual_engines_full: DualEngines): + """Both engines may suggest completeness constraints.""" + spark_suggestions = dual_engines_full.spark_engine.suggest_constraints(rules=[Rules.DEFAULT]) + duckdb_suggestions = dual_engines_full.duckdb_engine.suggest_constraints(rules=[Rules.DEFAULT]) + + spark_completeness = get_suggestions_by_constraint(spark_suggestions, "Complete") + duckdb_completeness = get_suggestions_by_constraint(duckdb_suggestions, "Complete") + + # Both might suggest completeness (or not - depends on data) + # Just verify structure is consistent + assert isinstance(spark_completeness, list) + assert isinstance(duckdb_completeness, list) + + def test_uniqueness_suggestions(self, dual_engines_unique: DualEngines): + """Both engines may suggest uniqueness constraints.""" + spark_suggestions = dual_engines_unique.spark_engine.suggest_constraints(rules=[Rules.COMMON]) + duckdb_suggestions = dual_engines_unique.duckdb_engine.suggest_constraints(rules=[Rules.COMMON]) + + assert isinstance(spark_suggestions, list) + assert isinstance(duckdb_suggestions, list) + + +@requires_spark +class TestSuggestionRuleSetsParity: + """Parity tests for different rule sets.""" + + def test_extended_rules(self, dual_engines_full: DualEngines): + """EXTENDED rules produce consistent suggestions on both engines.""" + spark_suggestions = dual_engines_full.spark_engine.suggest_constraints(rules=[Rules.EXTENDED]) + duckdb_suggestions = dual_engines_full.duckdb_engine.suggest_constraints(rules=[Rules.EXTENDED]) + + assert isinstance(spark_suggestions, list) + assert isinstance(duckdb_suggestions, list) + + def test_multiple_rule_sets(self, dual_engines_numeric: DualEngines): + """Multiple rule sets produce consistent suggestions.""" + spark_suggestions = dual_engines_numeric.spark_engine.suggest_constraints( + rules=[Rules.DEFAULT, Rules.NUMERICAL] + ) + duckdb_suggestions = dual_engines_numeric.duckdb_engine.suggest_constraints( + rules=[Rules.DEFAULT, Rules.NUMERICAL] + ) + + assert isinstance(spark_suggestions, list) + assert isinstance(duckdb_suggestions, list) + + +@requires_spark +class TestSuggestionEdgeCases: + """Parity tests for edge cases in suggestions.""" + + def test_single_row_suggestions(self, dual_engines_single: DualEngines): + """Single-row dataset produces consistent suggestions.""" + spark_suggestions = dual_engines_single.spark_engine.suggest_constraints(rules=[Rules.DEFAULT]) + duckdb_suggestions = dual_engines_single.duckdb_engine.suggest_constraints(rules=[Rules.DEFAULT]) + + assert isinstance(spark_suggestions, list) + assert isinstance(duckdb_suggestions, list) + + def test_all_null_column_suggestions(self, dual_engines_all_null: DualEngines): + """All-NULL column produces consistent suggestions.""" + spark_suggestions = dual_engines_all_null.spark_engine.suggest_constraints( + columns=["value"], + rules=[Rules.DEFAULT] + ) + duckdb_suggestions = dual_engines_all_null.duckdb_engine.suggest_constraints( + columns=["value"], + rules=[Rules.DEFAULT] + ) + + assert isinstance(spark_suggestions, list) + assert isinstance(duckdb_suggestions, list) + + +@requires_spark +class TestSuggestionColumnRestriction: + """Parity tests for column restriction in suggestions.""" + + def test_restrict_to_columns(self, dual_engines_full: DualEngines): + """Column restriction produces consistent suggestions.""" + spark_suggestions = dual_engines_full.spark_engine.suggest_constraints( + columns=["att1", "att2"], + rules=[Rules.DEFAULT] + ) + duckdb_suggestions = dual_engines_full.duckdb_engine.suggest_constraints( + columns=["att1", "att2"], + rules=[Rules.DEFAULT] + ) + + # Check that suggestions are for the restricted columns + spark_columns = {s.column_name for s in spark_suggestions if s.column_name} + duckdb_columns = {s.column_name for s in duckdb_suggestions if s.column_name} + + # Both should only include requested columns (or None for dataset-level) + for col in spark_columns: + if col: + assert col in ["att1", "att2"] + for col in duckdb_columns: + if col: + assert col in ["att1", "att2"] diff --git a/tests/engines/comparison/utils.py b/tests/engines/comparison/utils.py new file mode 100644 index 0000000..78be3df --- /dev/null +++ b/tests/engines/comparison/utils.py @@ -0,0 +1,434 @@ +# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Comparison utilities for cross-engine testing. + +Provides utilities for comparing results between DuckDB and Spark engines +with appropriate tolerance levels for different metric types. +""" + +import json +import math +from dataclasses import dataclass, field +from typing import Any, Dict, List, Optional, Tuple + +from pydeequ.engines import MetricResult, ConstraintResult, ColumnProfile + + +# Tolerance levels for comparing floating-point results +FLOAT_EPSILON = 1e-9 # Exact comparisons: Size, Completeness, Uniqueness +FLOAT_TOLERANCE = 1e-6 # Statistical: Mean, StdDev, Correlation +APPROX_TOLERANCE = 0.1 # Approximate algorithms: ApproxCountDistinct (10% relative) +ENTROPY_TOLERANCE = 1e-4 # Information theory metrics: Entropy, MutualInformation + + +# Mapping of analyzer types to their expected tolerance +ANALYZER_TOLERANCES: Dict[str, float] = { + # Exact metrics + "Size": FLOAT_EPSILON, + "Completeness": FLOAT_EPSILON, + "Uniqueness": FLOAT_EPSILON, + "Distinctness": FLOAT_EPSILON, + "UniqueValueRatio": FLOAT_EPSILON, + "CountDistinct": FLOAT_EPSILON, + "MinLength": FLOAT_EPSILON, + "MaxLength": FLOAT_EPSILON, + "PatternMatch": FLOAT_EPSILON, + "Compliance": FLOAT_EPSILON, + + # Statistical metrics + "Mean": FLOAT_TOLERANCE, + "Sum": FLOAT_TOLERANCE, + "Minimum": FLOAT_TOLERANCE, + "Maximum": FLOAT_TOLERANCE, + "StandardDeviation": FLOAT_TOLERANCE, + "Correlation": FLOAT_TOLERANCE, + "Entropy": ENTROPY_TOLERANCE, + "MutualInformation": ENTROPY_TOLERANCE, + "ApproxQuantile": APPROX_TOLERANCE, + + # Approximate metrics + "ApproxCountDistinct": APPROX_TOLERANCE, +} + + +def get_tolerance(analyzer_name: str) -> float: + """Get the appropriate tolerance for an analyzer type.""" + return ANALYZER_TOLERANCES.get(analyzer_name, FLOAT_TOLERANCE) + + +def values_equal( + actual: Any, + expected: Any, + tolerance: float = FLOAT_TOLERANCE +) -> bool: + """Check if two values are equal within tolerance. + + Handles None, NaN, strings, and numeric values appropriately. + + Args: + actual: The actual value from DuckDB + expected: The expected value from Spark + tolerance: The tolerance for numeric comparison + + Returns: + True if values are considered equal + """ + # Handle None/null + if actual is None and expected is None: + return True + if actual is None or expected is None: + return False + + # Handle NaN + if isinstance(actual, float) and isinstance(expected, float): + if math.isnan(actual) and math.isnan(expected): + return True + if math.isnan(actual) or math.isnan(expected): + return False + + # Handle JSON strings vs dicts (for Histogram, DataType) + if isinstance(actual, str) and not isinstance(expected, str): + try: + actual = json.loads(actual) + except (json.JSONDecodeError, TypeError): + pass + if isinstance(expected, str) and not isinstance(actual, str): + try: + expected = json.loads(expected) + except (json.JSONDecodeError, TypeError): + pass + + # Handle strings + if isinstance(actual, str) and isinstance(expected, str): + return actual == expected + + # Handle numeric values + try: + actual_float = float(actual) + expected_float = float(expected) + + if tolerance >= APPROX_TOLERANCE: + # Relative tolerance for approximate algorithms + if expected_float == 0: + return abs(actual_float) < tolerance + return abs(actual_float - expected_float) / abs(expected_float) < tolerance + else: + # Absolute tolerance for exact/statistical metrics + return abs(actual_float - expected_float) < tolerance + except (TypeError, ValueError): + # Fall back to exact equality + return actual == expected + + +@dataclass +class MetricDifference: + """Represents a difference between two metric results.""" + name: str + instance: Optional[str] + spark_value: Any + duckdb_value: Any + tolerance: float + is_match: bool + message: str = "" + + +@dataclass +class ComparisonReport: + """Report comparing results from two engines.""" + total_metrics: int = 0 + matching_metrics: int = 0 + differing_metrics: int = 0 + spark_only_metrics: int = 0 + duckdb_only_metrics: int = 0 + differences: List[MetricDifference] = field(default_factory=list) + + @property + def success(self) -> bool: + """True if all metrics match within tolerance.""" + return self.differing_metrics == 0 and self.spark_only_metrics == 0 and self.duckdb_only_metrics == 0 + + def summary(self) -> str: + """Generate a summary string.""" + lines = [ + f"Comparison Report:", + f" Total metrics: {self.total_metrics}", + f" Matching: {self.matching_metrics}", + f" Differing: {self.differing_metrics}", + f" Spark-only: {self.spark_only_metrics}", + f" DuckDB-only: {self.duckdb_only_metrics}", + ] + if self.differences: + lines.append(" Differences:") + for diff in self.differences: + lines.append(f" - {diff.name}({diff.instance}): Spark={diff.spark_value}, DuckDB={diff.duckdb_value}") + return "\n".join(lines) + + +def index_metrics(metrics: List[MetricResult]) -> Dict[Tuple[str, str], MetricResult]: + """Index metrics by (name, instance) tuple for efficient lookup.""" + return {(m.name, m.instance or ""): m for m in metrics} + + +def compare_metrics( + spark_metrics: List[MetricResult], + duckdb_metrics: List[MetricResult] +) -> ComparisonReport: + """Compare metric results from Spark and DuckDB engines. + + Args: + spark_metrics: Metrics computed by Spark engine + duckdb_metrics: Metrics computed by DuckDB engine + + Returns: + ComparisonReport with detailed comparison results + """ + report = ComparisonReport() + + # Index by (name, instance) + spark_index = index_metrics(spark_metrics) + duckdb_index = index_metrics(duckdb_metrics) + + all_keys = set(spark_index.keys()) | set(duckdb_index.keys()) + report.total_metrics = len(all_keys) + + for key in all_keys: + name, instance = key + tolerance = get_tolerance(name) + + spark_metric = spark_index.get(key) + duckdb_metric = duckdb_index.get(key) + + if spark_metric is None: + report.duckdb_only_metrics += 1 + report.differences.append(MetricDifference( + name=name, + instance=instance, + spark_value=None, + duckdb_value=duckdb_metric.value if duckdb_metric else None, + tolerance=tolerance, + is_match=False, + message="Metric only in DuckDB" + )) + elif duckdb_metric is None: + report.spark_only_metrics += 1 + report.differences.append(MetricDifference( + name=name, + instance=instance, + spark_value=spark_metric.value, + duckdb_value=None, + tolerance=tolerance, + is_match=False, + message="Metric only in Spark" + )) + else: + is_match = values_equal(spark_metric.value, duckdb_metric.value, tolerance) + if is_match: + report.matching_metrics += 1 + else: + report.differing_metrics += 1 + report.differences.append(MetricDifference( + name=name, + instance=instance, + spark_value=spark_metric.value, + duckdb_value=duckdb_metric.value, + tolerance=tolerance, + is_match=False, + message=f"Values differ (tolerance={tolerance})" + )) + + return report + + +def compare_constraint_results( + spark_results: List[ConstraintResult], + duckdb_results: List[ConstraintResult] +) -> ComparisonReport: + """Compare constraint results from Spark and DuckDB engines. + + Comparison is done by position within each check group, since constraint + names may differ between engines (e.g., Spark uses 'SizeConstraint(Size(None))' + while DuckDB uses 'hasSize(assertion)'). + + Args: + spark_results: Constraint results from Spark engine + duckdb_results: Constraint results from DuckDB engine + + Returns: + ComparisonReport with detailed comparison results + """ + report = ComparisonReport() + + # Group results by check_description to maintain ordering within checks + def group_by_check(results: List[ConstraintResult]) -> Dict[str, List[ConstraintResult]]: + groups: Dict[str, List[ConstraintResult]] = {} + for r in results: + key = r.check_description + if key not in groups: + groups[key] = [] + groups[key].append(r) + return groups + + spark_groups = group_by_check(spark_results) + duckdb_groups = group_by_check(duckdb_results) + + all_checks = set(spark_groups.keys()) | set(duckdb_groups.keys()) + + for check_desc in all_checks: + spark_list = spark_groups.get(check_desc, []) + duckdb_list = duckdb_groups.get(check_desc, []) + + # Compare by position within each check + max_len = max(len(spark_list), len(duckdb_list)) + report.total_metrics += max_len + + for i in range(max_len): + spark_result = spark_list[i] if i < len(spark_list) else None + duckdb_result = duckdb_list[i] if i < len(duckdb_list) else None + + if spark_result is None: + report.duckdb_only_metrics += 1 + elif duckdb_result is None: + report.spark_only_metrics += 1 + else: + # Compare constraint status + spark_status = spark_result.constraint_status + duckdb_status = duckdb_result.constraint_status + + if spark_status == duckdb_status: + report.matching_metrics += 1 + else: + report.differing_metrics += 1 + report.differences.append(MetricDifference( + name="ConstraintStatus", + instance=f"{check_desc}[{i}]", + spark_value=str(spark_status), + duckdb_value=str(duckdb_status), + tolerance=0, + is_match=False, + message="Constraint status differs" + )) + + return report + + +def compare_profiles( + spark_profiles: List[ColumnProfile], + duckdb_profiles: List[ColumnProfile] +) -> ComparisonReport: + """Compare column profiles from Spark and DuckDB engines. + + Args: + spark_profiles: Column profiles from Spark engine + duckdb_profiles: Column profiles from DuckDB engine + + Returns: + ComparisonReport with detailed comparison results + """ + report = ComparisonReport() + + # Index by column name + spark_index = {p.column: p for p in spark_profiles} + duckdb_index = {p.column: p for p in duckdb_profiles} + + all_columns = set(spark_index.keys()) | set(duckdb_index.keys()) + + for column in all_columns: + spark_profile = spark_index.get(column) + duckdb_profile = duckdb_index.get(column) + + if spark_profile is None: + report.duckdb_only_metrics += 1 + continue + if duckdb_profile is None: + report.spark_only_metrics += 1 + continue + + # Compare profile attributes + attrs_to_compare = [ + ("completeness", FLOAT_EPSILON), + ("approx_distinct_values", APPROX_TOLERANCE), + ("mean", FLOAT_TOLERANCE), + ("minimum", FLOAT_TOLERANCE), + ("maximum", FLOAT_TOLERANCE), + ("sum", FLOAT_TOLERANCE), + ("std_dev", APPROX_TOLERANCE), # Use relative tolerance for sample vs pop + ] + + for attr, tolerance in attrs_to_compare: + spark_val = getattr(spark_profile, attr, None) + duckdb_val = getattr(duckdb_profile, attr, None) + + report.total_metrics += 1 + + if values_equal(spark_val, duckdb_val, tolerance): + report.matching_metrics += 1 + else: + report.differing_metrics += 1 + report.differences.append(MetricDifference( + name=attr, + instance=column, + spark_value=spark_val, + duckdb_value=duckdb_val, + tolerance=tolerance, + is_match=False, + message=f"Profile attribute {attr} differs" + )) + + return report + + +def assert_metrics_match( + spark_metrics: List[MetricResult], + duckdb_metrics: List[MetricResult], + msg: str = "" +) -> None: + """Assert that metrics from both engines match within tolerance. + + Raises: + AssertionError: If metrics don't match + """ + report = compare_metrics(spark_metrics, duckdb_metrics) + if not report.success: + raise AssertionError(f"{msg}\n{report.summary()}") + + +def assert_constraints_match( + spark_results: List[ConstraintResult], + duckdb_results: List[ConstraintResult], + msg: str = "" +) -> None: + """Assert that constraint results from both engines match. + + Raises: + AssertionError: If results don't match + """ + report = compare_constraint_results(spark_results, duckdb_results) + if not report.success: + raise AssertionError(f"{msg}\n{report.summary()}") + + +def assert_profiles_match( + spark_profiles: List[ColumnProfile], + duckdb_profiles: List[ColumnProfile], + msg: str = "" +) -> None: + """Assert that profiles from both engines match within tolerance. + + Raises: + AssertionError: If profiles don't match + """ + report = compare_profiles(spark_profiles, duckdb_profiles) + if not report.success: + raise AssertionError(f"{msg}\n{report.summary()}") diff --git a/tests/engines/conftest.py b/tests/engines/conftest.py new file mode 100644 index 0000000..5a53f9d --- /dev/null +++ b/tests/engines/conftest.py @@ -0,0 +1,330 @@ +# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""DuckDB engine test fixtures. + +Provides fixtures for creating DuckDB engines with various test datasets. +These fixtures are used by DuckDB-only tests that don't require Spark. +""" + +from typing import Callable, Generator +import pytest +import duckdb +import pandas as pd + +from pydeequ.engines.duckdb import DuckDBEngine +from tests.engines.fixtures.datasets import ( + create_df_full, + create_df_missing, + create_df_numeric, + create_df_unique, + create_df_distinct, + create_df_string_lengths, + create_df_empty, + create_df_single, + create_df_all_null, + create_df_escape, + create_df_correlation, + create_df_entropy, + create_df_where, + create_df_pattern, + create_df_compliance, + create_df_quantile, + create_df_contained_in, + create_df_histogram, + create_df_mutual_info, + create_df_data_type, + DATASET_FACTORIES, +) + + +@pytest.fixture(scope="module") +def duckdb_connection() -> Generator[duckdb.DuckDBPyConnection, None, None]: + """Create a module-scoped DuckDB connection.""" + conn = duckdb.connect(":memory:") + yield conn + conn.close() + + +def _create_engine_from_df( + conn: duckdb.DuckDBPyConnection, + df: pd.DataFrame, + table_name: str +) -> DuckDBEngine: + """Helper to create a DuckDB engine from a pandas DataFrame.""" + # Register the DataFrame as a table + conn.register(table_name, df) + # Create engine pointing to the table + return DuckDBEngine(conn, table_name) + + +@pytest.fixture(scope="function") +def engine_factory(duckdb_connection: duckdb.DuckDBPyConnection) -> Callable[[str], DuckDBEngine]: + """Factory fixture to create DuckDB engines for any dataset. + + Usage: + def test_something(engine_factory): + engine = engine_factory("df_full") + results = engine.compute_metrics([Size()]) + """ + tables_created = [] + + def factory(dataset_name: str) -> DuckDBEngine: + if dataset_name not in DATASET_FACTORIES: + raise ValueError(f"Unknown dataset: {dataset_name}") + + table_name = f"test_{dataset_name}" + df = DATASET_FACTORIES[dataset_name]() + + # Unregister if already exists (for reuse in same test) + try: + duckdb_connection.unregister(table_name) + except Exception: + pass + + duckdb_connection.register(table_name, df) + tables_created.append(table_name) + + return DuckDBEngine(duckdb_connection, table_name) + + yield factory + + # Cleanup: unregister all tables + for table_name in tables_created: + try: + duckdb_connection.unregister(table_name) + except Exception: + pass + + +# Individual dataset fixtures for convenience + + +@pytest.fixture(scope="function") +def engine_full(duckdb_connection: duckdb.DuckDBPyConnection) -> Generator[DuckDBEngine, None, None]: + """DuckDB engine with df_full dataset.""" + table_name = "test_df_full" + df = create_df_full() + duckdb_connection.register(table_name, df) + yield DuckDBEngine(duckdb_connection, table_name) + duckdb_connection.unregister(table_name) + + +@pytest.fixture(scope="function") +def engine_missing(duckdb_connection: duckdb.DuckDBPyConnection) -> Generator[DuckDBEngine, None, None]: + """DuckDB engine with df_missing dataset.""" + table_name = "test_df_missing" + df = create_df_missing() + duckdb_connection.register(table_name, df) + yield DuckDBEngine(duckdb_connection, table_name) + duckdb_connection.unregister(table_name) + + +@pytest.fixture(scope="function") +def engine_numeric(duckdb_connection: duckdb.DuckDBPyConnection) -> Generator[DuckDBEngine, None, None]: + """DuckDB engine with df_numeric dataset.""" + table_name = "test_df_numeric" + df = create_df_numeric() + duckdb_connection.register(table_name, df) + yield DuckDBEngine(duckdb_connection, table_name) + duckdb_connection.unregister(table_name) + + +@pytest.fixture(scope="function") +def engine_unique(duckdb_connection: duckdb.DuckDBPyConnection) -> Generator[DuckDBEngine, None, None]: + """DuckDB engine with df_unique dataset.""" + table_name = "test_df_unique" + df = create_df_unique() + duckdb_connection.register(table_name, df) + yield DuckDBEngine(duckdb_connection, table_name) + duckdb_connection.unregister(table_name) + + +@pytest.fixture(scope="function") +def engine_distinct(duckdb_connection: duckdb.DuckDBPyConnection) -> Generator[DuckDBEngine, None, None]: + """DuckDB engine with df_distinct dataset.""" + table_name = "test_df_distinct" + df = create_df_distinct() + duckdb_connection.register(table_name, df) + yield DuckDBEngine(duckdb_connection, table_name) + duckdb_connection.unregister(table_name) + + +@pytest.fixture(scope="function") +def engine_string_lengths(duckdb_connection: duckdb.DuckDBPyConnection) -> Generator[DuckDBEngine, None, None]: + """DuckDB engine with df_string_lengths dataset.""" + table_name = "test_df_string_lengths" + df = create_df_string_lengths() + duckdb_connection.register(table_name, df) + yield DuckDBEngine(duckdb_connection, table_name) + duckdb_connection.unregister(table_name) + + +@pytest.fixture(scope="function") +def engine_empty(duckdb_connection: duckdb.DuckDBPyConnection) -> Generator[DuckDBEngine, None, None]: + """DuckDB engine with df_empty dataset.""" + table_name = "test_df_empty" + df = create_df_empty() + duckdb_connection.register(table_name, df) + yield DuckDBEngine(duckdb_connection, table_name) + duckdb_connection.unregister(table_name) + + +@pytest.fixture(scope="function") +def engine_single(duckdb_connection: duckdb.DuckDBPyConnection) -> Generator[DuckDBEngine, None, None]: + """DuckDB engine with df_single dataset.""" + table_name = "test_df_single" + df = create_df_single() + duckdb_connection.register(table_name, df) + yield DuckDBEngine(duckdb_connection, table_name) + duckdb_connection.unregister(table_name) + + +@pytest.fixture(scope="function") +def engine_all_null(duckdb_connection: duckdb.DuckDBPyConnection) -> Generator[DuckDBEngine, None, None]: + """DuckDB engine with df_all_null dataset.""" + table_name = "test_df_all_null" + df = create_df_all_null() + duckdb_connection.register(table_name, df) + yield DuckDBEngine(duckdb_connection, table_name) + duckdb_connection.unregister(table_name) + + +@pytest.fixture(scope="function") +def engine_escape(duckdb_connection: duckdb.DuckDBPyConnection) -> Generator[DuckDBEngine, None, None]: + """DuckDB engine with df_escape dataset.""" + table_name = "test_df_escape" + df = create_df_escape() + duckdb_connection.register(table_name, df) + yield DuckDBEngine(duckdb_connection, table_name) + duckdb_connection.unregister(table_name) + + +@pytest.fixture(scope="function") +def engine_correlation(duckdb_connection: duckdb.DuckDBPyConnection) -> Generator[DuckDBEngine, None, None]: + """DuckDB engine with df_correlation dataset.""" + table_name = "test_df_correlation" + df = create_df_correlation() + duckdb_connection.register(table_name, df) + yield DuckDBEngine(duckdb_connection, table_name) + duckdb_connection.unregister(table_name) + + +@pytest.fixture(scope="function") +def engine_entropy(duckdb_connection: duckdb.DuckDBPyConnection) -> Generator[DuckDBEngine, None, None]: + """DuckDB engine with df_entropy dataset.""" + table_name = "test_df_entropy" + df = create_df_entropy() + duckdb_connection.register(table_name, df) + yield DuckDBEngine(duckdb_connection, table_name) + duckdb_connection.unregister(table_name) + + +@pytest.fixture(scope="function") +def engine_where(duckdb_connection: duckdb.DuckDBPyConnection) -> Generator[DuckDBEngine, None, None]: + """DuckDB engine with df_where dataset.""" + table_name = "test_df_where" + df = create_df_where() + duckdb_connection.register(table_name, df) + yield DuckDBEngine(duckdb_connection, table_name) + duckdb_connection.unregister(table_name) + + +@pytest.fixture(scope="function") +def engine_pattern(duckdb_connection: duckdb.DuckDBPyConnection) -> Generator[DuckDBEngine, None, None]: + """DuckDB engine with df_pattern dataset.""" + table_name = "test_df_pattern" + df = create_df_pattern() + duckdb_connection.register(table_name, df) + yield DuckDBEngine(duckdb_connection, table_name) + duckdb_connection.unregister(table_name) + + +@pytest.fixture(scope="function") +def engine_compliance(duckdb_connection: duckdb.DuckDBPyConnection) -> Generator[DuckDBEngine, None, None]: + """DuckDB engine with df_compliance dataset.""" + table_name = "test_df_compliance" + df = create_df_compliance() + duckdb_connection.register(table_name, df) + yield DuckDBEngine(duckdb_connection, table_name) + duckdb_connection.unregister(table_name) + + +@pytest.fixture(scope="function") +def engine_quantile(duckdb_connection: duckdb.DuckDBPyConnection) -> Generator[DuckDBEngine, None, None]: + """DuckDB engine with df_quantile dataset.""" + table_name = "test_df_quantile" + df = create_df_quantile() + duckdb_connection.register(table_name, df) + yield DuckDBEngine(duckdb_connection, table_name) + duckdb_connection.unregister(table_name) + + +@pytest.fixture(scope="function") +def engine_contained_in(duckdb_connection: duckdb.DuckDBPyConnection) -> Generator[DuckDBEngine, None, None]: + """DuckDB engine with df_contained_in dataset.""" + table_name = "test_df_contained_in" + df = create_df_contained_in() + duckdb_connection.register(table_name, df) + yield DuckDBEngine(duckdb_connection, table_name) + duckdb_connection.unregister(table_name) + + +@pytest.fixture(scope="function") +def engine_histogram(duckdb_connection: duckdb.DuckDBPyConnection) -> Generator[DuckDBEngine, None, None]: + """DuckDB engine with df_histogram dataset.""" + table_name = "test_df_histogram" + df = create_df_histogram() + duckdb_connection.register(table_name, df) + yield DuckDBEngine(duckdb_connection, table_name) + duckdb_connection.unregister(table_name) + + +@pytest.fixture(scope="function") +def engine_mutual_info(duckdb_connection: duckdb.DuckDBPyConnection) -> Generator[DuckDBEngine, None, None]: + """DuckDB engine with df_mutual_info dataset.""" + table_name = "test_df_mutual_info" + df = create_df_mutual_info() + duckdb_connection.register(table_name, df) + yield DuckDBEngine(duckdb_connection, table_name) + duckdb_connection.unregister(table_name) + + +@pytest.fixture(scope="function") +def engine_data_type(duckdb_connection: duckdb.DuckDBPyConnection) -> Generator[DuckDBEngine, None, None]: + """DuckDB engine with df_data_type dataset.""" + table_name = "test_df_data_type" + df = create_df_data_type() + duckdb_connection.register(table_name, df) + yield DuckDBEngine(duckdb_connection, table_name) + duckdb_connection.unregister(table_name) + + +# Helper function for metric lookup +def get_metric_value(metrics, name: str, instance: str = None) -> float: + """Extract a metric value from results by name and optionally instance.""" + for m in metrics: + if m.name == name: + if instance is None or m.instance == instance: + return m.value + return None + + +def get_metric(metrics, name: str, instance: str = None): + """Extract a metric result from results by name and optionally instance.""" + for m in metrics: + if m.name == name: + if instance is None or m.instance == instance: + return m + return None diff --git a/tests/engines/fixtures/__init__.py b/tests/engines/fixtures/__init__.py new file mode 100644 index 0000000..73c4067 --- /dev/null +++ b/tests/engines/fixtures/__init__.py @@ -0,0 +1,36 @@ +# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Test fixtures module. + +Contains test dataset definitions ported from the original Deequ Scala +implementation for comprehensive edge case coverage. +""" + +from .datasets import ( + create_df_full, + create_df_missing, + create_df_numeric, + create_df_unique, + create_df_distinct, + create_df_string_lengths, + create_df_empty, + create_df_single, + create_df_all_null, + create_df_escape, + create_df_correlation, + create_df_entropy, + create_df_where, + EXPECTED_VALUES, +) diff --git a/tests/engines/fixtures/datasets.py b/tests/engines/fixtures/datasets.py new file mode 100644 index 0000000..af002e8 --- /dev/null +++ b/tests/engines/fixtures/datasets.py @@ -0,0 +1,529 @@ +# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Test dataset definitions ported from Deequ Scala FixtureSupport. + +These datasets provide comprehensive edge case coverage for testing +analyzers, constraints, profiles, and suggestions. +""" + +from typing import Any, Dict, List, Tuple +import math + +import pandas as pd + + +def create_df_full() -> pd.DataFrame: + """Basic complete data with no nulls (4 rows). + + Purpose: Basic complete data testing + Edge cases: No nulls, simple values + """ + return pd.DataFrame({ + "att1": ["a", "b", "c", "a"], + "att2": ["d", "e", "f", "d"], + "item": [1, 2, 3, 4], + "price": [10.0, 20.0, 30.0, 40.0], + }) + + +def create_df_missing() -> pd.DataFrame: + """Dataset with NULL handling patterns (12 rows). + + Purpose: NULL handling tests + Edge cases: att1: 50% complete, att2: 75% complete + """ + return pd.DataFrame({ + "att1": ["a", "b", None, None, "e", "f", None, None, "i", "j", None, None], + "att2": ["d", "e", "f", None, "h", "i", "j", None, "l", "m", "n", None], + "item": list(range(1, 13)), + }) + + +def create_df_numeric() -> pd.DataFrame: + """Dataset for statistical tests (6 rows). + + Purpose: Statistical analyzer tests (Mean, Sum, Min, Max, StdDev) + Edge cases: Mean=3.5, includes NULL column + Values: 1, 2, 3, 4, 5, 6 -> Mean=3.5, Sum=21, Min=1, Max=6 + StdDev (population) = sqrt(17.5/6) ≈ 1.7078 + """ + return pd.DataFrame({ + "att1": [1, 2, 3, 4, 5, 6], + "att2": [1.0, 2.0, 3.0, 4.0, 5.0, None], # One NULL + "item": ["a", "b", "c", "d", "e", "f"], + }) + + +def create_df_unique() -> pd.DataFrame: + """Dataset for uniqueness pattern tests (6 rows). + + Purpose: Uniqueness analyzer tests + Edge cases: Various uniqueness scenarios + - unique_col: All unique values (uniqueness=1.0) + - half_null: 50% null (completeness=0.5) + - non_unique: Duplicates present + """ + return pd.DataFrame({ + "unique_col": [1, 2, 3, 4, 5, 6], + "half_null": [1, None, 3, None, 5, None], + "non_unique": [1, 1, 2, 2, 3, 3], + "all_same": [1, 1, 1, 1, 1, 1], + }) + + +def create_df_distinct() -> pd.DataFrame: + """Dataset for distinctness testing with duplicates (6 rows). + + Purpose: Distinctness and uniqueness ratio testing + Edge cases: 3 distinct values in att1 with duplicates + - att1: ["a", "a", "b", "b", "c", "c"] -> 3 distinct, 0 unique + - att2: ["x", "y", "z", "w", "v", "u"] -> 6 distinct, 6 unique + """ + return pd.DataFrame({ + "att1": ["a", "a", "b", "b", "c", "c"], + "att2": ["x", "y", "z", "w", "v", "u"], + "item": [1, 2, 3, 4, 5, 6], + }) + + +def create_df_string_lengths() -> pd.DataFrame: + """Dataset for string length edge cases (5 rows). + + Purpose: MinLength and MaxLength analyzer tests + Edge cases: Empty string (""), varying lengths + Lengths: 0, 1, 2, 3, 4 + """ + return pd.DataFrame({ + "att1": ["", "a", "bb", "ccc", "dddd"], + "att2": ["hello", "world", "test", "data", "value"], + "item": [1, 2, 3, 4, 5], + }) + + +def create_df_empty() -> pd.DataFrame: + """Empty dataset with schema (0 rows). + + Purpose: Edge case testing with zero rows + Edge cases: Size=0, Completeness=1.0 (vacuously true for empty) + """ + return pd.DataFrame({ + "att1": pd.Series([], dtype="object"), + "att2": pd.Series([], dtype="object"), + "item": pd.Series([], dtype="int64"), + }) + + +def create_df_single() -> pd.DataFrame: + """Minimal dataset with single row (1 row). + + Purpose: Minimal dataset edge case testing + Edge cases: StdDev undefined/NaN, Uniqueness=1.0 + """ + return pd.DataFrame({ + "att1": ["a"], + "att2": ["d"], + "item": [1], + "price": [10.0], + }) + + +def create_df_all_null() -> pd.DataFrame: + """Dataset with all-NULL column (3 rows). + + Purpose: 0% completeness edge case testing + Edge cases: Completeness=0, Mean=NULL + """ + return pd.DataFrame({ + "value": [None, None, None], + "item": [1, 2, 3], + }) + + +def create_df_escape() -> pd.DataFrame: + """Dataset with special characters (8 rows). + + Purpose: Special character and regex escaping tests + Edge cases: Quotes, special characters (@#$%^&) + """ + return pd.DataFrame({ + "att1": [ + 'hello "world"', + "it's working", + "test@example.com", + "#hashtag", + "$money$", + "%percent%", + "^caret^", + "&ersand&", + ], + "att2": ["normal", "values", "here", "for", "comparison", "testing", "edge", "cases"], + "item": list(range(1, 9)), + }) + + +def create_df_correlation() -> pd.DataFrame: + """Dataset for correlation testing (5 rows). + + Purpose: Correlation analyzer tests + Edge cases: Perfect +1.0 and -1.0 correlation + - x and y: perfectly positively correlated (1.0) + - x and z: perfectly negatively correlated (-1.0) + """ + return pd.DataFrame({ + "x": [1.0, 2.0, 3.0, 4.0, 5.0], + "y": [2.0, 4.0, 6.0, 8.0, 10.0], # y = 2x, correlation = 1.0 + "z": [5.0, 4.0, 3.0, 2.0, 1.0], # z = 6-x, correlation = -1.0 + "w": [1.0, 1.0, 1.0, 1.0, 1.0], # constant, correlation undefined + }) + + +def create_df_entropy() -> pd.DataFrame: + """Dataset for entropy testing (4 rows). + + Purpose: Entropy analyzer tests + Edge cases: Uniform vs skewed distribution + - uniform: 4 distinct values each appearing once -> entropy = ln(4) ≈ 1.386 + - skewed: 1 value appearing 3 times, 1 appearing once -> entropy < 1.386 + """ + return pd.DataFrame({ + "uniform": ["a", "b", "c", "d"], # Entropy = ln(4) ≈ 1.386 + "skewed": ["a", "a", "a", "b"], # Entropy = -(3/4)ln(3/4) - (1/4)ln(1/4) ≈ 0.562 + "constant": ["x", "x", "x", "x"], # Entropy = 0 (single value) + "item": [1, 2, 3, 4], + }) + + +def create_df_where() -> pd.DataFrame: + """Dataset for WHERE clause filtering tests (4 rows). + + Purpose: WHERE clause filter testing + Edge cases: Mixed completeness by filter + - When filtered by category='A': att1 is complete + - When filtered by category='B': att1 has nulls + """ + return pd.DataFrame({ + "category": ["A", "A", "B", "B"], + "att1": ["x", "y", None, "w"], # A: 2/2 complete, B: 1/2 complete + "att2": [1, None, 3, 4], # A: 1/2 complete, B: 2/2 complete + "value": [10.0, 20.0, 30.0, 40.0], + }) + + +def create_df_pattern() -> pd.DataFrame: + """Dataset for pattern matching tests (6 rows). + + Purpose: PatternMatch analyzer and regex compliance tests + Edge cases: Email patterns, phone patterns, mixed valid/invalid + """ + return pd.DataFrame({ + "email": [ + "test@example.com", + "user@domain.org", + "invalid-email", + "another@test.co.uk", + "bad@", + "good.name@company.com", + ], + "phone": [ + "123-456-7890", + "987-654-3210", + "invalid", + "555-123-4567", + "1234567890", + "800-555-1234", + ], + "code": ["ABC123", "DEF456", "xyz789", "GHI012", "JKL345", "mno678"], + "item": list(range(1, 7)), + }) + + +def create_df_compliance() -> pd.DataFrame: + """Dataset for compliance predicate tests (6 rows). + + Purpose: Compliance and satisfies constraint tests + Edge cases: Positive/negative numbers, boundary conditions + """ + return pd.DataFrame({ + "positive": [1, 2, 3, 4, 5, 6], + "negative": [-1, -2, -3, -4, -5, -6], + "mixed": [-2, -1, 0, 1, 2, 3], + "with_null": [1, 2, None, 4, 5, None], + "item": list(range(1, 7)), + }) + + +def create_df_quantile() -> pd.DataFrame: + """Dataset for quantile testing (10 rows). + + Purpose: ApproxQuantile analyzer tests + Edge cases: Sorted values for predictable quantiles + Values: 1-10, Median (50th percentile) = 5.5 + """ + return pd.DataFrame({ + "value": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0], + "item": list(range(1, 11)), + }) + + +def create_df_contained_in() -> pd.DataFrame: + """Dataset for isContainedIn constraint tests (6 rows). + + Purpose: Testing containment in allowed value sets + Edge cases: All valid, some invalid, NULL handling + """ + return pd.DataFrame({ + "status": ["active", "inactive", "pending", "active", "inactive", "active"], + "category": ["A", "B", "C", "A", "B", "D"], # D is not in typical allowed set + "priority": [1, 2, 3, 1, 2, 4], # 4 might be outside allowed range + "item": list(range(1, 7)), + }) + + +def create_df_histogram() -> pd.DataFrame: + """Dataset for histogram testing (10 rows). + + Purpose: Histogram analyzer tests + Edge cases: Low cardinality categorical data + """ + return pd.DataFrame({ + "category": ["A", "A", "A", "B", "B", "C", "C", "C", "C", "D"], + "status": ["active", "active", "inactive", "active", "inactive", + "active", "active", "inactive", "inactive", "active"], + "item": list(range(1, 11)), + }) + + +def create_df_mutual_info() -> pd.DataFrame: + """Dataset for mutual information testing (8 rows). + + Purpose: MutualInformation analyzer tests + Edge cases: Perfectly dependent vs independent columns + """ + return pd.DataFrame({ + "x": ["a", "a", "b", "b", "c", "c", "d", "d"], + "y_dependent": ["a", "a", "b", "b", "c", "c", "d", "d"], # Perfectly dependent on x + "y_independent": ["p", "q", "r", "s", "p", "q", "r", "s"], # Less dependent + "item": list(range(1, 9)), + }) + + +def create_df_data_type() -> pd.DataFrame: + """Dataset for DataType analyzer testing. + + Purpose: Testing data type inference + Edge cases: Mixed numeric strings, pure numeric, non-numeric + """ + return pd.DataFrame({ + "numeric_strings": ["1", "2", "3", "4", "5"], + "mixed": ["1", "2", "three", "4", "five"], + "pure_numeric": [1.0, 2.0, 3.0, 4.0, 5.0], + "strings": ["a", "b", "c", "d", "e"], + "item": list(range(1, 6)), + }) + + +# Expected values registry for DuckDB-only tests +# Key: (dataset_name, analyzer_name, instance) -> expected_value +# instance can be a column name, tuple of columns, or None for dataset-level metrics +EXPECTED_VALUES: Dict[Tuple[str, str, Any], float] = { + # Size analyzer + ("df_full", "Size", None): 4.0, + ("df_missing", "Size", None): 12.0, + ("df_numeric", "Size", None): 6.0, + ("df_empty", "Size", None): 0.0, + ("df_single", "Size", None): 1.0, + + # Completeness analyzer + ("df_full", "Completeness", "att1"): 1.0, + ("df_full", "Completeness", "att2"): 1.0, + ("df_missing", "Completeness", "att1"): 0.5, # 6/12 + ("df_missing", "Completeness", "att2"): 0.75, # 9/12 + ("df_all_null", "Completeness", "value"): 0.0, + ("df_single", "Completeness", "att1"): 1.0, + ("df_unique", "Completeness", "unique_col"): 1.0, + ("df_unique", "Completeness", "half_null"): 0.5, # 3/6 + + # Mean analyzer + ("df_numeric", "Mean", "att1"): 3.5, # (1+2+3+4+5+6)/6 + ("df_numeric", "Mean", "att2"): 3.0, # (1+2+3+4+5)/5, NULL excluded + ("df_single", "Mean", "item"): 1.0, + ("df_single", "Mean", "price"): 10.0, + + # Sum analyzer + ("df_numeric", "Sum", "att1"): 21.0, # 1+2+3+4+5+6 + ("df_numeric", "Sum", "att2"): 15.0, # 1+2+3+4+5, NULL excluded + ("df_single", "Sum", "item"): 1.0, + ("df_single", "Sum", "price"): 10.0, + + # Minimum analyzer + ("df_numeric", "Minimum", "att1"): 1.0, + ("df_numeric", "Minimum", "att2"): 1.0, + ("df_single", "Minimum", "item"): 1.0, + ("df_single", "Minimum", "price"): 10.0, + + # Maximum analyzer + ("df_numeric", "Maximum", "att1"): 6.0, + ("df_numeric", "Maximum", "att2"): 5.0, # 6 is NULL position + ("df_single", "Maximum", "item"): 1.0, + ("df_single", "Maximum", "price"): 10.0, + + # StandardDeviation analyzer (population stddev) + ("df_numeric", "StandardDeviation", "att1"): 1.7078251276599330, # sqrt(17.5/6) + + # String length analyzers + ("df_string_lengths", "MinLength", "att1"): 0.0, # Empty string + ("df_string_lengths", "MaxLength", "att1"): 4.0, # "dddd" + ("df_string_lengths", "MinLength", "att2"): 4.0, # "test", "data" + ("df_string_lengths", "MaxLength", "att2"): 5.0, # "hello", "world", "value" + + # Distinctness analyzer (distinct values / total rows) + ("df_distinct", "Distinctness", "att1"): 0.5, # 3 distinct / 6 rows + ("df_distinct", "Distinctness", "att2"): 1.0, # 6 distinct / 6 rows + ("df_unique", "Distinctness", "all_same"): 1/6, # 1 distinct / 6 rows + + # Uniqueness analyzer (rows with unique values / total rows) + ("df_distinct", "Uniqueness", "att1"): 0.0, # No unique values (all duplicated) + ("df_distinct", "Uniqueness", "att2"): 1.0, # All values are unique + ("df_unique", "Uniqueness", "unique_col"): 1.0, # All values unique + ("df_unique", "Uniqueness", "non_unique"): 0.0, # All values duplicated + + # UniqueValueRatio analyzer (unique values / distinct values) + ("df_distinct", "UniqueValueRatio", "att1"): 0.0, # 0 unique / 3 distinct + ("df_distinct", "UniqueValueRatio", "att2"): 1.0, # 6 unique / 6 distinct + + # Correlation analyzer + ("df_correlation", "Correlation", ("x", "y")): 1.0, # Perfect positive + ("df_correlation", "Correlation", ("x", "z")): -1.0, # Perfect negative + + # Entropy analyzer + ("df_entropy", "Entropy", "uniform"): 1.3862943611198906, # ln(4) for 4 uniform values + ("df_entropy", "Entropy", "constant"): 0.0, # Single value = 0 entropy + + # ApproxCountDistinct analyzer + ("df_full", "ApproxCountDistinct", "att1"): 3.0, # "a", "b", "c" (a appears twice) + ("df_full", "ApproxCountDistinct", "item"): 4.0, # 1, 2, 3, 4 + ("df_distinct", "ApproxCountDistinct", "att1"): 3.0, # "a", "b", "c" + ("df_distinct", "ApproxCountDistinct", "att2"): 6.0, # All distinct + + # CountDistinct analyzer + ("df_full", "CountDistinct", "att1"): 3.0, + ("df_distinct", "CountDistinct", "att1"): 3.0, + ("df_distinct", "CountDistinct", "att2"): 6.0, + + # PatternMatch analyzer (fraction of rows matching pattern) + # These will be tested with specific patterns in the tests + + # Compliance analyzer (fraction of rows satisfying predicate) + ("df_compliance", "Compliance", "positive > 0"): 1.0, # All positive + ("df_compliance", "Compliance", "negative < 0"): 1.0, # All negative + ("df_compliance", "Compliance", "mixed > 0"): 0.5, # 3/6 > 0 + + # Quantile analyzer (approximate) + ("df_quantile", "ApproxQuantile", ("value", 0.5)): 5.5, # Median + ("df_quantile", "ApproxQuantile", ("value", 0.25)): 3.0, # 25th percentile (approx) + ("df_quantile", "ApproxQuantile", ("value", 0.75)): 8.0, # 75th percentile (approx) +} + + +# Tolerance levels for comparing floating-point results +FLOAT_EPSILON = 1e-9 # Exact comparisons: Size, Completeness, Uniqueness +FLOAT_TOLERANCE = 1e-6 # Statistical: Mean, StdDev, Correlation +APPROX_TOLERANCE = 0.1 # Approximate algorithms: ApproxCountDistinct (10% relative) + + +# Mapping of analyzer types to their expected tolerance +ANALYZER_TOLERANCES: Dict[str, float] = { + # Exact metrics + "Size": FLOAT_EPSILON, + "Completeness": FLOAT_EPSILON, + "Uniqueness": FLOAT_EPSILON, + "Distinctness": FLOAT_EPSILON, + "UniqueValueRatio": FLOAT_EPSILON, + "CountDistinct": FLOAT_EPSILON, + "MinLength": FLOAT_EPSILON, + "MaxLength": FLOAT_EPSILON, + "PatternMatch": FLOAT_EPSILON, + "Compliance": FLOAT_EPSILON, + + # Statistical metrics + "Mean": FLOAT_TOLERANCE, + "Sum": FLOAT_TOLERANCE, + "Minimum": FLOAT_TOLERANCE, + "Maximum": FLOAT_TOLERANCE, + "StandardDeviation": FLOAT_TOLERANCE, + "Correlation": FLOAT_TOLERANCE, + "Entropy": FLOAT_TOLERANCE, + "MutualInformation": FLOAT_TOLERANCE, + "ApproxQuantile": FLOAT_TOLERANCE, + + # Approximate metrics + "ApproxCountDistinct": APPROX_TOLERANCE, +} + + +def get_tolerance(analyzer_name: str) -> float: + """Get the appropriate tolerance for an analyzer type.""" + return ANALYZER_TOLERANCES.get(analyzer_name, FLOAT_TOLERANCE) + + +def is_close(actual: float, expected: float, tolerance: float) -> bool: + """Check if two values are close within tolerance. + + For APPROX_TOLERANCE, uses relative comparison. + For smaller tolerances, uses absolute comparison. + """ + if expected is None or actual is None: + return expected is None and actual is None + + if tolerance >= APPROX_TOLERANCE: + # Relative tolerance for approximate algorithms + if expected == 0: + return abs(actual) < tolerance + return abs(actual - expected) / abs(expected) < tolerance + else: + # Absolute tolerance for exact/statistical metrics + return abs(actual - expected) < tolerance + + +# Dataset factory registry +DATASET_FACTORIES = { + "df_full": create_df_full, + "df_missing": create_df_missing, + "df_numeric": create_df_numeric, + "df_unique": create_df_unique, + "df_distinct": create_df_distinct, + "df_string_lengths": create_df_string_lengths, + "df_empty": create_df_empty, + "df_single": create_df_single, + "df_all_null": create_df_all_null, + "df_escape": create_df_escape, + "df_correlation": create_df_correlation, + "df_entropy": create_df_entropy, + "df_where": create_df_where, + "df_pattern": create_df_pattern, + "df_compliance": create_df_compliance, + "df_quantile": create_df_quantile, + "df_contained_in": create_df_contained_in, + "df_histogram": create_df_histogram, + "df_mutual_info": create_df_mutual_info, + "df_data_type": create_df_data_type, +} + + +def get_dataset(name: str) -> pd.DataFrame: + """Get a dataset by name.""" + if name not in DATASET_FACTORIES: + raise ValueError(f"Unknown dataset: {name}. Available: {list(DATASET_FACTORIES.keys())}") + return DATASET_FACTORIES[name]() diff --git a/tests/engines/test_constraint_evaluators.py b/tests/engines/test_constraint_evaluators.py new file mode 100644 index 0000000..2b67027 --- /dev/null +++ b/tests/engines/test_constraint_evaluators.py @@ -0,0 +1,473 @@ +# -*- coding: utf-8 -*- +""" +Unit tests for constraint evaluators. + +These tests verify the constraint evaluator abstractions work correctly +in isolation, testing SQL generation and evaluation logic. +""" + +import pandas as pd +import pytest + +from pydeequ.engines.constraints import ( + ConstraintEvaluatorFactory, + BaseEvaluator, + RatioCheckEvaluator, + AnalyzerBasedEvaluator, + # Analyzer-based evaluators + SizeEvaluator, + CompletenessEvaluator, + MeanEvaluator, + MinimumEvaluator, + MaximumEvaluator, + SumEvaluator, + StandardDeviationEvaluator, + UniquenessEvaluator, + DistinctnessEvaluator, + UniqueValueRatioEvaluator, + CorrelationEvaluator, + EntropyEvaluator, + MutualInformationEvaluator, + PatternMatchEvaluator, + MinLengthEvaluator, + MaxLengthEvaluator, + ApproxCountDistinctEvaluator, + ApproxQuantileEvaluator, + ComplianceEvaluator, + # Ratio-check evaluators + IsPositiveEvaluator, + IsNonNegativeEvaluator, + IsContainedInEvaluator, + ContainsEmailEvaluator, + ContainsURLEvaluator, + ContainsCreditCardEvaluator, + ContainsSSNEvaluator, + # Comparison evaluators + ColumnComparisonEvaluator, + # Multi-column evaluators + MultiColumnCompletenessEvaluator, +) + + +class MockConstraintProto: + """Mock constraint protobuf for testing.""" + + def __init__( + self, + type: str = "test", + column: str = "", + columns: list = None, + where: str = "", + pattern: str = "", + column_condition: str = "", + constraint_name: str = "", + allowed_values: list = None, + quantile: float = 0.5, + assertion=None, + ): + self.type = type + self.column = column + self.columns = columns or [] + self.where = where + self.pattern = pattern + self.column_condition = column_condition + self.constraint_name = constraint_name + self.allowed_values = allowed_values or [] + self.quantile = quantile + self._assertion = assertion + + def HasField(self, field_name): + if field_name == "assertion": + return self._assertion is not None + return False + + @property + def assertion(self): + return self._assertion + + +class TestConstraintEvaluatorFactory: + """Tests for ConstraintEvaluatorFactory.""" + + def test_create_size_evaluator(self): + """Test creating SizeEvaluator.""" + proto = MockConstraintProto(type="hasSize") + evaluator = ConstraintEvaluatorFactory.create(proto) + assert evaluator is not None + assert isinstance(evaluator, SizeEvaluator) + + def test_create_completeness_evaluator(self): + """Test creating CompletenessEvaluator for isComplete.""" + proto = MockConstraintProto(type="isComplete", column="col1") + evaluator = ConstraintEvaluatorFactory.create(proto) + assert evaluator is not None + assert isinstance(evaluator, CompletenessEvaluator) + + def test_create_completeness_evaluator_has(self): + """Test creating CompletenessEvaluator for hasCompleteness.""" + proto = MockConstraintProto(type="hasCompleteness", column="col1") + evaluator = ConstraintEvaluatorFactory.create(proto) + assert evaluator is not None + assert isinstance(evaluator, CompletenessEvaluator) + + def test_create_is_positive_evaluator(self): + """Test creating IsPositiveEvaluator.""" + proto = MockConstraintProto(type="isPositive", column="col1") + evaluator = ConstraintEvaluatorFactory.create(proto) + assert evaluator is not None + assert isinstance(evaluator, IsPositiveEvaluator) + + def test_create_is_contained_in_evaluator(self): + """Test creating IsContainedInEvaluator.""" + proto = MockConstraintProto( + type="isContainedIn", + column="col1", + allowed_values=["a", "b", "c"] + ) + evaluator = ConstraintEvaluatorFactory.create(proto) + assert evaluator is not None + assert isinstance(evaluator, IsContainedInEvaluator) + + def test_create_column_comparison_evaluator(self): + """Test creating ColumnComparisonEvaluator for isLessThan.""" + proto = MockConstraintProto(type="isLessThan", columns=["col1", "col2"]) + evaluator = ConstraintEvaluatorFactory.create(proto) + assert evaluator is not None + assert isinstance(evaluator, ColumnComparisonEvaluator) + + def test_create_multi_column_completeness_evaluator(self): + """Test creating MultiColumnCompletenessEvaluator.""" + proto = MockConstraintProto(type="areComplete", columns=["col1", "col2"]) + evaluator = ConstraintEvaluatorFactory.create(proto) + assert evaluator is not None + assert isinstance(evaluator, MultiColumnCompletenessEvaluator) + + def test_create_unknown_type_returns_none(self): + """Test that unknown constraint types return None.""" + proto = MockConstraintProto(type="unknownType") + evaluator = ConstraintEvaluatorFactory.create(proto) + assert evaluator is None + + def test_is_supported(self): + """Test is_supported method.""" + assert ConstraintEvaluatorFactory.is_supported("hasSize") + assert ConstraintEvaluatorFactory.is_supported("isComplete") + assert ConstraintEvaluatorFactory.is_supported("isPositive") + assert not ConstraintEvaluatorFactory.is_supported("unknownType") + + def test_supported_types(self): + """Test supported_types method returns all registered types.""" + types = ConstraintEvaluatorFactory.supported_types() + assert "hasSize" in types + assert "isComplete" in types + assert "hasCompleteness" in types + assert "isPositive" in types + assert "isNonNegative" in types + assert "isContainedIn" in types + assert "isLessThan" in types + assert "areComplete" in types + + +class TestRatioCheckEvaluators: + """Tests for ratio-check evaluator condition generation.""" + + def test_is_positive_condition(self): + """Test IsPositiveEvaluator generates correct condition.""" + proto = MockConstraintProto(type="isPositive", column="price") + evaluator = IsPositiveEvaluator(proto) + assert evaluator.get_condition() == "price > 0" + + def test_is_non_negative_condition(self): + """Test IsNonNegativeEvaluator generates correct condition.""" + proto = MockConstraintProto(type="isNonNegative", column="count") + evaluator = IsNonNegativeEvaluator(proto) + assert evaluator.get_condition() == "count >= 0" + + def test_is_contained_in_condition(self): + """Test IsContainedInEvaluator generates correct condition.""" + proto = MockConstraintProto( + type="isContainedIn", + column="status", + allowed_values=["active", "pending"] + ) + evaluator = IsContainedInEvaluator(proto) + condition = evaluator.get_condition() + assert "status IN" in condition + assert "'active'" in condition + assert "'pending'" in condition + + def test_is_contained_in_escapes_quotes(self): + """Test IsContainedInEvaluator properly escapes single quotes.""" + proto = MockConstraintProto( + type="isContainedIn", + column="name", + allowed_values=["O'Brien", "D'Angelo"] + ) + evaluator = IsContainedInEvaluator(proto) + condition = evaluator.get_condition() + assert "O''Brien" in condition + assert "D''Angelo" in condition + + def test_contains_email_pattern(self): + """Test ContainsEmailEvaluator uses email regex pattern.""" + proto = MockConstraintProto(type="containsEmail", column="email") + evaluator = ContainsEmailEvaluator(proto) + condition = evaluator.get_condition() + assert "REGEXP_MATCHES" in condition + assert "email" in condition + + def test_contains_url_pattern(self): + """Test ContainsURLEvaluator uses URL regex pattern.""" + proto = MockConstraintProto(type="containsURL", column="website") + evaluator = ContainsURLEvaluator(proto) + condition = evaluator.get_condition() + assert "REGEXP_MATCHES" in condition + assert "website" in condition + + def test_column_comparison_less_than(self): + """Test ColumnComparisonEvaluator for isLessThan.""" + proto = MockConstraintProto(type="isLessThan", columns=["col_a", "col_b"]) + evaluator = ColumnComparisonEvaluator(proto) + assert evaluator.get_condition() == "col_a < col_b" + + def test_column_comparison_greater_than(self): + """Test ColumnComparisonEvaluator for isGreaterThan.""" + proto = MockConstraintProto(type="isGreaterThan", columns=["col_a", "col_b"]) + evaluator = ColumnComparisonEvaluator(proto) + assert evaluator.get_condition() == "col_a > col_b" + + def test_column_comparison_less_than_or_equal(self): + """Test ColumnComparisonEvaluator for isLessThanOrEqualTo.""" + proto = MockConstraintProto(type="isLessThanOrEqualTo", columns=["col_a", "col_b"]) + evaluator = ColumnComparisonEvaluator(proto) + assert evaluator.get_condition() == "col_a <= col_b" + + def test_column_comparison_greater_than_or_equal(self): + """Test ColumnComparisonEvaluator for isGreaterThanOrEqualTo.""" + proto = MockConstraintProto(type="isGreaterThanOrEqualTo", columns=["col_a", "col_b"]) + evaluator = ColumnComparisonEvaluator(proto) + assert evaluator.get_condition() == "col_a >= col_b" + + +class TestEvaluatorToString: + """Tests for evaluator to_string methods.""" + + def test_size_evaluator_to_string(self): + """Test SizeEvaluator to_string.""" + proto = MockConstraintProto(type="hasSize") + evaluator = SizeEvaluator(proto) + assert "hasSize" in evaluator.to_string() + + def test_completeness_evaluator_to_string_is_complete(self): + """Test CompletenessEvaluator to_string for isComplete.""" + proto = MockConstraintProto(type="isComplete", column="col1") + evaluator = CompletenessEvaluator(proto) + result = evaluator.to_string() + assert "Complete" in result + assert "col1" in result + + def test_is_positive_evaluator_to_string(self): + """Test IsPositiveEvaluator to_string.""" + proto = MockConstraintProto(type="isPositive", column="price") + evaluator = IsPositiveEvaluator(proto) + result = evaluator.to_string() + assert "isPositive" in result + assert "price" in result + + def test_is_contained_in_evaluator_to_string(self): + """Test IsContainedInEvaluator to_string.""" + proto = MockConstraintProto( + type="isContainedIn", + column="status", + allowed_values=["a", "b"] + ) + evaluator = IsContainedInEvaluator(proto) + result = evaluator.to_string() + assert "isContainedIn" in result + assert "status" in result + + def test_column_comparison_to_string(self): + """Test ColumnComparisonEvaluator to_string.""" + proto = MockConstraintProto(type="isLessThan", columns=["a", "b"]) + evaluator = ColumnComparisonEvaluator(proto) + result = evaluator.to_string() + assert "isLessThan" in result + assert "a" in result + assert "b" in result + + def test_multi_column_completeness_to_string(self): + """Test MultiColumnCompletenessEvaluator to_string.""" + proto = MockConstraintProto(type="areComplete", columns=["col1", "col2"]) + evaluator = MultiColumnCompletenessEvaluator(proto) + result = evaluator.to_string() + assert "Complete" in result + assert "col1" in result + assert "col2" in result + + +class TestEvaluatorEvaluation: + """Tests for evaluator evaluation logic.""" + + def test_evaluate_none_value_returns_false(self): + """Test that evaluating None value returns False.""" + proto = MockConstraintProto(type="hasSize") + evaluator = SizeEvaluator(proto) + assert evaluator.evaluate(None) is False + + def test_evaluate_1_0_without_assertion_returns_true(self): + """Test that evaluating 1.0 without assertion returns True.""" + proto = MockConstraintProto(type="isComplete", column="col1") + evaluator = CompletenessEvaluator(proto) + assert evaluator.evaluate(1.0) is True + + def test_evaluate_less_than_1_without_assertion_returns_false(self): + """Test that evaluating < 1.0 without assertion returns False.""" + proto = MockConstraintProto(type="isComplete", column="col1") + evaluator = CompletenessEvaluator(proto) + assert evaluator.evaluate(0.5) is False + + +class TestAnalyzerBasedEvaluators: + """Tests for analyzer-based evaluator operator generation.""" + + def test_completeness_evaluator_get_operator(self): + """Test CompletenessEvaluator creates correct operator.""" + from pydeequ.engines.operators import CompletenessOperator + + proto = MockConstraintProto(type="isComplete", column="col1") + evaluator = CompletenessEvaluator(proto) + operator = evaluator.get_operator() + assert isinstance(operator, CompletenessOperator) + assert operator.column == "col1" + + def test_mean_evaluator_get_operator(self): + """Test MeanEvaluator creates correct operator.""" + from pydeequ.engines.operators import MeanOperator + + proto = MockConstraintProto(type="hasMean", column="value") + evaluator = MeanEvaluator(proto) + operator = evaluator.get_operator() + assert isinstance(operator, MeanOperator) + assert operator.column == "value" + + def test_uniqueness_evaluator_get_operator(self): + """Test UniquenessEvaluator creates correct operator.""" + from pydeequ.engines.operators import UniquenessOperator + + proto = MockConstraintProto(type="isUnique", column="id") + evaluator = UniquenessEvaluator(proto) + operator = evaluator.get_operator() + assert isinstance(operator, UniquenessOperator) + + def test_pattern_match_evaluator_get_operator(self): + """Test PatternMatchEvaluator creates correct operator.""" + from pydeequ.engines.operators import PatternMatchOperator + + proto = MockConstraintProto(type="hasPattern", column="email", pattern="^.*@.*$") + evaluator = PatternMatchEvaluator(proto) + operator = evaluator.get_operator() + assert isinstance(operator, PatternMatchOperator) + assert operator.column == "email" + + def test_approx_quantile_evaluator_get_operator(self): + """Test ApproxQuantileEvaluator creates correct operator.""" + from pydeequ.engines.operators import ApproxQuantileOperator + + proto = MockConstraintProto(type="hasApproxQuantile", column="value", quantile=0.75) + evaluator = ApproxQuantileEvaluator(proto) + operator = evaluator.get_operator() + assert isinstance(operator, ApproxQuantileOperator) + assert operator.quantile == 0.75 + + +class TestWhereClauseHandling: + """Tests for WHERE clause handling in evaluators.""" + + def test_ratio_evaluator_with_where_clause(self): + """Test ratio evaluator includes WHERE in query.""" + proto = MockConstraintProto(type="isPositive", column="price", where="status='active'") + evaluator = IsPositiveEvaluator(proto) + assert evaluator.where == "status='active'" + + def test_analyzer_evaluator_with_where_clause(self): + """Test analyzer evaluator passes WHERE to operator.""" + proto = MockConstraintProto(type="hasMean", column="value", where="status='active'") + evaluator = MeanEvaluator(proto) + operator = evaluator.get_operator() + assert operator.where == "status='active'" + + +class TestSpecialConstraintTypes: + """Tests for special constraint types with extra parameters.""" + + def test_compliance_evaluator(self): + """Test ComplianceEvaluator with column_condition.""" + proto = MockConstraintProto( + type="satisfies", + column_condition="price > 0 AND quantity > 0", + constraint_name="valid_order" + ) + evaluator = ComplianceEvaluator(proto) + assert evaluator.predicate == "price > 0 AND quantity > 0" + assert evaluator.name == "valid_order" + result = evaluator.to_string() + assert "satisfies" in result + + def test_correlation_evaluator_requires_two_columns(self): + """Test CorrelationEvaluator handles missing columns.""" + proto = MockConstraintProto(type="hasCorrelation", columns=["col1"]) + evaluator = CorrelationEvaluator(proto) + # Should return None for operator when not enough columns + result = evaluator.compute_value("test_table", lambda q: pd.DataFrame()) + assert result is None + + def test_mutual_information_evaluator_requires_two_columns(self): + """Test MutualInformationEvaluator handles missing columns.""" + proto = MockConstraintProto(type="hasMutualInformation", columns=["col1"]) + evaluator = MutualInformationEvaluator(proto) + result = evaluator.compute_value("test_table", lambda q: pd.DataFrame()) + assert result is None + + +class TestAllConstraintTypesSupported: + """Verify all constraint types have evaluators.""" + + @pytest.mark.parametrize("constraint_type", [ + "hasSize", + "isComplete", + "hasCompleteness", + "hasMean", + "hasMin", + "hasMax", + "hasSum", + "hasStandardDeviation", + "isUnique", + "hasUniqueness", + "hasDistinctness", + "hasUniqueValueRatio", + "hasCorrelation", + "hasEntropy", + "hasMutualInformation", + "hasPattern", + "hasMinLength", + "hasMaxLength", + "hasApproxCountDistinct", + "hasApproxQuantile", + "satisfies", + "isPositive", + "isNonNegative", + "isContainedIn", + "containsEmail", + "containsURL", + "containsCreditCardNumber", + "containsSocialSecurityNumber", + "isLessThan", + "isLessThanOrEqualTo", + "isGreaterThan", + "isGreaterThanOrEqualTo", + "areComplete", + "haveCompleteness", + ]) + def test_constraint_type_has_evaluator(self, constraint_type): + """Verify each constraint type maps to an evaluator.""" + assert ConstraintEvaluatorFactory.is_supported(constraint_type) diff --git a/tests/engines/test_duckdb_analyzers.py b/tests/engines/test_duckdb_analyzers.py new file mode 100644 index 0000000..189c80f --- /dev/null +++ b/tests/engines/test_duckdb_analyzers.py @@ -0,0 +1,650 @@ +# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""DuckDB-only analyzer tests. + +Tests all 22 analyzers against known expected values from the test datasets. +These tests do not require Spark and can run quickly in CI. +""" + +import math +import pytest + +from pydeequ.v2.analyzers import ( + Size, + Completeness, + Mean, + Sum, + Maximum, + Minimum, + StandardDeviation, + Distinctness, + Uniqueness, + UniqueValueRatio, + CountDistinct, + ApproxCountDistinct, + ApproxQuantile, + Correlation, + MutualInformation, + MaxLength, + MinLength, + PatternMatch, + Compliance, + Entropy, + Histogram, + DataType, +) + +from tests.engines.conftest import get_metric_value, get_metric +from tests.engines.fixtures.datasets import ( + EXPECTED_VALUES, + FLOAT_EPSILON, + FLOAT_TOLERANCE, + APPROX_TOLERANCE, + is_close, + get_tolerance, +) + + +class TestSizeAnalyzer: + """Tests for the Size analyzer.""" + + def test_size_basic(self, engine_full): + """Size returns correct row count for basic dataset.""" + metrics = engine_full.compute_metrics([Size()]) + value = get_metric_value(metrics, "Size") + assert value == 4.0 + + def test_size_empty(self, engine_empty): + """Size returns 0 for empty dataset.""" + metrics = engine_empty.compute_metrics([Size()]) + value = get_metric_value(metrics, "Size") + assert value == 0.0 + + def test_size_single(self, engine_single): + """Size returns 1 for single-row dataset.""" + metrics = engine_single.compute_metrics([Size()]) + value = get_metric_value(metrics, "Size") + assert value == 1.0 + + def test_size_missing(self, engine_missing): + """Size counts all rows regardless of NULLs.""" + metrics = engine_missing.compute_metrics([Size()]) + value = get_metric_value(metrics, "Size") + assert value == 12.0 + + def test_size_with_where(self, engine_where): + """Size respects WHERE clause.""" + metrics = engine_where.compute_metrics([Size(where="category = 'A'")]) + value = get_metric_value(metrics, "Size") + assert value == 2.0 + + +class TestCompletenessAnalyzer: + """Tests for the Completeness analyzer.""" + + def test_completeness_full(self, engine_full): + """Completeness is 1.0 for columns with no NULLs.""" + metrics = engine_full.compute_metrics([Completeness("att1")]) + value = get_metric_value(metrics, "Completeness", "att1") + assert is_close(value, 1.0, FLOAT_EPSILON) + + def test_completeness_partial(self, engine_missing): + """Completeness reflects NULL ratio correctly.""" + metrics = engine_missing.compute_metrics([ + Completeness("att1"), + Completeness("att2"), + ]) + att1_value = get_metric_value(metrics, "Completeness", "att1") + att2_value = get_metric_value(metrics, "Completeness", "att2") + assert is_close(att1_value, 0.5, FLOAT_EPSILON) # 6/12 + assert is_close(att2_value, 0.75, FLOAT_EPSILON) # 9/12 + + def test_completeness_all_null(self, engine_all_null): + """Completeness is 0.0 for all-NULL column.""" + metrics = engine_all_null.compute_metrics([Completeness("value")]) + value = get_metric_value(metrics, "Completeness", "value") + assert is_close(value, 0.0, FLOAT_EPSILON) + + def test_completeness_empty(self, engine_empty): + """Completeness is 1.0 for empty dataset (vacuously true).""" + metrics = engine_empty.compute_metrics([Completeness("att1")]) + value = get_metric_value(metrics, "Completeness", "att1") + # Empty dataset: 0/0 should be treated as 1.0 (all rows are complete) + assert value is None or is_close(value, 1.0, FLOAT_EPSILON) or math.isnan(value) + + def test_completeness_with_where(self, engine_where): + """Completeness respects WHERE clause.""" + # category='A': att1 has values "x", "y" (2/2 complete) + metrics = engine_where.compute_metrics([ + Completeness("att1", where="category = 'A'") + ]) + value = get_metric_value(metrics, "Completeness", "att1") + assert is_close(value, 1.0, FLOAT_EPSILON) + + +class TestMeanAnalyzer: + """Tests for the Mean analyzer.""" + + def test_mean_basic(self, engine_numeric): + """Mean calculates correctly for numeric column.""" + metrics = engine_numeric.compute_metrics([Mean("att1")]) + value = get_metric_value(metrics, "Mean", "att1") + assert is_close(value, 3.5, FLOAT_TOLERANCE) + + def test_mean_with_nulls(self, engine_numeric): + """Mean excludes NULL values in calculation.""" + metrics = engine_numeric.compute_metrics([Mean("att2")]) + value = get_metric_value(metrics, "Mean", "att2") + assert is_close(value, 3.0, FLOAT_TOLERANCE) # (1+2+3+4+5)/5 + + def test_mean_single(self, engine_single): + """Mean works for single row.""" + metrics = engine_single.compute_metrics([Mean("price")]) + value = get_metric_value(metrics, "Mean", "price") + assert is_close(value, 10.0, FLOAT_TOLERANCE) + + def test_mean_with_where(self, engine_where): + """Mean respects WHERE clause.""" + metrics = engine_where.compute_metrics([Mean("value", where="category = 'A'")]) + value = get_metric_value(metrics, "Mean", "value") + assert is_close(value, 15.0, FLOAT_TOLERANCE) # (10+20)/2 + + +class TestSumAnalyzer: + """Tests for the Sum analyzer.""" + + def test_sum_basic(self, engine_numeric): + """Sum calculates correctly for numeric column.""" + metrics = engine_numeric.compute_metrics([Sum("att1")]) + value = get_metric_value(metrics, "Sum", "att1") + assert is_close(value, 21.0, FLOAT_TOLERANCE) + + def test_sum_with_nulls(self, engine_numeric): + """Sum excludes NULL values.""" + metrics = engine_numeric.compute_metrics([Sum("att2")]) + value = get_metric_value(metrics, "Sum", "att2") + assert is_close(value, 15.0, FLOAT_TOLERANCE) + + def test_sum_single(self, engine_single): + """Sum works for single row.""" + metrics = engine_single.compute_metrics([Sum("price")]) + value = get_metric_value(metrics, "Sum", "price") + assert is_close(value, 10.0, FLOAT_TOLERANCE) + + +class TestMinimumAnalyzer: + """Tests for the Minimum analyzer.""" + + def test_minimum_basic(self, engine_numeric): + """Minimum finds smallest value.""" + metrics = engine_numeric.compute_metrics([Minimum("att1")]) + value = get_metric_value(metrics, "Minimum", "att1") + assert is_close(value, 1.0, FLOAT_TOLERANCE) + + def test_minimum_with_nulls(self, engine_numeric): + """Minimum ignores NULL values.""" + metrics = engine_numeric.compute_metrics([Minimum("att2")]) + value = get_metric_value(metrics, "Minimum", "att2") + assert is_close(value, 1.0, FLOAT_TOLERANCE) + + def test_minimum_single(self, engine_single): + """Minimum works for single row.""" + metrics = engine_single.compute_metrics([Minimum("price")]) + value = get_metric_value(metrics, "Minimum", "price") + assert is_close(value, 10.0, FLOAT_TOLERANCE) + + +class TestMaximumAnalyzer: + """Tests for the Maximum analyzer.""" + + def test_maximum_basic(self, engine_numeric): + """Maximum finds largest value.""" + metrics = engine_numeric.compute_metrics([Maximum("att1")]) + value = get_metric_value(metrics, "Maximum", "att1") + assert is_close(value, 6.0, FLOAT_TOLERANCE) + + def test_maximum_with_nulls(self, engine_numeric): + """Maximum ignores NULL values.""" + metrics = engine_numeric.compute_metrics([Maximum("att2")]) + value = get_metric_value(metrics, "Maximum", "att2") + assert is_close(value, 5.0, FLOAT_TOLERANCE) + + def test_maximum_single(self, engine_single): + """Maximum works for single row.""" + metrics = engine_single.compute_metrics([Maximum("price")]) + value = get_metric_value(metrics, "Maximum", "price") + assert is_close(value, 10.0, FLOAT_TOLERANCE) + + +class TestStandardDeviationAnalyzer: + """Tests for the StandardDeviation analyzer.""" + + def test_stddev_basic(self, engine_numeric): + """StandardDeviation calculates population stddev correctly.""" + metrics = engine_numeric.compute_metrics([StandardDeviation("att1")]) + value = get_metric_value(metrics, "StandardDeviation", "att1") + # Population stddev of [1,2,3,4,5,6] = sqrt(17.5/6) ≈ 1.7078 (matches Spark) + assert is_close(value, 1.7078251276599330, FLOAT_TOLERANCE) + + def test_stddev_single_row(self, engine_single): + """StandardDeviation for single row is NaN or 0.""" + metrics = engine_single.compute_metrics([StandardDeviation("price")]) + value = get_metric_value(metrics, "StandardDeviation", "price") + # Single value: stddev is undefined (NaN) or 0 + assert value is None or math.isnan(value) or value == 0.0 + + +class TestDistinctnessAnalyzer: + """Tests for the Distinctness analyzer.""" + + def test_distinctness_basic(self, engine_distinct): + """Distinctness = distinct values / total rows.""" + metrics = engine_distinct.compute_metrics([Distinctness(["att1"])]) + value = get_metric_value(metrics, "Distinctness", "att1") + # 3 distinct values / 6 rows = 0.5 + assert is_close(value, 0.5, FLOAT_EPSILON) + + def test_distinctness_all_unique(self, engine_distinct): + """Distinctness is 1.0 when all values are distinct.""" + metrics = engine_distinct.compute_metrics([Distinctness(["att2"])]) + value = get_metric_value(metrics, "Distinctness", "att2") + assert is_close(value, 1.0, FLOAT_EPSILON) + + def test_distinctness_all_same(self, engine_unique): + """Distinctness is 1/n when all values are the same.""" + metrics = engine_unique.compute_metrics([Distinctness(["all_same"])]) + value = get_metric_value(metrics, "Distinctness", "all_same") + # 1 distinct / 6 rows ≈ 0.167 + assert is_close(value, 1/6, FLOAT_EPSILON) + + +class TestUniquenessAnalyzer: + """Tests for the Uniqueness analyzer.""" + + def test_uniqueness_all_unique(self, engine_unique): + """Uniqueness is 1.0 when all values appear exactly once.""" + metrics = engine_unique.compute_metrics([Uniqueness(["unique_col"])]) + value = get_metric_value(metrics, "Uniqueness", "unique_col") + assert is_close(value, 1.0, FLOAT_EPSILON) + + def test_uniqueness_all_duplicated(self, engine_distinct): + """Uniqueness is 0.0 when all values are duplicated.""" + metrics = engine_distinct.compute_metrics([Uniqueness(["att1"])]) + value = get_metric_value(metrics, "Uniqueness", "att1") + # All values in att1 appear twice, so 0 unique + assert is_close(value, 0.0, FLOAT_EPSILON) + + def test_uniqueness_mixed(self, engine_unique): + """Uniqueness handles mixed case correctly.""" + metrics = engine_unique.compute_metrics([Uniqueness(["non_unique"])]) + value = get_metric_value(metrics, "Uniqueness", "non_unique") + # [1,1,2,2,3,3] - all duplicated, uniqueness = 0 + assert is_close(value, 0.0, FLOAT_EPSILON) + + +class TestUniqueValueRatioAnalyzer: + """Tests for the UniqueValueRatio analyzer.""" + + def test_unique_value_ratio_all_unique(self, engine_distinct): + """UniqueValueRatio is 1.0 when unique count = distinct count.""" + metrics = engine_distinct.compute_metrics([UniqueValueRatio(["att2"])]) + value = get_metric_value(metrics, "UniqueValueRatio", "att2") + # 6 unique / 6 distinct = 1.0 + assert is_close(value, 1.0, FLOAT_EPSILON) + + def test_unique_value_ratio_no_unique(self, engine_distinct): + """UniqueValueRatio is 0.0 when no values are unique.""" + metrics = engine_distinct.compute_metrics([UniqueValueRatio(["att1"])]) + value = get_metric_value(metrics, "UniqueValueRatio", "att1") + # 0 unique / 3 distinct = 0.0 + assert is_close(value, 0.0, FLOAT_EPSILON) + + +class TestCountDistinctAnalyzer: + """Tests for the CountDistinct analyzer.""" + + def test_count_distinct_basic(self, engine_full): + """CountDistinct counts unique values correctly.""" + metrics = engine_full.compute_metrics([CountDistinct(["att1"])]) + value = get_metric_value(metrics, "CountDistinct", "att1") + # "a", "b", "c" (a appears twice) = 3 distinct + assert is_close(value, 3.0, FLOAT_EPSILON) + + def test_count_distinct_all_unique(self, engine_distinct): + """CountDistinct equals row count when all values are distinct.""" + metrics = engine_distinct.compute_metrics([CountDistinct(["att2"])]) + value = get_metric_value(metrics, "CountDistinct", "att2") + assert is_close(value, 6.0, FLOAT_EPSILON) + + def test_count_distinct_with_duplicates(self, engine_distinct): + """CountDistinct counts only distinct values.""" + metrics = engine_distinct.compute_metrics([CountDistinct(["att1"])]) + value = get_metric_value(metrics, "CountDistinct", "att1") + assert is_close(value, 3.0, FLOAT_EPSILON) + + +class TestApproxCountDistinctAnalyzer: + """Tests for the ApproxCountDistinct analyzer.""" + + def test_approx_count_distinct_basic(self, engine_full): + """ApproxCountDistinct approximates distinct count.""" + metrics = engine_full.compute_metrics([ApproxCountDistinct("att1")]) + value = get_metric_value(metrics, "ApproxCountDistinct", "att1") + # Should be approximately 3 + assert is_close(value, 3.0, APPROX_TOLERANCE) + + def test_approx_count_distinct_all_unique(self, engine_distinct): + """ApproxCountDistinct handles all-unique column.""" + metrics = engine_distinct.compute_metrics([ApproxCountDistinct("att2")]) + value = get_metric_value(metrics, "ApproxCountDistinct", "att2") + # HyperLogLog can have higher variance on small datasets (up to 20% error) + assert is_close(value, 6.0, 0.2) + + +class TestApproxQuantileAnalyzer: + """Tests for the ApproxQuantile analyzer.""" + + def test_approx_quantile_median(self, engine_quantile): + """ApproxQuantile calculates median correctly.""" + metrics = engine_quantile.compute_metrics([ApproxQuantile("value", 0.5)]) + value = get_metric_value(metrics, "ApproxQuantile", "value") + # Median of [1,2,3,4,5,6,7,8,9,10] = 5.5 + assert is_close(value, 5.5, FLOAT_TOLERANCE) + + def test_approx_quantile_quartiles(self, engine_quantile): + """ApproxQuantile calculates quartiles.""" + metrics = engine_quantile.compute_metrics([ + ApproxQuantile("value", 0.25), + ApproxQuantile("value", 0.75), + ]) + # For small datasets, quantile calculation may vary slightly + q25 = get_metric_value(metrics, "ApproxQuantile", "value") + # Note: DuckDB uses QUANTILE_CONT which interpolates + assert q25 is not None + + +class TestCorrelationAnalyzer: + """Tests for the Correlation analyzer.""" + + def test_correlation_positive(self, engine_correlation): + """Correlation is 1.0 for perfectly positively correlated columns.""" + metrics = engine_correlation.compute_metrics([Correlation("x", "y")]) + value = get_metric_value(metrics, "Correlation", "x,y") + assert is_close(value, 1.0, FLOAT_TOLERANCE) + + def test_correlation_negative(self, engine_correlation): + """Correlation is -1.0 for perfectly negatively correlated columns.""" + metrics = engine_correlation.compute_metrics([Correlation("x", "z")]) + value = get_metric_value(metrics, "Correlation", "x,z") + assert is_close(value, -1.0, FLOAT_TOLERANCE) + + +class TestMutualInformationAnalyzer: + """Tests for the MutualInformation analyzer.""" + + def test_mutual_information_dependent(self, engine_mutual_info): + """MutualInformation is high for perfectly dependent columns.""" + metrics = engine_mutual_info.compute_metrics([ + MutualInformation(["x", "y_dependent"]) + ]) + value = get_metric_value(metrics, "MutualInformation", "x,y_dependent") + # Perfect dependency should have high MI (equal to entropy of x) + assert value is not None and value > 0 + + +class TestMaxLengthAnalyzer: + """Tests for the MaxLength analyzer.""" + + def test_maxlength_basic(self, engine_string_lengths): + """MaxLength finds longest string.""" + metrics = engine_string_lengths.compute_metrics([MaxLength("att1")]) + value = get_metric_value(metrics, "MaxLength", "att1") + assert is_close(value, 4.0, FLOAT_EPSILON) # "dddd" + + def test_maxlength_uniform(self, engine_string_lengths): + """MaxLength works with varying lengths.""" + metrics = engine_string_lengths.compute_metrics([MaxLength("att2")]) + value = get_metric_value(metrics, "MaxLength", "att2") + assert is_close(value, 5.0, FLOAT_EPSILON) # "hello", "world", "value" + + +class TestMinLengthAnalyzer: + """Tests for the MinLength analyzer.""" + + def test_minlength_empty_string(self, engine_string_lengths): + """MinLength handles empty string (length 0).""" + metrics = engine_string_lengths.compute_metrics([MinLength("att1")]) + value = get_metric_value(metrics, "MinLength", "att1") + assert is_close(value, 0.0, FLOAT_EPSILON) # "" + + def test_minlength_basic(self, engine_string_lengths): + """MinLength finds shortest string.""" + metrics = engine_string_lengths.compute_metrics([MinLength("att2")]) + value = get_metric_value(metrics, "MinLength", "att2") + assert is_close(value, 4.0, FLOAT_EPSILON) # "test", "data" + + +class TestPatternMatchAnalyzer: + """Tests for the PatternMatch analyzer.""" + + def test_pattern_match_email(self, engine_pattern): + """PatternMatch detects email pattern.""" + # Simple email regex + metrics = engine_pattern.compute_metrics([ + PatternMatch("email", r".*@.*\..*") + ]) + value = get_metric_value(metrics, "PatternMatch", "email") + # 4 valid emails out of 6 + assert is_close(value, 4/6, FLOAT_TOLERANCE) + + def test_pattern_match_all_match(self, engine_full): + """PatternMatch returns 1.0 when all rows match.""" + metrics = engine_full.compute_metrics([ + PatternMatch("att1", r"^[a-c]$") + ]) + value = get_metric_value(metrics, "PatternMatch", "att1") + # "a", "b", "c", "a" all match + assert is_close(value, 1.0, FLOAT_TOLERANCE) + + +class TestComplianceAnalyzer: + """Tests for the Compliance analyzer.""" + + def test_compliance_all_positive(self, engine_compliance): + """Compliance is 1.0 when all rows satisfy predicate.""" + metrics = engine_compliance.compute_metrics([ + Compliance("positive_check", "positive > 0") + ]) + value = get_metric_value(metrics, "Compliance", "positive_check") + assert is_close(value, 1.0, FLOAT_TOLERANCE) + + def test_compliance_partial(self, engine_compliance): + """Compliance reflects fraction satisfying predicate.""" + metrics = engine_compliance.compute_metrics([ + Compliance("mixed_check", "mixed > 0") + ]) + value = get_metric_value(metrics, "Compliance", "mixed_check") + # [-2,-1,0,1,2,3] -> 3 values > 0 + assert is_close(value, 0.5, FLOAT_TOLERANCE) + + def test_compliance_none(self, engine_compliance): + """Compliance is 0.0 when no rows satisfy predicate.""" + metrics = engine_compliance.compute_metrics([ + Compliance("negative_positive", "negative > 0") + ]) + value = get_metric_value(metrics, "Compliance", "negative_positive") + assert is_close(value, 0.0, FLOAT_TOLERANCE) + + +class TestEntropyAnalyzer: + """Tests for the Entropy analyzer.""" + + def test_entropy_uniform(self, engine_entropy): + """Entropy is ln(n) for uniform distribution.""" + metrics = engine_entropy.compute_metrics([Entropy("uniform")]) + value = get_metric_value(metrics, "Entropy", "uniform") + # 4 equally distributed values: entropy = ln(4) ≈ 1.386 (matches Spark) + assert is_close(value, 1.3862943611198906, FLOAT_TOLERANCE) + + def test_entropy_constant(self, engine_entropy): + """Entropy is 0 for constant column.""" + metrics = engine_entropy.compute_metrics([Entropy("constant")]) + value = get_metric_value(metrics, "Entropy", "constant") + assert is_close(value, 0.0, FLOAT_TOLERANCE) + + def test_entropy_skewed(self, engine_entropy): + """Entropy is between 0 and max for skewed distribution.""" + metrics = engine_entropy.compute_metrics([Entropy("skewed")]) + value = get_metric_value(metrics, "Entropy", "skewed") + # Skewed distribution: 0 < entropy < ln(4) ≈ 1.386 + assert value > 0.0 and value < 1.3862943611198906 + + +class TestHistogramAnalyzer: + """Tests for the Histogram analyzer.""" + + def test_histogram_basic(self, engine_histogram): + """Histogram returns value distribution.""" + metrics = engine_histogram.compute_metrics([Histogram("category")]) + result = get_metric(metrics, "Histogram", "category") + assert result is not None + # Histogram value should be non-null (JSON or dict) + + +class TestDataTypeAnalyzer: + """Tests for the DataType analyzer.""" + + def test_datatype_numeric(self, engine_data_type): + """DataType identifies numeric columns.""" + metrics = engine_data_type.compute_metrics([DataType("pure_numeric")]) + result = get_metric(metrics, "DataType", "pure_numeric") + assert result is not None + + def test_datatype_string(self, engine_data_type): + """DataType identifies string columns.""" + metrics = engine_data_type.compute_metrics([DataType("strings")]) + result = get_metric(metrics, "DataType", "strings") + assert result is not None + + +class TestMultipleAnalyzers: + """Tests for running multiple analyzers together.""" + + def test_multiple_basic_analyzers(self, engine_numeric): + """Multiple analyzers can be computed in one call.""" + metrics = engine_numeric.compute_metrics([ + Size(), + Mean("att1"), + Sum("att1"), + Minimum("att1"), + Maximum("att1"), + ]) + + assert len(metrics) >= 5 + assert get_metric_value(metrics, "Size") == 6.0 + assert is_close(get_metric_value(metrics, "Mean", "att1"), 3.5, FLOAT_TOLERANCE) + assert is_close(get_metric_value(metrics, "Sum", "att1"), 21.0, FLOAT_TOLERANCE) + assert is_close(get_metric_value(metrics, "Minimum", "att1"), 1.0, FLOAT_TOLERANCE) + assert is_close(get_metric_value(metrics, "Maximum", "att1"), 6.0, FLOAT_TOLERANCE) + + def test_multiple_columns_same_analyzer(self, engine_full): + """Same analyzer type on multiple columns.""" + metrics = engine_full.compute_metrics([ + Completeness("att1"), + Completeness("att2"), + Completeness("item"), + ]) + + assert len(metrics) >= 3 + assert is_close(get_metric_value(metrics, "Completeness", "att1"), 1.0, FLOAT_EPSILON) + assert is_close(get_metric_value(metrics, "Completeness", "att2"), 1.0, FLOAT_EPSILON) + assert is_close(get_metric_value(metrics, "Completeness", "item"), 1.0, FLOAT_EPSILON) + + def test_mixed_analyzer_types(self, engine_full): + """Mix of different analyzer categories.""" + metrics = engine_full.compute_metrics([ + Size(), + Completeness("att1"), + CountDistinct(["att1"]), + MaxLength("att1"), + ]) + + assert get_metric_value(metrics, "Size") == 4.0 + assert is_close(get_metric_value(metrics, "Completeness", "att1"), 1.0, FLOAT_EPSILON) + assert get_metric_value(metrics, "CountDistinct", "att1") == 3.0 + assert get_metric_value(metrics, "MaxLength", "att1") == 1.0 # "a", "b", "c" + + +class TestAnalyzersWithWhere: + """Tests for analyzers with WHERE clause filtering.""" + + def test_size_where_a(self, engine_where): + """Size with WHERE filters correctly.""" + metrics = engine_where.compute_metrics([ + Size(where="category = 'A'"), + Size(where="category = 'B'"), + ]) + assert get_metric_value(metrics, "Size") == 2.0 # Both return 2 + + def test_completeness_where(self, engine_where): + """Completeness varies by WHERE filter.""" + metrics = engine_where.compute_metrics([ + Completeness("att1", where="category = 'A'"), + Completeness("att1", where="category = 'B'"), + ]) + # Category A: att1 = ["x", "y"] -> 2/2 complete + # Category B: att1 = [None, "w"] -> 1/2 complete + a_completeness = get_metric_value(metrics, "Completeness", "att1") + assert a_completeness is not None + + def test_mean_where(self, engine_where): + """Mean varies by WHERE filter.""" + metrics = engine_where.compute_metrics([ + Mean("value", where="category = 'A'"), + ]) + # Category A: value = [10, 20] -> mean = 15 + value = get_metric_value(metrics, "Mean", "value") + assert is_close(value, 15.0, FLOAT_TOLERANCE) + + +class TestEdgeCases: + """Tests for edge cases and boundary conditions.""" + + def test_empty_dataset_all_analyzers(self, engine_empty): + """Empty dataset handles gracefully.""" + metrics = engine_empty.compute_metrics([ + Size(), + Completeness("att1"), + ]) + assert get_metric_value(metrics, "Size") == 0.0 + + def test_all_null_column_stats(self, engine_all_null): + """All-NULL column returns appropriate values.""" + metrics = engine_all_null.compute_metrics([ + Completeness("value"), + Size(), + ]) + assert is_close(get_metric_value(metrics, "Completeness", "value"), 0.0, FLOAT_EPSILON) + assert get_metric_value(metrics, "Size") == 3.0 + + def test_special_characters(self, engine_escape): + """Special characters in data are handled.""" + metrics = engine_escape.compute_metrics([ + Size(), + Completeness("att1"), + MaxLength("att1"), + ]) + assert get_metric_value(metrics, "Size") == 8.0 + assert is_close(get_metric_value(metrics, "Completeness", "att1"), 1.0, FLOAT_EPSILON) diff --git a/tests/engines/test_duckdb_constraints.py b/tests/engines/test_duckdb_constraints.py new file mode 100644 index 0000000..a1124ed --- /dev/null +++ b/tests/engines/test_duckdb_constraints.py @@ -0,0 +1,641 @@ +# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""DuckDB-only constraint tests. + +Tests all 32 constraint types against known expected values from the test datasets. +These tests do not require Spark and can run quickly in CI. +""" + +import pytest + +from pydeequ.v2.checks import Check, CheckLevel +from pydeequ.v2.predicates import eq, gt, gte, lt, lte, between, is_one +from pydeequ.v2.verification import VerificationSuite +from pydeequ.engines import ConstraintStatus, CheckStatus + + +def get_constraint_result(results, constraint_substring: str): + """Find a constraint result by substring match on constraint name.""" + for r in results: + if constraint_substring in r.constraint: + return r + return None + + +def get_check_result(results, check_description: str): + """Find results for a specific check by description.""" + return [r for r in results if r.check_description == check_description] + + +class TestSizeConstraint: + """Tests for hasSize constraint.""" + + def test_has_size_success(self, engine_full): + """hasSize succeeds when size equals expected.""" + check = Check(CheckLevel.Error, "size check").hasSize(eq(4)) + results = engine_full.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Success + + def test_has_size_failure(self, engine_full): + """hasSize fails when size doesn't match.""" + check = Check(CheckLevel.Error, "size check").hasSize(eq(10)) + results = engine_full.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Failure + + def test_has_size_range(self, engine_full): + """hasSize with between predicate.""" + check = Check(CheckLevel.Error, "size range").hasSize(between(3, 5)) + results = engine_full.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Success + + def test_has_size_empty(self, engine_empty): + """hasSize correctly reports 0 for empty dataset.""" + check = Check(CheckLevel.Error, "empty size").hasSize(eq(0)) + results = engine_empty.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Success + + +class TestCompletenessConstraints: + """Tests for completeness-related constraints.""" + + def test_is_complete_success(self, engine_full): + """isComplete succeeds for non-NULL column.""" + check = Check(CheckLevel.Error, "complete").isComplete("att1") + results = engine_full.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Success + + def test_is_complete_failure(self, engine_missing): + """isComplete fails for column with NULLs.""" + check = Check(CheckLevel.Error, "complete").isComplete("att1") + results = engine_missing.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Failure + + def test_has_completeness_success(self, engine_missing): + """hasCompleteness succeeds when threshold is met.""" + # att1 is 50% complete, check for >= 50% + check = Check(CheckLevel.Error, "partial complete").hasCompleteness("att1", gte(0.5)) + results = engine_missing.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Success + + def test_has_completeness_failure(self, engine_missing): + """hasCompleteness fails when threshold not met.""" + # att1 is 50% complete, check for >= 90% + check = Check(CheckLevel.Error, "high threshold").hasCompleteness("att1", gte(0.9)) + results = engine_missing.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Failure + + def test_are_complete_success(self, engine_full): + """areComplete succeeds when all columns are complete.""" + check = Check(CheckLevel.Error, "multi complete").areComplete(["att1", "att2"]) + results = engine_full.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Success + + def test_are_complete_failure(self, engine_missing): + """areComplete fails when any column has NULLs.""" + check = Check(CheckLevel.Error, "multi complete").areComplete(["att1", "att2"]) + results = engine_missing.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Failure + + def test_have_completeness_success(self, engine_missing): + """haveCompleteness succeeds for combined column threshold.""" + check = Check(CheckLevel.Error, "combined").haveCompleteness(["att1", "att2"], gte(0.5)) + results = engine_missing.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Success + + +class TestUniquenessConstraints: + """Tests for uniqueness-related constraints.""" + + def test_is_unique_success(self, engine_unique): + """isUnique succeeds when all values are unique.""" + check = Check(CheckLevel.Error, "unique").isUnique("unique_col") + results = engine_unique.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Success + + def test_is_unique_failure(self, engine_unique): + """isUnique fails when there are duplicates.""" + check = Check(CheckLevel.Error, "not unique").isUnique("non_unique") + results = engine_unique.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Failure + + def test_has_uniqueness_success(self, engine_unique): + """hasUniqueness succeeds when threshold met.""" + check = Check(CheckLevel.Error, "uniqueness").hasUniqueness(["unique_col"], is_one()) + results = engine_unique.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Success + + def test_has_uniqueness_failure(self, engine_distinct): + """hasUniqueness fails when uniqueness is below threshold.""" + # att1 has all duplicates, uniqueness = 0 + check = Check(CheckLevel.Error, "low uniqueness").hasUniqueness(["att1"], gte(0.5)) + results = engine_distinct.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Failure + + def test_has_distinctness_success(self, engine_distinct): + """hasDistinctness succeeds when threshold met.""" + # att2 has 6 distinct / 6 rows = 1.0 + check = Check(CheckLevel.Error, "distinct").hasDistinctness(["att2"], is_one()) + results = engine_distinct.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Success + + def test_has_distinctness_partial(self, engine_distinct): + """hasDistinctness with partial distinctness.""" + # att1 has 3 distinct / 6 rows = 0.5 + check = Check(CheckLevel.Error, "partial distinct").hasDistinctness(["att1"], gte(0.5)) + results = engine_distinct.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Success + + def test_has_unique_value_ratio_success(self, engine_distinct): + """hasUniqueValueRatio succeeds for all-unique column.""" + check = Check(CheckLevel.Error, "uvr").hasUniqueValueRatio(["att2"], is_one()) + results = engine_distinct.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Success + + def test_has_unique_value_ratio_zero(self, engine_distinct): + """hasUniqueValueRatio for all-duplicated column.""" + # att1: 0 unique / 3 distinct = 0 + check = Check(CheckLevel.Error, "uvr zero").hasUniqueValueRatio(["att1"], eq(0)) + results = engine_distinct.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Success + + +class TestStatisticalConstraints: + """Tests for statistical constraints.""" + + def test_has_min_success(self, engine_numeric): + """hasMin succeeds when minimum matches.""" + check = Check(CheckLevel.Error, "min").hasMin("att1", eq(1)) + results = engine_numeric.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Success + + def test_has_min_failure(self, engine_numeric): + """hasMin fails when minimum doesn't match.""" + check = Check(CheckLevel.Error, "min fail").hasMin("att1", eq(5)) + results = engine_numeric.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Failure + + def test_has_max_success(self, engine_numeric): + """hasMax succeeds when maximum matches.""" + check = Check(CheckLevel.Error, "max").hasMax("att1", eq(6)) + results = engine_numeric.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Success + + def test_has_max_failure(self, engine_numeric): + """hasMax fails when maximum doesn't match.""" + check = Check(CheckLevel.Error, "max fail").hasMax("att1", eq(100)) + results = engine_numeric.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Failure + + def test_has_mean_success(self, engine_numeric): + """hasMean succeeds when mean matches.""" + check = Check(CheckLevel.Error, "mean").hasMean("att1", eq(3.5)) + results = engine_numeric.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Success + + def test_has_mean_range(self, engine_numeric): + """hasMean with range predicate.""" + check = Check(CheckLevel.Error, "mean range").hasMean("att1", between(3.0, 4.0)) + results = engine_numeric.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Success + + def test_has_sum_success(self, engine_numeric): + """hasSum succeeds when sum matches.""" + check = Check(CheckLevel.Error, "sum").hasSum("att1", eq(21)) + results = engine_numeric.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Success + + def test_has_standard_deviation_success(self, engine_numeric): + """hasStandardDeviation with range check.""" + check = Check(CheckLevel.Error, "stddev").hasStandardDeviation("att1", between(1.5, 2.0)) + results = engine_numeric.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Success + + def test_has_approx_count_distinct_success(self, engine_full): + """hasApproxCountDistinct succeeds when count is approximately correct.""" + check = Check(CheckLevel.Error, "approx distinct").hasApproxCountDistinct("att1", between(2, 4)) + results = engine_full.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Success + + +class TestQuantileConstraints: + """Tests for quantile constraints.""" + + def test_has_approx_quantile_median(self, engine_quantile): + """hasApproxQuantile for median.""" + check = Check(CheckLevel.Error, "median").hasApproxQuantile("value", 0.5, between(5.0, 6.0)) + results = engine_quantile.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Success + + +class TestCorrelationConstraints: + """Tests for correlation constraints.""" + + def test_has_correlation_positive(self, engine_correlation): + """hasCorrelation for perfectly correlated columns.""" + check = Check(CheckLevel.Error, "positive corr").hasCorrelation("x", "y", is_one()) + results = engine_correlation.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Success + + def test_has_correlation_negative(self, engine_correlation): + """hasCorrelation for negative correlation.""" + check = Check(CheckLevel.Error, "negative corr").hasCorrelation("x", "z", eq(-1)) + results = engine_correlation.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Success + + +class TestEntropyConstraints: + """Tests for entropy constraints.""" + + def test_has_entropy_uniform(self, engine_entropy): + """hasEntropy for uniform distribution.""" + # ln(4) ≈ 1.386 (matches Spark's natural log convention) + check = Check(CheckLevel.Error, "entropy").hasEntropy("uniform", between(1.38, 1.39)) + results = engine_entropy.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Success + + def test_has_entropy_constant(self, engine_entropy): + """hasEntropy for constant column (entropy=0).""" + check = Check(CheckLevel.Error, "zero entropy").hasEntropy("constant", eq(0)) + results = engine_entropy.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Success + + +class TestMutualInformationConstraints: + """Tests for mutual information constraints.""" + + def test_has_mutual_information(self, engine_mutual_info): + """hasMutualInformation for dependent columns.""" + check = Check(CheckLevel.Error, "mi").hasMutualInformation("x", "y_dependent", gt(0)) + results = engine_mutual_info.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Success + + +class TestStringLengthConstraints: + """Tests for string length constraints.""" + + def test_has_min_length_success(self, engine_string_lengths): + """hasMinLength for empty string (0).""" + check = Check(CheckLevel.Error, "min length").hasMinLength("att1", eq(0)) + results = engine_string_lengths.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Success + + def test_has_min_length_failure(self, engine_string_lengths): + """hasMinLength fails when min length is higher.""" + check = Check(CheckLevel.Error, "min length fail").hasMinLength("att1", gte(2)) + results = engine_string_lengths.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Failure + + def test_has_max_length_success(self, engine_string_lengths): + """hasMaxLength succeeds when max is correct.""" + check = Check(CheckLevel.Error, "max length").hasMaxLength("att1", eq(4)) + results = engine_string_lengths.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Success + + def test_has_max_length_bound(self, engine_string_lengths): + """hasMaxLength with upper bound.""" + check = Check(CheckLevel.Error, "max bound").hasMaxLength("att1", lte(5)) + results = engine_string_lengths.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Success + + +class TestPatternConstraints: + """Tests for pattern matching constraints.""" + + def test_has_pattern_success(self, engine_full): + """hasPattern succeeds when pattern matches all rows.""" + check = Check(CheckLevel.Error, "pattern").hasPattern("att1", r"^[a-c]$", is_one()) + results = engine_full.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Success + + def test_has_pattern_partial(self, engine_pattern): + """hasPattern with partial match threshold.""" + # Email pattern matches 4/6 rows + check = Check(CheckLevel.Error, "email pattern").hasPattern("email", r".*@.*\..*", gte(0.5)) + results = engine_pattern.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Success + + def test_has_pattern_failure(self, engine_pattern): + """hasPattern fails when match rate is below threshold.""" + check = Check(CheckLevel.Error, "strict pattern").hasPattern("email", r".*@.*\..*", is_one()) + results = engine_pattern.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Failure + + +class TestEmailUrlConstraints: + """Tests for email and URL pattern constraints.""" + + def test_contains_email_success(self, engine_pattern): + """containsEmail with threshold.""" + check = Check(CheckLevel.Error, "email").containsEmail("email", gte(0.5)) + results = engine_pattern.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Success + + def test_contains_url_failure(self, engine_pattern): + """containsURL fails for non-URL column.""" + check = Check(CheckLevel.Error, "url").containsURL("email", gte(0.5)) + results = engine_pattern.run_checks([check]) + result = results[0] + # No URLs in email column + assert result.constraint_status == ConstraintStatus.Failure + + +class TestNumericConstraints: + """Tests for numeric value constraints.""" + + def test_is_positive_success(self, engine_compliance): + """isPositive succeeds for all-positive column.""" + check = Check(CheckLevel.Error, "positive").isPositive("positive", is_one()) + results = engine_compliance.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Success + + def test_is_positive_failure(self, engine_compliance): + """isPositive fails for negative column.""" + check = Check(CheckLevel.Error, "not positive").isPositive("negative", gte(0.5)) + results = engine_compliance.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Failure + + def test_is_non_negative_success(self, engine_compliance): + """isNonNegative for positive column.""" + check = Check(CheckLevel.Error, "non-neg").isNonNegative("positive", is_one()) + results = engine_compliance.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Success + + def test_is_non_negative_partial(self, engine_compliance): + """isNonNegative with partial compliance.""" + # mixed: [-2,-1,0,1,2,3] -> 4/6 non-negative + check = Check(CheckLevel.Error, "partial non-neg").isNonNegative("mixed", gte(0.5)) + results = engine_compliance.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Success + + +class TestColumnComparisonConstraints: + """Tests for column comparison constraints.""" + + def test_is_less_than(self, engine_correlation): + """isLessThan for ordered columns.""" + # x = [1,2,3,4,5], y = [2,4,6,8,10], so x < y always + check = Check(CheckLevel.Error, "less than").isLessThan("x", "y") + results = engine_correlation.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Success + + def test_is_less_than_or_equal_to(self, engine_correlation): + """isLessThanOrEqualTo for ordered columns.""" + check = Check(CheckLevel.Error, "lte").isLessThanOrEqualTo("x", "y") + results = engine_correlation.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Success + + def test_is_greater_than(self, engine_correlation): + """isGreaterThan for reverse-ordered columns.""" + # y > x always + check = Check(CheckLevel.Error, "greater than").isGreaterThan("y", "x") + results = engine_correlation.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Success + + def test_is_greater_than_or_equal_to(self, engine_correlation): + """isGreaterThanOrEqualTo for ordered columns.""" + check = Check(CheckLevel.Error, "gte").isGreaterThanOrEqualTo("y", "x") + results = engine_correlation.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Success + + +class TestContainedInConstraint: + """Tests for isContainedIn constraint.""" + + def test_is_contained_in_success(self, engine_contained_in): + """isContainedIn succeeds when all values are in allowed set.""" + check = Check(CheckLevel.Error, "contained").isContainedIn( + "status", ["active", "inactive", "pending"], is_one() + ) + results = engine_contained_in.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Success + + def test_is_contained_in_failure(self, engine_contained_in): + """isContainedIn fails when some values are not in set.""" + check = Check(CheckLevel.Error, "not contained").isContainedIn( + "category", ["A", "B", "C"], is_one() + ) + results = engine_contained_in.run_checks([check]) + result = results[0] + # "D" is not in the allowed set + assert result.constraint_status == ConstraintStatus.Failure + + def test_is_contained_in_partial(self, engine_contained_in): + """isContainedIn with threshold for partial match.""" + check = Check(CheckLevel.Error, "partial contained").isContainedIn( + "category", ["A", "B", "C"], gte(0.8) + ) + results = engine_contained_in.run_checks([check]) + result = results[0] + # 5/6 = 0.833 in allowed set + assert result.constraint_status == ConstraintStatus.Success + + +class TestSatisfiesConstraint: + """Tests for satisfies constraint.""" + + def test_satisfies_simple(self, engine_compliance): + """satisfies with simple predicate.""" + check = Check(CheckLevel.Error, "satisfies").satisfies("positive > 0", "positive_check", is_one()) + results = engine_compliance.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Success + + def test_satisfies_complex(self, engine_compliance): + """satisfies with complex predicate.""" + check = Check(CheckLevel.Error, "complex").satisfies( + "mixed >= -2 AND mixed <= 3", "range_check", is_one() + ) + results = engine_compliance.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Success + + def test_satisfies_partial(self, engine_compliance): + """satisfies with partial compliance.""" + check = Check(CheckLevel.Error, "partial satisfies").satisfies("mixed > 0", "partial_check", gte(0.4)) + results = engine_compliance.run_checks([check]) + result = results[0] + # 3/6 = 0.5 > 0.4 + assert result.constraint_status == ConstraintStatus.Success + + +class TestCheckLevels: + """Tests for check levels (Error vs Warning).""" + + def test_error_level_failure(self, engine_full): + """Error level check results in Error status on failure.""" + check = Check(CheckLevel.Error, "error check").hasSize(eq(100)) + results = engine_full.run_checks([check]) + result = results[0] + assert result.check_level == "Error" + assert result.check_status == CheckStatus.Error + + def test_warning_level_failure(self, engine_full): + """Warning level check results in Warning status on failure.""" + check = Check(CheckLevel.Warning, "warning check").hasSize(eq(100)) + results = engine_full.run_checks([check]) + result = results[0] + assert result.check_level == "Warning" + assert result.check_status == CheckStatus.Warning + + def test_error_level_success(self, engine_full): + """Error level check results in Success status on pass.""" + check = Check(CheckLevel.Error, "pass check").hasSize(eq(4)) + results = engine_full.run_checks([check]) + result = results[0] + assert result.check_status == CheckStatus.Success + + +class TestMultipleConstraints: + """Tests for multiple constraints in one check.""" + + def test_all_pass(self, engine_full): + """All constraints pass results in Success.""" + check = (Check(CheckLevel.Error, "all pass") + .hasSize(eq(4)) + .isComplete("att1") + .isComplete("att2")) + results = engine_full.run_checks([check]) + assert all(r.constraint_status == ConstraintStatus.Success for r in results) + assert results[0].check_status == CheckStatus.Success + + def test_some_fail(self, engine_missing): + """Some constraints fail results in overall failure.""" + check = (Check(CheckLevel.Error, "some fail") + .hasSize(eq(12)) # Pass + .isComplete("att1") # Fail + .hasCompleteness("att2", gte(0.5))) # Pass + results = engine_missing.run_checks([check]) + # Check that at least one constraint failed + failed = [r for r in results if r.constraint_status == ConstraintStatus.Failure] + assert len(failed) >= 1 + # Overall check should fail + assert results[0].check_status == CheckStatus.Error + + def test_multiple_checks(self, engine_numeric): + """Multiple checks can be run together.""" + check1 = Check(CheckLevel.Error, "size check").hasSize(eq(6)) + check2 = Check(CheckLevel.Error, "mean check").hasMean("att1", eq(3.5)) + check3 = Check(CheckLevel.Warning, "sum check").hasSum("att1", eq(21)) + + results = engine_numeric.run_checks([check1, check2, check3]) + # All should pass + assert len(results) == 3 + assert all(r.constraint_status == ConstraintStatus.Success for r in results) + + +class TestConstraintsWithWhere: + """Tests for constraints with WHERE clause filtering.""" + + @pytest.mark.skip(reason="WHERE clause support not yet implemented in Check API") + def test_completeness_where(self, engine_where): + """Completeness constraint with WHERE filter.""" + check = Check(CheckLevel.Error, "filtered completeness").hasCompleteness( + "att1", is_one(), where="category = 'A'" + ) + results = engine_where.run_checks([check]) + result = results[0] + # Category A: att1 is complete + assert result.constraint_status == ConstraintStatus.Success + + @pytest.mark.skip(reason="WHERE clause support not yet implemented in Check API") + def test_size_where(self, engine_where): + """Size constraint with WHERE filter.""" + check = Check(CheckLevel.Error, "filtered size").hasSize( + eq(2), where="category = 'A'" + ) + results = engine_where.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Success + + +class TestEdgeCases: + """Tests for edge cases and boundary conditions.""" + + def test_empty_dataset(self, engine_empty): + """Constraints on empty dataset.""" + check = (Check(CheckLevel.Error, "empty check") + .hasSize(eq(0))) + results = engine_empty.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Success + + def test_single_row(self, engine_single): + """Constraints on single-row dataset.""" + check = (Check(CheckLevel.Error, "single row") + .hasSize(eq(1)) + .isComplete("att1") + .hasMin("item", eq(1)) + .hasMax("item", eq(1))) + results = engine_single.run_checks([check]) + assert all(r.constraint_status == ConstraintStatus.Success for r in results) + + def test_all_null_column(self, engine_all_null): + """Constraints on all-NULL column.""" + check = (Check(CheckLevel.Error, "all null") + .hasCompleteness("value", eq(0))) + results = engine_all_null.run_checks([check]) + result = results[0] + assert result.constraint_status == ConstraintStatus.Success diff --git a/tests/engines/test_duckdb_profiles.py b/tests/engines/test_duckdb_profiles.py new file mode 100644 index 0000000..f105fa1 --- /dev/null +++ b/tests/engines/test_duckdb_profiles.py @@ -0,0 +1,267 @@ +# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""DuckDB-only profiling tests. + +Tests the column profiling functionality of the DuckDB engine. +""" + +import math +import pytest + +from tests.engines.fixtures.datasets import ( + FLOAT_EPSILON, + FLOAT_TOLERANCE, + is_close, +) + + +def get_profile_by_column(profiles, column_name: str): + """Find a column profile by column name.""" + for p in profiles: + if p.column == column_name: + return p + return None + + +class TestBasicProfiling: + """Tests for basic profiling functionality.""" + + def test_profile_all_columns(self, engine_full): + """Profile returns data for all columns.""" + profiles = engine_full.profile_columns() + assert len(profiles) >= 4 # att1, att2, item, price + + def test_profile_specific_columns(self, engine_full): + """Profile can be restricted to specific columns.""" + profiles = engine_full.profile_columns(columns=["att1", "item"]) + column_names = [p.column for p in profiles] + assert "att1" in column_names + assert "item" in column_names + + def test_profile_column_name(self, engine_full): + """Profile contains correct column names.""" + profiles = engine_full.profile_columns() + column_names = [p.column for p in profiles] + assert "att1" in column_names + assert "att2" in column_names + + +class TestCompletenessProfile: + """Tests for completeness in profiles.""" + + def test_completeness_full(self, engine_full): + """Completeness is 1.0 for complete columns.""" + profiles = engine_full.profile_columns(columns=["att1"]) + profile = get_profile_by_column(profiles, "att1") + assert is_close(profile.completeness, 1.0, FLOAT_EPSILON) + + def test_completeness_partial(self, engine_missing): + """Completeness reflects NULL ratio.""" + profiles = engine_missing.profile_columns(columns=["att1", "att2"]) + att1_profile = get_profile_by_column(profiles, "att1") + att2_profile = get_profile_by_column(profiles, "att2") + assert is_close(att1_profile.completeness, 0.5, FLOAT_EPSILON) # 6/12 + assert is_close(att2_profile.completeness, 0.75, FLOAT_EPSILON) # 9/12 + + def test_completeness_all_null(self, engine_all_null): + """Completeness is 0 for all-NULL column.""" + profiles = engine_all_null.profile_columns(columns=["value"]) + profile = get_profile_by_column(profiles, "value") + assert is_close(profile.completeness, 0.0, FLOAT_EPSILON) + + +class TestDistinctValuesProfile: + """Tests for approximate distinct values in profiles.""" + + def test_distinct_values_unique(self, engine_unique): + """Distinct count for unique column.""" + profiles = engine_unique.profile_columns(columns=["unique_col"]) + profile = get_profile_by_column(profiles, "unique_col") + assert profile.approx_distinct_values == 6 + + def test_distinct_values_duplicates(self, engine_distinct): + """Distinct count handles duplicates correctly.""" + profiles = engine_distinct.profile_columns(columns=["att1"]) + profile = get_profile_by_column(profiles, "att1") + # att1: ["a", "a", "b", "b", "c", "c"] -> 3 distinct + assert profile.approx_distinct_values == 3 + + +class TestDataTypeProfile: + """Tests for data type detection in profiles.""" + + def test_data_type_string(self, engine_full): + """Data type detection for string column.""" + profiles = engine_full.profile_columns(columns=["att1"]) + profile = get_profile_by_column(profiles, "att1") + assert profile.data_type is not None + # Should be some form of string type + assert "str" in profile.data_type.lower() or "char" in profile.data_type.lower() or "text" in profile.data_type.lower() or "object" in profile.data_type.lower() + + def test_data_type_numeric(self, engine_numeric): + """Data type detection for numeric column.""" + profiles = engine_numeric.profile_columns(columns=["att1"]) + profile = get_profile_by_column(profiles, "att1") + assert profile.data_type is not None + + +class TestNumericProfileStatistics: + """Tests for numeric statistics in profiles.""" + + def test_mean_numeric(self, engine_numeric): + """Mean is calculated for numeric columns.""" + profiles = engine_numeric.profile_columns(columns=["att1"]) + profile = get_profile_by_column(profiles, "att1") + assert profile.mean is not None + assert is_close(profile.mean, 3.5, FLOAT_TOLERANCE) + + def test_min_numeric(self, engine_numeric): + """Minimum is calculated for numeric columns.""" + profiles = engine_numeric.profile_columns(columns=["att1"]) + profile = get_profile_by_column(profiles, "att1") + assert profile.minimum is not None + assert is_close(profile.minimum, 1.0, FLOAT_TOLERANCE) + + def test_max_numeric(self, engine_numeric): + """Maximum is calculated for numeric columns.""" + profiles = engine_numeric.profile_columns(columns=["att1"]) + profile = get_profile_by_column(profiles, "att1") + assert profile.maximum is not None + assert is_close(profile.maximum, 6.0, FLOAT_TOLERANCE) + + def test_sum_numeric(self, engine_numeric): + """Sum is calculated for numeric columns.""" + profiles = engine_numeric.profile_columns(columns=["att1"]) + profile = get_profile_by_column(profiles, "att1") + assert profile.sum is not None + assert is_close(profile.sum, 21.0, FLOAT_TOLERANCE) + + def test_stddev_numeric(self, engine_numeric): + """Standard deviation is calculated for numeric columns.""" + profiles = engine_numeric.profile_columns(columns=["att1"]) + profile = get_profile_by_column(profiles, "att1") + if profile.std_dev is not None: + # Population stddev (matches Spark) + assert is_close(profile.std_dev, 1.7078251276599330, FLOAT_TOLERANCE) + + def test_numeric_with_nulls(self, engine_numeric): + """Numeric statistics handle NULLs correctly.""" + profiles = engine_numeric.profile_columns(columns=["att2"]) + profile = get_profile_by_column(profiles, "att2") + # att2 has values [1,2,3,4,5,NULL] + if profile.mean is not None: + assert is_close(profile.mean, 3.0, FLOAT_TOLERANCE) # (1+2+3+4+5)/5 + + +class TestStringProfileStatistics: + """Tests for string column profiles.""" + + def test_string_column_no_numeric_stats(self, engine_full): + """String columns don't have numeric statistics.""" + profiles = engine_full.profile_columns(columns=["att1"]) + profile = get_profile_by_column(profiles, "att1") + # String column shouldn't have meaningful numeric stats + # (or they might be None) + # Just verify we get a profile back + assert profile is not None + assert profile.completeness is not None + + +class TestHistogramProfile: + """Tests for histogram in profiles.""" + + def test_histogram_low_cardinality(self, engine_histogram): + """Histogram is generated for low cardinality columns.""" + profiles = engine_histogram.profile_columns( + columns=["category"], + low_cardinality_threshold=10 + ) + profile = get_profile_by_column(profiles, "category") + # Should have histogram for 4-value column with threshold 10 + if profile.histogram is not None: + assert len(profile.histogram) > 0 + + def test_histogram_high_cardinality(self, engine_unique): + """Histogram might not be generated for high cardinality columns.""" + profiles = engine_unique.profile_columns( + columns=["unique_col"], + low_cardinality_threshold=3 + ) + profile = get_profile_by_column(profiles, "unique_col") + # With 6 distinct and threshold 3, might skip histogram + assert profile is not None + + +class TestQuantileProfile: + """Tests for quantile/percentile information in profiles.""" + + def test_percentiles_numeric(self, engine_quantile): + """Percentiles are calculated for numeric columns.""" + profiles = engine_quantile.profile_columns(columns=["value"]) + profile = get_profile_by_column(profiles, "value") + # Check for percentile attributes if present + if hasattr(profile, 'approx_percentiles') and profile.approx_percentiles: + # Should have some percentile data + assert len(profile.approx_percentiles) >= 0 + + +class TestEdgeCases: + """Tests for edge cases in profiling.""" + + def test_empty_dataset(self, engine_empty): + """Profiling empty dataset.""" + profiles = engine_empty.profile_columns() + # Should return profiles (possibly with default/None values) + assert isinstance(profiles, list) + + def test_single_row(self, engine_single): + """Profiling single-row dataset.""" + profiles = engine_single.profile_columns(columns=["att1", "item"]) + att1_profile = get_profile_by_column(profiles, "att1") + item_profile = get_profile_by_column(profiles, "item") + + assert att1_profile.completeness == 1.0 + assert att1_profile.approx_distinct_values == 1 + + if item_profile.mean is not None: + assert item_profile.mean == 1.0 + if item_profile.minimum is not None: + assert item_profile.minimum == 1.0 + if item_profile.maximum is not None: + assert item_profile.maximum == 1.0 + + def test_all_null_column(self, engine_all_null): + """Profiling all-NULL column.""" + profiles = engine_all_null.profile_columns(columns=["value"]) + profile = get_profile_by_column(profiles, "value") + assert profile.completeness == 0.0 + # Statistics should be None or NaN for all-NULL column + if profile.mean is not None and not math.isnan(profile.mean): + # Some implementations might return 0 or None + pass + + +class TestProfileDataFrame: + """Tests for profile to DataFrame conversion.""" + + def test_profiles_to_dataframe(self, engine_full): + """Profiles can be converted to DataFrame.""" + profiles = engine_full.profile_columns() + df = engine_full.profiles_to_dataframe(profiles) + + assert df is not None + assert len(df) > 0 + assert "column" in df.columns + assert "completeness" in df.columns diff --git a/tests/engines/test_duckdb_suggestions.py b/tests/engines/test_duckdb_suggestions.py new file mode 100644 index 0000000..3da9a51 --- /dev/null +++ b/tests/engines/test_duckdb_suggestions.py @@ -0,0 +1,287 @@ +# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""DuckDB-only constraint suggestion tests. + +Tests the constraint suggestion functionality of the DuckDB engine. +""" + +import pytest + +from pydeequ.v2.suggestions import Rules + + +def get_suggestions_for_column(suggestions, column_name: str): + """Get all suggestions for a specific column.""" + return [s for s in suggestions if s.column_name == column_name] + + +def get_suggestions_by_constraint(suggestions, constraint_name: str): + """Get all suggestions for a specific constraint type.""" + return [s for s in suggestions if constraint_name in s.constraint_name] + + +class TestBasicSuggestions: + """Tests for basic suggestion functionality.""" + + def test_default_rules_generate_suggestions(self, engine_full): + """DEFAULT rules generate suggestions for complete columns.""" + suggestions = engine_full.suggest_constraints(rules=[Rules.DEFAULT]) + # Should generate some suggestions for complete data + assert isinstance(suggestions, list) + + def test_suggestions_have_required_fields(self, engine_full): + """Suggestions contain all required fields.""" + suggestions = engine_full.suggest_constraints(rules=[Rules.DEFAULT]) + if suggestions: + suggestion = suggestions[0] + assert hasattr(suggestion, 'column_name') + assert hasattr(suggestion, 'constraint_name') + assert hasattr(suggestion, 'description') + assert hasattr(suggestion, 'suggesting_rule') + + def test_restrict_to_columns(self, engine_full): + """Suggestions can be restricted to specific columns.""" + suggestions = engine_full.suggest_constraints( + columns=["att1"], + rules=[Rules.DEFAULT] + ) + # All suggestions should be for att1 (or dataset-level) + column_suggestions = [s for s in suggestions if s.column_name] + for s in column_suggestions: + assert s.column_name == "att1" or s.column_name is None + + +class TestCompletenessRuleSuggestions: + """Tests for completeness-related suggestions.""" + + def test_complete_column_suggestions(self, engine_full): + """Complete columns get completeness suggestions.""" + suggestions = engine_full.suggest_constraints( + columns=["att1"], + rules=[Rules.DEFAULT] + ) + # Should suggest isComplete or hasCompleteness for complete column + completeness_suggestions = get_suggestions_by_constraint(suggestions, "Complete") + # May or may not generate based on implementation + assert isinstance(suggestions, list) + + def test_incomplete_column_suggestions(self, engine_missing): + """Incomplete columns may get retain completeness suggestions.""" + suggestions = engine_missing.suggest_constraints( + columns=["att1"], + rules=[Rules.DEFAULT] + ) + # att1 is 50% complete - might suggest retaining that level + assert isinstance(suggestions, list) + + +class TestUniquenessRuleSuggestions: + """Tests for uniqueness-related suggestions.""" + + def test_unique_column_suggestions(self, engine_unique): + """Unique columns get uniqueness suggestions with COMMON rules.""" + suggestions = engine_unique.suggest_constraints( + columns=["unique_col"], + rules=[Rules.COMMON] + ) + # Should suggest isUnique or hasUniqueness for unique column + uniqueness_suggestions = get_suggestions_by_constraint(suggestions, "Unique") + # Implementation dependent + assert isinstance(suggestions, list) + + +class TestNumericalRuleSuggestions: + """Tests for numerical constraint suggestions.""" + + def test_numeric_column_suggestions(self, engine_numeric): + """Numeric columns get statistical suggestions with NUMERICAL rules.""" + suggestions = engine_numeric.suggest_constraints( + columns=["att1"], + rules=[Rules.NUMERICAL] + ) + # Should suggest hasMin, hasMax, hasMean for numeric column + assert isinstance(suggestions, list) + + def test_min_max_suggestions(self, engine_numeric): + """Numeric columns may get min/max suggestions.""" + suggestions = engine_numeric.suggest_constraints( + columns=["att1"], + rules=[Rules.NUMERICAL] + ) + min_suggestions = get_suggestions_by_constraint(suggestions, "Min") + max_suggestions = get_suggestions_by_constraint(suggestions, "Max") + # May have min/max suggestions + assert isinstance(suggestions, list) + + +class TestStringRuleSuggestions: + """Tests for string-related suggestions.""" + + def test_string_column_suggestions(self, engine_string_lengths): + """String columns get length suggestions with STRING rules.""" + suggestions = engine_string_lengths.suggest_constraints( + columns=["att1"], + rules=[Rules.STRING] + ) + # Should suggest hasMinLength, hasMaxLength for string column + assert isinstance(suggestions, list) + + +class TestCategoricalRuleSuggestions: + """Tests for categorical constraint suggestions.""" + + def test_categorical_column_suggestions(self, engine_contained_in): + """Low-cardinality columns may get containment suggestions.""" + suggestions = engine_contained_in.suggest_constraints( + columns=["status"], + rules=[Rules.DEFAULT] + ) + # May suggest isContainedIn for categorical column + assert isinstance(suggestions, list) + + +class TestMultipleRules: + """Tests for combining multiple rule sets.""" + + def test_extended_rules(self, engine_numeric): + """EXTENDED rules combine all rule sets.""" + suggestions = engine_numeric.suggest_constraints( + rules=[Rules.EXTENDED] + ) + # Should get suggestions from all rule categories + assert isinstance(suggestions, list) + + def test_multiple_rule_sets(self, engine_numeric): + """Multiple rule sets can be combined.""" + suggestions = engine_numeric.suggest_constraints( + rules=[Rules.DEFAULT, Rules.NUMERICAL] + ) + assert isinstance(suggestions, list) + + +class TestSuggestionContent: + """Tests for suggestion content quality.""" + + def test_suggestion_has_description(self, engine_full): + """Suggestions include human-readable descriptions.""" + suggestions = engine_full.suggest_constraints(rules=[Rules.DEFAULT]) + if suggestions: + for s in suggestions: + assert s.description is not None + assert len(s.description) > 0 + + def test_suggestion_has_rule_name(self, engine_full): + """Suggestions identify the suggesting rule.""" + suggestions = engine_full.suggest_constraints(rules=[Rules.DEFAULT]) + if suggestions: + for s in suggestions: + assert s.suggesting_rule is not None + + def test_suggestion_has_current_value(self, engine_full): + """Suggestions include current metric value.""" + suggestions = engine_full.suggest_constraints(rules=[Rules.DEFAULT]) + if suggestions: + for s in suggestions: + # current_value may be present + assert hasattr(s, 'current_value') + + def test_suggestion_has_code_snippet(self, engine_full): + """Suggestions may include code for constraint.""" + suggestions = engine_full.suggest_constraints(rules=[Rules.DEFAULT]) + if suggestions: + for s in suggestions: + # code_for_constraint may be present + assert hasattr(s, 'code_for_constraint') + + +class TestEdgeCases: + """Tests for edge cases in suggestions.""" + + def test_empty_dataset_suggestions(self, engine_empty): + """Suggestions on empty dataset.""" + suggestions = engine_empty.suggest_constraints(rules=[Rules.DEFAULT]) + # Should handle gracefully + assert isinstance(suggestions, list) + + def test_single_row_suggestions(self, engine_single): + """Suggestions on single-row dataset.""" + suggestions = engine_single.suggest_constraints(rules=[Rules.DEFAULT]) + assert isinstance(suggestions, list) + + def test_all_null_column_suggestions(self, engine_all_null): + """Suggestions on all-NULL column.""" + suggestions = engine_all_null.suggest_constraints( + columns=["value"], + rules=[Rules.DEFAULT] + ) + # Should handle all-NULL gracefully + assert isinstance(suggestions, list) + + +class TestSuggestionDataFrame: + """Tests for suggestion to DataFrame conversion.""" + + def test_suggestions_to_dataframe(self, engine_full): + """Suggestions can be converted to DataFrame.""" + suggestions = engine_full.suggest_constraints(rules=[Rules.DEFAULT]) + df = engine_full.suggestions_to_dataframe(suggestions) + + assert df is not None + if len(suggestions) > 0: + assert len(df) > 0 + assert "column_name" in df.columns + assert "constraint_name" in df.columns + + +class TestDatasetSpecificSuggestions: + """Tests for suggestions on specific dataset types.""" + + def test_numeric_dataset(self, engine_numeric): + """Numeric dataset gets appropriate suggestions.""" + suggestions = engine_numeric.suggest_constraints( + rules=[Rules.DEFAULT, Rules.NUMERICAL] + ) + # Should have suggestions for numeric columns + numeric_suggestions = get_suggestions_for_column(suggestions, "att1") + assert isinstance(suggestions, list) + + def test_string_dataset(self, engine_string_lengths): + """String dataset gets appropriate suggestions.""" + suggestions = engine_string_lengths.suggest_constraints( + rules=[Rules.DEFAULT, Rules.STRING] + ) + string_suggestions = get_suggestions_for_column(suggestions, "att1") + assert isinstance(suggestions, list) + + def test_mixed_type_dataset(self, engine_full): + """Mixed-type dataset handles all columns.""" + suggestions = engine_full.suggest_constraints( + rules=[Rules.EXTENDED] + ) + # Should have suggestions for different column types + assert isinstance(suggestions, list) + + +class TestNonNegativeRuleSuggestions: + """Tests for non-negative number suggestions.""" + + def test_positive_column_suggestions(self, engine_compliance): + """All-positive columns may get non-negative suggestions.""" + suggestions = engine_compliance.suggest_constraints( + columns=["positive"], + rules=[Rules.DEFAULT] + ) + # May suggest isNonNegative for positive column + assert isinstance(suggestions, list) diff --git a/tests/engines/test_operators.py b/tests/engines/test_operators.py new file mode 100644 index 0000000..62ccd24 --- /dev/null +++ b/tests/engines/test_operators.py @@ -0,0 +1,484 @@ +# -*- coding: utf-8 -*- +""" +Unit tests for SQL operators. + +These tests verify the operator abstractions work correctly in isolation, +testing SQL generation and result extraction separately from actual +database execution. +""" + +import pandas as pd +import pytest + +from pydeequ.engines import MetricResult +from pydeequ.engines.operators import ( + # Scan operators + SizeOperator, + CompletenessOperator, + MeanOperator, + SumOperator, + MinimumOperator, + MaximumOperator, + StandardDeviationOperator, + MaxLengthOperator, + MinLengthOperator, + PatternMatchOperator, + ComplianceOperator, + CorrelationOperator, + CountDistinctOperator, + ApproxCountDistinctOperator, + # Grouping operators + DistinctnessOperator, + UniquenessOperator, + UniqueValueRatioOperator, + EntropyOperator, + MutualInformationOperator, + # Factory + OperatorFactory, + # Mixins + WhereClauseMixin, + SafeExtractMixin, + ColumnAliasMixin, +) + + +class TestWhereClauseMixin: + """Tests for WhereClauseMixin.""" + + def test_wrap_agg_with_where_no_condition(self): + """Test wrapping aggregation without WHERE clause.""" + class TestClass(WhereClauseMixin): + where = None + + obj = TestClass() + result = obj.wrap_agg_with_where("AVG", "price") + assert result == "AVG(price)" + + def test_wrap_agg_with_where_with_condition(self): + """Test wrapping aggregation with WHERE clause.""" + class TestClass(WhereClauseMixin): + where = "status = 'active'" + + obj = TestClass() + result = obj.wrap_agg_with_where("AVG", "price") + assert result == "AVG(CASE WHEN status = 'active' THEN price ELSE NULL END)" + + def test_wrap_count_with_where_no_condition(self): + """Test wrapping COUNT without WHERE clause.""" + class TestClass(WhereClauseMixin): + where = None + + obj = TestClass() + result = obj.wrap_count_with_where() + assert result == "COUNT(*)" + + def test_wrap_count_with_where_with_condition(self): + """Test wrapping COUNT with WHERE clause.""" + class TestClass(WhereClauseMixin): + where = "status = 'active'" + + obj = TestClass() + result = obj.wrap_count_with_where() + assert result == "SUM(CASE WHEN status = 'active' THEN 1 ELSE 0 END)" + + def test_wrap_count_with_where_custom_condition(self): + """Test wrapping COUNT with custom condition and WHERE clause.""" + class TestClass(WhereClauseMixin): + where = "status = 'active'" + + obj = TestClass() + result = obj.wrap_count_with_where("price > 0") + assert "status = 'active'" in result + assert "price > 0" in result + + +class TestSafeExtractMixin: + """Tests for SafeExtractMixin.""" + + def test_safe_float_valid(self): + """Test extracting valid float value.""" + class TestClass(SafeExtractMixin): + pass + + obj = TestClass() + df = pd.DataFrame({"value": [42.5]}) + result = obj.safe_float(df, "value") + assert result == 42.5 + + def test_safe_float_none(self): + """Test extracting None value.""" + class TestClass(SafeExtractMixin): + pass + + obj = TestClass() + df = pd.DataFrame({"value": [None]}) + result = obj.safe_float(df, "value") + assert result is None + + def test_safe_float_missing_column(self): + """Test extracting from missing column.""" + class TestClass(SafeExtractMixin): + pass + + obj = TestClass() + df = pd.DataFrame({"other": [42.5]}) + result = obj.safe_float(df, "value") + assert result is None + + def test_safe_int(self): + """Test extracting integer value.""" + class TestClass(SafeExtractMixin): + pass + + obj = TestClass() + df = pd.DataFrame({"value": [42.7]}) + result = obj.safe_int(df, "value") + assert result == 42 + + +class TestColumnAliasMixin: + """Tests for ColumnAliasMixin.""" + + def test_make_alias_single_part(self): + """Test alias with single part.""" + class TestClass(ColumnAliasMixin): + pass + + obj = TestClass() + result = obj.make_alias("mean", "price") + assert result == "mean_price" + + def test_make_alias_multiple_parts(self): + """Test alias with multiple parts.""" + class TestClass(ColumnAliasMixin): + pass + + obj = TestClass() + result = obj.make_alias("corr", "price", "quantity") + assert result == "corr_price_quantity" + + def test_make_alias_sanitization(self): + """Test alias sanitizes special characters.""" + class TestClass(ColumnAliasMixin): + pass + + obj = TestClass() + result = obj.make_alias("mean", "table.column") + assert result == "mean_table_column" + + +class TestSizeOperator: + """Tests for SizeOperator.""" + + def test_get_aggregations_no_where(self): + """Test SQL generation without WHERE clause.""" + op = SizeOperator() + aggs = op.get_aggregations() + assert len(aggs) == 1 + assert "COUNT(*)" in aggs[0] + assert "size_value" in aggs[0] + + def test_get_aggregations_with_where(self): + """Test SQL generation with WHERE clause.""" + op = SizeOperator(where="status = 'active'") + aggs = op.get_aggregations() + assert len(aggs) == 1 + assert "SUM(CASE WHEN" in aggs[0] + assert "status = 'active'" in aggs[0] + + def test_extract_result(self): + """Test result extraction.""" + op = SizeOperator() + df = pd.DataFrame({"size_value": [100]}) + result = op.extract_result(df) + assert result.name == "Size" + assert result.instance == "*" + assert result.entity == "Dataset" + assert result.value == 100.0 + + +class TestCompletenessOperator: + """Tests for CompletenessOperator.""" + + def test_get_aggregations(self): + """Test SQL generation.""" + op = CompletenessOperator("email") + aggs = op.get_aggregations() + assert len(aggs) == 2 + assert any("count_email" in agg for agg in aggs) + assert any("null_count_email" in agg for agg in aggs) + + def test_extract_result_complete(self): + """Test result extraction with complete data.""" + op = CompletenessOperator("email") + df = pd.DataFrame({ + "count_email": [100], + "null_count_email": [0], + }) + result = op.extract_result(df) + assert result.value == 1.0 + + def test_extract_result_partial(self): + """Test result extraction with partial data.""" + op = CompletenessOperator("email") + df = pd.DataFrame({ + "count_email": [100], + "null_count_email": [20], + }) + result = op.extract_result(df) + assert result.value == 0.8 + + +class TestMeanOperator: + """Tests for MeanOperator.""" + + def test_get_aggregations(self): + """Test SQL generation.""" + op = MeanOperator("price") + aggs = op.get_aggregations() + assert len(aggs) == 1 + assert "AVG(price)" in aggs[0] + assert "mean_price" in aggs[0] + + def test_extract_result(self): + """Test result extraction.""" + op = MeanOperator("price") + df = pd.DataFrame({"mean_price": [42.5]}) + result = op.extract_result(df) + assert result.name == "Mean" + assert result.instance == "price" + assert result.value == 42.5 + + +class TestPatternMatchOperator: + """Tests for PatternMatchOperator.""" + + def test_get_aggregations(self): + """Test SQL generation.""" + op = PatternMatchOperator("email", r"^.+@.+\..+$") + aggs = op.get_aggregations() + assert len(aggs) == 2 + assert any("count_email" in agg for agg in aggs) + assert any("pattern_match_email" in agg for agg in aggs) + assert any("REGEXP_MATCHES" in agg for agg in aggs) + + def test_extract_result(self): + """Test result extraction.""" + op = PatternMatchOperator("email", r"^.+@.+\..+$") + df = pd.DataFrame({ + "count_email": [100], + "pattern_match_email": [95], + }) + result = op.extract_result(df) + assert result.name == "PatternMatch" + assert result.value == 0.95 + + +class TestDistinctnessOperator: + """Tests for DistinctnessOperator.""" + + def test_get_grouping_columns(self): + """Test grouping columns.""" + op = DistinctnessOperator(["category"]) + assert op.get_grouping_columns() == ["category"] + + def test_build_query(self): + """Test query building.""" + op = DistinctnessOperator(["category"]) + query = op.build_query("products") + assert "SELECT category" in query + assert "GROUP BY category" in query + assert "distinct_count" in query + assert "total_count" in query + + def test_extract_result(self): + """Test result extraction.""" + op = DistinctnessOperator(["category"]) + df = pd.DataFrame({ + "distinct_count": [10], + "total_count": [100], + }) + result = op.extract_result(df) + assert result.name == "Distinctness" + assert result.value == 0.1 + + +class TestUniquenessOperator: + """Tests for UniquenessOperator.""" + + def test_build_query(self): + """Test query building.""" + op = UniquenessOperator(["id"]) + query = op.build_query("users") + assert "GROUP BY id" in query + assert "HAVING" not in query # HAVING is used in the inner query + assert "unique_count" in query + assert "total_count" in query + + def test_extract_result(self): + """Test result extraction.""" + op = UniquenessOperator(["id"]) + df = pd.DataFrame({ + "unique_count": [90], + "total_count": [100], + }) + result = op.extract_result(df) + assert result.name == "Uniqueness" + assert result.value == 0.9 + + +class TestEntropyOperator: + """Tests for EntropyOperator.""" + + def test_build_query(self): + """Test query building.""" + op = EntropyOperator("category") + query = op.build_query("products") + assert "GROUP BY category" in query + assert "LN" in query + assert "entropy" in query + + def test_extract_result(self): + """Test result extraction.""" + op = EntropyOperator("category") + df = pd.DataFrame({"entropy": [2.5]}) + result = op.extract_result(df) + assert result.name == "Entropy" + assert result.value == 2.5 + + +class TestOperatorFactory: + """Tests for OperatorFactory.""" + + def test_is_scan_operator(self): + """Test scan operator detection.""" + from pydeequ.v2.analyzers import Mean, Sum, Completeness + + assert OperatorFactory.is_scan_operator(Mean("price")) + assert OperatorFactory.is_scan_operator(Sum("amount")) + assert OperatorFactory.is_scan_operator(Completeness("email")) + + def test_is_grouping_operator(self): + """Test grouping operator detection.""" + from pydeequ.v2.analyzers import Distinctness, Uniqueness, Entropy + + assert OperatorFactory.is_grouping_operator(Distinctness("category")) + assert OperatorFactory.is_grouping_operator(Uniqueness("id")) + assert OperatorFactory.is_grouping_operator(Entropy("status")) + + def test_create_scan_operator(self): + """Test creating scan operator from analyzer.""" + from pydeequ.v2.analyzers import Mean + + analyzer = Mean("price", where="status = 'active'") + operator = OperatorFactory.create(analyzer) + + assert operator is not None + assert isinstance(operator, MeanOperator) + assert operator.column == "price" + assert operator.where == "status = 'active'" + + def test_create_grouping_operator(self): + """Test creating grouping operator from analyzer.""" + from pydeequ.v2.analyzers import Distinctness + + analyzer = Distinctness(["category", "brand"]) + operator = OperatorFactory.create(analyzer) + + assert operator is not None + assert isinstance(operator, DistinctnessOperator) + assert operator.columns == ["category", "brand"] + + def test_is_supported(self): + """Test analyzer support checking.""" + from pydeequ.v2.analyzers import Mean, Histogram, ApproxQuantile, DataType + + assert OperatorFactory.is_supported(Mean("price")) + # Histogram and ApproxQuantile are now supported as operators + assert OperatorFactory.is_supported(Histogram("category")) + assert OperatorFactory.is_supported(ApproxQuantile("price", 0.5)) + # DataType is now supported via the metadata registry + assert OperatorFactory.is_supported(DataType("category")) + assert OperatorFactory.is_metadata_operator(DataType("category")) + + +class TestOperatorIntegration: + """Integration tests for operators with actual DuckDB.""" + + @pytest.fixture + def duckdb_conn(self): + """Create a DuckDB connection with test data.""" + import duckdb + + conn = duckdb.connect(":memory:") + conn.execute(""" + CREATE TABLE test_data AS SELECT * FROM ( + VALUES + (1, 'Alice', 100.0, 'A', 'active'), + (2, 'Bob', 200.0, 'B', 'active'), + (3, 'Carol', 150.0, 'A', 'inactive'), + (4, 'Dave', NULL, 'C', 'active'), + (5, 'Eve', 300.0, 'A', 'active') + ) AS t(id, name, amount, category, status) + """) + yield conn + conn.close() + + def test_scan_operators_batch_execution(self, duckdb_conn): + """Test batch execution of multiple scan operators.""" + operators = [ + SizeOperator(), + MeanOperator("amount"), + MaximumOperator("amount"), + MinimumOperator("amount"), + ] + + # Collect all aggregations + aggregations = [] + for op in operators: + aggregations.extend(op.get_aggregations()) + + # Execute single query + query = f"SELECT {', '.join(aggregations)} FROM test_data" + result = duckdb_conn.execute(query).fetchdf() + + # Extract results + results = [op.extract_result(result) for op in operators] + + assert results[0].value == 5.0 # Size + assert results[1].value == 187.5 # Mean (750/4 non-null) + assert results[2].value == 300.0 # Maximum + assert results[3].value == 100.0 # Minimum + + def test_grouping_operator_execution(self, duckdb_conn): + """Test execution of grouping operator.""" + op = DistinctnessOperator(["category"]) + query = op.build_query("test_data") + result = duckdb_conn.execute(query).fetchdf() + metric = op.extract_result(result) + + # 3 distinct categories / 5 rows = 0.6 + assert metric.name == "Distinctness" + assert metric.value == 0.6 + + def test_completeness_operator(self, duckdb_conn): + """Test completeness operator with NULL values.""" + op = CompletenessOperator("amount") + aggs = op.get_aggregations() + query = f"SELECT {', '.join(aggs)} FROM test_data" + result = duckdb_conn.execute(query).fetchdf() + metric = op.extract_result(result) + + # 4 non-null out of 5 + assert metric.value == 0.8 + + def test_operator_with_where_clause(self, duckdb_conn): + """Test operator with WHERE clause filtering.""" + op = MeanOperator("amount", where="status = 'active'") + aggs = op.get_aggregations() + query = f"SELECT {', '.join(aggs)} FROM test_data" + result = duckdb_conn.execute(query).fetchdf() + metric = op.extract_result(result) + + # Active rows: 100, 200, NULL, 300 -> mean of non-null = 200 + assert metric.value == 200.0 diff --git a/tests/engines/test_suggestion_rules.py b/tests/engines/test_suggestion_rules.py new file mode 100644 index 0000000..248182f --- /dev/null +++ b/tests/engines/test_suggestion_rules.py @@ -0,0 +1,462 @@ +# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for suggestion rules. + +Tests the individual suggestion rules in isolation using mock column profiles. +""" + +import json +import pytest + +from pydeequ.engines import ColumnProfile +from pydeequ.engines.suggestions import ( + RuleRegistry, + SuggestionRunner, + CompleteIfCompleteRule, + RetainCompletenessRule, + NonNegativeNumbersRule, + CategoricalRangeRule, + HasMinRule, + HasMaxRule, + HasMeanRule, + HasMinLengthRule, + HasMaxLengthRule, + UniqueIfApproximatelyUniqueRule, +) + + +def make_profile( + column: str = "test_col", + completeness: float = 1.0, + approx_distinct_values: int = 10, + data_type: str = "INTEGER", + minimum: float = None, + maximum: float = None, + mean: float = None, + histogram: str = None, +) -> ColumnProfile: + """Create a test column profile with specified attributes.""" + return ColumnProfile( + column=column, + completeness=completeness, + approx_distinct_values=approx_distinct_values, + data_type=data_type, + minimum=minimum, + maximum=maximum, + mean=mean, + histogram=histogram, + ) + + +class TestCompleteIfCompleteRule: + """Tests for CompleteIfComplete rule.""" + + def test_applies_when_fully_complete(self): + """Rule applies when completeness is 1.0.""" + rule = CompleteIfCompleteRule() + profile = make_profile(completeness=1.0) + assert rule.applies_to(profile) is True + + def test_does_not_apply_when_not_complete(self): + """Rule does not apply when completeness < 1.0.""" + rule = CompleteIfCompleteRule() + profile = make_profile(completeness=0.95) + assert rule.applies_to(profile) is False + + def test_generates_correct_suggestion(self): + """Rule generates isComplete suggestion.""" + rule = CompleteIfCompleteRule() + profile = make_profile(column="my_column", completeness=1.0) + suggestion = rule.generate(profile) + + assert suggestion.column_name == "my_column" + assert suggestion.constraint_name == "Completeness" + assert suggestion.suggesting_rule == "CompleteIfComplete" + assert ".isComplete" in suggestion.code_for_constraint + + def test_rule_sets(self): + """Rule belongs to DEFAULT and EXTENDED sets.""" + rule = CompleteIfCompleteRule() + assert "DEFAULT" in rule.rule_sets + assert "EXTENDED" in rule.rule_sets + + +class TestRetainCompletenessRule: + """Tests for RetainCompleteness rule.""" + + def test_applies_when_high_completeness(self): + """Rule applies when completeness >= 0.9 and < 1.0.""" + rule = RetainCompletenessRule() + profile = make_profile(completeness=0.95) + assert rule.applies_to(profile) is True + + def test_does_not_apply_when_fully_complete(self): + """Rule does not apply when completeness is 1.0.""" + rule = RetainCompletenessRule() + profile = make_profile(completeness=1.0) + assert rule.applies_to(profile) is False + + def test_does_not_apply_when_low_completeness(self): + """Rule does not apply when completeness < threshold.""" + rule = RetainCompletenessRule() + profile = make_profile(completeness=0.85) + assert rule.applies_to(profile) is False + + def test_generates_correct_suggestion(self): + """Rule generates hasCompleteness suggestion.""" + rule = RetainCompletenessRule() + profile = make_profile(column="my_column", completeness=0.95) + suggestion = rule.generate(profile) + + assert suggestion.column_name == "my_column" + assert suggestion.suggesting_rule == "RetainCompleteness" + assert ".hasCompleteness" in suggestion.code_for_constraint + + +class TestNonNegativeNumbersRule: + """Tests for NonNegativeNumbers rule.""" + + def test_applies_when_minimum_non_negative(self): + """Rule applies when minimum >= 0.""" + rule = NonNegativeNumbersRule() + profile = make_profile(minimum=0.0) + assert rule.applies_to(profile) is True + + def test_applies_when_minimum_positive(self): + """Rule applies when minimum > 0.""" + rule = NonNegativeNumbersRule() + profile = make_profile(minimum=5.0) + assert rule.applies_to(profile) is True + + def test_does_not_apply_when_minimum_negative(self): + """Rule does not apply when minimum < 0.""" + rule = NonNegativeNumbersRule() + profile = make_profile(minimum=-1.0) + assert rule.applies_to(profile) is False + + def test_does_not_apply_when_no_minimum(self): + """Rule does not apply when minimum is None.""" + rule = NonNegativeNumbersRule() + profile = make_profile(minimum=None) + assert rule.applies_to(profile) is False + + def test_generates_correct_suggestion(self): + """Rule generates isNonNegative suggestion.""" + rule = NonNegativeNumbersRule() + profile = make_profile(column="amount", minimum=0.0) + suggestion = rule.generate(profile) + + assert suggestion.column_name == "amount" + assert suggestion.suggesting_rule == "NonNegativeNumbers" + assert ".isNonNegative" in suggestion.code_for_constraint + + +class TestCategoricalRangeRule: + """Tests for CategoricalRange rule.""" + + def test_applies_when_low_cardinality_histogram(self): + """Rule applies when histogram has <= 10 values.""" + rule = CategoricalRangeRule() + histogram = json.dumps({"A": 10, "B": 20, "C": 30}) + profile = make_profile(histogram=histogram) + assert rule.applies_to(profile) is True + + def test_does_not_apply_when_no_histogram(self): + """Rule does not apply when no histogram.""" + rule = CategoricalRangeRule() + profile = make_profile(histogram=None) + assert rule.applies_to(profile) is False + + def test_does_not_apply_when_high_cardinality(self): + """Rule does not apply when histogram has > 10 values.""" + rule = CategoricalRangeRule() + histogram = json.dumps({f"val_{i}": i for i in range(20)}) + profile = make_profile(histogram=histogram) + assert rule.applies_to(profile) is False + + def test_generates_correct_suggestion(self): + """Rule generates isContainedIn suggestion.""" + rule = CategoricalRangeRule() + histogram = json.dumps({"A": 10, "B": 20}) + profile = make_profile(column="status", histogram=histogram) + suggestion = rule.generate(profile) + + assert suggestion.column_name == "status" + assert suggestion.suggesting_rule == "CategoricalRangeRule" + assert ".isContainedIn" in suggestion.code_for_constraint + + +class TestHasMinRule: + """Tests for HasMin rule.""" + + def test_applies_when_numeric_with_stats(self): + """Rule applies when minimum and mean are present.""" + rule = HasMinRule() + profile = make_profile(minimum=0.0, mean=5.0) + assert rule.applies_to(profile) is True + + def test_does_not_apply_when_no_minimum(self): + """Rule does not apply when minimum is None.""" + rule = HasMinRule() + profile = make_profile(minimum=None, mean=5.0) + assert rule.applies_to(profile) is False + + def test_generates_correct_suggestion(self): + """Rule generates hasMin suggestion.""" + rule = HasMinRule() + profile = make_profile(column="value", minimum=1.0, mean=5.0) + suggestion = rule.generate(profile) + + assert suggestion.column_name == "value" + assert suggestion.suggesting_rule == "HasMin" + assert ".hasMin" in suggestion.code_for_constraint + + def test_rule_sets(self): + """Rule belongs to NUMERICAL and EXTENDED sets.""" + rule = HasMinRule() + assert "NUMERICAL" in rule.rule_sets + assert "EXTENDED" in rule.rule_sets + + +class TestHasMaxRule: + """Tests for HasMax rule.""" + + def test_applies_when_numeric_with_stats(self): + """Rule applies when maximum and mean are present.""" + rule = HasMaxRule() + profile = make_profile(maximum=10.0, mean=5.0) + assert rule.applies_to(profile) is True + + def test_generates_correct_suggestion(self): + """Rule generates hasMax suggestion.""" + rule = HasMaxRule() + profile = make_profile(column="value", maximum=10.0, mean=5.0) + suggestion = rule.generate(profile) + + assert suggestion.column_name == "value" + assert suggestion.suggesting_rule == "HasMax" + assert ".hasMax" in suggestion.code_for_constraint + + +class TestHasMeanRule: + """Tests for HasMean rule.""" + + def test_applies_when_mean_present(self): + """Rule applies when mean is present.""" + rule = HasMeanRule() + profile = make_profile(mean=5.0) + assert rule.applies_to(profile) is True + + def test_does_not_apply_when_no_mean(self): + """Rule does not apply when mean is None.""" + rule = HasMeanRule() + profile = make_profile(mean=None) + assert rule.applies_to(profile) is False + + def test_generates_correct_suggestion(self): + """Rule generates hasMean suggestion with range.""" + rule = HasMeanRule() + profile = make_profile(column="value", mean=100.0) + suggestion = rule.generate(profile) + + assert suggestion.column_name == "value" + assert suggestion.suggesting_rule == "HasMean" + assert ".hasMean" in suggestion.code_for_constraint + assert "between" in suggestion.code_for_constraint + + +class TestHasMinLengthRule: + """Tests for HasMinLength rule.""" + + def test_applies_to_string_columns(self): + """Rule applies to string data types.""" + rule = HasMinLengthRule() + profile = make_profile(data_type="VARCHAR") + assert rule.applies_to(profile) is True + + def test_does_not_apply_to_numeric_columns(self): + """Rule does not apply to numeric data types.""" + rule = HasMinLengthRule() + profile = make_profile(data_type="INTEGER") + assert rule.applies_to(profile) is False + + def test_generates_correct_suggestion(self): + """Rule generates hasMinLength suggestion.""" + rule = HasMinLengthRule() + profile = make_profile(column="name", data_type="VARCHAR") + suggestion = rule.generate(profile, min_length=3) + + assert suggestion.column_name == "name" + assert suggestion.suggesting_rule == "HasMinLength" + assert ".hasMinLength" in suggestion.code_for_constraint + + def test_returns_none_when_no_length(self): + """Rule returns None when no min_length provided.""" + rule = HasMinLengthRule() + profile = make_profile(data_type="VARCHAR") + suggestion = rule.generate(profile, min_length=None) + assert suggestion is None + + def test_rule_sets(self): + """Rule belongs to STRING and EXTENDED sets.""" + rule = HasMinLengthRule() + assert "STRING" in rule.rule_sets + assert "EXTENDED" in rule.rule_sets + + +class TestHasMaxLengthRule: + """Tests for HasMaxLength rule.""" + + def test_applies_to_string_columns(self): + """Rule applies to string data types.""" + rule = HasMaxLengthRule() + profile = make_profile(data_type="TEXT") + assert rule.applies_to(profile) is True + + def test_generates_correct_suggestion(self): + """Rule generates hasMaxLength suggestion.""" + rule = HasMaxLengthRule() + profile = make_profile(column="name", data_type="VARCHAR") + suggestion = rule.generate(profile, max_length=50) + + assert suggestion.column_name == "name" + assert suggestion.suggesting_rule == "HasMaxLength" + assert ".hasMaxLength" in suggestion.code_for_constraint + + +class TestUniqueIfApproximatelyUniqueRule: + """Tests for UniqueIfApproximatelyUnique rule.""" + + def test_generates_suggestion_when_unique(self): + """Rule generates isUnique when distinct values >= 99% of rows.""" + rule = UniqueIfApproximatelyUniqueRule() + profile = make_profile(column="id", approx_distinct_values=100) + suggestion = rule.generate(profile, row_count=100) + + assert suggestion is not None + assert suggestion.column_name == "id" + assert suggestion.suggesting_rule == "UniqueIfApproximatelyUnique" + assert ".isUnique" in suggestion.code_for_constraint + + def test_does_not_generate_when_not_unique(self): + """Rule returns None when distinct values < 99% of rows.""" + rule = UniqueIfApproximatelyUniqueRule() + profile = make_profile(approx_distinct_values=50) + suggestion = rule.generate(profile, row_count=100) + assert suggestion is None + + def test_returns_none_when_no_row_count(self): + """Rule returns None when row_count is not provided.""" + rule = UniqueIfApproximatelyUniqueRule() + profile = make_profile(approx_distinct_values=100) + suggestion = rule.generate(profile, row_count=None) + assert suggestion is None + + def test_rule_sets(self): + """Rule belongs to COMMON and EXTENDED sets.""" + rule = UniqueIfApproximatelyUniqueRule() + assert "COMMON" in rule.rule_sets + assert "EXTENDED" in rule.rule_sets + + +class TestRuleRegistry: + """Tests for RuleRegistry.""" + + def test_registry_has_default_rules(self): + """Registry has rules registered by default.""" + rules = RuleRegistry.get_all_rules() + assert len(rules) > 0 + + def test_get_rules_for_sets_default(self): + """Can retrieve DEFAULT rules.""" + rules = RuleRegistry.get_rules_for_sets(["DEFAULT"]) + rule_names = [r.name for r in rules] + assert "CompleteIfComplete" in rule_names + assert "NonNegativeNumbers" in rule_names + + def test_get_rules_for_sets_numerical(self): + """Can retrieve NUMERICAL rules.""" + rules = RuleRegistry.get_rules_for_sets(["NUMERICAL"]) + rule_names = [r.name for r in rules] + assert "HasMin" in rule_names + assert "HasMax" in rule_names + assert "HasMean" in rule_names + + def test_get_rules_for_sets_string(self): + """Can retrieve STRING rules.""" + rules = RuleRegistry.get_rules_for_sets(["STRING"]) + rule_names = [r.name for r in rules] + assert "HasMinLength" in rule_names + assert "HasMaxLength" in rule_names + + def test_get_rules_for_multiple_sets(self): + """Can retrieve rules from multiple sets.""" + rules = RuleRegistry.get_rules_for_sets(["DEFAULT", "NUMERICAL"]) + rule_names = [r.name for r in rules] + assert "CompleteIfComplete" in rule_names + assert "HasMin" in rule_names + + +class TestSuggestionRunner: + """Tests for SuggestionRunner.""" + + def test_runner_default_rules(self): + """Runner uses DEFAULT rules by default.""" + runner = SuggestionRunner() + assert runner.rule_sets == ["DEFAULT"] + + def test_runner_custom_rules(self): + """Runner can use custom rule sets.""" + runner = SuggestionRunner(rule_sets=["NUMERICAL", "STRING"]) + assert "NUMERICAL" in runner.rule_sets + assert "STRING" in runner.rule_sets + + def test_run_generates_suggestions(self): + """Runner generates suggestions from profiles.""" + runner = SuggestionRunner(rule_sets=["DEFAULT"]) + profiles = [ + make_profile(column="complete_col", completeness=1.0), + make_profile(column="partial_col", completeness=0.95), + ] + suggestions = runner.run(profiles) + + # Should have suggestions for both columns + column_names = [s.column_name for s in suggestions] + assert "complete_col" in column_names + assert "partial_col" in column_names + + def test_run_with_numeric_profiles(self): + """Runner generates numeric suggestions.""" + runner = SuggestionRunner(rule_sets=["NUMERICAL"]) + profiles = [ + make_profile(column="value", minimum=0.0, maximum=100.0, mean=50.0), + ] + suggestions = runner.run(profiles) + + rule_names = [s.suggesting_rule for s in suggestions] + assert "HasMin" in rule_names + assert "HasMax" in rule_names + assert "HasMean" in rule_names + + def test_run_with_row_count_for_uniqueness(self): + """Runner uses row_count for uniqueness checks.""" + runner = SuggestionRunner(rule_sets=["COMMON"]) + profiles = [ + make_profile(column="id", approx_distinct_values=100), + ] + suggestions = runner.run(profiles, row_count=100) + + rule_names = [s.suggesting_rule for s in suggestions] + assert "UniqueIfApproximatelyUnique" in rule_names diff --git a/tests/helpers/__init__.py b/tests/helpers/__init__.py new file mode 100644 index 0000000..9c7f9e4 --- /dev/null +++ b/tests/helpers/__init__.py @@ -0,0 +1,19 @@ +# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Test helpers for PyDeequ tests.""" + +from tests.helpers.spark_server import SparkConnectServer, SparkServerConfig + +__all__ = ["SparkConnectServer", "SparkServerConfig"] diff --git a/tests/helpers/spark_server.py b/tests/helpers/spark_server.py new file mode 100644 index 0000000..2679433 --- /dev/null +++ b/tests/helpers/spark_server.py @@ -0,0 +1,193 @@ +# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Spark Connect server management for tests. + +This module provides utilities to start and manage a Spark Connect server +for running integration tests that require Spark. +""" + +import os +import socket +import subprocess +import time +from dataclasses import dataclass, field +from typing import Optional + + +@dataclass +class SparkServerConfig: + """Configuration for Spark Connect server.""" + + java_home: str = field( + default_factory=lambda: os.environ.get( + "JAVA_HOME", + "/Library/Java/JavaVirtualMachines/amazon-corretto-17.jdk/Contents/Home", + ) + ) + spark_home: str = field( + default_factory=lambda: os.environ.get( + "SPARK_HOME", "/Volumes/workplace/deequ_rewrite/spark-3.5.0-bin-hadoop3" + ) + ) + port: int = 15002 + startup_timeout: int = 60 + poll_interval: float = 1.0 + driver_memory: str = "4g" + executor_memory: str = "4g" + deequ_jar: str = field( + default_factory=lambda: os.environ.get( + "DEEQU_JAR", + "/Volumes/workplace/deequ_rewrite/deequ/target/deequ_2.12-2.1.0b-spark-3.5.jar" + ) + ) + + +class SparkConnectServer: + """Manages Spark Connect server lifecycle for tests.""" + + def __init__(self, config: Optional[SparkServerConfig] = None): + """ + Initialize Spark Connect server manager. + + Args: + config: Server configuration (uses defaults if not provided) + """ + self.config = config or SparkServerConfig() + self._process: Optional[subprocess.Popen] = None + self._started_by_us = False + + def is_running(self) -> bool: + """Check if Spark Connect server is running by attempting to connect.""" + try: + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.settimeout(1) + result = sock.connect_ex(("localhost", self.config.port)) + sock.close() + return result == 0 + except (socket.error, OSError): + return False + + def start(self) -> float: + """ + Start Spark Connect server if not already running. + + Returns: + Time taken to start the server (0 if already running) + + Raises: + RuntimeError: If server fails to start within timeout + """ + if self.is_running(): + print(f"Spark Connect server already running on port {self.config.port}") + return 0.0 + + start_time = time.time() + + # Build the startup command + start_script = os.path.join(self.config.spark_home, "sbin", "start-connect-server.sh") + + if not os.path.exists(start_script): + raise RuntimeError(f"Spark Connect start script not found: {start_script}") + + cmd = [ + start_script, + "--conf", f"spark.driver.memory={self.config.driver_memory}", + "--conf", f"spark.executor.memory={self.config.executor_memory}", + "--packages", "org.apache.spark:spark-connect_2.12:3.5.0", + "--jars", self.config.deequ_jar, + "--conf", "spark.connect.extensions.relation.classes=com.amazon.deequ.connect.DeequRelationPlugin", + ] + + # Set up environment + env = os.environ.copy() + env["JAVA_HOME"] = self.config.java_home + env["SPARK_HOME"] = self.config.spark_home + + print(f"Starting Spark Connect server on port {self.config.port}...") + print(f" JAVA_HOME: {self.config.java_home}") + print(f" SPARK_HOME: {self.config.spark_home}") + + # Start the server + self._process = subprocess.Popen( + cmd, + env=env, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + self._started_by_us = True + + # Wait for server to be ready + deadline = time.time() + self.config.startup_timeout + while time.time() < deadline: + if self.is_running(): + elapsed = time.time() - start_time + print(f"Spark Connect server started in {elapsed:.1f}s") + return elapsed + time.sleep(self.config.poll_interval) + + # Timeout - try to get error output + if self._process: + self._process.terminate() + _, stderr = self._process.communicate(timeout=5) + error_msg = stderr.decode() if stderr else "Unknown error" + self._process = None + self._started_by_us = False + raise RuntimeError( + f"Spark Connect server failed to start within {self.config.startup_timeout}s: {error_msg[:500]}" + ) + + raise RuntimeError( + f"Spark Connect server failed to start within {self.config.startup_timeout}s" + ) + + def stop(self) -> None: + """Stop Spark Connect server if we started it.""" + if not self._started_by_us: + print("Spark Connect server was not started by us, skipping stop") + return + + stop_script = os.path.join(self.config.spark_home, "sbin", "stop-connect-server.sh") + + if os.path.exists(stop_script): + print("Stopping Spark Connect server...") + env = os.environ.copy() + env["JAVA_HOME"] = self.config.java_home + env["SPARK_HOME"] = self.config.spark_home + + try: + subprocess.run( + [stop_script], + env=env, + timeout=30, + capture_output=True, + ) + print("Spark Connect server stopped") + except subprocess.TimeoutExpired: + print("Warning: stop script timed out") + except Exception as e: + print(f"Warning: Error stopping server: {e}") + else: + # Fall back to killing the process directly + if self._process: + print("Terminating Spark Connect server process...") + self._process.terminate() + try: + self._process.wait(timeout=10) + except subprocess.TimeoutExpired: + self._process.kill() + print("Spark Connect server process terminated") + + self._started_by_us = False + self._process = None diff --git a/tests/v2/conftest.py b/tests/v2/conftest.py index 0474335..611d170 100644 --- a/tests/v2/conftest.py +++ b/tests/v2/conftest.py @@ -20,11 +20,13 @@ import pytest from pyspark.sql import Row, SparkSession -@pytest.fixture(scope="session") -def spark(): +@pytest.fixture(scope="module") +def spark(spark_connect_server): """ - Session-scoped Spark Connect session. - Shared across all tests for efficiency. + Module-scoped Spark Connect session. + + Depends on spark_connect_server fixture from tests/conftest.py + to ensure the server is running before creating the session. """ remote_url = os.environ.get("SPARK_REMOTE", "sc://localhost:15002") session = SparkSession.builder.remote(remote_url).getOrCreate() diff --git a/tests/v2/test_e2e_spark_connect.py b/tests/v2/test_e2e_spark_connect.py index 58c18fd..1dc019d 100644 --- a/tests/v2/test_e2e_spark_connect.py +++ b/tests/v2/test_e2e_spark_connect.py @@ -42,14 +42,8 @@ # Import the new Spark Connect API from pydeequ.v2.verification import AnalysisRunner, VerificationSuite -# Skip all tests if SPARK_REMOTE is not set -pytestmark = pytest.mark.skipif( - "SPARK_REMOTE" not in os.environ, - reason="SPARK_REMOTE environment variable not set. Start Spark Connect server first.", -) - - -# Note: spark fixture is defined in conftest.py (session-scoped) +# Note: spark fixture is defined in conftest.py and depends on spark_connect_server +# which automatically starts the server if needed @pytest.fixture(scope="module") diff --git a/tutorials/data_quality_example_duckdb.py b/tutorials/data_quality_example_duckdb.py new file mode 100644 index 0000000..29d4aff --- /dev/null +++ b/tutorials/data_quality_example_duckdb.py @@ -0,0 +1,258 @@ +#!/usr/bin/env python3 +""" +Testing Data Quality at Scale with PyDeequ + DuckDB + +This example demonstrates using PyDeequ with DuckDB as the execution backend, +enabling data quality checks without a Spark cluster. + +It covers: +- Data analysis (AnalysisRunner) +- Constraint verification (VerificationSuite) +- Column profiling (ColumnProfilerRunner) +- Constraint suggestions (ConstraintSuggestionRunner) + +Prerequisites: +1. Install dependencies: + pip install duckdb pandas + +2. Run this script: + python data_quality_example_duckdb.py +""" + +import duckdb +import pydeequ +from pydeequ.v2.analyzers import ( + Size, + Completeness, + Distinctness, + Mean, + Minimum, + Maximum, + StandardDeviation, + Correlation, + Uniqueness, +) +from pydeequ.v2.checks import Check, CheckLevel +from pydeequ.v2.verification import AnalysisRunner, VerificationSuite +from pydeequ.v2.predicates import eq, gte, lte, between +from pydeequ.v2.profiles import ColumnProfilerRunner +from pydeequ.v2.suggestions import ConstraintSuggestionRunner, Rules + + +def create_sample_data(con: duckdb.DuckDBPyConnection): + """Create a sample product reviews dataset for demonstration.""" + con.execute(""" + CREATE TABLE reviews AS SELECT * FROM (VALUES + ('R001', 'C100', 'P001', 'US', 5, 10, 12, 2023, 'Great Product', 'Y'), + ('R002', 'C101', 'P002', 'US', 4, 8, 10, 2023, 'Good Value', 'Y'), + ('R003', 'C102', 'P001', 'UK', 5, 15, 18, 2022, 'Great Product', 'N'), + ('R004', 'C103', 'P003', 'DE', 3, 5, 8, 2022, 'Decent Item', 'Y'), + ('R005', 'C104', 'P002', 'FR', 4, 12, 15, 2021, 'Good Value', 'N'), + ('R006', 'C105', 'P004', 'JP', 5, 20, 22, 2023, 'Excellent!', 'Y'), + ('R007', 'C106', 'P001', 'US', 2, 3, 10, 2020, 'Great Product', 'N'), + ('R008', 'C107', 'P005', 'UK', 1, 25, 30, 2021, 'Disappointing', 'Y'), + ('R009', 'C108', 'P002', NULL, 4, 7, 9, 2023, 'Good Value', 'Y'), + ('R001', 'C109', 'P003', 'US', 3, 4, 6, 2022, 'Decent Item', 'N') + ) AS t(review_id, customer_id, product_id, marketplace, star_rating, + helpful_votes, total_votes, review_year, product_title, insight) + """) + + +def run_data_analysis(engine): + """ + Run data analysis using AnalysisRunner. + + This demonstrates computing various metrics on the dataset: + - Size: Total row count + - Completeness: Ratio of non-null values + - Distinctness: Ratio of distinct values + - Mean, Min, Max: Statistical measures + - Correlation: Relationship between columns + """ + print("\n" + "=" * 60) + print("DATA ANALYSIS") + print("=" * 60) + + result = (AnalysisRunner() + .on_engine(engine) + .addAnalyzer(Size()) + .addAnalyzer(Completeness("review_id")) + .addAnalyzer(Completeness("marketplace")) + .addAnalyzer(Distinctness(["review_id"])) + .addAnalyzer(Mean("star_rating")) + .addAnalyzer(Minimum("star_rating")) + .addAnalyzer(Maximum("star_rating")) + .addAnalyzer(StandardDeviation("star_rating")) + .addAnalyzer(Correlation("total_votes", "helpful_votes")) + .run()) + + print("\nAnalysis Results:") + print(result.to_string(index=False)) + + # Extract key insights + metrics = {(r["name"], r["instance"]): r["value"] for _, r in result.iterrows()} + + print("\nKey Insights:") + print(f" - Dataset contains {int(metrics.get(('Size', '*'), 0))} reviews") + print(f" - review_id completeness: {metrics.get(('Completeness', 'review_id'), 0):.1%}") + print(f" - marketplace completeness: {metrics.get(('Completeness', 'marketplace'), 0):.1%}") + print(f" - review_id distinctness: {metrics.get(('Distinctness', 'review_id'), 0):.1%}") + print(f" - Average star rating: {metrics.get(('Mean', 'star_rating'), 0):.2f}") + print(f" - Star rating range: {metrics.get(('Minimum', 'star_rating'), 0):.0f} - {metrics.get(('Maximum', 'star_rating'), 0):.0f}") + + return result + + +def run_constraint_verification(engine): + """ + Run constraint verification using VerificationSuite. + + This demonstrates defining and verifying data quality rules: + - Size checks + - Completeness checks + - Uniqueness checks + - Range checks (min/max) + - Categorical value checks + """ + print("\n" + "=" * 60) + print("CONSTRAINT VERIFICATION") + print("=" * 60) + + # Define checks using the V2 predicate API + check = (Check(CheckLevel.Warning, "Product Reviews Quality Check") + # Size check: at least 5 reviews + .hasSize(gte(5)) + # Completeness checks + .isComplete("review_id") + .isComplete("customer_id") + .hasCompleteness("marketplace", gte(0.8)) # Allow some missing + # Uniqueness check + .isUnique("review_id") + # Star rating range check + .hasMin("star_rating", eq(1.0)) + .hasMax("star_rating", eq(5.0)) + .hasMean("star_rating", between(1.0, 5.0)) + # Year range check + .hasMin("review_year", gte(2015)) + .hasMax("review_year", lte(2025)) + # Categorical check + .isContainedIn("marketplace", ["US", "UK", "DE", "JP", "FR"]) + .isContainedIn("insight", ["Y", "N"]) + ) + + result = (VerificationSuite() + .on_engine(engine) + .addCheck(check) + .run()) + + print("\nVerification Results:") + print(result.to_string(index=False)) + + # Summarize results + passed = (result["constraint_status"] == "Success").sum() + failed = (result["constraint_status"] == "Failure").sum() + + print(f"\nSummary: {passed} passed, {failed} failed out of {len(result)} constraints") + + if failed > 0: + print("\nFailed Constraints:") + for _, row in result[result["constraint_status"] == "Failure"].iterrows(): + print(f" - {row['constraint']}") + if row["constraint_message"]: + print(f" Message: {row['constraint_message']}") + + return result + + +def run_column_profiling(engine): + """ + Run column profiling using ColumnProfilerRunner. + + This automatically computes statistics for each column: + - Completeness + - Approximate distinct values + - Data type detection + - Numeric statistics (mean, min, max, etc.) + """ + print("\n" + "=" * 60) + print("COLUMN PROFILING") + print("=" * 60) + + result = (ColumnProfilerRunner() + .on_engine(engine) + .withLowCardinalityHistogramThreshold(10) # Generate histograms for low-cardinality columns + .run()) + + print("\nColumn Profiles:") + # Show selected columns for readability + cols_to_show = ["column", "completeness", "approx_distinct_values", "data_type", "mean", "minimum", "maximum"] + available_cols = [c for c in cols_to_show if c in result.columns] + print(result[available_cols].to_string(index=False)) + + return result + + +def run_constraint_suggestions(engine): + """ + Run automated constraint suggestion using ConstraintSuggestionRunner. + + This analyzes the data and suggests appropriate constraints: + - Completeness constraints for complete columns + - Uniqueness constraints for unique columns + - Categorical range constraints for low-cardinality columns + - Non-negative constraints for numeric columns + """ + print("\n" + "=" * 60) + print("CONSTRAINT SUGGESTIONS") + print("=" * 60) + + result = (ConstraintSuggestionRunner() + .on_engine(engine) + .addConstraintRules(Rules.DEFAULT) + .run()) + + print("\nSuggested Constraints:") + cols_to_show = ["column_name", "constraint_name", "description", "code_for_constraint"] + available_cols = [c for c in cols_to_show if c in result.columns] + print(result[available_cols].to_string(index=False)) + + print(f"\nTotal suggestions: {len(result)}") + + return result + + +def main(): + print("PyDeequ Data Quality Example with DuckDB") + print("No Spark cluster required!") + + # Create in-memory DuckDB connection + con = duckdb.connect() + + # Create sample data + print("\nCreating sample product reviews dataset...") + create_sample_data(con) + + # Create engine using pydeequ.connect() + engine = pydeequ.connect(con, table="reviews") + + print("\nDataset Schema:") + schema = engine.get_schema() + for col, dtype in schema.items(): + print(f" {col}: {dtype}") + + print("\nSample Data:") + print(con.execute("SELECT * FROM reviews LIMIT 5").fetchdf().to_string(index=False)) + + # Run all examples + run_data_analysis(engine) + run_constraint_verification(engine) + run_column_profiling(engine) + run_constraint_suggestions(engine) + + print("\n" + "=" * 60) + print("EXAMPLE COMPLETE") + print("=" * 60) + + +if __name__ == "__main__": + main()