Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 0 additions & 15 deletions .flake8

This file was deleted.

14 changes: 7 additions & 7 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
#
# Jobs (all lint jobs run in parallel):
# changes — detect which paths changed (skips heavy jobs on workflow-only PRs)
# lint-core — black, isort, flake8, mypy on packages/parser-core
# lint-free — black, isort, flake8 on packages/parser-free
# lint-core — black, isort, ruff, mypy on packages/parser-core
# lint-free — black, isort, ruff on packages/parser-free
# security — bandit + safety on both packages
# test-core — pytest with 91% coverage gate (Python matrix), needs lint-core
# test-free — pytest on packages/parser-free, needs lint-free
Expand Down Expand Up @@ -115,8 +115,8 @@ jobs:
- name: isort
run: isort --check-only --diff src tests

- name: Flake8
run: flake8 src tests --max-line-length=88 --extend-ignore=E203,W503,E501,W504,D,C420
- name: Ruff
run: ruff check src tests

- name: MyPy
run: mypy src --ignore-missing-imports
Expand Down Expand Up @@ -162,16 +162,16 @@ jobs:
pip install --upgrade pip
pip install -e ../parser-core
pip install -e ".[test]"
pip install black isort flake8
pip install black isort ruff

- name: Black
run: black --check --diff src tests

- name: isort
run: isort --check-only --diff src tests

- name: Flake8
run: flake8 src tests --max-line-length=88 --extend-ignore=E203,W503,E501,W504,D,C420
- name: Ruff
run: ruff check src tests

security:
name: Security — bandit + safety
Expand Down
15 changes: 4 additions & 11 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,20 +33,13 @@ repos:
- id: black
language_version: python3

# Ruff - fast Python linter and auto-fixer
- repo: https://github.com/charliermarsh/ruff-pre-commit
rev: v0.0.265
# Python linting (ruff)
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.8.0
hooks:
- id: ruff
- id: ruff-check
args: ["--fix"]

# Python linting
- repo: https://github.com/pycqa/flake8
rev: 7.0.0
hooks:
- id: flake8
args: ["--max-line-length=88", "--extend-ignore=E203,W503,E501,W504,D,C420"]

# Type checking
- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.8.0
Expand Down
34 changes: 32 additions & 2 deletions packages/parser-core/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ dependencies = [
dev = [
"black>=23.0.0,<27.0.0",
"isort>=5.12.0,<9.0.0",
"flake8>=6.0.0,<8.0.0",
"mypy>=1.8.0,<2.0.0",
"pyright>=1.1.350",
"types-python-dateutil>=2.8.0.0",
Expand All @@ -41,7 +40,7 @@ dev = [
"ipython>=8.0.0,<10.0.0",
"ipdb>=0.13.0",
"pre-commit>=3.0.0,<5.0.0",
"ruff>=0.0.265,<1.0.0",
"ruff>=0.8.0,<1.0.0",
"bandit[toml]>=1.7.0,<2.0.0",
"safety>=2.0.0,<4.0.0",
"detect-secrets>=1.4.0,<2.0.0",
Expand Down Expand Up @@ -145,3 +144,34 @@ profile = "black"
multi_line_output = 3
line_length = 88
known_first_party = ["bankstatements_core"]

[tool.ruff]
line-length = 88
target-version = "py311"

[tool.ruff.lint]
select = [
"E", # pycodestyle errors
"W", # pycodestyle warnings
"F", # pyflakes
"B", # flake8-bugbear
"C901", # mccabe complexity
"G", # flake8-logging-format
"PLC", # pylint convention
"PLR", # pylint refactoring
"T201", # flake8-print
"BLE001", # flake8-blind-except
"UP", # pyupgrade
"RUF", # ruff-specific rules
]
ignore = [
"E501", # line too long — handled by black
"PLR2004", # magic value comparison — acceptable in tests and config
"G004", # logging f-string — 214 violations, deferred, see GitHub issue #90
]

[tool.ruff.lint.per-file-ignores]
"__init__.py" = ["F401"]
"tests/**/*.py" = ["F401", "PLC0415", "PLR0913", "C901", "RUF043", "RUF059", "RUF005", "RUF003"]
"tests/integration/*.py" = ["T201"]
"src/bankstatements_core/pdf_table_extractor.py" = ["E402"]
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
)

__all__ = [
"PDFPlumberReaderAdapter",
"PDFPlumberDocumentAdapter",
"PDFPlumberPageAdapter",
"PDFPlumberReaderAdapter",
]
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
class PDFPlumberPageAdapter:
"""Adapter wrapping pdfplumber Page to implement IPDFPage protocol."""

def __init__(self, page: "Page"):
def __init__(self, page: Page):
"""Initialize page adapter.

Args:
Expand Down Expand Up @@ -118,7 +118,7 @@ def crop(self, bbox: tuple[float, float, float, float]) -> PDFPlumberPageAdapter
class PDFPlumberDocumentAdapter:
"""Adapter wrapping pdfplumber PDF to implement IPDFDocument protocol."""

def __init__(self, pdf_doc: "PDF"):
def __init__(self, pdf_doc: PDF):
"""Initialize document adapter.

Args:
Expand Down Expand Up @@ -174,12 +174,12 @@ def open(self, pdf_path: Path) -> PDFPlumberDocumentAdapter:
# pdfplumber.open returns pdfplumber.PDF but type system expects pdfplumber.pdf.PDF
return PDFPlumberDocumentAdapter(pdf_doc) # type: ignore[arg-type]
except FileNotFoundError:
raise FileNotFoundError(f"PDF file not found: {pdf_path}")
raise FileNotFoundError(f"PDF file not found: {pdf_path}") from None
except (OSError, ValueError, TypeError, RuntimeError) as e:
# Expected errors: file I/O errors, invalid PDF structure, type errors, PDF library errors
# PDFSyntaxError and other pdfminer exceptions inherit from RuntimeError or are library-specific
raise IOError(f"Failed to open PDF {pdf_path}: {e}") from e
raise OSError(f"Failed to open PDF {pdf_path}: {e}") from e
except Exception as e:
# Catch any other PDF library exceptions (PDFSyntaxError, etc.)
# These are library-specific errors that indicate corrupted/invalid PDFs
raise IOError(f"Failed to open PDF {pdf_path}: {e}") from e
raise OSError(f"Failed to open PDF {pdf_path}: {e}") from e
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
"""

from dataclasses import dataclass
from typing import List, Tuple


@dataclass
Expand Down Expand Up @@ -133,7 +132,7 @@ def expand_bbox(bbox: BBox, margin: float) -> BBox:
)


def merge_bboxes(bboxes: List[BBox]) -> BBox:
def merge_bboxes(bboxes: list[BBox]) -> BBox:
"""Merge multiple bounding boxes into a single container bbox.

Args:
Expand All @@ -156,7 +155,7 @@ def merge_bboxes(bboxes: List[BBox]) -> BBox:
return BBox(x0=min_x0, y0=min_y0, x1=max_x1, y1=max_y1)


def bbox_from_words(words: List[dict]) -> BBox:
def bbox_from_words(words: list[dict]) -> BBox:
"""Create a bounding box that contains all given words.

Args:
Expand All @@ -179,7 +178,7 @@ def bbox_from_words(words: List[dict]) -> BBox:
return BBox(x0=min_x0, y0=min_y0, x1=max_x1, y1=max_y1)


def bbox_intersection(bbox1: BBox, bbox2: BBox) -> Tuple[float, float]:
def bbox_intersection(bbox1: BBox, bbox2: BBox) -> tuple[float, float]:
"""Calculate the intersection dimensions of two bounding boxes.

Args:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
"""

import logging
from typing import Any, Dict, List, Optional, Tuple
from typing import Any

from bankstatements_core.analysis.bbox_utils import BBox

Expand Down Expand Up @@ -34,7 +34,7 @@ def __init__(

def analyze_columns(
self, page: Any, table_bbox: BBox
) -> Dict[str, Tuple[float, float]]:
) -> dict[str, tuple[float, float]]:
"""Analyze table and detect column boundaries.

Args:
Expand Down Expand Up @@ -95,7 +95,7 @@ def analyze_columns(
logger.info(f"Detected {len(columns)} columns")
return columns

def _cluster_x_coordinates(self, words: List[dict]) -> List[float]:
def _cluster_x_coordinates(self, words: list[dict]) -> list[float]:
"""Cluster word X coordinates to find column alignment points.

Args:
Expand Down Expand Up @@ -135,8 +135,8 @@ def _cluster_x_coordinates(self, words: List[dict]) -> List[float]:
return sorted(clusters)

def _detect_boundaries_from_clusters(
self, clusters: List[float]
) -> List[Tuple[float, float]]:
self, clusters: list[float]
) -> list[tuple[float, float]]:
"""Detect column boundaries from cluster centers.

Args:
Expand Down Expand Up @@ -165,21 +165,20 @@ def _detect_boundaries_from_clusters(
else:
# Small gap - columns are close, use midpoint
x_max = (clusters[i] + clusters[i + 1]) / 2
# Last column - extend to reasonable width
elif i > 0:
avg_width = (clusters[i] - clusters[0]) / i
x_max = clusters[i] + avg_width
else:
# Last column - extend to reasonable width
if i > 0:
avg_width = (clusters[i] - clusters[0]) / i
x_max = clusters[i] + avg_width
else:
x_max = clusters[i] + 100 # Default width
x_max = clusters[i] + 100 # Default width

boundaries.append((x_min, x_max))

return boundaries

def _find_header_words(
self, table_words: List[dict], table_bbox: BBox
) -> List[dict]:
self, table_words: list[dict], table_bbox: BBox
) -> list[dict]:
"""Find words in the header row of the table.

Args:
Expand Down Expand Up @@ -208,8 +207,8 @@ def _find_header_words(
return header_words

def _assign_column_names(
self, boundaries: List[Tuple[float, float]], header_words: List[dict]
) -> List[str]:
self, boundaries: list[tuple[float, float]], header_words: list[dict]
) -> list[str]:
"""Assign names to columns based on header words.

Strategy: Each header word should be assigned to its BEST matching column only.
Expand Down Expand Up @@ -246,7 +245,7 @@ def _assign_column_names(
word_groups.append(current_group)

# Assign each word group to the best matching column boundary
column_names: List[Optional[str]] = [None] * len(boundaries)
column_names: list[str | None] = [None] * len(boundaries)

for group in word_groups:
# Calculate group center
Expand Down Expand Up @@ -278,7 +277,7 @@ def _assign_column_names(
)

# Fill in any unassigned columns with generic names
result_names: List[str] = []
result_names: list[str] = []
for i in range(len(column_names)):
name_val = column_names[i]
if name_val is None:
Expand All @@ -294,8 +293,8 @@ def _assign_column_names(
return result_names

def _resolve_overlapping_boundaries(
self, boundaries: List[Tuple[float, float]]
) -> List[Tuple[float, float]]:
self, boundaries: list[tuple[float, float]]
) -> list[tuple[float, float]]:
"""Resolve overlapping column boundaries.

When columns overlap, adjust boundaries so column i ends just before
Expand Down Expand Up @@ -337,8 +336,8 @@ def _resolve_overlapping_boundaries(
return resolved

def _create_columns_from_headers(
self, header_words: List[dict], table_bbox: BBox
) -> Tuple[List[Tuple[float, float]], List[str]]:
self, header_words: list[dict], table_bbox: BBox
) -> tuple[list[tuple[float, float]], list[str]]:
"""Create column boundaries and names directly from header words.

Args:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import logging
import re
from dataclasses import dataclass
from typing import Any, List, Optional
from typing import Any

from bankstatements_core.analysis.bbox_utils import BBox, overlaps
from bankstatements_core.extraction.iban_extractor import IBANExtractor
Expand All @@ -33,7 +33,7 @@ class IBANCandidate:
masked: str
bbox: BBox
confidence_score: float = 0.0
rejection_reason: Optional[str] = None
rejection_reason: str | None = None


class IBANSpatialFilter:
Expand All @@ -46,7 +46,7 @@ def __init__(self) -> None:
"""Initialize IBAN spatial filter."""
self.iban_extractor = IBANExtractor()

def extract_iban_candidates(self, page: Any) -> List[IBANCandidate]:
def extract_iban_candidates(self, page: Any) -> list[IBANCandidate]: # noqa: C901
"""Extract IBAN candidates with spatial coordinates from page.

Uses two strategies:
Expand Down Expand Up @@ -169,10 +169,10 @@ def extract_iban_candidates(self, page: Any) -> List[IBANCandidate]:

def filter_by_table_overlap(
self,
candidates: List[IBANCandidate],
table_regions: List[BBox],
candidates: list[IBANCandidate],
table_regions: list[BBox],
overlap_threshold: float = 0.0,
) -> List[IBANCandidate]:
) -> list[IBANCandidate]:
"""Filter out IBANs that overlap with table regions.

Args:
Expand Down Expand Up @@ -216,8 +216,8 @@ def filter_by_table_overlap(
return filtered

def score_candidates(
self, candidates: List[IBANCandidate], page_height: float
) -> List[IBANCandidate]:
self, candidates: list[IBANCandidate], page_height: float
) -> list[IBANCandidate]:
"""Score IBAN candidates based on location and context.

Higher scores are given to:
Expand Down Expand Up @@ -265,9 +265,7 @@ def score_candidates(

return candidates_sorted

def select_best_iban(
self, candidates: List[IBANCandidate]
) -> Optional[IBANCandidate]:
def select_best_iban(self, candidates: list[IBANCandidate]) -> IBANCandidate | None:
"""Select the best IBAN from scored candidates.

Args:
Expand Down
Loading
Loading