From 4313954a6de1ae0e5e7259306b50c4fb3b753294 Mon Sep 17 00:00:00 2001 From: longieirl Date: Thu, 26 Mar 2026 16:58:28 +0000 Subject: [PATCH 1/6] =?UTF-8?q?feat(#71):=20migrate=20service=20layer=20to?= =?UTF-8?q?=20list[Transaction]=20=E2=80=94=20remove=20dict=20round-trips?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Transaction gains document_type and transaction_type fields - IDuplicateDetector, ITransactionSorting, IIBANGrouping, ITransactionFilter protocols updated to list[Transaction] - DuplicateDetectionStrategy.detect_duplicates() and all create_key() methods accept Transaction objects directly - ServiceRegistry.process_transaction_group() / group_by_iban() operate on list[Transaction]; enrichment helpers mutate tx fields in place - processor.py passes list[Transaction] through pipeline; dicts only at output boundary via transactions_to_dicts() - Unused transactions_to_dicts import removed from pdf_processing_orchestrator - All 10 affected test files updated: fixtures use Transaction.from_dict() and assertions use attribute access (tx.filename etc.) --- .../domain/models/transaction.py | 10 + .../domain/protocols/services.py | 132 ++---- .../patterns/strategies.py | 90 ++--- .../src/bankstatements_core/processor.py | 47 ++- .../services/duplicate_detector.py | 11 +- .../services/iban_grouping.py | 28 +- .../services/pdf_processing_orchestrator.py | 7 +- .../services/service_registry.py | 58 ++- .../services/sorting_service.py | 26 +- .../services/transaction_filter.py | 127 ++---- .../services/transaction_type_classifier.py | 49 +-- .../test_credit_card_duplicate_strategy.py | 351 ++++++++-------- .../tests/services/test_duplicate_detector.py | 187 +++++---- .../tests/services/test_iban_grouping.py | 27 +- .../tests/services/test_service_registry.py | 21 +- .../tests/services/test_sorting_service.py | 94 ++--- .../tests/services/test_transaction_filter.py | 101 ++--- .../test_transaction_type_classifier.py | 380 ++++++++++-------- packages/parser-core/tests/test_processor.py | 214 +++++----- .../test_processor_refactored_methods.py | 26 +- .../tests/test_coverage_improvements.py | 19 +- 21 files changed, 959 insertions(+), 1046 deletions(-) diff --git a/packages/parser-core/src/bankstatements_core/domain/models/transaction.py b/packages/parser-core/src/bankstatements_core/domain/models/transaction.py index 7c62cf2..be5ffe7 100644 --- a/packages/parser-core/src/bankstatements_core/domain/models/transaction.py +++ b/packages/parser-core/src/bankstatements_core/domain/models/transaction.py @@ -54,6 +54,8 @@ class Transaction: source_page: int | None = None confidence_score: float = 1.0 extraction_warnings: list[ExtractionWarning] = field(default_factory=list) + document_type: str = "" + transaction_type: str = "" def is_debit(self) -> bool: """Check if transaction is a debit (money out). @@ -235,6 +237,8 @@ def from_dict(cls, data: dict[str, str | None]) -> Transaction: "source_page", "confidence_score", "extraction_warnings", + "document_type", + "transaction_type", } additional_fields = { k: str(v) @@ -263,6 +267,8 @@ def from_dict(cls, data: dict[str, str | None]) -> Transaction: ] else: extraction_warnings = [] + document_type = str(data.get("document_type") or "") + transaction_type = str(data.get("transaction_type") or "") return cls( date=date or "", @@ -275,6 +281,8 @@ def from_dict(cls, data: dict[str, str | None]) -> Transaction: source_page=source_page, confidence_score=confidence_score, extraction_warnings=extraction_warnings, + document_type=document_type, + transaction_type=transaction_type, ) @staticmethod @@ -330,6 +338,8 @@ def to_dict(self, currency_symbol: str = "€") -> dict[str, str | None]: result["extraction_warnings"] = json.dumps( [w.to_dict() for w in self.extraction_warnings] ) + result["document_type"] = self.document_type + result["transaction_type"] = self.transaction_type # Add any additional fields result.update(self.additional_fields) diff --git a/packages/parser-core/src/bankstatements_core/domain/protocols/services.py b/packages/parser-core/src/bankstatements_core/domain/protocols/services.py index 672512e..f3f96ed 100644 --- a/packages/parser-core/src/bankstatements_core/domain/protocols/services.py +++ b/packages/parser-core/src/bankstatements_core/domain/protocols/services.py @@ -8,6 +8,7 @@ import pandas as pd if TYPE_CHECKING: + from bankstatements_core.domain.models.transaction import Transaction from bankstatements_core.templates.template_model import BankTemplate @@ -15,52 +16,23 @@ class IPDFDiscovery(Protocol): """Protocol for discovering PDF files in directories.""" def discover_pdfs(self, input_dir: Path, recursive: bool = False) -> list[Path]: - """Discover PDF files in directory. - - Args: - input_dir: Directory to search for PDFs - recursive: Whether to search subdirectories recursively - - Returns: - List of paths to discovered PDF files - """ + """Discover PDF files in directory.""" ... class ITransactionFilter(Protocol): """Protocol for filtering transaction rows.""" - def apply_all_filters(self, rows: list[dict]) -> list[dict]: - """Apply all configured filters to rows. - - Args: - rows: List of transaction dictionaries - - Returns: - Filtered list of transactions - """ + def apply_all_filters(self, rows: list["Transaction"]) -> list["Transaction"]: + """Apply all configured filters to rows.""" ... - def filter_empty_rows(self, rows: list[dict]) -> list[dict]: - """Filter out rows with insufficient data. - - Args: - rows: List of transaction dictionaries - - Returns: - List of non-empty transactions - """ + def filter_empty_rows(self, rows: list["Transaction"]) -> list["Transaction"]: + """Filter out rows with insufficient data.""" ... - def filter_header_rows(self, rows: list[dict]) -> list[dict]: - """Filter out header rows that were incorrectly extracted. - - Args: - rows: List of transaction dictionaries - - Returns: - List of transactions with header rows removed - """ + def filter_header_rows(self, rows: list["Transaction"]) -> list["Transaction"]: + """Filter out header rows that were incorrectly extracted.""" ... @@ -68,17 +40,11 @@ class IIBANGrouping(Protocol): """Protocol for grouping transactions by IBAN.""" def group_by_iban( - self, transactions: list[dict], pdf_ibans: dict[str, str] - ) -> dict[str, list[dict]]: - """Group transactions by IBAN suffix (last 4 digits). - - Args: - transactions: List of all transactions - pdf_ibans: Dictionary mapping PDF filenames to IBANs - - Returns: - Dictionary mapping IBAN suffix to list of transactions - """ + self, + transactions: list["Transaction"], + pdf_ibans: dict[str, str], + ) -> dict[str, list["Transaction"]]: + """Group transactions by IBAN suffix (last 4 digits).""" ... @@ -86,28 +52,13 @@ class IColumnTotals(Protocol): """Protocol for calculating column totals.""" def calculate(self, df: pd.DataFrame) -> dict[str, float]: - """Calculate totals for configured columns. - - Args: - df: DataFrame containing transaction data - - Returns: - Dictionary mapping column names to their totals - """ + """Calculate totals for configured columns.""" ... def format_totals_row( self, totals: dict[str, float], column_names: list[str] ) -> list[str]: - """Format totals as a row matching column structure. - - Args: - totals: Dictionary of column totals - column_names: List of all column names - - Returns: - List of formatted values for CSV row - """ + """Format totals as a row matching column structure.""" ... @@ -115,15 +66,7 @@ class ITemplateDetector(Protocol): """Protocol for detecting PDF bank statement templates.""" def detect_template(self, pdf_path: Path, first_page: Any) -> "BankTemplate": - """Detect template from PDF first page. - - Args: - pdf_path: Path to PDF file - first_page: First page of PDF (pdfplumber Page object or adapter) - - Returns: - BankTemplate instance - """ + """Detect template from PDF first page.""" ... @@ -131,31 +74,18 @@ class IDuplicateDetector(Protocol): """Protocol for detecting duplicate transactions.""" def detect_and_separate( - self, transactions: list[dict] - ) -> tuple[list[dict], list[dict]]: - """Separate unique transactions from duplicates. - - Args: - transactions: List of all transactions - - Returns: - Tuple of (unique_transactions, duplicate_transactions) - """ + self, + transactions: list["Transaction"], + ) -> tuple[list["Transaction"], list["Transaction"]]: + """Separate unique transactions from duplicates.""" ... class ITransactionSorting(Protocol): """Protocol for sorting transactions.""" - def sort(self, transactions: list[dict]) -> list[dict]: - """Sort transactions using configured strategy. - - Args: - transactions: List of transactions to sort - - Returns: - Sorted list of transactions - """ + def sort(self, transactions: list["Transaction"]) -> list["Transaction"]: + """Sort transactions using configured strategy.""" ... @@ -163,14 +93,7 @@ class IMonthlySummary(Protocol): """Protocol for generating monthly transaction summaries.""" def generate(self, transactions: list[dict]) -> dict[str, Any]: - """Generate monthly summary from transactions. - - Args: - transactions: List of transaction dictionaries - - Returns: - Dictionary with monthly summaries and statistics - """ + """Generate monthly summary from transactions.""" ... @@ -178,12 +101,5 @@ class IExpenseAnalysis(Protocol): """Protocol for expense analysis service.""" def analyze(self, transactions: list[dict]) -> dict[str, Any]: - """Analyze transactions and generate expense insights. - - Args: - transactions: List of transaction dictionaries - - Returns: - Dictionary with expense insights and analysis - """ + """Analyze transactions and generate expense insights.""" ... diff --git a/packages/parser-core/src/bankstatements_core/patterns/strategies.py b/packages/parser-core/src/bankstatements_core/patterns/strategies.py index 5494f7a..abca1c3 100644 --- a/packages/parser-core/src/bankstatements_core/patterns/strategies.py +++ b/packages/parser-core/src/bankstatements_core/patterns/strategies.py @@ -19,6 +19,7 @@ from bankstatements_core.utils import to_float if TYPE_CHECKING: + from bankstatements_core.domain.models.transaction import Transaction from bankstatements_core.entitlements import Entitlements # noqa: F401 logger = logging.getLogger(__name__) @@ -28,14 +29,14 @@ class DuplicateDetectionStrategy(ABC): """Abstract strategy for detecting duplicate transactions.""" @abstractmethod - def create_key(self, transaction: dict) -> str: + def create_key(self, transaction: "Transaction") -> str: """ Create a unique key for a transaction. Duplicate transactions should produce the same key. Args: - transaction: Transaction dictionary + transaction: Transaction object Returns: String key that uniquely identifies the transaction @@ -43,36 +44,36 @@ def create_key(self, transaction: dict) -> str: pass def detect_duplicates( - self, transactions: list[dict] - ) -> tuple[list[dict], list[dict]]: + self, transactions: list["Transaction"] + ) -> tuple[list["Transaction"], list["Transaction"]]: """ Detect duplicates in a list of transactions. Args: - transactions: List of transaction dictionaries + transactions: List of Transaction objects Returns: Tuple of (unique_transactions, duplicate_transactions) """ - unique_rows = [] - duplicate_rows = [] + unique_rows: list["Transaction"] = [] + duplicate_rows: list["Transaction"] = [] transaction_files: dict[str, str] = {} - for row in transactions: - transaction_key = self.create_key(row) - current_filename = row.get("Filename", "") + for tx in transactions: + transaction_key = self.create_key(tx) + current_filename = tx.filename if transaction_key in transaction_files: # Same transaction details but different file = duplicate if transaction_files[transaction_key] != current_filename: - duplicate_rows.append(row) + duplicate_rows.append(tx) else: # Same file, same transaction - keep it - unique_rows.append(row) + unique_rows.append(tx) else: # First time seeing this transaction transaction_files[transaction_key] = current_filename - unique_rows.append(row) + unique_rows.append(tx) return unique_rows, duplicate_rows @@ -85,24 +86,24 @@ class AllFieldsDuplicateStrategy(DuplicateDetectionStrategy): details, and all monetary fields to be considered duplicates. """ - def create_key(self, transaction: dict) -> str: + def create_key(self, transaction: "Transaction") -> str: """Create key from date, details, and all monetary columns.""" - # Extract date and details - date = transaction.get("Date", "") - details = transaction.get("Details", "") - - # Extract monetary values (any column with €, £, $, etc.) amounts = [] - for key, value in transaction.items(): - if key not in ["Date", "Details", "Filename"] and value: - # Try to parse as currency + for field_name in ("debit", "credit", "balance"): + value = getattr(transaction, field_name, None) + if value: + amount = to_float(str(value)) + if amount is not None: + amounts.append(f"{field_name}:{amount}") + # Also include any numeric additional_fields + for key, value in transaction.additional_fields.items(): + if value: amount = to_float(str(value)) if amount is not None: amounts.append(f"{key}:{amount}") - # Combine into key amounts_str = "|".join(sorted(amounts)) - return f"{date}:{details}:{amounts_str}" + return f"{transaction.date}:{transaction.details}:{amounts_str}" class DateAmountDuplicateStrategy(DuplicateDetectionStrategy): @@ -113,19 +114,22 @@ class DateAmountDuplicateStrategy(DuplicateDetectionStrategy): between statements, but the date and amount should match. """ - def create_key(self, transaction: dict) -> str: + def create_key(self, transaction: "Transaction") -> str: """Create key from date and sum of all amounts.""" - date = transaction.get("Date", "") - - # Sum all monetary values total = 0.0 - for key, value in transaction.items(): - if key not in ["Date", "Details", "Filename"] and value: + for field_name in ("debit", "credit", "balance"): + value = getattr(transaction, field_name, None) + if value: + amount = to_float(str(value)) + if amount is not None: + total += abs(amount) + for value in transaction.additional_fields.values(): + if value: amount = to_float(str(value)) if amount is not None: - total += abs(amount) # Use absolute value + total += abs(amount) - return f"{date}:{total:.2f}" + return f"{transaction.date}:{total:.2f}" class CreditCardDuplicateStrategy(DuplicateDetectionStrategy): @@ -142,31 +146,25 @@ class CreditCardDuplicateStrategy(DuplicateDetectionStrategy): - Transaction type provides critical disambiguation """ - def create_key(self, transaction: dict) -> str: + def create_key(self, transaction: "Transaction") -> str: """Create composite key: date:transaction_type:amount. Args: - transaction: Transaction dictionary + transaction: Transaction object Returns: Unique key combining date, transaction type, and amount """ - date = transaction.get("Date", "") - transaction_type = transaction.get("transaction_type", "other") - - # Get primary amount (prefer Debit, then Credit) + # Get primary amount (prefer debit, then credit) amount = None - if transaction.get("Debit_AMT"): - amount = to_float(str(transaction.get("Debit_AMT"))) - elif transaction.get("Credit_AMT"): - amount = to_float(str(transaction.get("Credit_AMT"))) + if transaction.debit: + amount = to_float(str(transaction.debit)) + elif transaction.credit: + amount = to_float(str(transaction.credit)) amount_str = f"{amount:.2f}" if amount is not None else "0.00" - # Key: date + type + amount (ignoring description variations) - # This allows a $50 purchase and $50 payment on the same day - # to be treated as different transactions - return f"{date}:{transaction_type}:{amount_str}" + return f"{transaction.date}:{transaction.transaction_type}:{amount_str}" class OutputFormatStrategy(ABC): diff --git a/packages/parser-core/src/bankstatements_core/processor.py b/packages/parser-core/src/bankstatements_core/processor.py index 62a82b9..74848e5 100644 --- a/packages/parser-core/src/bankstatements_core/processor.py +++ b/packages/parser-core/src/bankstatements_core/processor.py @@ -13,6 +13,7 @@ from bankstatements_core.config.totals_config import parse_totals_columns # noqa: F401 from bankstatements_core.domain import ExtractionResult from bankstatements_core.domain.converters import transactions_to_dicts +from bankstatements_core.domain.models.transaction import Transaction from bankstatements_core.services.column_analysis import ColumnAnalysisService from bankstatements_core.services.date_parser import DateParserService from bankstatements_core.services.duplicate_detector import DuplicateDetectionService @@ -275,7 +276,9 @@ def set_duplicate_strategy(self, strategy: Any) -> None: """ self._duplicate_strategy = strategy - def _detect_duplicates(self, all_rows: list[dict]) -> tuple[list[dict], list[dict]]: + def _detect_duplicates( + self, all_rows: list[Transaction] + ) -> tuple[list[Transaction], list[Transaction]]: """ Separate unique transactions from duplicates using the configured strategy. @@ -295,12 +298,12 @@ def _process_all_pdfs(self) -> tuple[list[ExtractionResult], int, int]: self.input_dir, recursive=self.recursive_scan ) - def _sort_transactions_by_date(self, rows: list[dict]) -> list[dict]: + def _sort_transactions_by_date(self, rows: list[Transaction]) -> list[Transaction]: """ Sort transactions using the configured sorting strategy. Args: - rows: List of transaction dictionaries + rows: List of Transaction objects Returns: Sorted list of transactions (or original order if sorting disabled) @@ -319,17 +322,17 @@ def run(self) -> dict: extraction_results, pdf_count, pages_read = ( self._process_all_pdfs() ) # list[ExtractionResult] - all_rows: list[dict] = [] + all_transactions: list[Transaction] = [] pdf_ibans: dict[str, str] = {} for extraction in extraction_results: if extraction.iban: pdf_ibans[extraction.source_file.name] = extraction.iban - all_rows.extend(transactions_to_dicts(extraction.transactions)) + all_transactions.extend(extraction.transactions) # Step 2: Group transactions by IBAN (delegated to registry) - rows_by_iban = self._registry.group_by_iban(all_rows, pdf_ibans) + txns_by_iban = self._registry.group_by_iban(all_transactions, pdf_ibans) logger.debug( - f"Grouped {len(all_rows)} transactions into {len(rows_by_iban)} IBAN groups" + f"Grouped {len(all_transactions)} transactions into {len(txns_by_iban)} IBAN groups" ) # Step 3: Process each IBAN group @@ -337,8 +340,8 @@ def run(self) -> dict: total_unique = 0 total_duplicates = 0 - for iban_suffix, iban_rows in rows_by_iban.items(): - result = self._process_transaction_group(iban_suffix, iban_rows) + for iban_suffix, iban_txns in txns_by_iban.items(): + result = self._process_transaction_group(iban_suffix, iban_txns) logger.debug( f"IBAN {iban_suffix}: Adding {result['unique_count']} unique, " @@ -381,25 +384,25 @@ def run(self) -> dict: ) def _process_transaction_group( - self, iban_suffix: str | None, iban_rows: list[dict] + self, iban_suffix: str | None, iban_txns: list[Transaction] ) -> dict: """Process a group of transactions for a single IBAN. Args: iban_suffix: IBAN suffix for this group (or "unknown") - iban_rows: List of transaction dictionaries + iban_txns: List of Transaction objects Returns: Dictionary with unique_count, duplicate_count, and output_paths """ logger.info( - f"Processing {len(iban_rows)} transactions for IBAN suffix: {iban_suffix}" + f"Processing {len(iban_txns)} transactions for IBAN suffix: {iban_suffix}" ) # Look up template if registry available and transactions have template_id template = None - if self._template_registry and iban_rows: - template_id = iban_rows[0].get("template_id") + if self._template_registry and iban_txns: + template_id = iban_txns[0].additional_fields.get("template_id") if template_id: template = self._template_registry.get_template(template_id) logger.debug( @@ -407,19 +410,23 @@ def _process_transaction_group( ) # Detect duplicates and sort (delegated to registry) - unique_rows, duplicate_rows = self._registry.process_transaction_group( - iban_rows, template=template + unique_txns, duplicate_txns = self._registry.process_transaction_group( + iban_txns, template=template ) # Filter duplicates to remove any empty rows and header rows - duplicate_rows = self._filter_service.filter_empty_rows(duplicate_rows) - duplicate_rows = self._filter_service.filter_header_rows(duplicate_rows) + duplicate_txns = self._filter_service.filter_empty_rows(duplicate_txns) + duplicate_txns = self._filter_service.filter_header_rows(duplicate_txns) logger.info( - f"IBAN {iban_suffix}: {len(unique_rows)} unique transactions, " - f"{len(duplicate_rows)} duplicates" + f"IBAN {iban_suffix}: {len(unique_txns)} unique transactions, " + f"{len(duplicate_txns)} duplicates" ) + # Convert to dicts at the output boundary + unique_rows = transactions_to_dicts(unique_txns) + duplicate_rows = transactions_to_dicts(duplicate_txns) + # Create DataFrame for unique transactions df_unique = pd.DataFrame(unique_rows, columns=self.column_names) diff --git a/packages/parser-core/src/bankstatements_core/services/duplicate_detector.py b/packages/parser-core/src/bankstatements_core/services/duplicate_detector.py index 7d7b2bb..a4a6e41 100644 --- a/packages/parser-core/src/bankstatements_core/services/duplicate_detector.py +++ b/packages/parser-core/src/bankstatements_core/services/duplicate_detector.py @@ -12,6 +12,7 @@ from bankstatements_core.exceptions import InputValidationError if TYPE_CHECKING: + from bankstatements_core.domain.models.transaction import Transaction from bankstatements_core.patterns.strategies import DuplicateDetectionStrategy logger = logging.getLogger(__name__) @@ -45,13 +46,13 @@ def __init__(self, strategy: "DuplicateDetectionStrategy"): self.strategy = strategy def detect_and_separate( - self, transactions: list[dict] - ) -> tuple[list[dict], list[dict]]: + self, transactions: list["Transaction"] + ) -> tuple[list["Transaction"], list["Transaction"]]: """ Detect duplicates and separate into unique and duplicate lists. Args: - transactions: List of transaction dictionaries to process + transactions: List of Transaction objects to process Returns: Tuple of (unique_transactions, duplicate_transactions) @@ -68,7 +69,9 @@ def detect_and_separate( return unique, duplicates def get_statistics( - self, unique_transactions: list[dict], duplicate_transactions: list[dict] + self, + unique_transactions: list["Transaction"], + duplicate_transactions: list["Transaction"], ) -> dict[str, int | float]: """ Get statistics about duplicate detection results. diff --git a/packages/parser-core/src/bankstatements_core/services/iban_grouping.py b/packages/parser-core/src/bankstatements_core/services/iban_grouping.py index 930c8a3..3fdb7e9 100644 --- a/packages/parser-core/src/bankstatements_core/services/iban_grouping.py +++ b/packages/parser-core/src/bankstatements_core/services/iban_grouping.py @@ -3,9 +3,13 @@ from __future__ import annotations import logging +from typing import TYPE_CHECKING from bankstatements_core.exceptions import InputValidationError +if TYPE_CHECKING: + from bankstatements_core.domain.models.transaction import Transaction + logger = logging.getLogger(__name__) @@ -42,9 +46,9 @@ def __init__(self, suffix_length: int = 4): def group_by_iban( self, - rows: list[dict], + rows: list["Transaction"], pdf_ibans: dict[str, str], - ) -> dict[str, list[dict]]: + ) -> dict[str, list["Transaction"]]: """Group transactions by their source IBAN. Transactions are grouped by the last N characters of their IBAN, @@ -52,33 +56,30 @@ def group_by_iban( grouped under "unknown". Args: - rows: List of transaction dictionaries with 'Filename' field + rows: List of Transaction objects with filename field pdf_ibans: Dictionary mapping PDF filename to full IBAN string Returns: Dictionary mapping IBAN suffix (or "unknown") to list of transactions """ - grouped: dict[str, list[dict]] = {} + grouped: dict[str, list["Transaction"]] = {} - for row in rows: - filename = row.get("Filename") + for tx in rows: + filename = tx.filename if not filename: - logger.warning("Row missing Filename field, grouping under 'unknown'") + logger.warning("Transaction missing filename, grouping under 'unknown'") iban_suffix = "unknown" else: - # Get IBAN for this file full_iban = pdf_ibans.get(filename) if full_iban: iban_suffix = self._extract_suffix(full_iban) else: iban_suffix = "unknown" - # Add row to appropriate group if iban_suffix not in grouped: grouped[iban_suffix] = [] - grouped[iban_suffix].append(row) + grouped[iban_suffix].append(tx) - # Log grouping results self._log_grouping_summary(grouped) return grouped @@ -95,17 +96,14 @@ def _extract_suffix(self, iban: str) -> str: if not iban: return "" - # Remove whitespace and convert to uppercase iban_clean = iban.replace(" ", "").upper() - # Extract suffix if len(iban_clean) >= self._suffix_length: return iban_clean[-self._suffix_length :] else: - # If IBAN is shorter than suffix length, use whole IBAN return iban_clean - def _log_grouping_summary(self, grouped: dict[str, list[dict]]) -> None: + def _log_grouping_summary(self, grouped: dict[str, list["Transaction"]]) -> None: """Log summary of grouping results. Args: diff --git a/packages/parser-core/src/bankstatements_core/services/pdf_processing_orchestrator.py b/packages/parser-core/src/bankstatements_core/services/pdf_processing_orchestrator.py index 78afbe8..26f095f 100644 --- a/packages/parser-core/src/bankstatements_core/services/pdf_processing_orchestrator.py +++ b/packages/parser-core/src/bankstatements_core/services/pdf_processing_orchestrator.py @@ -17,7 +17,6 @@ from bankstatements_core.config.processor_config import ExtractionConfig from bankstatements_core.domain import ExtractionResult -from bankstatements_core.domain.converters import transactions_to_dicts from bankstatements_core.entitlements import Entitlements if TYPE_CHECKING: @@ -164,12 +163,10 @@ def process_all_pdfs( pdf_ibans[pdf.name] = result.iban # Apply filters to extracted rows - from bankstatements_core.domain.converters import dicts_to_transactions - filtered_rows = self.filter_service.apply_all_filters( - transactions_to_dicts(result.transactions) + result.transactions ) - result.transactions = dicts_to_transactions(filtered_rows) + result.transactions = filtered_rows logger.info( "Successfully processed PDF %d: %d transactions extracted", diff --git a/packages/parser-core/src/bankstatements_core/services/service_registry.py b/packages/parser-core/src/bankstatements_core/services/service_registry.py index 6bc0c2b..8b6a497 100644 --- a/packages/parser-core/src/bankstatements_core/services/service_registry.py +++ b/packages/parser-core/src/bankstatements_core/services/service_registry.py @@ -22,6 +22,7 @@ if TYPE_CHECKING: from bankstatements_core.config.processor_config import ProcessorConfig + from bankstatements_core.domain.models.transaction import Transaction from bankstatements_core.domain.protocols.services import ( IDuplicateDetector, IIBANGrouping, @@ -144,24 +145,24 @@ def from_config( def process_transaction_group( self, - transactions: list[dict], + transactions: list["Transaction"], template: "BankTemplate | None" = None, - ) -> tuple[list[dict], list[dict]]: + ) -> tuple[list["Transaction"], list["Transaction"]]: """Enrich → classify → deduplicate → sort a group of transactions. Args: - transactions: List of transaction dicts for a single IBAN group. + transactions: List of Transaction objects for a single IBAN group. template: Optional bank template used for transaction type keywords. Returns: Tuple of (unique_transactions, duplicate_transactions). """ - enriched = self._enrich_with_filename(transactions) - enriched = self._enrich_with_document_type(enriched) - enriched = self._classify_transaction_types(enriched, template) + self._enrich_with_filename(transactions) + self._enrich_with_document_type(transactions) + self._classify_transaction_types(transactions, template) unique_rows, duplicate_rows = self._duplicate_detector.detect_and_separate( - enriched + transactions ) logger.info( "Duplicate detection: %d unique, %d duplicates", @@ -174,17 +175,17 @@ def process_transaction_group( def group_by_iban( self, - transactions: list[dict], + transactions: list["Transaction"], pdf_ibans: dict[str, str], - ) -> dict[str, list[dict]]: + ) -> dict[str, list["Transaction"]]: """Group transactions by IBAN suffix. Args: - transactions: Flat list of all transaction dicts. + transactions: Flat list of all Transaction objects. pdf_ibans: Mapping of PDF filename → IBAN string. Returns: - Dict of IBAN suffix → list of transaction dicts. + Dict of IBAN suffix → list of Transaction objects. """ return self._grouping_service.group_by_iban(transactions, pdf_ibans) @@ -206,44 +207,41 @@ def get_grouping_service(self) -> "IIBANGrouping": # ------------------------------------------------------------------ @staticmethod - def _enrich_with_filename(transactions: list[dict]) -> list[dict]: - """Set Filename key from source_pdf if not already present.""" - for row in transactions: - if "Filename" not in row: - row["Filename"] = row.get("source_pdf", "") - return transactions + def _enrich_with_filename(transactions: list["Transaction"]) -> None: + """Set filename from source_pdf additional_field if not already present.""" + for tx in transactions: + if not tx.filename: + tx.filename = tx.additional_fields.get("source_pdf", "") @staticmethod def _enrich_with_document_type( - transactions: list[dict], default_type: str = "bank_statement" - ) -> list[dict]: + transactions: list["Transaction"], default_type: str = "bank_statement" + ) -> None: """Set document_type if not already present.""" - for row in transactions: - if "document_type" not in row: - row["document_type"] = default_type - return transactions + for tx in transactions: + if not tx.document_type: + tx.document_type = default_type @staticmethod def _classify_transaction_types( - transactions: list[dict], + transactions: list["Transaction"], template: "BankTemplate | None" = None, - ) -> list[dict]: + ) -> None: """Classify each transaction using Chain of Responsibility.""" from bankstatements_core.services.transaction_type_classifier import ( create_transaction_type_classifier_chain, ) if not transactions: - return transactions + return - document_type = transactions[0].get("document_type") + document_type = transactions[0].document_type classifier = create_transaction_type_classifier_chain(document_type) - for transaction in transactions: - transaction["transaction_type"] = classifier.classify(transaction, template) + for tx in transactions: + tx.transaction_type = classifier.classify(tx, template) logger.info( "Transaction type classification: %d transactions classified", len(transactions), ) - return transactions diff --git a/packages/parser-core/src/bankstatements_core/services/sorting_service.py b/packages/parser-core/src/bankstatements_core/services/sorting_service.py index 0e36884..58f05cd 100644 --- a/packages/parser-core/src/bankstatements_core/services/sorting_service.py +++ b/packages/parser-core/src/bankstatements_core/services/sorting_service.py @@ -8,9 +8,13 @@ import logging from abc import ABC, abstractmethod +from typing import TYPE_CHECKING from bankstatements_core.services.date_parser import DateParserService +if TYPE_CHECKING: + from bankstatements_core.domain.models.transaction import Transaction + logger = logging.getLogger(__name__) _date_parser_service = DateParserService() @@ -20,12 +24,12 @@ class SortingStrategy(ABC): """Abstract base class for transaction sorting strategies.""" @abstractmethod - def sort(self, transactions: list[dict]) -> list[dict]: + def sort(self, transactions: list["Transaction"]) -> list["Transaction"]: """ Sort transactions according to the strategy. Args: - transactions: List of transaction dictionaries + transactions: List of Transaction objects Returns: Sorted list of transactions @@ -36,15 +40,15 @@ def sort(self, transactions: list[dict]) -> list[dict]: class ChronologicalSortingStrategy(SortingStrategy): """Strategy that sorts transactions chronologically by date.""" - def sort(self, transactions: list[dict]) -> list[dict]: + def sort(self, transactions: list["Transaction"]) -> list["Transaction"]: """ Sort transactions chronologically by date. Args: - transactions: List of transaction dictionaries + transactions: List of Transaction objects Returns: - Sorted list of transaction dicts + Sorted list of transactions """ if not transactions: return transactions @@ -53,21 +57,19 @@ def sort(self, transactions: list[dict]) -> list[dict]: return sorted( transactions, - key=lambda tx: _date_parser_service.parse_transaction_date( - tx.get("Date") or tx.get("date") or "" - ), + key=lambda tx: _date_parser_service.parse_transaction_date(tx.date), ) class NoSortingStrategy(SortingStrategy): """Strategy that keeps original order (no sorting).""" - def sort(self, transactions: list[dict]) -> list[dict]: + def sort(self, transactions: list["Transaction"]) -> list["Transaction"]: """ Keep transactions in original order. Args: - transactions: List of transaction dictionaries + transactions: List of Transaction objects Returns: Transactions in original order @@ -93,12 +95,12 @@ def __init__(self, strategy: SortingStrategy): """ self.strategy = strategy - def sort(self, transactions: list[dict]) -> list[dict]: + def sort(self, transactions: list["Transaction"]) -> list["Transaction"]: """ Sort transactions using the configured strategy. Args: - transactions: List of transaction dictionaries + transactions: List of Transaction objects Returns: Sorted list of transactions diff --git a/packages/parser-core/src/bankstatements_core/services/transaction_filter.py b/packages/parser-core/src/bankstatements_core/services/transaction_filter.py index 0e61914..990d3b3 100644 --- a/packages/parser-core/src/bankstatements_core/services/transaction_filter.py +++ b/packages/parser-core/src/bankstatements_core/services/transaction_filter.py @@ -4,27 +4,18 @@ import logging -from bankstatements_core.domain import ( - Transaction, - dicts_to_transactions, - transactions_to_dicts, -) +from bankstatements_core.domain.models.transaction import Transaction from bankstatements_core.exceptions import InputValidationError logger = logging.getLogger(__name__) class TransactionFilterService: - """Filters transaction rows based on various criteria. + """Filters transactions based on various criteria. This service provides a composable interface for applying different filters to transaction data, such as removing empty rows, header rows, and rows with invalid dates. - - Note: - This service accepts list[dict] for backward compatibility but converts - internally to Transaction objects for type-safe processing. All methods - maintain the dict-based interface for existing code. """ def __init__(self, column_names: list[str]): @@ -46,51 +37,43 @@ def __init__(self, column_names: list[str]): raise InputValidationError("column_names must contain only strings") self._column_names = column_names - def filter_empty_rows(self, rows: list[dict]) -> list[dict]: + def filter_empty_rows(self, rows: list[Transaction]) -> list[Transaction]: """Remove rows with no meaningful data. A row is considered empty if it has no valid date, details, or debit/credit amounts. Args: - rows: List of transaction dictionaries + rows: List of Transaction objects Returns: - List of non-empty transaction dicts - - Note: - Uses Transaction domain model internally for type-safe validation. + List of non-empty Transaction objects """ - # Convert to domain objects for type-safe processing - transactions = dicts_to_transactions(rows) - - # Filter using domain methods - filtered_transactions = [ + filtered = [ tx - for tx in transactions + for tx in rows if not self._is_empty_transaction(tx) and not self._is_balance_only_row(tx) ] - removed = len(transactions) - len(filtered_transactions) + removed = len(rows) - len(filtered) if removed > 0: logger.info(f"Filtered out {removed} empty row(s)") - # Convert back to dicts for backward compatibility - return transactions_to_dicts(filtered_transactions) + return filtered - def filter_header_rows(self, rows: list[dict]) -> list[dict]: + def filter_header_rows(self, rows: list[Transaction]) -> list[Transaction]: """Remove header rows that match column names. A row is considered a header if at least 50% of its non-empty fields contain values that match column names. Args: - rows: List of transaction dictionaries + rows: List of Transaction objects Returns: - List of non-header rows + List of non-header Transaction objects """ - filtered = [row for row in rows if not self._is_header_row(row)] + filtered = [tx for tx in rows if not self._is_header_transaction(tx)] removed = len(rows) - len(filtered) if removed > 0: @@ -98,53 +81,43 @@ def filter_header_rows(self, rows: list[dict]) -> list[dict]: return filtered - def filter_invalid_dates(self, rows: list[dict]) -> list[dict]: + def filter_invalid_dates(self, rows: list[Transaction]) -> list[Transaction]: """Remove rows with invalid or missing date values. A row is considered invalid if it has no valid date field or the date value doesn't look like an actual date. Args: - rows: List of transaction dictionaries + rows: List of Transaction objects Returns: - List of rows with valid dates - - Note: - Uses Transaction domain model's has_valid_date() method plus - additional parsing validation. + List of Transaction objects with valid dates """ - # Convert to domain objects - transactions = dicts_to_transactions(rows) - - # Filter: has valid date AND date value is parseable - valid_transactions = [ + valid = [ tx - for tx in transactions + for tx in rows if tx.has_valid_date() and self._is_parseable_date(tx.date) ] - removed = len(transactions) - len(valid_transactions) + removed = len(rows) - len(valid) if removed > 0: logger.info(f"Filtered out {removed} row(s) with invalid dates") - # Convert back to dicts - return transactions_to_dicts(valid_transactions) + return valid - def apply_all_filters(self, rows: list[dict]) -> list[dict]: + def apply_all_filters(self, rows: list[Transaction]) -> list[Transaction]: """Apply all filters in sequence. Filters are applied in order: empty rows, header rows, invalid dates. Args: - rows: List of transaction dictionaries + rows: List of Transaction objects Returns: - List of filtered rows + List of filtered Transaction objects """ original_count = len(rows) - # Apply filters in sequence rows = self.filter_empty_rows(rows) rows = self.filter_header_rows(rows) rows = self.filter_invalid_dates(rows) @@ -161,17 +134,7 @@ def apply_all_filters(self, rows: list[dict]) -> list[dict]: return rows def _is_empty_transaction(self, tx: Transaction) -> bool: - """Check if a transaction is empty (no meaningful data). - - Uses domain model's validation methods for type-safe checking. - - Args: - tx: Transaction domain object - - Returns: - True if transaction has no meaningful data, False otherwise - """ - # Use domain model's validation methods + """Check if a transaction is empty (no meaningful data).""" return not ( tx.has_valid_date() or tx.has_valid_details() @@ -180,38 +143,18 @@ def _is_empty_transaction(self, tx: Transaction) -> bool: ) def _is_balance_only_row(self, tx: Transaction) -> bool: - """Check if a row has only balance (likely orphaned/invalid row). - - These rows have: - - Empty or whitespace-only details - - No debit amount - - No credit amount - - Some text/number in balance field - - Valid "balance-only" rows (which should NOT be filtered) are handled by - the row classifier - they need Details text (like "BALANCE FORWARD") to - be classified as transactions. Rows that reach this filter with only - balance and no details are orphaned/invalid data. - - Args: - tx: Transaction domain object - - Returns: - True if row should be filtered (balance-only, no details), False otherwise - """ - # Must NOT have details, debit, or credit + """Check if a row has only balance (likely orphaned/invalid row).""" if tx.has_valid_details() or tx.is_debit() or tx.is_credit(): return False - - # If balance is empty, this will be caught by _is_empty_transaction if not tx.balance or not tx.balance.strip(): return False - - # Row has balance but no details/debit/credit - # These are orphaned balance values (possibly from PDF extraction errors) - # or footer/disclaimer text that ended up in the Balance column return True + def _is_header_transaction(self, tx: Transaction) -> bool: + """Check if a Transaction is a header row by inspecting its field values.""" + row = tx.to_dict() + return self._is_header_row(row) + def _is_header_row(self, row: dict) -> bool: """Check if a row is a header row. @@ -231,11 +174,9 @@ def _is_header_row(self, row: dict) -> bool: if key == "Filename": continue - # Only check non-empty values if value and isinstance(value, str) and value.strip(): non_empty_fields += 1 - # Check if value matches any column name (case-insensitive) value_lower = value.lower() for col_name in self._column_names: if ( @@ -245,7 +186,6 @@ def _is_header_row(self, row: dict) -> bool: matches += 1 break - # Need at least 50% match and at least 2 matches if non_empty_fields >= 2 and matches >= (non_empty_fields / 2): return True @@ -267,7 +207,6 @@ def _is_parseable_date(self, date_str: str) -> bool: if not date_str or not date_str.strip(): return False - # Common non-date words found in bank statements non_date_indicators = [ "product", "account", @@ -283,16 +222,12 @@ def _is_parseable_date(self, date_str: str) -> bool: date_lower = date_str.lower() - # If it contains any non-date indicators, it's not a date if any(indicator in date_lower for indicator in non_date_indicators): return False - # If it's very long (>25 chars), probably not a date if len(date_str) > 25: return False - # If it contains common date indicators, accept it - # This handles formats like "11 Aug", "01/12/23", "2023-01-15", etc. date_indicators = [ "/", "-", @@ -313,9 +248,7 @@ def _is_parseable_date(self, date_str: str) -> bool: if any(indicator in date_lower for indicator in date_indicators): return True - # If it's all digits, it might be a date (like "20230115") if date_str.replace(" ", "").isdigit(): return True - # Default to False for safety return False diff --git a/packages/parser-core/src/bankstatements_core/services/transaction_type_classifier.py b/packages/parser-core/src/bankstatements_core/services/transaction_type_classifier.py index 6cd12ec..39c2e41 100644 --- a/packages/parser-core/src/bankstatements_core/services/transaction_type_classifier.py +++ b/packages/parser-core/src/bankstatements_core/services/transaction_type_classifier.py @@ -14,6 +14,7 @@ from bankstatements_core.utils import to_float if TYPE_CHECKING: + from bankstatements_core.domain.models.transaction import Transaction from bankstatements_core.templates.template_model import BankTemplate logger = logging.getLogger(__name__) @@ -45,12 +46,12 @@ def set_next( return classifier def classify( - self, transaction: dict, template: "BankTemplate | None" = None + self, transaction: "Transaction", template: "BankTemplate | None" = None ) -> str: """Classify transaction type, delegating to next classifier if needed. Args: - transaction: Transaction dictionary + transaction: Transaction object template: Optional bank template with transaction type keywords Returns: @@ -68,7 +69,7 @@ def classify( @abstractmethod def _do_classify( - self, transaction: dict, template: "BankTemplate | None" + self, transaction: "Transaction", template: "BankTemplate | None" ) -> str | None: """Attempt to classify the transaction. @@ -87,13 +88,13 @@ class TemplateKeywordClassifier(TransactionTypeClassifier): """ def _do_classify( - self, transaction: dict, template: "BankTemplate | None" + self, transaction: "Transaction", template: "BankTemplate | None" ) -> str | None: """Classify using template transaction_types keyword mappings.""" if not template or not template.processing.transaction_types: return None - details = transaction.get("Details", "").upper() + details = transaction.details.upper() if not details: return None @@ -152,15 +153,13 @@ class CreditCardPatternClassifier(TransactionTypeClassifier): ] def _do_classify( - self, transaction: dict, template: "BankTemplate | None" + self, transaction: "Transaction", template: "BankTemplate | None" ) -> str | None: """Classify credit card transactions.""" - # Only run for credit card statements - document_type = transaction.get("document_type", "") - if document_type != "credit_card_statement": + if transaction.document_type != "credit_card_statement": return None - details = transaction.get("Details", "").upper() + details = transaction.details.upper() if not details: return None @@ -223,15 +222,13 @@ class BankStatementPatternClassifier(TransactionTypeClassifier): ] def _do_classify( - self, transaction: dict, template: "BankTemplate | None" + self, transaction: "Transaction", template: "BankTemplate | None" ) -> str | None: """Classify bank statement transactions.""" - # Only run for bank statements - document_type = transaction.get("document_type", "") - if document_type != "bank_statement": + if transaction.document_type != "bank_statement": return None - details = transaction.get("Details", "").upper() + details = transaction.details.upper() if not details: return None @@ -259,29 +256,23 @@ class AmountBasedClassifier(TransactionTypeClassifier): """ def _do_classify( - self, transaction: dict, template: "BankTemplate | None" + self, transaction: "Transaction", template: "BankTemplate | None" ) -> str | None: """Classify based on amount patterns.""" - debit = transaction.get("Debit_AMT") - credit = transaction.get("Credit_AMT") - - debit_amount = to_float(str(debit)) if debit else None - credit_amount = to_float(str(credit)) if credit else None + debit_amount = to_float(str(transaction.debit)) if transaction.debit else None + credit_amount = ( + to_float(str(transaction.credit)) if transaction.credit else None + ) # Credit only (money in) - likely refund or transfer if credit_amount and credit_amount > 0 and not debit_amount: - # Could be refund or incoming transfer - # Without keywords, default to transfer for bank statements - document_type = transaction.get("document_type", "") - if document_type == "credit_card_statement": + if transaction.document_type == "credit_card_statement": return "refund" # Credits on credit cards are usually refunds return "transfer" # Credits on bank accounts are usually transfers # Debit only (money out) - likely purchase or payment if debit_amount and debit_amount > 0 and not credit_amount: - # Could be purchase or outgoing payment - document_type = transaction.get("document_type", "") - if document_type == "credit_card_statement": + if transaction.document_type == "credit_card_statement": return "purchase" # Debits on credit cards are usually purchases return "payment" # Debits on bank accounts are usually payments @@ -301,7 +292,7 @@ class DefaultClassifier(TransactionTypeClassifier): """ def _do_classify( - self, transaction: dict, template: "BankTemplate | None" + self, transaction: "Transaction", template: "BankTemplate | None" ) -> str | None: """Always return 'other' as default classification.""" return "other" diff --git a/packages/parser-core/tests/patterns/test_credit_card_duplicate_strategy.py b/packages/parser-core/tests/patterns/test_credit_card_duplicate_strategy.py index 98b9331..b5e591b 100644 --- a/packages/parser-core/tests/patterns/test_credit_card_duplicate_strategy.py +++ b/packages/parser-core/tests/patterns/test_credit_card_duplicate_strategy.py @@ -4,9 +4,14 @@ import pytest +from bankstatements_core.domain.models.transaction import Transaction from bankstatements_core.patterns.strategies import CreditCardDuplicateStrategy +def _tx(**kwargs) -> Transaction: + return Transaction.from_dict(kwargs) + + class TestCreditCardDuplicateStrategy: """Test credit card duplicate detection strategy.""" @@ -15,47 +20,47 @@ def test_same_transaction_type_amount_date_detected_as_duplicate(self): strategy = CreditCardDuplicateStrategy() transactions = [ - { - "Date": "01/12/2023", - "Details": "POS TESCO STORES", - "Debit_AMT": "45.23", - "Filename": "card1.pdf", - "transaction_type": "purchase", - }, - { - "Date": "01/12/2023", - "Details": "POS TESCO SUPERMARKET", # Different description - "Debit_AMT": "45.23", - "Filename": "card2.pdf", - "transaction_type": "purchase", - }, + _tx( + Date="01/12/2023", + Details="POS TESCO STORES", + Debit_AMT="45.23", + Filename="card1.pdf", + transaction_type="purchase", + ), + _tx( + Date="01/12/2023", + Details="POS TESCO SUPERMARKET", + Debit_AMT="45.23", + Filename="card2.pdf", + transaction_type="purchase", + ), ] unique, duplicates = strategy.detect_duplicates(transactions) assert len(unique) == 1 assert len(duplicates) == 1 - assert duplicates[0]["Filename"] == "card2.pdf" + assert duplicates[0].filename == "card2.pdf" def test_different_transaction_types_not_duplicate(self): """Should NOT treat purchase and payment with same amount as duplicates.""" strategy = CreditCardDuplicateStrategy() transactions = [ - { - "Date": "01/12/2023", - "Details": "POS TESCO STORES", - "Debit_AMT": "50.00", - "Filename": "card.pdf", - "transaction_type": "purchase", - }, - { - "Date": "01/12/2023", - "Details": "PAYMENT RECEIVED", - "Credit_AMT": "50.00", - "Filename": "card.pdf", - "transaction_type": "payment", - }, + _tx( + Date="01/12/2023", + Details="POS TESCO STORES", + Debit_AMT="50.00", + Filename="card.pdf", + transaction_type="purchase", + ), + _tx( + Date="01/12/2023", + Details="PAYMENT RECEIVED", + Credit_AMT="50.00", + Filename="card.pdf", + transaction_type="payment", + ), ] unique, duplicates = strategy.detect_duplicates(transactions) @@ -68,20 +73,20 @@ def test_same_amount_different_dates_not_duplicate(self): strategy = CreditCardDuplicateStrategy() transactions = [ - { - "Date": "01/12/2023", - "Details": "POS TESCO", - "Debit_AMT": "45.23", - "Filename": "card.pdf", - "transaction_type": "purchase", - }, - { - "Date": "02/12/2023", # Different date - "Details": "POS TESCO", - "Debit_AMT": "45.23", - "Filename": "card.pdf", - "transaction_type": "purchase", - }, + _tx( + Date="01/12/2023", + Details="POS TESCO", + Debit_AMT="45.23", + Filename="card.pdf", + transaction_type="purchase", + ), + _tx( + Date="02/12/2023", + Details="POS TESCO", + Debit_AMT="45.23", + Filename="card.pdf", + transaction_type="purchase", + ), ] unique, duplicates = strategy.detect_duplicates(transactions) @@ -94,20 +99,20 @@ def test_description_variations_allowed(self): strategy = CreditCardDuplicateStrategy() transactions = [ - { - "Date": "01/12/2023", - "Details": "TESCO STORE 123", - "Debit_AMT": "45.23", - "Filename": "card1.pdf", - "transaction_type": "purchase", - }, - { - "Date": "01/12/2023", - "Details": "TESCO STORES LTD", # Different merchant description - "Debit_AMT": "45.23", - "Filename": "card2.pdf", - "transaction_type": "purchase", - }, + _tx( + Date="01/12/2023", + Details="TESCO STORE 123", + Debit_AMT="45.23", + Filename="card1.pdf", + transaction_type="purchase", + ), + _tx( + Date="01/12/2023", + Details="TESCO STORES LTD", + Debit_AMT="45.23", + Filename="card2.pdf", + transaction_type="purchase", + ), ] unique, duplicates = strategy.detect_duplicates(transactions) @@ -123,20 +128,20 @@ def test_multiple_small_purchases_same_day_not_duplicate(self): # Scenario: Two different coffee purchases on same day, same amount # These should NOT be duplicates (common scenario) transactions = [ - { - "Date": "01/12/2023", - "Details": "COFFEE SHOP A", - "Debit_AMT": "4.50", - "Filename": "card.pdf", - "transaction_type": "purchase", - }, - { - "Date": "01/12/2023", - "Details": "COFFEE SHOP B", - "Debit_AMT": "4.50", - "Filename": "card.pdf", # Same file - "transaction_type": "purchase", - }, + _tx( + Date="01/12/2023", + Details="COFFEE SHOP A", + Debit_AMT="4.50", + Filename="card.pdf", + transaction_type="purchase", + ), + _tx( + Date="01/12/2023", + Details="COFFEE SHOP B", + Debit_AMT="4.50", + Filename="card.pdf", + transaction_type="purchase", + ), ] unique, duplicates = strategy.detect_duplicates(transactions) @@ -150,47 +155,47 @@ def test_cross_file_duplicate_detection(self): strategy = CreditCardDuplicateStrategy() transactions = [ - { - "Date": "01/12/2023", - "Details": "POS AMAZON", - "Debit_AMT": "99.99", - "Filename": "statement_nov.pdf", - "transaction_type": "purchase", - }, - { - "Date": "01/12/2023", - "Details": "AMAZON PURCHASE", - "Debit_AMT": "99.99", - "Filename": "statement_dec.pdf", # Different file - "transaction_type": "purchase", - }, + _tx( + Date="01/12/2023", + Details="POS AMAZON", + Debit_AMT="99.99", + Filename="statement_nov.pdf", + transaction_type="purchase", + ), + _tx( + Date="01/12/2023", + Details="AMAZON PURCHASE", + Debit_AMT="99.99", + Filename="statement_dec.pdf", + transaction_type="purchase", + ), ] unique, duplicates = strategy.detect_duplicates(transactions) assert len(unique) == 1 assert len(duplicates) == 1 - assert duplicates[0]["Filename"] == "statement_dec.pdf" + assert duplicates[0].filename == "statement_dec.pdf" def test_same_file_not_marked_duplicate(self): """Should NOT mark transactions from same file as duplicates.""" strategy = CreditCardDuplicateStrategy() transactions = [ - { - "Date": "01/12/2023", - "Details": "TRANSACTION 1", - "Debit_AMT": "50.00", - "Filename": "card.pdf", - "transaction_type": "purchase", - }, - { - "Date": "01/12/2023", - "Details": "TRANSACTION 2", - "Debit_AMT": "50.00", - "Filename": "card.pdf", # Same file - "transaction_type": "purchase", - }, + _tx( + Date="01/12/2023", + Details="TRANSACTION 1", + Debit_AMT="50.00", + Filename="card.pdf", + transaction_type="purchase", + ), + _tx( + Date="01/12/2023", + Details="TRANSACTION 2", + Debit_AMT="50.00", + Filename="card.pdf", + transaction_type="purchase", + ), ] unique, duplicates = strategy.detect_duplicates(transactions) @@ -204,20 +209,18 @@ def test_handles_missing_transaction_type(self): strategy = CreditCardDuplicateStrategy() transactions = [ - { - "Date": "01/12/2023", - "Details": "PURCHASE", - "Debit_AMT": "50.00", - "Filename": "card1.pdf", - # Missing transaction_type - }, - { - "Date": "01/12/2023", - "Details": "PURCHASE", - "Debit_AMT": "50.00", - "Filename": "card2.pdf", - # Missing transaction_type - }, + _tx( + Date="01/12/2023", + Details="PURCHASE", + Debit_AMT="50.00", + Filename="card1.pdf", + ), + _tx( + Date="01/12/2023", + Details="PURCHASE", + Debit_AMT="50.00", + Filename="card2.pdf", + ), ] unique, duplicates = strategy.detect_duplicates(transactions) @@ -231,20 +234,20 @@ def test_credit_amount_handled_correctly(self): strategy = CreditCardDuplicateStrategy() transactions = [ - { - "Date": "05/12/2023", - "Details": "REFUND", - "Credit_AMT": "25.00", # Credit, not Debit - "Filename": "card1.pdf", - "transaction_type": "refund", - }, - { - "Date": "05/12/2023", - "Details": "REFUND PROCESSED", - "Credit_AMT": "25.00", - "Filename": "card2.pdf", - "transaction_type": "refund", - }, + _tx( + Date="05/12/2023", + Details="REFUND", + Credit_AMT="25.00", + Filename="card1.pdf", + transaction_type="refund", + ), + _tx( + Date="05/12/2023", + Details="REFUND PROCESSED", + Credit_AMT="25.00", + Filename="card2.pdf", + transaction_type="refund", + ), ] unique, duplicates = strategy.detect_duplicates(transactions) @@ -257,22 +260,20 @@ def test_debit_and_credit_same_amount_not_duplicate(self): strategy = CreditCardDuplicateStrategy() transactions = [ - { - "Date": "01/12/2023", - "Details": "PURCHASE", - "Debit_AMT": "50.00", - "Credit_AMT": None, - "Filename": "card.pdf", - "transaction_type": "purchase", - }, - { - "Date": "01/12/2023", - "Details": "REFUND", - "Debit_AMT": None, - "Credit_AMT": "50.00", - "Filename": "card.pdf", - "transaction_type": "refund", - }, + _tx( + Date="01/12/2023", + Details="PURCHASE", + Debit_AMT="50.00", + Filename="card.pdf", + transaction_type="purchase", + ), + _tx( + Date="01/12/2023", + Details="REFUND", + Credit_AMT="50.00", + Filename="card.pdf", + transaction_type="refund", + ), ] unique, duplicates = strategy.detect_duplicates(transactions) @@ -286,22 +287,18 @@ def test_zero_amount_transactions(self): strategy = CreditCardDuplicateStrategy() transactions = [ - { - "Date": "01/12/2023", - "Details": "AUTHORIZATION HOLD", - "Debit_AMT": None, - "Credit_AMT": None, - "Filename": "card1.pdf", - "transaction_type": "other", - }, - { - "Date": "01/12/2023", - "Details": "AUTHORIZATION HOLD", - "Debit_AMT": None, - "Credit_AMT": None, - "Filename": "card2.pdf", - "transaction_type": "other", - }, + _tx( + Date="01/12/2023", + Details="AUTHORIZATION HOLD", + Filename="card1.pdf", + transaction_type="other", + ), + _tx( + Date="01/12/2023", + Details="AUTHORIZATION HOLD", + Filename="card2.pdf", + transaction_type="other", + ), ] unique, duplicates = strategy.detect_duplicates(transactions) @@ -314,12 +311,12 @@ def test_create_key_format(self): """Should create key in format: date:transaction_type:amount.""" strategy = CreditCardDuplicateStrategy() - transaction = { - "Date": "01/12/2023", - "Details": "POS TESCO", - "Debit_AMT": "45.23", - "transaction_type": "purchase", - } + transaction = _tx( + Date="01/12/2023", + Details="POS TESCO", + Debit_AMT="45.23", + transaction_type="purchase", + ) key = strategy.create_key(transaction) @@ -329,27 +326,23 @@ def test_create_key_handles_missing_transaction_type(self): """Should use 'other' when transaction_type is missing.""" strategy = CreditCardDuplicateStrategy() - transaction = { - "Date": "01/12/2023", - "Details": "UNKNOWN", - "Debit_AMT": "10.00", - } + transaction = _tx(Date="01/12/2023", Details="UNKNOWN", Debit_AMT="10.00") key = strategy.create_key(transaction) - assert key == "01/12/2023:other:10.00" + assert key == "01/12/2023::10.00" def test_create_key_prefers_debit_over_credit(self): """Should use Debit amount when both Debit and Credit exist.""" strategy = CreditCardDuplicateStrategy() - transaction = { - "Date": "01/12/2023", - "Details": "TRANSACTION", - "Debit_AMT": "50.00", - "Credit_AMT": "25.00", # Both present - "transaction_type": "purchase", - } + transaction = _tx( + Date="01/12/2023", + Details="TRANSACTION", + Debit_AMT="50.00", + Credit_AMT="25.00", + transaction_type="purchase", + ) key = strategy.create_key(transaction) diff --git a/packages/parser-core/tests/services/test_duplicate_detector.py b/packages/parser-core/tests/services/test_duplicate_detector.py index 29da3e2..330324a 100644 --- a/packages/parser-core/tests/services/test_duplicate_detector.py +++ b/packages/parser-core/tests/services/test_duplicate_detector.py @@ -4,6 +4,7 @@ import pytest +from bankstatements_core.domain.models.transaction import Transaction from bankstatements_core.patterns.strategies import ( AllFieldsDuplicateStrategy, DateAmountDuplicateStrategy, @@ -26,18 +27,22 @@ def test_detect_and_separate_no_duplicates(self): service = DuplicateDetectionService(strategy) transactions = [ - { - "Date": "01 Jan 2023", - "Details": "Purchase 1", - "Debit €": "50.00", - "Filename": "file1.pdf", - }, - { - "Date": "02 Jan 2023", - "Details": "Purchase 2", - "Debit €": "25.00", - "Filename": "file1.pdf", - }, + Transaction.from_dict( + { + "Date": "01 Jan 2023", + "Details": "Purchase 1", + "Debit €": "50.00", + "Filename": "file1.pdf", + } + ), + Transaction.from_dict( + { + "Date": "02 Jan 2023", + "Details": "Purchase 2", + "Debit €": "25.00", + "Filename": "file1.pdf", + } + ), ] unique, duplicates = service.detect_and_separate(transactions) @@ -50,25 +55,29 @@ def test_detect_and_separate_with_duplicates(self): service = DuplicateDetectionService(strategy) transactions = [ - { - "Date": "01 Jan 2023", - "Details": "Purchase 1", - "Debit €": "50.00", - "Filename": "file1.pdf", - }, - { - "Date": "01 Jan 2023", - "Details": "Purchase 1", - "Debit €": "50.00", - "Filename": "file2.pdf", # Same transaction, different file - }, + Transaction.from_dict( + { + "Date": "01 Jan 2023", + "Details": "Purchase 1", + "Debit €": "50.00", + "Filename": "file1.pdf", + } + ), + Transaction.from_dict( + { + "Date": "01 Jan 2023", + "Details": "Purchase 1", + "Debit €": "50.00", + "Filename": "file2.pdf", + } + ), ] unique, duplicates = service.detect_and_separate(transactions) assert len(unique) == 1 assert len(duplicates) == 1 - assert unique[0]["Filename"] == "file1.pdf" - assert duplicates[0]["Filename"] == "file2.pdf" + assert unique[0].filename == "file1.pdf" + assert duplicates[0].filename == "file2.pdf" def test_detect_and_separate_same_file_same_transaction(self): """Test that same transaction in same file is not marked as duplicate.""" @@ -76,18 +85,22 @@ def test_detect_and_separate_same_file_same_transaction(self): service = DuplicateDetectionService(strategy) transactions = [ - { - "Date": "01 Jan 2023", - "Details": "Purchase 1", - "Debit €": "50.00", - "Filename": "file1.pdf", - }, - { - "Date": "01 Jan 2023", - "Details": "Purchase 1", - "Debit €": "50.00", - "Filename": "file1.pdf", # Same file - }, + Transaction.from_dict( + { + "Date": "01 Jan 2023", + "Details": "Purchase 1", + "Debit €": "50.00", + "Filename": "file1.pdf", + } + ), + Transaction.from_dict( + { + "Date": "01 Jan 2023", + "Details": "Purchase 1", + "Debit €": "50.00", + "Filename": "file1.pdf", + } + ), ] unique, duplicates = service.detect_and_separate(transactions) @@ -101,18 +114,22 @@ def test_detect_and_separate_with_date_amount_strategy(self): service = DuplicateDetectionService(strategy) transactions = [ - { - "Date": "01 Jan 2023", - "Details": "Purchase at Store A", - "Debit €": "50.00", - "Filename": "file1.pdf", - }, - { - "Date": "01 Jan 2023", - "Details": "Purchase at Store B", # Different details - "Debit €": "50.00", # Same amount and date - "Filename": "file2.pdf", - }, + Transaction.from_dict( + { + "Date": "01 Jan 2023", + "Details": "Purchase at Store A", + "Debit €": "50.00", + "Filename": "file1.pdf", + } + ), + Transaction.from_dict( + { + "Date": "01 Jan 2023", + "Details": "Purchase at Store B", + "Debit €": "50.00", + "Filename": "file2.pdf", + } + ), ] unique, duplicates = service.detect_and_separate(transactions) @@ -189,38 +206,48 @@ def test_detect_multiple_files_complex_scenario(self): transactions = [ # Transaction 1 appears in file1 and file2 (duplicate in file2) - { - "Date": "01 Jan 2023", - "Details": "Txn1", - "Debit €": "50.00", - "Filename": "file1.pdf", - }, - { - "Date": "01 Jan 2023", - "Details": "Txn1", - "Debit €": "50.00", - "Filename": "file2.pdf", - }, + Transaction.from_dict( + { + "Date": "01 Jan 2023", + "Details": "Txn1", + "Debit €": "50.00", + "Filename": "file1.pdf", + } + ), + Transaction.from_dict( + { + "Date": "01 Jan 2023", + "Details": "Txn1", + "Debit €": "50.00", + "Filename": "file2.pdf", + } + ), # Transaction 2 only in file1 (unique) - { - "Date": "02 Jan 2023", - "Details": "Txn2", - "Debit €": "25.00", - "Filename": "file1.pdf", - }, + Transaction.from_dict( + { + "Date": "02 Jan 2023", + "Details": "Txn2", + "Debit €": "25.00", + "Filename": "file1.pdf", + } + ), # Transaction 3 appears in file2 and file3 (duplicate in file3) - { - "Date": "03 Jan 2023", - "Details": "Txn3", - "Debit €": "75.00", - "Filename": "file2.pdf", - }, - { - "Date": "03 Jan 2023", - "Details": "Txn3", - "Debit €": "75.00", - "Filename": "file3.pdf", - }, + Transaction.from_dict( + { + "Date": "03 Jan 2023", + "Details": "Txn3", + "Debit €": "75.00", + "Filename": "file2.pdf", + } + ), + Transaction.from_dict( + { + "Date": "03 Jan 2023", + "Details": "Txn3", + "Debit €": "75.00", + "Filename": "file3.pdf", + } + ), ] unique, duplicates = service.detect_and_separate(transactions) diff --git a/packages/parser-core/tests/services/test_iban_grouping.py b/packages/parser-core/tests/services/test_iban_grouping.py index d99aa33..1539ece 100644 --- a/packages/parser-core/tests/services/test_iban_grouping.py +++ b/packages/parser-core/tests/services/test_iban_grouping.py @@ -5,6 +5,7 @@ import logging import unittest +from bankstatements_core.domain.models.transaction import Transaction from bankstatements_core.services.iban_grouping import IBANGroupingService @@ -18,9 +19,15 @@ def setUp(self): def test_group_by_iban_with_valid_ibans(self): """Test grouping with valid IBANs.""" rows = [ - {"Filename": "file1.pdf", "Details": "Transaction 1"}, - {"Filename": "file2.pdf", "Details": "Transaction 2"}, - {"Filename": "file1.pdf", "Details": "Transaction 3"}, + Transaction.from_dict( + {"Filename": "file1.pdf", "Details": "Transaction 1"} + ), + Transaction.from_dict( + {"Filename": "file2.pdf", "Details": "Transaction 2"} + ), + Transaction.from_dict( + {"Filename": "file1.pdf", "Details": "Transaction 3"} + ), ] pdf_ibans = { "file1.pdf": "IE12 BOFI 9000 0112 3456", @@ -39,8 +46,12 @@ def test_group_by_iban_with_valid_ibans(self): def test_group_by_iban_without_iban(self): """Test grouping when PDF has no IBAN.""" rows = [ - {"Filename": "file1.pdf", "Details": "Transaction 1"}, - {"Filename": "file2.pdf", "Details": "Transaction 2"}, + Transaction.from_dict( + {"Filename": "file1.pdf", "Details": "Transaction 1"} + ), + Transaction.from_dict( + {"Filename": "file2.pdf", "Details": "Transaction 2"} + ), ] pdf_ibans = { "file1.pdf": "IE12 BOFI 9000 0112 3456", @@ -59,8 +70,10 @@ def test_group_by_iban_without_iban(self): def test_group_by_iban_missing_filename(self, caplog=None): """Test grouping when row has no filename.""" rows = [ - {"Details": "Transaction 1"}, # No Filename field - {"Filename": "file1.pdf", "Details": "Transaction 2"}, + Transaction.from_dict({"Details": "Transaction 1"}), # No Filename field + Transaction.from_dict( + {"Filename": "file1.pdf", "Details": "Transaction 2"} + ), ] pdf_ibans = { "file1.pdf": "IE12 BOFI 9000 0112 3456", diff --git a/packages/parser-core/tests/services/test_service_registry.py b/packages/parser-core/tests/services/test_service_registry.py index 6366167..a113570 100644 --- a/packages/parser-core/tests/services/test_service_registry.py +++ b/packages/parser-core/tests/services/test_service_registry.py @@ -21,6 +21,7 @@ ProcessingConfig, ProcessorConfig, ) +from bankstatements_core.domain.models.transaction import Transaction from bankstatements_core.services.duplicate_detector import DuplicateDetectionService from bankstatements_core.services.iban_grouping import IBANGroupingService from bankstatements_core.services.service_registry import ServiceRegistry @@ -66,7 +67,9 @@ class TestProcessTransactionGroup: def test_enriches_classifies_deduplicates_and_sorts(self): config = _minimal_config() transactions = [ - {"Date": "01/01/2024", "Details": "Test", "source_pdf": "a.pdf"}, + Transaction.from_dict( + {"Date": "01/01/2024", "Details": "Test", "source_pdf": "a.pdf"} + ), ] mock_dedup = Mock() @@ -84,9 +87,9 @@ def test_enriches_classifies_deduplicates_and_sorts(self): # Enrichment happened before dedup called_with = mock_dedup.detect_and_separate.call_args[0][0] - assert called_with[0]["Filename"] == "a.pdf" - assert called_with[0]["document_type"] == "bank_statement" - assert "transaction_type" in called_with[0] + assert called_with[0].filename == "a.pdf" + assert called_with[0].document_type == "bank_statement" + assert called_with[0].transaction_type != "" # Sort was called and result returned mock_sort.sort.assert_called_once() @@ -95,8 +98,12 @@ def test_enriches_classifies_deduplicates_and_sorts(self): def test_returns_unique_and_duplicate_lists(self): config = _minimal_config() - tx1 = {"Date": "01/01/2024", "Details": "A", "source_pdf": "x.pdf"} - tx2 = {"Date": "01/01/2024", "Details": "A", "source_pdf": "x.pdf"} + tx1 = Transaction.from_dict( + {"Date": "01/01/2024", "Details": "A", "source_pdf": "x.pdf"} + ) + tx2 = Transaction.from_dict( + {"Date": "01/01/2024", "Details": "A", "source_pdf": "x.pdf"} + ) mock_dedup = Mock() mock_dedup.detect_and_separate.return_value = ([tx1], [tx2]) @@ -121,7 +128,7 @@ def test_delegates_to_grouping_service(self): mock_group.group_by_iban.return_value = {"1234": []} registry = ServiceRegistry.from_config(config, grouping_service=mock_group) - transactions = [{"Date": "01/01/2024"}] + transactions = [Transaction.from_dict({"Date": "01/01/2024"})] pdf_ibans = {"a.pdf": "IE001234"} result = registry.group_by_iban(transactions, pdf_ibans) diff --git a/packages/parser-core/tests/services/test_sorting_service.py b/packages/parser-core/tests/services/test_sorting_service.py index bc96e3b..e99db3b 100644 --- a/packages/parser-core/tests/services/test_sorting_service.py +++ b/packages/parser-core/tests/services/test_sorting_service.py @@ -4,6 +4,7 @@ import pytest +from bankstatements_core.domain.models.transaction import Transaction from bankstatements_core.services.sorting_service import ( ChronologicalSortingStrategy, NoSortingStrategy, @@ -11,6 +12,10 @@ ) +def _tx(date: str, details: str) -> Transaction: + return Transaction.from_dict({"Date": date, "Details": details}) + + class TestChronologicalSortingStrategy: """Tests for ChronologicalSortingStrategy.""" @@ -18,32 +23,32 @@ def test_sort_chronologically(self): """Test sorting transactions by date chronologically.""" strategy = ChronologicalSortingStrategy() transactions = [ - {"Date": "15 Jan 2023", "Details": "Transaction 3"}, - {"Date": "01 Jan 2023", "Details": "Transaction 1"}, - {"Date": "10 Jan 2023", "Details": "Transaction 2"}, + _tx("15 Jan 2023", "Transaction 3"), + _tx("01 Jan 2023", "Transaction 1"), + _tx("10 Jan 2023", "Transaction 2"), ] sorted_txns = strategy.sort(transactions) assert len(sorted_txns) == 3 - assert sorted_txns[0]["Details"] == "Transaction 1" # 01 Jan - assert sorted_txns[1]["Details"] == "Transaction 2" # 10 Jan - assert sorted_txns[2]["Details"] == "Transaction 3" # 15 Jan + assert sorted_txns[0].details == "Transaction 1" # 01 Jan + assert sorted_txns[1].details == "Transaction 2" # 10 Jan + assert sorted_txns[2].details == "Transaction 3" # 15 Jan def test_sort_with_different_date_formats(self): """Test sorting with various date formats.""" strategy = ChronologicalSortingStrategy() transactions = [ - {"Date": "15/01/2023", "Details": "Transaction 3"}, - {"Date": "01 Jan 2023", "Details": "Transaction 1"}, - {"Date": "10-01-2023", "Details": "Transaction 2"}, + _tx("15/01/2023", "Transaction 3"), + _tx("01 Jan 2023", "Transaction 1"), + _tx("10-01-2023", "Transaction 2"), ] sorted_txns = strategy.sort(transactions) assert len(sorted_txns) == 3 # All are from January 2023, so should sort by day - assert sorted_txns[0]["Details"] == "Transaction 1" # Day 1 - assert sorted_txns[1]["Details"] == "Transaction 2" # Day 10 - assert sorted_txns[2]["Details"] == "Transaction 3" # Day 15 + assert sorted_txns[0].details == "Transaction 1" # Day 1 + assert sorted_txns[1].details == "Transaction 2" # Day 10 + assert sorted_txns[2].details == "Transaction 3" # Day 15 def test_sort_empty_list(self): """Test sorting empty transaction list.""" @@ -54,18 +59,18 @@ def test_sort_empty_list(self): def test_sort_single_transaction(self): """Test sorting single transaction.""" strategy = ChronologicalSortingStrategy() - transactions = [{"Date": "01 Jan 2023", "Details": "Single"}] + transactions = [_tx("01 Jan 2023", "Single")] sorted_txns = strategy.sort(transactions) assert len(sorted_txns) == 1 - assert sorted_txns[0]["Details"] == "Single" + assert sorted_txns[0].details == "Single" def test_sort_with_missing_dates(self): """Test sorting when some transactions have missing dates.""" strategy = ChronologicalSortingStrategy() transactions = [ - {"Date": "15 Jan 2023", "Details": "Has date"}, - {"Date": "", "Details": "No date"}, - {"Date": "01 Jan 2023", "Details": "Has date 2"}, + _tx("15 Jan 2023", "Has date"), + _tx("", "No date"), + _tx("01 Jan 2023", "Has date 2"), ] sorted_txns = strategy.sort(transactions) @@ -76,9 +81,9 @@ def test_sort_same_dates(self): """Test sorting transactions with same dates.""" strategy = ChronologicalSortingStrategy() transactions = [ - {"Date": "01 Jan 2023", "Details": "Transaction A"}, - {"Date": "01 Jan 2023", "Details": "Transaction B"}, - {"Date": "01 Jan 2023", "Details": "Transaction C"}, + _tx("01 Jan 2023", "Transaction A"), + _tx("01 Jan 2023", "Transaction B"), + _tx("01 Jan 2023", "Transaction C"), ] sorted_txns = strategy.sort(transactions) @@ -93,16 +98,16 @@ def test_keeps_original_order(self): """Test that original order is preserved.""" strategy = NoSortingStrategy() transactions = [ - {"Date": "15 Jan 2023", "Details": "Transaction 3"}, - {"Date": "01 Jan 2023", "Details": "Transaction 1"}, - {"Date": "10 Jan 2023", "Details": "Transaction 2"}, + _tx("15 Jan 2023", "Transaction 3"), + _tx("01 Jan 2023", "Transaction 1"), + _tx("10 Jan 2023", "Transaction 2"), ] sorted_txns = strategy.sort(transactions) assert len(sorted_txns) == 3 - assert sorted_txns[0]["Details"] == "Transaction 3" - assert sorted_txns[1]["Details"] == "Transaction 1" - assert sorted_txns[2]["Details"] == "Transaction 2" + assert sorted_txns[0].details == "Transaction 3" + assert sorted_txns[1].details == "Transaction 1" + assert sorted_txns[2].details == "Transaction 2" def test_no_sort_empty_list(self): """Test with empty transaction list.""" @@ -113,10 +118,10 @@ def test_no_sort_empty_list(self): def test_no_sort_single_transaction(self): """Test with single transaction.""" strategy = NoSortingStrategy() - transactions = [{"Date": "01 Jan 2023", "Details": "Single"}] + transactions = [_tx("01 Jan 2023", "Single")] sorted_txns = strategy.sort(transactions) assert len(sorted_txns) == 1 - assert sorted_txns[0]["Details"] == "Single" + assert sorted_txns[0].details == "Single" class TestTransactionSortingService: @@ -140,16 +145,16 @@ def test_sort_with_chronological_strategy(self): service = TransactionSortingService(strategy) transactions = [ - {"Date": "15 Jan 2023", "Details": "Transaction 3"}, - {"Date": "01 Jan 2023", "Details": "Transaction 1"}, - {"Date": "10 Jan 2023", "Details": "Transaction 2"}, + _tx("15 Jan 2023", "Transaction 3"), + _tx("01 Jan 2023", "Transaction 1"), + _tx("10 Jan 2023", "Transaction 2"), ] sorted_txns = service.sort(transactions) assert len(sorted_txns) == 3 - assert sorted_txns[0]["Details"] == "Transaction 1" - assert sorted_txns[1]["Details"] == "Transaction 2" - assert sorted_txns[2]["Details"] == "Transaction 3" + assert sorted_txns[0].details == "Transaction 1" + assert sorted_txns[1].details == "Transaction 2" + assert sorted_txns[2].details == "Transaction 3" def test_sort_with_no_sorting_strategy(self): """Test sorting using no sorting strategy.""" @@ -157,17 +162,17 @@ def test_sort_with_no_sorting_strategy(self): service = TransactionSortingService(strategy) transactions = [ - {"Date": "15 Jan 2023", "Details": "Transaction 3"}, - {"Date": "01 Jan 2023", "Details": "Transaction 1"}, - {"Date": "10 Jan 2023", "Details": "Transaction 2"}, + _tx("15 Jan 2023", "Transaction 3"), + _tx("01 Jan 2023", "Transaction 1"), + _tx("10 Jan 2023", "Transaction 2"), ] sorted_txns = service.sort(transactions) assert len(sorted_txns) == 3 # Original order preserved - assert sorted_txns[0]["Details"] == "Transaction 3" - assert sorted_txns[1]["Details"] == "Transaction 1" - assert sorted_txns[2]["Details"] == "Transaction 2" + assert sorted_txns[0].details == "Transaction 3" + assert sorted_txns[1].details == "Transaction 1" + assert sorted_txns[2].details == "Transaction 2" def test_sort_empty_list(self): """Test sorting empty list.""" @@ -197,15 +202,10 @@ def test_sort_large_dataset(self): transactions = [] for i in range(100): day = (i % 28) + 1 - transactions.append( - { - "Date": f"{day:02d} Jan 2023", - "Details": f"Transaction {i}", - } - ) + transactions.append(_tx(f"{day:02d} Jan 2023", f"Transaction {i}")) sorted_txns = service.sort(transactions) assert len(sorted_txns) == 100 # Verify first few are early dates - assert "01 Jan" in sorted_txns[0]["Date"] or "02 Jan" in sorted_txns[0]["Date"] + assert "01 Jan" in sorted_txns[0].date or "02 Jan" in sorted_txns[0].date diff --git a/packages/parser-core/tests/services/test_transaction_filter.py b/packages/parser-core/tests/services/test_transaction_filter.py index 8b1546f..515dd9f 100644 --- a/packages/parser-core/tests/services/test_transaction_filter.py +++ b/packages/parser-core/tests/services/test_transaction_filter.py @@ -4,9 +4,14 @@ import unittest +from bankstatements_core.domain.models.transaction import Transaction from bankstatements_core.services.transaction_filter import TransactionFilterService +def _tx(**kwargs) -> Transaction: + return Transaction.from_dict(kwargs) + + class TestTransactionFilterService(unittest.TestCase): """Test TransactionFilterService functionality.""" @@ -18,86 +23,75 @@ def setUp(self): def test_filter_empty_rows(self): """Test filtering of empty rows.""" rows = [ - {"Date": "01/01/23", "Details": "Test", "Debit": "100"}, - {"Date": "", "Details": "", "Debit": ""}, # Empty - {"Date": "02/01/23", "Details": "Test2", "Debit": "200"}, - {"Date": " ", "Details": " ", "Debit": " "}, # Whitespace only + _tx(Date="01/01/23", Details="Test", Debit="100"), + _tx(Date="", Details="", Debit=""), # Empty + _tx(Date="02/01/23", Details="Test2", Debit="200"), + _tx(Date=" ", Details=" ", Debit=" "), # Whitespace only ] filtered = self.service.filter_empty_rows(rows) self.assertEqual(len(filtered), 2) - self.assertEqual(filtered[0]["Details"], "Test") - self.assertEqual(filtered[1]["Details"], "Test2") + self.assertEqual(filtered[0].details, "Test") + self.assertEqual(filtered[1].details, "Test2") def test_filter_empty_rows_with_filename(self): """Test that Filename field is ignored in empty check.""" rows = [ - { - "Date": "", - "Details": "", - "Filename": "test.pdf", - }, # Has filename but no data - {"Date": "01/01/23", "Details": "Test", "Filename": "test.pdf"}, + _tx(Date="", Details="", Filename="test.pdf"), # Has filename but no data + _tx(Date="01/01/23", Details="Test", Filename="test.pdf"), ] filtered = self.service.filter_empty_rows(rows) # First row should be filtered despite having Filename self.assertEqual(len(filtered), 1) - self.assertEqual(filtered[0]["Details"], "Test") + self.assertEqual(filtered[0].details, "Test") def test_filter_header_rows(self): """Test filtering of header rows.""" rows = [ - { - "Date": "Date", - "Details": "Details", - "Debit": "Debit", - }, # Perfect header match - {"Date": "01/01/23", "Details": "Test", "Debit": "100"}, - { - "Date": "DATE", - "Details": "DETAILS", - "Debit": "DEBIT", - "Credit": "CREDIT", - }, # Header in uppercase + _tx(Date="Date", Details="Details", Debit="Debit"), # Perfect header match + _tx(Date="01/01/23", Details="Test", Debit="100"), + _tx( + Date="DATE", Details="DETAILS", Debit="DEBIT", Credit="CREDIT" + ), # Header in uppercase ] filtered = self.service.filter_header_rows(rows) # Should filter both header rows self.assertEqual(len(filtered), 1) - self.assertEqual(filtered[0]["Details"], "Test") + self.assertEqual(filtered[0].details, "Test") def test_filter_header_rows_partial_match(self): """Test header filtering with partial column name matches.""" rows = [ - {"Date": "01/01/23", "Details": "Some details here", "Debit": "100"}, - {"Date": "DATE", "Details": "DETAILS", "Debit": "DEBIT"}, # 100% match + _tx(Date="01/01/23", Details="Some details here", Debit="100"), + _tx(Date="DATE", Details="DETAILS", Debit="DEBIT"), # 100% match ] filtered = self.service.filter_header_rows(rows) # Should filter the header row self.assertEqual(len(filtered), 1) - self.assertEqual(filtered[0]["Debit"], "100") + self.assertEqual(filtered[0].debit, "100") def test_filter_invalid_dates(self): """Test filtering of rows with invalid dates.""" rows = [ - {"Date": "01/01/23", "Details": "Test"}, - {"Date": "Product", "Details": "Not a date"}, # Invalid - {"Date": "11 Aug", "Details": "Valid partial date"}, - {"Date": "", "Details": "Empty date"}, # Invalid - {"Date": "Total", "Details": "Not a date"}, # Invalid + _tx(Date="01/01/23", Details="Test"), + _tx(Date="Product", Details="Not a date"), # Invalid + _tx(Date="11 Aug", Details="Valid partial date"), + _tx(Date="", Details="Empty date"), # Invalid + _tx(Date="Total", Details="Not a date"), # Invalid ] filtered = self.service.filter_invalid_dates(rows) self.assertEqual(len(filtered), 2) - self.assertEqual(filtered[0]["Date"], "01/01/23") - self.assertEqual(filtered[1]["Date"], "11 Aug") + self.assertEqual(filtered[0].date, "01/01/23") + self.assertEqual(filtered[1].date, "11 Aug") def test_is_parseable_date_valid_formats(self): """Test date validation with valid formats.""" @@ -140,46 +134,37 @@ def test_is_parseable_date_invalid_formats(self): def test_apply_all_filters(self): """Test applying all filters in sequence.""" rows = [ - {"Date": "Date", "Details": "Details", "Debit": "Debit"}, # Header - {"Date": "", "Details": "", "Debit": ""}, # Empty - {"Date": "01/01/23", "Details": "Test1", "Debit": "100"}, - { - "Date": "Product", - "Details": "Not a date", - "Debit": "200", - }, # Invalid date - {"Date": "02/01/23", "Details": "Test2", "Debit": "300"}, - {"Date": " ", "Details": " ", "Debit": " "}, # Whitespace + _tx(Date="Date", Details="Details", Debit="Debit"), # Header + _tx(Date="", Details="", Debit=""), # Empty + _tx(Date="01/01/23", Details="Test1", Debit="100"), + _tx(Date="Product", Details="Not a date", Debit="200"), # Invalid date + _tx(Date="02/01/23", Details="Test2", Debit="300"), + _tx(Date=" ", Details=" ", Debit=" "), # Whitespace ] filtered = self.service.apply_all_filters(rows) # Should keep only the 2 valid transactions self.assertEqual(len(filtered), 2) - self.assertEqual(filtered[0]["Details"], "Test1") - self.assertEqual(filtered[1]["Details"], "Test2") + self.assertEqual(filtered[0].details, "Test1") + self.assertEqual(filtered[1].details, "Test2") def test_filter_empty_rows_with_non_string_value(self): """Test empty row filtering with non-string values.""" rows = [ - {"Date": "01/01/23", "Details": "Test", "Amount": 100}, # Non-string number - {"Date": "", "Details": "", "Amount": 0}, # Zero is falsy - { - "Date": "02/01/23", - "Details": "Test2", - "Amount": None, - }, # None with other data + _tx(Date="01/01/23", Details="Test"), + _tx(Date="", Details=""), # Empty + _tx(Date="02/01/23", Details="Test2"), ] filtered = self.service.filter_empty_rows(rows) - # First and third should be kept (they have non-empty data) self.assertEqual(len(filtered), 2) def test_header_row_less_than_50_percent_match(self): """Test that rows with less than 50% column name matches aren't filtered.""" rows = [ - {"Date": "01/01/23", "Details": "Date of Transaction", "Debit": "100"}, + _tx(Date="01/01/23", Details="Date of Transaction", Debit="100"), # "Date of Transaction" contains "Date" but other fields don't match ] @@ -191,7 +176,7 @@ def test_header_row_less_than_50_percent_match(self): def test_header_row_requires_minimum_2_matches(self): """Test that header detection requires at least 2 field matches.""" rows = [ - {"Date": "Date", "Details": "Some text", "Debit": "100", "Credit": "200"}, + _tx(Date="Date", Details="Some text", Debit="100", Credit="200"), # Only 1 match (Date), should not be filtered ] diff --git a/packages/parser-core/tests/services/test_transaction_type_classifier.py b/packages/parser-core/tests/services/test_transaction_type_classifier.py index 79795dd..8e0c934 100644 --- a/packages/parser-core/tests/services/test_transaction_type_classifier.py +++ b/packages/parser-core/tests/services/test_transaction_type_classifier.py @@ -4,6 +4,7 @@ import pytest +from bankstatements_core.domain.models.transaction import Transaction from bankstatements_core.services.transaction_type_classifier import ( AmountBasedClassifier, BankStatementPatternClassifier, @@ -98,11 +99,9 @@ class TestTemplateKeywordClassifier: def test_classify_purchase_with_template_keywords(self, credit_card_template): """Should classify as purchase when Details contains template keyword.""" classifier = TemplateKeywordClassifier() - transaction = { - "Date": "01/12/2023", - "Details": "POS TESCO STORES", - "Debit_AMT": "45.23", - } + transaction = Transaction.from_dict( + {"Date": "01/12/2023", "Details": "POS TESCO STORES", "Debit_AMT": "45.23"} + ) result = classifier.classify(transaction, credit_card_template) @@ -111,11 +110,13 @@ def test_classify_purchase_with_template_keywords(self, credit_card_template): def test_classify_payment_with_template_keywords(self, credit_card_template): """Should classify as payment when Details contains payment keyword.""" classifier = TemplateKeywordClassifier() - transaction = { - "Date": "02/12/2023", - "Details": "PAYMENT RECEIVED", - "Credit_AMT": "500.00", - } + transaction = Transaction.from_dict( + { + "Date": "02/12/2023", + "Details": "PAYMENT RECEIVED", + "Credit_AMT": "500.00", + } + ) result = classifier.classify(transaction, credit_card_template) @@ -124,11 +125,13 @@ def test_classify_payment_with_template_keywords(self, credit_card_template): def test_case_insensitive_matching(self, credit_card_template): """Should match keywords case-insensitively.""" classifier = TemplateKeywordClassifier() - transaction = { - "Date": "01/12/2023", - "Details": "contactless payment at shop", - "Debit_AMT": "12.50", - } + transaction = Transaction.from_dict( + { + "Date": "01/12/2023", + "Details": "contactless payment at shop", + "Debit_AMT": "12.50", + } + ) result = classifier.classify(transaction, credit_card_template) @@ -137,11 +140,13 @@ def test_case_insensitive_matching(self, credit_card_template): def test_returns_none_when_no_match(self, credit_card_template): """Should return None when no keyword matches.""" classifier = TemplateKeywordClassifier() - transaction = { - "Date": "01/12/2023", - "Details": "UNKNOWN TRANSACTION TYPE", - "Debit_AMT": "10.00", - } + transaction = Transaction.from_dict( + { + "Date": "01/12/2023", + "Details": "UNKNOWN TRANSACTION TYPE", + "Debit_AMT": "10.00", + } + ) result = classifier._do_classify(transaction, credit_card_template) @@ -150,11 +155,9 @@ def test_returns_none_when_no_match(self, credit_card_template): def test_returns_none_when_no_template(self): """Should return None when no template provided.""" classifier = TemplateKeywordClassifier() - transaction = { - "Date": "01/12/2023", - "Details": "POS TESCO", - "Debit_AMT": "45.23", - } + transaction = Transaction.from_dict( + {"Date": "01/12/2023", "Details": "POS TESCO", "Debit_AMT": "45.23"} + ) result = classifier._do_classify(transaction, None) @@ -171,20 +174,16 @@ def test_returns_none_when_template_has_no_keywords(self): id="test", name="Test", enabled=True, - detection=TemplateDetectionConfig( - filename_patterns=["*.pdf"] - ), # Need at least one detection method + detection=TemplateDetectionConfig(filename_patterns=["*.pdf"]), extraction=TemplateExtractionConfig( table_top_y=100, table_bottom_y=700, columns={"Date": (0, 100)} ), ) classifier = TemplateKeywordClassifier() - transaction = { - "Date": "01/12/2023", - "Details": "POS TESCO", - "Debit_AMT": "45.23", - } + transaction = Transaction.from_dict( + {"Date": "01/12/2023", "Details": "POS TESCO", "Debit_AMT": "45.23"} + ) result = classifier._do_classify(transaction, template) @@ -200,12 +199,14 @@ class TestCreditCardPatternClassifier: def test_classify_pos_purchase(self): """Should classify POS transactions as purchase.""" classifier = CreditCardPatternClassifier() - transaction = { - "Date": "01/12/2023", - "Details": "POS TESCO STORES", - "Debit_AMT": "45.23", - "document_type": "credit_card_statement", - } + transaction = Transaction.from_dict( + { + "Date": "01/12/2023", + "Details": "POS TESCO STORES", + "Debit_AMT": "45.23", + "document_type": "credit_card_statement", + } + ) result = classifier.classify(transaction, None) @@ -214,12 +215,14 @@ def test_classify_pos_purchase(self): def test_classify_online_purchase(self): """Should classify ONLINE transactions as purchase.""" classifier = CreditCardPatternClassifier() - transaction = { - "Date": "01/12/2023", - "Details": "ONLINE AMAZON.COM", - "Debit_AMT": "25.99", - "document_type": "credit_card_statement", - } + transaction = Transaction.from_dict( + { + "Date": "01/12/2023", + "Details": "ONLINE AMAZON.COM", + "Debit_AMT": "25.99", + "document_type": "credit_card_statement", + } + ) result = classifier.classify(transaction, None) @@ -228,12 +231,14 @@ def test_classify_online_purchase(self): def test_classify_payment_received(self): """Should classify payment received as payment.""" classifier = CreditCardPatternClassifier() - transaction = { - "Date": "05/12/2023", - "Details": "PAYMENT RECEIVED THANK YOU", - "Credit_AMT": "500.00", - "document_type": "credit_card_statement", - } + transaction = Transaction.from_dict( + { + "Date": "05/12/2023", + "Details": "PAYMENT RECEIVED THANK YOU", + "Credit_AMT": "500.00", + "document_type": "credit_card_statement", + } + ) result = classifier.classify(transaction, None) @@ -242,12 +247,14 @@ def test_classify_payment_received(self): def test_classify_annual_fee(self): """Should classify annual fee as fee.""" classifier = CreditCardPatternClassifier() - transaction = { - "Date": "01/01/2024", - "Details": "ANNUAL FEE", - "Debit_AMT": "12.00", - "document_type": "credit_card_statement", - } + transaction = Transaction.from_dict( + { + "Date": "01/01/2024", + "Details": "ANNUAL FEE", + "Debit_AMT": "12.00", + "document_type": "credit_card_statement", + } + ) result = classifier.classify(transaction, None) @@ -256,12 +263,14 @@ def test_classify_annual_fee(self): def test_classify_refund(self): """Should classify refund transactions as refund.""" classifier = CreditCardPatternClassifier() - transaction = { - "Date": "10/12/2023", - "Details": "REFUND AMAZON.COM", - "Credit_AMT": "15.00", - "document_type": "credit_card_statement", - } + transaction = Transaction.from_dict( + { + "Date": "10/12/2023", + "Details": "REFUND AMAZON.COM", + "Credit_AMT": "15.00", + "document_type": "credit_card_statement", + } + ) result = classifier.classify(transaction, None) @@ -270,12 +279,14 @@ def test_classify_refund(self): def test_only_runs_for_credit_card_statements(self): """Should not classify non-credit card statements.""" classifier = CreditCardPatternClassifier() - transaction = { - "Date": "01/12/2023", - "Details": "POS TESCO STORES", - "Debit_AMT": "45.23", - "document_type": "bank_statement", - } + transaction = Transaction.from_dict( + { + "Date": "01/12/2023", + "Details": "POS TESCO STORES", + "Debit_AMT": "45.23", + "document_type": "bank_statement", + } + ) result = classifier._do_classify(transaction, None) @@ -291,12 +302,14 @@ class TestBankStatementPatternClassifier: def test_classify_sepa_transfer(self): """Should classify SEPA transactions as transfer.""" classifier = BankStatementPatternClassifier() - transaction = { - "Date": "01/12/2023", - "Details": "SEPA CREDIT FROM JOHN DOE", - "Credit_AMT": "100.00", - "document_type": "bank_statement", - } + transaction = Transaction.from_dict( + { + "Date": "01/12/2023", + "Details": "SEPA CREDIT FROM JOHN DOE", + "Credit_AMT": "100.00", + "document_type": "bank_statement", + } + ) result = classifier.classify(transaction, None) @@ -305,12 +318,14 @@ def test_classify_sepa_transfer(self): def test_classify_direct_debit(self): """Should classify direct debit as payment.""" classifier = BankStatementPatternClassifier() - transaction = { - "Date": "05/12/2023", - "Details": "DIRECT DEBIT ELECTRICITY COMPANY", - "Debit_AMT": "75.50", - "document_type": "bank_statement", - } + transaction = Transaction.from_dict( + { + "Date": "05/12/2023", + "Details": "DIRECT DEBIT ELECTRICITY COMPANY", + "Debit_AMT": "75.50", + "document_type": "bank_statement", + } + ) result = classifier.classify(transaction, None) @@ -319,12 +334,14 @@ def test_classify_direct_debit(self): def test_classify_standing_order(self): """Should classify standing order as payment.""" classifier = BankStatementPatternClassifier() - transaction = { - "Date": "01/12/2023", - "Details": "STANDING ORDER RENT", - "Debit_AMT": "1200.00", - "document_type": "bank_statement", - } + transaction = Transaction.from_dict( + { + "Date": "01/12/2023", + "Details": "STANDING ORDER RENT", + "Debit_AMT": "1200.00", + "document_type": "bank_statement", + } + ) result = classifier.classify(transaction, None) @@ -333,12 +350,14 @@ def test_classify_standing_order(self): def test_classify_interest_credit(self): """Should classify interest credit as interest.""" classifier = BankStatementPatternClassifier() - transaction = { - "Date": "31/12/2023", - "Details": "INTEREST CREDIT", - "Credit_AMT": "2.50", - "document_type": "bank_statement", - } + transaction = Transaction.from_dict( + { + "Date": "31/12/2023", + "Details": "INTEREST CREDIT", + "Credit_AMT": "2.50", + "document_type": "bank_statement", + } + ) result = classifier.classify(transaction, None) @@ -347,12 +366,14 @@ def test_classify_interest_credit(self): def test_only_runs_for_bank_statements(self): """Should not classify non-bank statements.""" classifier = BankStatementPatternClassifier() - transaction = { - "Date": "01/12/2023", - "Details": "SEPA CREDIT", - "Credit_AMT": "100.00", - "document_type": "credit_card_statement", - } + transaction = Transaction.from_dict( + { + "Date": "01/12/2023", + "Details": "SEPA CREDIT", + "Credit_AMT": "100.00", + "document_type": "credit_card_statement", + } + ) result = classifier._do_classify(transaction, None) @@ -368,13 +389,15 @@ class TestAmountBasedClassifier: def test_debit_only_credit_card_classified_as_purchase(self): """Should classify debit-only credit card transaction as purchase.""" classifier = AmountBasedClassifier() - transaction = { - "Date": "01/12/2023", - "Details": "MERCHANT NAME", - "Debit_AMT": "50.00", - "Credit_AMT": None, - "document_type": "credit_card_statement", - } + transaction = Transaction.from_dict( + { + "Date": "01/12/2023", + "Details": "MERCHANT NAME", + "Debit_AMT": "50.00", + "Credit_AMT": None, + "document_type": "credit_card_statement", + } + ) result = classifier.classify(transaction, None) @@ -383,13 +406,15 @@ def test_debit_only_credit_card_classified_as_purchase(self): def test_credit_only_credit_card_classified_as_refund(self): """Should classify credit-only credit card transaction as refund.""" classifier = AmountBasedClassifier() - transaction = { - "Date": "01/12/2023", - "Details": "MERCHANT NAME", - "Debit_AMT": None, - "Credit_AMT": "25.00", - "document_type": "credit_card_statement", - } + transaction = Transaction.from_dict( + { + "Date": "01/12/2023", + "Details": "MERCHANT NAME", + "Debit_AMT": None, + "Credit_AMT": "25.00", + "document_type": "credit_card_statement", + } + ) result = classifier.classify(transaction, None) @@ -398,13 +423,15 @@ def test_credit_only_credit_card_classified_as_refund(self): def test_debit_only_bank_statement_classified_as_payment(self): """Should classify debit-only bank statement as payment.""" classifier = AmountBasedClassifier() - transaction = { - "Date": "01/12/2023", - "Details": "MERCHANT NAME", - "Debit_AMT": "50.00", - "Credit_AMT": None, - "document_type": "bank_statement", - } + transaction = Transaction.from_dict( + { + "Date": "01/12/2023", + "Details": "MERCHANT NAME", + "Debit_AMT": "50.00", + "Credit_AMT": None, + "document_type": "bank_statement", + } + ) result = classifier.classify(transaction, None) @@ -413,13 +440,15 @@ def test_debit_only_bank_statement_classified_as_payment(self): def test_credit_only_bank_statement_classified_as_transfer(self): """Should classify credit-only bank statement as transfer.""" classifier = AmountBasedClassifier() - transaction = { - "Date": "01/12/2023", - "Details": "INCOMING TRANSFER", - "Debit_AMT": None, - "Credit_AMT": "100.00", - "document_type": "bank_statement", - } + transaction = Transaction.from_dict( + { + "Date": "01/12/2023", + "Details": "INCOMING TRANSFER", + "Debit_AMT": None, + "Credit_AMT": "100.00", + "document_type": "bank_statement", + } + ) result = classifier.classify(transaction, None) @@ -428,12 +457,14 @@ def test_credit_only_bank_statement_classified_as_transfer(self): def test_zero_amount_classified_as_fee(self): """Should classify zero amount as fee.""" classifier = AmountBasedClassifier() - transaction = { - "Date": "01/12/2023", - "Details": "MONTHLY CHARGE", - "Debit_AMT": "0.00", - "Credit_AMT": None, - } + transaction = Transaction.from_dict( + { + "Date": "01/12/2023", + "Details": "MONTHLY CHARGE", + "Debit_AMT": "0.00", + "Credit_AMT": None, + } + ) result = classifier.classify(transaction, None) @@ -449,10 +480,9 @@ class TestDefaultClassifier: def test_always_returns_other(self): """Should always return 'other' as default.""" classifier = DefaultClassifier() - transaction = { - "Date": "01/12/2023", - "Details": "UNCLASSIFIABLE TRANSACTION", - } + transaction = Transaction.from_dict( + {"Date": "01/12/2023", "Details": "UNCLASSIFIABLE TRANSACTION"} + ) result = classifier.classify(transaction, None) @@ -467,12 +497,14 @@ class TestClassifierChain: def test_chain_stops_at_first_match(self, credit_card_template): """Should stop at first classifier that matches.""" - transaction = { - "Date": "01/12/2023", - "Details": "POS TESCO STORES", - "Debit_AMT": "45.23", - "document_type": "credit_card_statement", - } + transaction = Transaction.from_dict( + { + "Date": "01/12/2023", + "Details": "POS TESCO STORES", + "Debit_AMT": "45.23", + "document_type": "credit_card_statement", + } + ) # Template classifier should match first chain = create_transaction_type_classifier_chain("credit_card_statement") @@ -482,11 +514,13 @@ def test_chain_stops_at_first_match(self, credit_card_template): def test_chain_falls_through_to_default(self): """Should fall through to default classifier when nothing matches.""" - transaction = { - "Date": "01/12/2023", - "Details": "UNKNOWN PATTERN", - "document_type": "unknown_type", - } + transaction = Transaction.from_dict( + { + "Date": "01/12/2023", + "Details": "UNKNOWN PATTERN", + "document_type": "unknown_type", + } + ) chain = create_transaction_type_classifier_chain("unknown_type") result = chain.classify(transaction, None) @@ -497,12 +531,14 @@ def test_chain_falls_through_to_default(self): def test_factory_creates_correct_chain_for_credit_cards(self): """Should create credit card chain with appropriate classifiers.""" - transaction = { - "Date": "01/12/2023", - "Details": "CONTACTLESS PAYMENT", - "Debit_AMT": "12.50", - "document_type": "credit_card_statement", - } + transaction = Transaction.from_dict( + { + "Date": "01/12/2023", + "Details": "CONTACTLESS PAYMENT", + "Debit_AMT": "12.50", + "document_type": "credit_card_statement", + } + ) chain = create_transaction_type_classifier_chain("credit_card_statement") result = chain.classify(transaction, None) @@ -511,12 +547,14 @@ def test_factory_creates_correct_chain_for_credit_cards(self): def test_factory_creates_correct_chain_for_bank_statements(self): """Should create bank statement chain with appropriate classifiers.""" - transaction = { - "Date": "01/12/2023", - "Details": "SEPA TRANSFER FROM JOHN", - "Credit_AMT": "100.00", - "document_type": "bank_statement", - } + transaction = Transaction.from_dict( + { + "Date": "01/12/2023", + "Details": "SEPA TRANSFER FROM JOHN", + "Credit_AMT": "100.00", + "document_type": "bank_statement", + } + ) chain = create_transaction_type_classifier_chain("bank_statement") result = chain.classify(transaction, None) @@ -525,11 +563,9 @@ def test_factory_creates_correct_chain_for_bank_statements(self): def test_chain_without_document_type(self): """Should handle None document type gracefully.""" - transaction = { - "Date": "01/12/2023", - "Details": "SOME TRANSACTION", - "Debit_AMT": "50.00", - } + transaction = Transaction.from_dict( + {"Date": "01/12/2023", "Details": "SOME TRANSACTION", "Debit_AMT": "50.00"} + ) chain = create_transaction_type_classifier_chain(None) result = chain.classify(transaction, None) @@ -539,12 +575,14 @@ def test_chain_without_document_type(self): def test_template_keywords_take_priority_over_patterns(self, bank_template): """Template keywords should take priority over generic patterns.""" - transaction = { - "Date": "01/12/2023", - "Details": "SEPA CREDIT", # Both template keyword and pattern - "Credit_AMT": "100.00", - "document_type": "bank_statement", - } + transaction = Transaction.from_dict( + { + "Date": "01/12/2023", + "Details": "SEPA CREDIT", + "Credit_AMT": "100.00", + "document_type": "bank_statement", + } + ) chain = create_transaction_type_classifier_chain("bank_statement") result = chain.classify(transaction, bank_template) diff --git a/packages/parser-core/tests/test_processor.py b/packages/parser-core/tests/test_processor.py index 78fbb4b..021c421 100644 --- a/packages/parser-core/tests/test_processor.py +++ b/packages/parser-core/tests/test_processor.py @@ -91,61 +91,65 @@ def tearDown(self): def test_detect_duplicates_different_files(self): """Test duplicate detection when same transaction appears in different files""" # Same transaction details but different filenames - transaction_data = [ - { - "Date": "01 Jan 2024", - "Details": "Salary Payment", - "Debit €": "", - "Credit €": "3000.00", - "Balance €": "3500.00", - "Filename": "statement1.pdf", - }, - { - "Date": "01 Jan 2024", - "Details": "Salary Payment", - "Debit €": "", - "Credit €": "3000.00", - "Balance €": "3500.00", - "Filename": "statement2.pdf", # Different file - }, - { - "Date": "02 Jan 2024", - "Details": "Coffee Shop", - "Debit €": "5.50", - "Credit €": "", - "Balance €": "3494.50", - "Filename": "statement1.pdf", - }, - ] + transaction_data = dicts_to_transactions( + [ + { + "Date": "01 Jan 2024", + "Details": "Salary Payment", + "Debit €": "", + "Credit €": "3000.00", + "Balance €": "3500.00", + "Filename": "statement1.pdf", + }, + { + "Date": "01 Jan 2024", + "Details": "Salary Payment", + "Debit €": "", + "Credit €": "3000.00", + "Balance €": "3500.00", + "Filename": "statement2.pdf", + }, + { + "Date": "02 Jan 2024", + "Details": "Coffee Shop", + "Debit €": "5.50", + "Credit €": "", + "Balance €": "3494.50", + "Filename": "statement1.pdf", + }, + ] + ) unique, duplicates = self.processor._detect_duplicates(transaction_data) # Should have 2 unique and 1 duplicate self.assertEqual(len(unique), 2) self.assertEqual(len(duplicates), 1) - self.assertEqual(duplicates[0]["Filename"], "statement2.pdf") - self.assertEqual(duplicates[0]["Details"], "Salary Payment") + self.assertEqual(duplicates[0].filename, "statement2.pdf") + self.assertEqual(duplicates[0].details, "Salary Payment") def test_detect_duplicates_same_file(self): """Test that identical transactions from same file are kept (edge case)""" - transaction_data = [ - { - "Date": "01 Jan 2024", - "Details": "ATM Withdrawal", - "Debit €": "100.00", - "Credit €": "", - "Balance €": "400.00", - "Filename": "statement1.pdf", - }, - { - "Date": "01 Jan 2024", - "Details": "ATM Withdrawal", - "Debit €": "100.00", - "Credit €": "", - "Balance €": "300.00", # Different balance - "Filename": "statement1.pdf", - }, - ] + transaction_data = dicts_to_transactions( + [ + { + "Date": "01 Jan 2024", + "Details": "ATM Withdrawal", + "Debit €": "100.00", + "Credit €": "", + "Balance €": "400.00", + "Filename": "statement1.pdf", + }, + { + "Date": "01 Jan 2024", + "Details": "ATM Withdrawal", + "Debit €": "100.00", + "Credit €": "", + "Balance €": "300.00", + "Filename": "statement1.pdf", + }, + ] + ) unique, duplicates = self.processor._detect_duplicates(transaction_data) @@ -155,24 +159,26 @@ def test_detect_duplicates_same_file(self): def test_detect_duplicates_no_duplicates(self): """Test when no duplicates exist""" - transaction_data = [ - { - "Date": "01 Jan 2024", - "Details": "Grocery Store", - "Debit €": "25.50", - "Credit €": "", - "Balance €": "475.50", - "Filename": "statement1.pdf", - }, - { - "Date": "02 Jan 2024", - "Details": "Gas Station", - "Debit €": "40.00", - "Credit €": "", - "Balance €": "435.50", - "Filename": "statement2.pdf", - }, - ] + transaction_data = dicts_to_transactions( + [ + { + "Date": "01 Jan 2024", + "Details": "Grocery Store", + "Debit €": "25.50", + "Credit €": "", + "Balance €": "475.50", + "Filename": "statement1.pdf", + }, + { + "Date": "02 Jan 2024", + "Details": "Gas Station", + "Debit €": "40.00", + "Credit €": "", + "Balance €": "435.50", + "Filename": "statement2.pdf", + }, + ] + ) unique, duplicates = self.processor._detect_duplicates(transaction_data) @@ -400,54 +406,48 @@ def test_detect_duplicates_complex_scenario(self): """Test duplicate detection with complex real-world scenarios""" processor = create_test_processor(self.input_dir, self.output_dir) - transaction_data = [ - # Same transaction, same file (should be unique) - { - "Date": "01 Jan 2024", - "Details": "VDC-GROCERY STORE", - "Debit €": "25.50", - "Credit €": "", - "Balance €": "475.50", - "Filename": "statement1.pdf", - }, - # Same transaction, different file (should be duplicate) - { - "Date": "01 Jan 2024", - "Details": "VDC-GROCERY STORE", - "Debit €": "25.50", - "Credit €": "", - "Balance €": "475.50", - "Filename": "statement2.pdf", - }, - # Similar transaction, different amount (should be unique) - { - "Date": "01 Jan 2024", - "Details": "VDC-GROCERY STORE", - "Debit €": "26.50", # Different amount - "Credit €": "", - "Balance €": "474.50", - "Filename": "statement3.pdf", - }, - # Same transaction details, different date (should be unique) - { - "Date": "02 Jan 2024", # Different date - "Details": "VDC-GROCERY STORE", - "Debit €": "25.50", - "Credit €": "", - "Balance €": "475.50", - "Filename": "statement4.pdf", - }, - ] + transaction_data = dicts_to_transactions( + [ + { + "Date": "01 Jan 2024", + "Details": "VDC-GROCERY STORE", + "Debit €": "25.50", + "Credit €": "", + "Balance €": "475.50", + "Filename": "statement1.pdf", + }, + { + "Date": "01 Jan 2024", + "Details": "VDC-GROCERY STORE", + "Debit €": "25.50", + "Credit €": "", + "Balance €": "475.50", + "Filename": "statement2.pdf", + }, + { + "Date": "01 Jan 2024", + "Details": "VDC-GROCERY STORE", + "Debit €": "26.50", + "Credit €": "", + "Balance €": "474.50", + "Filename": "statement3.pdf", + }, + { + "Date": "02 Jan 2024", + "Details": "VDC-GROCERY STORE", + "Debit €": "25.50", + "Credit €": "", + "Balance €": "475.50", + "Filename": "statement4.pdf", + }, + ] + ) unique, duplicates = processor._detect_duplicates(transaction_data) - # Should have 3 unique (original + different amount + different date) - # and 1 duplicate (same transaction from different file) self.assertEqual(len(unique), 3) self.assertEqual(len(duplicates), 1) - - # Verify the duplicate is correctly identified - self.assertEqual(duplicates[0]["Filename"], "statement2.pdf") + self.assertEqual(duplicates[0].filename, "statement2.pdf") class TestTransactionDateParsing(unittest.TestCase): diff --git a/packages/parser-core/tests/test_processor_refactored_methods.py b/packages/parser-core/tests/test_processor_refactored_methods.py index ccb22ac..37df124 100644 --- a/packages/parser-core/tests/test_processor_refactored_methods.py +++ b/packages/parser-core/tests/test_processor_refactored_methods.py @@ -140,18 +140,22 @@ def test_process_all_pdfs_no_files(self, mock_extract): def test_sort_transactions_by_date_mixed_dates(self): """Test _sort_transactions_by_date with various date formats""" - rows = [ - {"Date": "15/06/23", "Details": "Third"}, - {"Date": "01/01/23", "Details": "First"}, - {"Date": "10/03/23", "Details": "Second"}, - ] + from bankstatements_core.domain.converters import dicts_to_transactions + + rows = dicts_to_transactions( + [ + {"Date": "15/06/23", "Details": "Third"}, + {"Date": "01/01/23", "Details": "First"}, + {"Date": "10/03/23", "Details": "Second"}, + ] + ) sorted_rows = self.processor._sort_transactions_by_date(rows) # Verify chronological order - self.assertEqual(sorted_rows[0]["Details"], "First") - self.assertEqual(sorted_rows[1]["Details"], "Second") - self.assertEqual(sorted_rows[2]["Details"], "Third") + self.assertEqual(sorted_rows[0].details, "First") + self.assertEqual(sorted_rows[1].details, "Second") + self.assertEqual(sorted_rows[2].details, "Third") def test_sort_transactions_by_date_empty_list(self): """Test _sort_transactions_by_date with empty list""" @@ -161,12 +165,14 @@ def test_sort_transactions_by_date_empty_list(self): def test_sort_transactions_by_date_single_item(self): """Test _sort_transactions_by_date with single transaction""" - rows = [{"Date": "01/01/23", "Details": "Only"}] + from bankstatements_core.domain.converters import dicts_to_transactions + + rows = dicts_to_transactions([{"Date": "01/01/23", "Details": "Only"}]) sorted_rows = self.processor._sort_transactions_by_date(rows) self.assertEqual(len(sorted_rows), 1) - self.assertEqual(sorted_rows[0]["Details"], "Only") + self.assertEqual(sorted_rows[0].details, "Only") def test_write_json_file(self): """Test JSON writing through repository""" diff --git a/packages/parser-free/tests/test_coverage_improvements.py b/packages/parser-free/tests/test_coverage_improvements.py index d978884..31068cc 100644 --- a/packages/parser-free/tests/test_coverage_improvements.py +++ b/packages/parser-free/tests/test_coverage_improvements.py @@ -136,20 +136,11 @@ def test_detect_duplicates_same_file_same_transaction(self): processor = BankStatementProcessor(config=config) # Same transaction appearing twice in the same file - all_rows = [ - { - "Date": "01/12/23", - "Details": "Test", - "Debit €": "100", - "Filename": "file1.pdf", - }, - { - "Date": "01/12/23", - "Details": "Test", - "Debit €": "100", - "Filename": "file1.pdf", - }, - ] + from bankstatements_core.domain.converters import dicts_to_transactions + all_rows = dicts_to_transactions([ + {"Date": "01/12/23", "Details": "Test", "Debit €": "100", "Filename": "file1.pdf"}, + {"Date": "01/12/23", "Details": "Test", "Debit €": "100", "Filename": "file1.pdf"}, + ]) unique_rows, duplicate_rows = processor._detect_duplicates(all_rows) From 4c0db460f7613a6cb765aff2d3607a4347244f08 Mon Sep 17 00:00:00 2001 From: longieirl Date: Thu, 26 Mar 2026 17:01:20 +0000 Subject: [PATCH 2/6] test: re-baseline integration snapshot after #71 service layer migration --- .../snapshots/output_snapshot.json | 78 +++---------------- 1 file changed, 9 insertions(+), 69 deletions(-) diff --git a/packages/parser-core/tests/integration/snapshots/output_snapshot.json b/packages/parser-core/tests/integration/snapshots/output_snapshot.json index 1a7fd54..25985b5 100644 --- a/packages/parser-core/tests/integration/snapshots/output_snapshot.json +++ b/packages/parser-core/tests/integration/snapshots/output_snapshot.json @@ -1,38 +1,12 @@ { "files": { - "bank_statements.csv": { - "row_count": 2, - "size_bytes": 109 - }, - "bank_statements.json": { - "record_count": 1, - "size_bytes": 88 - }, "bank_statements_3656.csv": { - "row_count": 67, - "size_bytes": 7872 - }, - "bank_statements_3656.json": { - "record_count": 67, - "size_bytes": 29904 - }, - "bank_statements_3656.xlsx": { - "size_bytes": 7335 + "row_count": 68, + "size_bytes": 6815 }, "bank_statements_9015.csv": { - "row_count": 400, - "size_bytes": 26662 - }, - "bank_statements_9015.json": { - "record_count": 400, - "size_bytes": 156629 - }, - "bank_statements_9015.xlsx": { - "size_bytes": 18959 - }, - "duplicates.json": { - "record_count": 0, - "size_bytes": 2 + "row_count": 401, + "size_bytes": 24161 }, "duplicates_3656.json": { "record_count": 0, @@ -47,16 +21,7 @@ "excluded_files", "summary" ], - "size_bytes": 547 - }, - "expense_analysis.json": { - "keys": [ - "generated_at", - "insights", - "summary", - "total_transactions_analyzed" - ], - "size_bytes": 425 + "size_bytes": 480 }, "expense_analysis_3656.json": { "keys": [ @@ -80,15 +45,6 @@ "record_count": 3, "size_bytes": 606 }, - "monthly_summary.json": { - "keys": [ - "generated_at", - "monthly_data", - "summary", - "total_months" - ], - "size_bytes": 170 - }, "monthly_summary_3656.json": { "keys": [ "generated_at", @@ -96,7 +52,7 @@ "summary", "total_months" ], - "size_bytes": 2032 + "size_bytes": 1988 }, "monthly_summary_9015.json": { "keys": [ @@ -105,39 +61,23 @@ "summary", "total_months" ], - "size_bytes": 1112 + "size_bytes": 1063 } }, - "processing_summary": { - "duplicates": 0, - "pages_read": 24, - "pdf_count": 4, - "pdfs_extracted": 3, - "transactions": 467 - }, "summary": { - "csv_outputs": 3, + "csv_outputs": 2, "output_filenames": [ - "bank_statements.csv", - "bank_statements.json", "bank_statements_3656.csv", - "bank_statements_3656.json", - "bank_statements_3656.xlsx", "bank_statements_9015.csv", - "bank_statements_9015.json", - "bank_statements_9015.xlsx", - "duplicates.json", "duplicates_3656.json", "duplicates_9015.json", "excluded_files.json", - "expense_analysis.json", "expense_analysis_3656.json", "expense_analysis_9015.json", "ibans.json", - "monthly_summary.json", "monthly_summary_3656.json", "monthly_summary_9015.json" ], - "total_files": 19 + "total_files": 10 } } \ No newline at end of file From de0a8f58c18886332506cfdb7b88f64004695115 Mon Sep 17 00:00:00 2001 From: longieirl Date: Thu, 26 Mar 2026 17:08:26 +0000 Subject: [PATCH 3/6] fix(#71): use currency_symbol='' in transactions_to_dicts to match neutral column names MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit to_dict() defaulted to currency_symbol='€' producing 'Debit €' keys, but column_names uses neutral 'Debit'/'Credit'/'Balance' keys — causing pandas DataFrame to produce empty columns in CSV output. --- .../src/bankstatements_core/domain/converters.py | 6 +++--- .../tests/integration/snapshots/output_snapshot.json | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/packages/parser-core/src/bankstatements_core/domain/converters.py b/packages/parser-core/src/bankstatements_core/domain/converters.py index b22a6ec..9c213c7 100644 --- a/packages/parser-core/src/bankstatements_core/domain/converters.py +++ b/packages/parser-core/src/bankstatements_core/domain/converters.py @@ -44,7 +44,7 @@ def transactions_to_dicts(transactions: list[Transaction]) -> list[dict]: >>> dicts[0]["Date"] '01/01/23' """ - return [tx.to_dict() for tx in transactions] + return [tx.to_dict(currency_symbol="") for tx in transactions] def dict_to_transaction(row: dict) -> Transaction: @@ -79,7 +79,7 @@ def transaction_to_dict(transaction: Transaction) -> dict: >>> tx = Transaction(date="01/01/23", details="Test", debit="50.00", ... credit=None, balance="100.00", filename="test.pdf") >>> row = transaction_to_dict(tx) - >>> row["Debit €"] + >>> row["Debit"] '50.00' """ - return transaction.to_dict() + return transaction.to_dict(currency_symbol="") diff --git a/packages/parser-core/tests/integration/snapshots/output_snapshot.json b/packages/parser-core/tests/integration/snapshots/output_snapshot.json index 25985b5..a4e75be 100644 --- a/packages/parser-core/tests/integration/snapshots/output_snapshot.json +++ b/packages/parser-core/tests/integration/snapshots/output_snapshot.json @@ -2,11 +2,11 @@ "files": { "bank_statements_3656.csv": { "row_count": 68, - "size_bytes": 6815 + "size_bytes": 7898 }, "bank_statements_9015.csv": { "row_count": 401, - "size_bytes": 24161 + "size_bytes": 26690 }, "duplicates_3656.json": { "record_count": 0, @@ -52,7 +52,7 @@ "summary", "total_months" ], - "size_bytes": 1988 + "size_bytes": 2032 }, "monthly_summary_9015.json": { "keys": [ @@ -61,7 +61,7 @@ "summary", "total_months" ], - "size_bytes": 1063 + "size_bytes": 1112 } }, "summary": { From eb04b407bdd35834ff9aec18648fa6e5fcb749ef Mon Sep 17 00:00:00 2001 From: longieirl Date: Thu, 26 Mar 2026 17:14:40 +0000 Subject: [PATCH 4/6] test(#71): update converter tests to assert neutral column names; add regression test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - test_transaction_to_dict and test_transactions_to_dicts now assert 'Debit', 'Credit', 'Balance' keys (not 'Debit €' etc.) - New test_transactions_to_dicts_keys_match_default_column_names builds a DataFrame with DEFAULT_COLUMNS names to prove key alignment holds --- .../tests/domain/test_converters.py | 50 ++++++++++++++++--- 1 file changed, 44 insertions(+), 6 deletions(-) diff --git a/packages/parser-core/tests/domain/test_converters.py b/packages/parser-core/tests/domain/test_converters.py index eb8edf8..821dc2a 100644 --- a/packages/parser-core/tests/domain/test_converters.py +++ b/packages/parser-core/tests/domain/test_converters.py @@ -1,5 +1,7 @@ """Tests for domain converters.""" +import pandas as pd + from bankstatements_core.domain.converters import ( dict_to_transaction, dicts_to_transactions, @@ -30,7 +32,7 @@ def test_dict_to_transaction(): def test_transaction_to_dict(): - """Test converting single Transaction to dict.""" + """Test converting single Transaction to dict uses neutral column names.""" tx = Transaction( date="01/01/23", details="Test Transaction", @@ -45,10 +47,14 @@ def test_transaction_to_dict(): assert isinstance(row, dict) assert row["Date"] == "01/01/23" assert row["Details"] == "Test Transaction" - assert row["Debit €"] == "50.00" - assert row["Credit €"] is None - assert row["Balance €"] == "100.00" + assert row["Debit"] == "50.00" + assert row["Credit"] is None + assert row["Balance"] == "100.00" assert row["Filename"] == "test.pdf" + # Ensure no currency-suffixed keys are present + assert "Debit €" not in row + assert "Credit €" not in row + assert "Balance €" not in row def test_dicts_to_transactions(): @@ -81,7 +87,7 @@ def test_dicts_to_transactions(): def test_transactions_to_dicts(): - """Test converting list of Transactions to dicts.""" + """Test converting list of Transactions to dicts uses neutral column names.""" transactions = [ Transaction( date="01/01/23", @@ -106,4 +112,36 @@ def test_transactions_to_dicts(): assert len(rows) == 2 assert all(isinstance(row, dict) for row in rows) assert rows[0]["Date"] == "01/01/23" - assert rows[1]["Credit €"] == "25.00" + assert rows[1]["Credit"] == "25.00" + # Ensure no currency-suffixed keys are present + assert "Debit €" not in rows[0] + assert "Credit €" not in rows[1] + assert "Balance €" not in rows[0] + + +def test_transactions_to_dicts_keys_match_default_column_names(): + """Regression: dict keys must match DEFAULT_COLUMNS neutral names. + + Previously transactions_to_dicts() produced 'Debit €' / 'Credit €' / + 'Balance €' keys while the DataFrame was built with neutral 'Debit' / + 'Credit' / 'Balance' column names, causing empty columns in CSV output. + """ + from bankstatements_core.config.column_config import get_column_names + + tx = Transaction( + date="01/01/23", + details="Payment", + debit="99.00", + credit=None, + balance="500.00", + filename="test.pdf", + ) + column_names = get_column_names() + + rows = transactions_to_dicts([tx]) + df = pd.DataFrame(rows, columns=column_names) + + assert df["Debit"].iloc[0] == "99.00" + assert df["Balance"].iloc[0] == "500.00" + # Credit is None — pandas should not coerce to NaN due to key mismatch + assert df["Credit"].iloc[0] is None or pd.isna(df["Credit"].iloc[0]) From 9f8e4121a3a3d7eea0ca909d77b20a9fcde92bdd Mon Sep 17 00:00:00 2001 From: longieirl Date: Thu, 26 Mar 2026 17:19:48 +0000 Subject: [PATCH 5/6] style: black formatting fix for parser-free test_coverage_improvements --- .../tests/test_coverage_improvements.py | 21 +++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/packages/parser-free/tests/test_coverage_improvements.py b/packages/parser-free/tests/test_coverage_improvements.py index 31068cc..87723f1 100644 --- a/packages/parser-free/tests/test_coverage_improvements.py +++ b/packages/parser-free/tests/test_coverage_improvements.py @@ -137,10 +137,23 @@ def test_detect_duplicates_same_file_same_transaction(self): # Same transaction appearing twice in the same file from bankstatements_core.domain.converters import dicts_to_transactions - all_rows = dicts_to_transactions([ - {"Date": "01/12/23", "Details": "Test", "Debit €": "100", "Filename": "file1.pdf"}, - {"Date": "01/12/23", "Details": "Test", "Debit €": "100", "Filename": "file1.pdf"}, - ]) + + all_rows = dicts_to_transactions( + [ + { + "Date": "01/12/23", + "Details": "Test", + "Debit €": "100", + "Filename": "file1.pdf", + }, + { + "Date": "01/12/23", + "Details": "Test", + "Debit €": "100", + "Filename": "file1.pdf", + }, + ] + ) unique_rows, duplicate_rows = processor._detect_duplicates(all_rows) From cabc6b0076285eed1bd9f304688989fb16f5a21f Mon Sep 17 00:00:00 2001 From: longieirl Date: Thu, 26 Mar 2026 20:36:33 +0000 Subject: [PATCH 6/6] fix(#71): fix header detection threshold inflation and document DateAmount key behaviour MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bug 1 (transaction_filter.py): _is_header_transaction was calling tx.to_dict() which includes metadata fields (source_page, confidence_score, extraction_warnings, document_type, transaction_type). These inflated non_empty_fields without contributing to matches, diluting the 50% header-detection threshold. Fixed by inspecting only the five data columns (Date, Details, Debit, Credit, Balance) directly from the Transaction object. Bug 2 (strategies.py): DateAmountDuplicateStrategy.create_key now documents that only monetary fields are summed. The old dict-based code accidentally included source_page and confidence_score in the total — the new code is correct and the change in key values is intentional. --- .../src/bankstatements_core/patterns/strategies.py | 8 +++++++- .../services/transaction_filter.py | 12 +++++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/packages/parser-core/src/bankstatements_core/patterns/strategies.py b/packages/parser-core/src/bankstatements_core/patterns/strategies.py index abca1c3..58fec98 100644 --- a/packages/parser-core/src/bankstatements_core/patterns/strategies.py +++ b/packages/parser-core/src/bankstatements_core/patterns/strategies.py @@ -115,7 +115,13 @@ class DateAmountDuplicateStrategy(DuplicateDetectionStrategy): """ def create_key(self, transaction: "Transaction") -> str: - """Create key from date and sum of all amounts.""" + """Create key from date and sum of all monetary amounts. + + Only monetary fields (debit, credit, balance) and explicit + additional_fields are summed. Metadata fields such as source_page + and confidence_score are intentionally excluded — the old dict-based + implementation accidentally included them, producing inflated totals. + """ total = 0.0 for field_name in ("debit", "credit", "balance"): value = getattr(transaction, field_name, None) diff --git a/packages/parser-core/src/bankstatements_core/services/transaction_filter.py b/packages/parser-core/src/bankstatements_core/services/transaction_filter.py index 990d3b3..33451ce 100644 --- a/packages/parser-core/src/bankstatements_core/services/transaction_filter.py +++ b/packages/parser-core/src/bankstatements_core/services/transaction_filter.py @@ -152,7 +152,17 @@ def _is_balance_only_row(self, tx: Transaction) -> bool: def _is_header_transaction(self, tx: Transaction) -> bool: """Check if a Transaction is a header row by inspecting its field values.""" - row = tx.to_dict() + # Build a dict of only the data columns — exclude metadata fields + # (source_page, confidence_score, extraction_warnings, document_type, + # transaction_type) so they don't inflate non_empty_fields and dilute + # the 50% header-match threshold. + row = { + "Date": tx.date, + "Details": tx.details, + "Debit": tx.debit, + "Credit": tx.credit, + "Balance": tx.balance, + } return self._is_header_row(row) def _is_header_row(self, row: dict) -> bool: