From e6fa1b33543678efe47e7b7504f6e0200de2e93d Mon Sep 17 00:00:00 2001 From: przemyslawbialon Date: Tue, 21 Apr 2026 15:06:01 +0200 Subject: [PATCH 1/4] fix: Revolut CSV with BOM, lowercase headers, and unknown operations (#33) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reported by @inobrevi: real Revolut stock export produced `tax: 0 PLN` silently instead of a correct calculation. Three collaborating root causes, all silenced by the loader's `except (ValueError, KeyError)` clause — so users saw "Loaded 0 operations" with no error trace. 1. UTF-8 BOM at start of file Revolut exports begin with `\xef\xbb\xbf`. Plain `open(path, "r")` doesn't strip it, so the first column name becomes `date` and `row['date']` raises KeyError. 2. Lowercase column names Revolut changed `Date` → `date`. Our parsers hardcoded `row['Date']`, `row['Ticker']`, etc. All broke. 3. Unknown operation types (CASH WITHDRAWAL, DEPOSIT, TRANSFER) Not listed in `RowParser.OPERATIONS`. `_operation_type()` returned None, downstream parser returned None, row silently dropped. Central CSV opener at `pit38/data_sources/csv_utils.py`: - `encoding="utf-8-sig"` — strips BOM if present, no-op otherwise. - `newline=""` — Python csv module best practice for quoted fields. - Normalize fieldnames to `strip().lower()` at read time so downstream code uses a stable lowercase form regardless of broker capitalization. Write side (`generic_saver.py`) unchanged — emits plain UTF-8 without BOM. Postel's law: liberal in what we accept, strict in what we send. All 5 broker readers migrated to the new util: - `data_sources/stock_loader/csv_loader.py` - `data_sources/crypto_loader/csv_loader.py` - `plugins/stock/revolut/csv.py` - `plugins/stock/etrade/csv.py` - `plugins/crypto/revolut/csv.py` - `plugins/crypto/binance/csv.py` Column refs updated to lowercase across Revolut stock/crypto, E*Trade, and Binance parsers. `CsvService.read_with_summary()` returns a `ReadResult(records, skipped_by_type: Counter)`. `pit38 import revolut-stock` prints a post-import summary: Skipped 3 rows (operation types not recognized as tax-relevant): • CASH WITHDRAWAL: 1 rows • DEPOSIT: 1 rows • TRANSFER: 1 rows No more silent drops. User can see what was ignored and flag misclassifications (e.g. "INTEREST: 200 rows — is this taxable?"). New fixture `tests/e2e/fixtures/revolut_stock_real.csv` has BOM, lowercase headers, and mixed known/unknown operation types — exactly what the synthetic fixture lacked. `test_revolut_real_export_with_bom_and_unknown_operations` verifies parser handles all three problems. - 93 tests pass (was 92; added one regression test) - `pit38 import revolut-stock` on the real-world fixture produces 2 tx + 1 dividend, reports 3 skipped rows, writes clean UTF-8 output - `pit38 stock` on that output produces 564 PLN profit / 107 PLN tax (i.e. non-zero, no silent 0) Closes #33. --- pit38/cli.py | 33 ++++++++++-- .../data_sources/crypto_loader/csv_loader.py | 7 ++- pit38/data_sources/csv_utils.py | 35 +++++++++++++ pit38/data_sources/stock_loader/csv_loader.py | 7 ++- pit38/plugins/crypto/binance/csv.py | 13 +++-- pit38/plugins/crypto/revolut/csv.py | 6 +-- pit38/plugins/crypto/revolut/row_parser.py | 14 +++--- pit38/plugins/stock/etrade/csv.py | 23 ++++----- pit38/plugins/stock/revolut/csv.py | 50 +++++++++++++++---- pit38/plugins/stock/revolut/row_parser.py | 16 +++--- .../stock/revolut/transaction_row_parser.py | 2 +- tests/e2e/fixtures/revolut_stock_real.csv | 7 +++ tests/e2e/test_stock_e2e.py | 28 +++++++++++ tests/test_revolut_row_parser.py | 26 +++++----- 14 files changed, 196 insertions(+), 71 deletions(-) create mode 100644 pit38/data_sources/csv_utils.py create mode 100644 tests/e2e/fixtures/revolut_stock_real.csv diff --git a/pit38/cli.py b/pit38/cli.py index fc10cae..c15d9bd 100644 --- a/pit38/cli.py +++ b/pit38/cli.py @@ -24,6 +24,24 @@ def import_cmd(): pass +def _print_skipped_summary(skipped_by_type: "Counter") -> None: + """Pretty-print a skip summary to stderr for the user to review.""" + if not skipped_by_type: + return + total = sum(skipped_by_type.values()) + click.echo( + f"\nSkipped {total} rows (operation types not recognized as tax-relevant):", + err=True, + ) + for op_type, count in skipped_by_type.most_common(): + click.echo(f" • {op_type}: {count} rows", err=True) + click.echo( + "\nIf you believe any of these should be taxed, please check with your\n" + "tax advisor and open an issue: https://github.com/pbialon/pit-38/issues", + err=True, + ) + + @import_cmd.command("revolut-stock") @click.option("-i", "--input", "input_path", type=click.Path(exists=True), required=True, help="Revolut stock export CSV") @click.option("-o", "--output", "output_path", type=click.Path(), required=True, help="Output standardized CSV") @@ -36,10 +54,17 @@ def import_revolut_stock(input_path, output_path, log_level): from pit38.plugins.stock.revolut.operation_row_parser import OperationRowParser from pit38.plugins.stock.generic_saver import GenericCsvSaver - transactions = CsvService(input_path, TransactionRowParser).read() - operations = CsvService(input_path, OperationRowParser).read() - GenericCsvSaver.save(transactions, operations, output_path) - click.echo(f"Saved {len(transactions)} transactions and {len(operations)} operations to {output_path}") + # Both parsers scan every row; they skip the same unknown-type rows, so + # the skip counters are identical. Report just once (from the first pass). + transaction_result = CsvService(input_path, TransactionRowParser).read_with_summary() + operation_result = CsvService(input_path, OperationRowParser).read_with_summary() + + GenericCsvSaver.save(transaction_result.records, operation_result.records, output_path) + click.echo( + f"Saved {len(transaction_result.records)} transactions and " + f"{len(operation_result.records)} operations to {output_path}" + ) + _print_skipped_summary(transaction_result.skipped_by_type) @import_cmd.command("revolut-crypto") diff --git a/pit38/data_sources/crypto_loader/csv_loader.py b/pit38/data_sources/crypto_loader/csv_loader.py index 9b77dd7..95d4eee 100644 --- a/pit38/data_sources/crypto_loader/csv_loader.py +++ b/pit38/data_sources/crypto_loader/csv_loader.py @@ -1,8 +1,8 @@ -import csv from typing import List, Optional import pendulum from loguru import logger +from pit38.data_sources.csv_utils import open_csv_reader from pit38.domain.transactions import Transaction, AssetValue, Action from pit38.domain.currency_exchange_service.currencies import parse_currency, FiatValue, Currency, InvalidCurrencyException @@ -11,9 +11,8 @@ class Loader: def load(cls, file_path: str) -> List[Transaction]: transactions = [] logger.info(f"Loading transactions from {file_path}...") - - with open(file_path, "r") as csvfile: - reader = csv.DictReader(csvfile) + + with open_csv_reader(file_path) as reader: for row in reader: transaction = cls._parse_row(row) if transaction: diff --git a/pit38/data_sources/csv_utils.py b/pit38/data_sources/csv_utils.py new file mode 100644 index 0000000..67f671e --- /dev/null +++ b/pit38/data_sources/csv_utils.py @@ -0,0 +1,35 @@ +"""Shared CSV reading utilities. + +Centralizes two quirks that broke real-world broker exports (#33): + +1. UTF-8 BOM — Revolut (and many Excel-generated CSVs) prefix files with + `\xef\xbb\xbf`. Opening with plain `"r"` leaves the BOM in the first + column name (e.g. `'\\ufeffdate'` instead of `'date'`). `utf-8-sig` + strips it cleanly and is a no-op for files without BOM. + +2. Header case drift — brokers change column capitalization between + exports (`Date` → `date`). We normalize headers to `lower().strip()` + once at read time so downstream code uses a stable form. + +Write side stays as plain `utf-8` (no BOM) — Postel's law: liberal in +what we accept, strict in what we emit. +""" +import csv +from contextlib import contextmanager +from typing import Iterator + + +@contextmanager +def open_csv_reader(path: str, delimiter: str = ",") -> Iterator[csv.DictReader]: + """Open CSV tolerantly: strips UTF-8 BOM and normalizes headers. + + Usage: + with open_csv_reader("export.csv") as reader: + for row in reader: + value = row["date"] # always lowercase + """ + with open(path, "r", encoding="utf-8-sig", newline="") as f: + reader = csv.DictReader(f, delimiter=delimiter) + if reader.fieldnames: + reader.fieldnames = [name.strip().lower() for name in reader.fieldnames] + yield reader diff --git a/pit38/data_sources/stock_loader/csv_loader.py b/pit38/data_sources/stock_loader/csv_loader.py index 9497a34..85449c1 100644 --- a/pit38/data_sources/stock_loader/csv_loader.py +++ b/pit38/data_sources/stock_loader/csv_loader.py @@ -1,8 +1,8 @@ -import csv from typing import List, Optional import pendulum from loguru import logger +from pit38.data_sources.csv_utils import open_csv_reader from pit38.domain.stock.operations.operation import Operation, OperationType from pit38.data_sources.stock_loader.factory import OperationFactory from pit38.domain.currency_exchange_service.currencies import parse_currency, FiatValue, InvalidCurrencyException @@ -14,9 +14,8 @@ class Loader: def load(cls, file_path: str) -> List[Operation | Transaction]: operations = [] logger.info(f"Loading stock operations from {file_path}...") - - with open(file_path, "r") as csvfile: - reader = csv.DictReader(csvfile) + + with open_csv_reader(file_path) as reader: for row in reader: operation = cls._parse_row(row) if operation: diff --git a/pit38/plugins/crypto/binance/csv.py b/pit38/plugins/crypto/binance/csv.py index d34a786..8042241 100644 --- a/pit38/plugins/crypto/binance/csv.py +++ b/pit38/plugins/crypto/binance/csv.py @@ -1,10 +1,10 @@ -import csv from datetime import datetime from enum import Enum from typing import List import pendulum from loguru import logger +from pit38.data_sources.csv_utils import open_csv_reader from pit38.domain.currency_exchange_service.currencies import CURRENCY_MAP, FiatValue from pit38.domain.transactions.action import Action from pit38.domain.transactions.asset import AssetValue @@ -19,10 +19,10 @@ class BinanceOperationType(Enum): class BinanceTransaction: def __init__(self, row: dict): - self.utc_time = datetime.strptime(row["UTC_Time"], "%Y-%m-%d %H:%M:%S") - self.operation = row["Operation"] - self.coin = row["Coin"] - self.change = float(row["Change"]) + self.utc_time = datetime.strptime(row["utc_time"], "%Y-%m-%d %H:%M:%S") + self.operation = row["operation"] + self.coin = row["coin"] + self.change = float(row["change"]) def operation_type(self) -> BinanceOperationType: if self.operation == BinanceOperationType.CONVERT.value: @@ -75,8 +75,7 @@ def _process_transaction_operations(self, binance_transactions: List[BinanceTran def read(self, file_path: str) -> List[Transaction]: convert_operations = [] transaction_operations = [] - with open(file_path, 'r') as file: - reader = csv.DictReader(file) + with open_csv_reader(file_path) as reader: for row in reader: binance_transaction = BinanceTransaction(row) if binance_transaction.operation_type() == BinanceOperationType.DEPOSIT: diff --git a/pit38/plugins/crypto/revolut/csv.py b/pit38/plugins/crypto/revolut/csv.py index b71833a..873caea 100644 --- a/pit38/plugins/crypto/revolut/csv.py +++ b/pit38/plugins/crypto/revolut/csv.py @@ -1,7 +1,8 @@ from typing import List + from loguru import logger -import csv +from pit38.data_sources.csv_utils import open_csv_reader from pit38.domain.transactions.transaction import Transaction from pit38.plugins.crypto.revolut.row_parser import RowParser @@ -13,8 +14,7 @@ def __init__(self, row_parser: RowParser): def read(self, input_path: str) -> List[Transaction]: transactions = [] logger.info(f"Reading transactions from {input_path}...") - with open(input_path, 'r') as csvfile: - reader = csv.DictReader(csvfile, delimiter=',') + with open_csv_reader(input_path) as reader: for row in reader: transaction = self.row_parser.parse(row) if not transaction: diff --git a/pit38/plugins/crypto/revolut/row_parser.py b/pit38/plugins/crypto/revolut/row_parser.py index 31386c3..a2bd1e3 100644 --- a/pit38/plugins/crypto/revolut/row_parser.py +++ b/pit38/plugins/crypto/revolut/row_parser.py @@ -29,16 +29,16 @@ def parse(cls, row: Dict) -> Transaction: @classmethod def _crypto_value(cls, row: dict) -> AssetValue: - currency = row["Symbol"] - amount = row["Quantity"].replace(",", "") + currency = row["symbol"] + amount = row["quantity"].replace(",", "") return AssetValue(float(amount), currency) @classmethod def _fiat_value(cls, row: dict) -> FiatValue: - if row["Value"] == "": + if row["value"] == "": return FiatValue(0, Currency.ZLOTY) - value = row["Value"].replace(",", "") + value = row["value"].replace(",", "") amount_match = re.search(r'\d+\.?\d{2}', value) if not amount_match: @@ -53,16 +53,16 @@ def _fiat_value(cls, row: dict) -> FiatValue: @classmethod def _action(cls, row: dict) -> Action: - if row["Type"] in BUY_OPERATION_TYPES: + if row["type"] in BUY_OPERATION_TYPES: return Action.BUY - if row["Type"] in SELL_OPERATION_TYPES: + if row["type"] in SELL_OPERATION_TYPES: return Action.SELL # Staking and unstaking are not important for tax purposes return None @classmethod def _datetime(cls, row: dict) -> pendulum.DateTime: - date_str = row["Date"].replace("Sept", "Sep") # revolut uses Sept instead of Sep + date_str = row["date"].replace("Sept", "Sep") # revolut uses Sept instead of Sep supported_formats = ( "DD MMM YYYY, HH:mm:ss", "MMM DD, YYYY, hh:mm:ss A", diff --git a/pit38/plugins/stock/etrade/csv.py b/pit38/plugins/stock/etrade/csv.py index e990d4b..1850df6 100644 --- a/pit38/plugins/stock/etrade/csv.py +++ b/pit38/plugins/stock/etrade/csv.py @@ -1,8 +1,8 @@ -import csv from typing import Dict, List, Tuple from loguru import logger import pendulum +from pit38.data_sources.csv_utils import open_csv_reader from pit38.domain.currency_exchange_service.currencies import FiatValue from pit38.domain.transactions.action import Action from pit38.domain.transactions.asset import AssetValue @@ -15,10 +15,9 @@ class EtradeCsvReader: def read(cls, file_path: str) -> List[Transaction]: transactions = [] logger.info(f"Reading transactions from {file_path}...") - with open(file_path, "r") as csvfile: - reader = csv.DictReader(csvfile, delimiter=",") + with open_csv_reader(file_path) as reader: for row in reader: - if row["Record Type"] == "Summary": + if row["record type"] == "Summary": continue buy, sell = cls.parse_row(row) transactions.extend((buy, sell)) @@ -42,28 +41,28 @@ def _buy_transaction(cls, row: Dict) -> Transaction: asset=cls._asset(row), fiat_value=cls._buy_cost(row), action=Action.BUY, - date=pendulum.parse(str(row["Date Acquired"]), strict=False), + date=pendulum.parse(str(row["date acquired"]), strict=False), ) - + @classmethod def _sell_transaction(cls, row: Dict) -> Transaction: return Transaction( asset=cls._asset(row), fiat_value=cls._sell_cost(row), action=Action.SELL, - date=pendulum.parse(str(row["Date Sold"]), strict=False), + date=pendulum.parse(str(row["date sold"]), strict=False), ) @classmethod def _asset(cls, row: Dict) -> AssetValue: - quantity = float(row['Qty.']) - stock_name = row['Symbol'] + quantity = float(row['qty.']) + stock_name = row['symbol'] return AssetValue(quantity, stock_name) - + @classmethod def _buy_cost(cls, row: Dict) -> FiatValue: - return FiatValueParser.parse(row["Acquisition Cost"]) + return FiatValueParser.parse(row["acquisition cost"]) @classmethod def _sell_cost(cls, row: Dict) -> FiatValue: - return FiatValueParser.parse(row["Total Proceeds"]) \ No newline at end of file + return FiatValueParser.parse(row["total proceeds"]) \ No newline at end of file diff --git a/pit38/plugins/stock/revolut/csv.py b/pit38/plugins/stock/revolut/csv.py index 240ee3e..214415f 100644 --- a/pit38/plugins/stock/revolut/csv.py +++ b/pit38/plugins/stock/revolut/csv.py @@ -1,28 +1,60 @@ -import csv +from collections import Counter +from dataclasses import dataclass, field from typing import List from loguru import logger +from pit38.data_sources.csv_utils import open_csv_reader from pit38.domain.stock.operations.operation import Operation from pit38.domain.transactions.transaction import Transaction from pit38.plugins.stock.revolut.row_parser import RowParser +@dataclass +class ReadResult: + """Result of reading a broker CSV: parsed records plus a summary of + rows we intentionally skipped (unknown operation types, empty rows).""" + records: List[Operation | Transaction] + skipped_by_type: Counter = field(default_factory=Counter) + + @property + def total_skipped(self) -> int: + return sum(self.skipped_by_type.values()) + + class CsvService: def __init__(self, path: str, row_parser: RowParser): self.path = path self.row_parser = row_parser def read(self) -> List[Operation | Transaction]: - records = [] + """Backward-compatible: return only the records list. + + For the full result with skip summary, use read_with_summary(). + """ + return self.read_with_summary().records + + def read_with_summary(self) -> ReadResult: + records: List[Operation | Transaction] = [] + skipped: Counter = Counter() logger.info(f"Reading records from {self.path}...") - with open(self.path, 'r') as csvfile: - reader = csv.DictReader(csvfile, delimiter=',') + + with open_csv_reader(self.path) as reader: for row in reader: - operation = self.row_parser.parse(row) - if not operation: + op_type = self.row_parser._operation_type(row) + if op_type is None: + raw_type = (row.get("type") or "").strip() or "" + skipped[raw_type] += 1 + continue + + record = self.row_parser.parse(row) + if record is None: + # Parser decided this specific row doesn't belong to its + # OPERATIONS_HANDLED set (e.g. TransactionRowParser sees + # a DIVIDEND row). Not an error — different parser will + # pick it up on another pass. continue + records.append(record) - records.append(operation) - logger.info(f"Parsed {len(records)} records") - return records \ No newline at end of file + logger.info(f"Parsed {len(records)} records ({sum(skipped.values())} rows skipped)") + return ReadResult(records=records, skipped_by_type=skipped) diff --git a/pit38/plugins/stock/revolut/row_parser.py b/pit38/plugins/stock/revolut/row_parser.py index 5581d2d..cf89834 100644 --- a/pit38/plugins/stock/revolut/row_parser.py +++ b/pit38/plugins/stock/revolut/row_parser.py @@ -25,7 +25,7 @@ def parse(cls, row: Dict) -> Transaction: @classmethod def _fiat_value(cls, row: Dict) -> FiatValue: - raw = row['Total Amount'] + raw = row['total amount'] if raw.startswith("-"): raw = raw[1:] @@ -45,26 +45,26 @@ def _fiat_value(cls, row: Dict) -> FiatValue: cls._validate_currency(row, currency) return FiatValue(amount, currency) - raise ValueError(f"Cannot parse Total Amount: '{row['Total Amount']}')") + raise ValueError(f"Cannot parse Total Amount: '{row['total amount']}')") @classmethod def _validate_currency(cls, row: Dict, parsed_currency: Currency) -> None: - if 'Currency' in row and row['Currency']: - expected = parse_currency(row['Currency']) + if 'currency' in row and row['currency']: + expected = parse_currency(row['currency']) if expected != parsed_currency: raise ValueError( f"Currency mismatch: Total Amount implies {parsed_currency}, " - f"but Currency column says {expected} (row: {row['Total Amount']})" + f"but Currency column says {expected} (row: {row['total amount']})" ) @classmethod def _stock(cls, row: Dict) -> str: - return row['Ticker'] + return row['ticker'] @classmethod def _date(cls, row: dict) -> pendulum.DateTime: - return pendulum.parse(row['Date']) + return pendulum.parse(row['date']) @classmethod def _operation_type(cls, row: dict) -> OperationType: - return cls.OPERATIONS.get(row['Type']) + return cls.OPERATIONS.get(row['type']) diff --git a/pit38/plugins/stock/revolut/transaction_row_parser.py b/pit38/plugins/stock/revolut/transaction_row_parser.py index bf98aea..c6406a4 100644 --- a/pit38/plugins/stock/revolut/transaction_row_parser.py +++ b/pit38/plugins/stock/revolut/transaction_row_parser.py @@ -38,4 +38,4 @@ def _asset(cls, row: Dict) -> AssetValue: @classmethod def _quantity(cls, row: Dict) -> float: - return float(row['Quantity']) \ No newline at end of file + return float(row['quantity']) \ No newline at end of file diff --git a/tests/e2e/fixtures/revolut_stock_real.csv b/tests/e2e/fixtures/revolut_stock_real.csv new file mode 100644 index 0000000..02ad688 --- /dev/null +++ b/tests/e2e/fixtures/revolut_stock_real.csv @@ -0,0 +1,7 @@ +date,Ticker,Type,Quantity,Price per share,Total Amount,Currency,FX Rate +2024-01-15T10:00:00.000000Z,AAPL,BUY - MARKET,10,USD 150.00,USD 1500.00,USD,0.25 +2024-02-01T14:30:00.000000Z,,CASH WITHDRAWAL,,,USD -100.00,USD,0.25 +2024-03-10T09:15:00.000000Z,AAPL,SELL - MARKET,5,USD 180.00,USD 900.00,USD,0.26 +2024-04-05T16:00:00.000000Z,,DEPOSIT,,,USD 500.00,USD,0.25 +2024-05-20T11:45:00.000000Z,MSFT,DIVIDEND,,,USD 2.50,USD,0.25 +2024-06-01T12:00:00.000000Z,,TRANSFER,,,USD -50.00,USD,0.25 diff --git a/tests/e2e/test_stock_e2e.py b/tests/e2e/test_stock_e2e.py index 498ad04..f62f27a 100644 --- a/tests/e2e/test_stock_e2e.py +++ b/tests/e2e/test_stock_e2e.py @@ -34,6 +34,34 @@ def test_revolut_import_then_calculate(self): loaded = StockLoader.load(tmp_path) self.assertEqual(len(loaded), len(transactions)) + def test_revolut_real_export_with_bom_and_unknown_operations(self): + """Regression for #33 — real Revolut CSV has UTF-8 BOM, lowercase + headers (`date` not `Date`), and non-tax operations (CASH WITHDRAWAL, + DEPOSIT, TRANSFER). Each broke the parser before; now: + - BOM is stripped by utf-8-sig encoding + - Headers normalized to lowercase at read time + - Unknown operation types are counted and skipped, not crashed + """ + from pit38.plugins.stock.revolut.operation_row_parser import OperationRowParser + + revolut_csv = FIXTURES / "revolut_stock_real.csv" + # Sanity: fixture actually has BOM + self.assertEqual(revolut_csv.read_bytes()[:3], b"\xef\xbb\xbf") + + # Transactions: BUY + SELL rows parsed + tx_result = CsvService(str(revolut_csv), TransactionRowParser).read_with_summary() + self.assertEqual(len(tx_result.records), 2, "Should find BUY and SELL") + + # Operations: DIVIDEND row parsed + op_result = CsvService(str(revolut_csv), OperationRowParser).read_with_summary() + self.assertEqual(len(op_result.records), 1, "Should find DIVIDEND") + + # Skipped summary includes all non-tax rows + self.assertEqual(tx_result.skipped_by_type["CASH WITHDRAWAL"], 1) + self.assertEqual(tx_result.skipped_by_type["DEPOSIT"], 1) + self.assertEqual(tx_result.skipped_by_type["TRANSFER"], 1) + self.assertEqual(tx_result.total_skipped, 3) + def test_standardized_csv_to_tax_result(self): csv_path = FIXTURES / "standardized_stock.csv" diff --git a/tests/test_revolut_row_parser.py b/tests/test_revolut_row_parser.py index db40b61..7f1960e 100644 --- a/tests/test_revolut_row_parser.py +++ b/tests/test_revolut_row_parser.py @@ -5,54 +5,56 @@ class TestFiatValueParsing(TestCase): + # NOTE: column names are lowercase because `open_csv_reader` normalizes + # headers to `strip().lower()` at read time (see data_sources/csv_utils.py). def test_old_format_usd(self): - row = {'Total Amount': '$500', 'Currency': 'USD'} + row = {'total amount': '$500', 'currency': 'USD'} self.assertEqual(RowParser._fiat_value(row), FiatValue(500, Currency.DOLLAR)) def test_old_format_usd_with_comma(self): - row = {'Total Amount': '$1,003.01', 'Currency': 'USD'} + row = {'total amount': '$1,003.01', 'currency': 'USD'} self.assertEqual(RowParser._fiat_value(row), FiatValue(1003.01, Currency.DOLLAR)) def test_old_format_negative(self): - row = {'Total Amount': '-$529.68', 'Currency': 'USD'} + row = {'total amount': '-$529.68', 'currency': 'USD'} self.assertEqual(RowParser._fiat_value(row), FiatValue(529.68, Currency.DOLLAR)) def test_old_format_eur(self): - row = {'Total Amount': '€250.00', 'Currency': 'EUR'} + row = {'total amount': '€250.00', 'currency': 'EUR'} self.assertEqual(RowParser._fiat_value(row), FiatValue(250, Currency.EURO)) def test_new_format_usd(self): - row = {'Total Amount': 'USD 1317.06', 'Currency': 'USD'} + row = {'total amount': 'USD 1317.06', 'currency': 'USD'} self.assertEqual(RowParser._fiat_value(row), FiatValue(1317.06, Currency.DOLLAR)) def test_new_format_eur(self): - row = {'Total Amount': 'EUR 500.00', 'Currency': 'EUR'} + row = {'total amount': 'EUR 500.00', 'currency': 'EUR'} self.assertEqual(RowParser._fiat_value(row), FiatValue(500, Currency.EURO)) def test_new_format_negative(self): - row = {'Total Amount': '-USD 529.68', 'Currency': 'USD'} + row = {'total amount': '-USD 529.68', 'currency': 'USD'} self.assertEqual(RowParser._fiat_value(row), FiatValue(529.68, Currency.DOLLAR)) def test_new_format_with_comma(self): - row = {'Total Amount': 'USD 1,317.06', 'Currency': 'USD'} + row = {'total amount': 'USD 1,317.06', 'currency': 'USD'} self.assertEqual(RowParser._fiat_value(row), FiatValue(1317.06, Currency.DOLLAR)) def test_currency_mismatch_raises_error(self): - row = {'Total Amount': '$500', 'Currency': 'EUR'} + row = {'total amount': '$500', 'currency': 'EUR'} with self.assertRaises(ValueError) as ctx: RowParser._fiat_value(row) self.assertIn("Currency mismatch", str(ctx.exception)) def test_missing_currency_column(self): - row = {'Total Amount': 'USD 1317.06'} + row = {'total amount': 'USD 1317.06'} self.assertEqual(RowParser._fiat_value(row), FiatValue(1317.06, Currency.DOLLAR)) def test_empty_currency_column(self): - row = {'Total Amount': 'EUR 500.00', 'Currency': ''} + row = {'total amount': 'EUR 500.00', 'currency': ''} self.assertEqual(RowParser._fiat_value(row), FiatValue(500, Currency.EURO)) def test_unparseable_amount_raises_error(self): - row = {'Total Amount': '???', 'Currency': 'USD'} + row = {'total amount': '???', 'currency': 'USD'} with self.assertRaises(ValueError) as ctx: RowParser._fiat_value(row) self.assertIn("Cannot parse Total Amount", str(ctx.exception)) From 4f94ee156918949494c84d891947af244cc376c8 Mon Sep 17 00:00:00 2001 From: przemyslawbialon Date: Tue, 21 Apr 2026 15:14:28 +0200 Subject: [PATCH 2/4] test: Add unit coverage for csv_utils, CsvService, and no-BOM output MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Part of #33 fix — adds focused tests for the three building blocks introduced in the preceding commit. Total: +24 tests (117 passing, was 93). tests/test_csv_utils.py (11 tests) Pins the behaviour of open_csv_reader: - BOM stripping on leading position, preserved in middle of file - Header normalization (lowercase, stripped, multi-word preserved) - Edge cases (empty file, header-only, custom delimiter) - Context manager file-close semantics tests/test_revolut_csv_service.py (9 tests) Integration tests for CsvService.read_with_summary: - All-known-ops → empty skip counter - Mixed known/unknown → correct per-type counts - Empty operation column → tagged as - OperationRowParser vs TransactionRowParser skip asymmetry - ReadResult dataclass invariants (total_skipped) - BOM tolerance at the CsvService boundary tests/test_generic_saver_no_bom.py (4 tests) Postel's law regression — our output must NOT contain BOM: - Stock and crypto savers produce BOM-free output - Output is valid UTF-8 - Round-trip: saver output reads back through our loader cleanly These tests would have caught #33 before the user hit it. The synthetic E2E fixture didn't exercise BOM or case drift — the new unit tests do, in isolation, so future regressions are caught early. --- tests/test_csv_utils.py | 137 ++++++++++++++++++++++++ tests/test_generic_saver_no_bom.py | 94 ++++++++++++++++ tests/test_revolut_csv_service.py | 165 +++++++++++++++++++++++++++++ 3 files changed, 396 insertions(+) create mode 100644 tests/test_csv_utils.py create mode 100644 tests/test_generic_saver_no_bom.py create mode 100644 tests/test_revolut_csv_service.py diff --git a/tests/test_csv_utils.py b/tests/test_csv_utils.py new file mode 100644 index 0000000..eac3532 --- /dev/null +++ b/tests/test_csv_utils.py @@ -0,0 +1,137 @@ +"""Unit tests for pit38.data_sources.csv_utils. + +The open_csv_reader utility is the centerpiece of the fix for #33: it +tolerates UTF-8 BOM and normalizes headers so downstream code doesn't +need to worry about broker-specific quirks. These tests pin the +behaviour so future refactors don't regress it. +""" +import os +import tempfile +from unittest import TestCase + +from pit38.data_sources.csv_utils import open_csv_reader + + +BOM = b"\xef\xbb\xbf" + + +class TestOpenCsvReader(TestCase): + def setUp(self): + self.tmp = tempfile.NamedTemporaryFile( + mode="wb", suffix=".csv", delete=False + ) + self.tmp.close() + self.path = self.tmp.name + + def tearDown(self): + os.unlink(self.path) + + def _write_bytes(self, content: bytes) -> None: + with open(self.path, "wb") as f: + f.write(content) + + # ─── BOM handling ─────────────────────────────────────── + + def test_strips_utf8_bom_from_first_column(self): + """File with UTF-8 BOM at the start: first column name should + come out clean, no \\ufeff prefix.""" + self._write_bytes(BOM + b"date,ticker\n2024-01-01,AAPL\n") + + with open_csv_reader(self.path) as reader: + self.assertEqual(reader.fieldnames, ["date", "ticker"]) + rows = list(reader) + self.assertEqual(len(rows), 1) + self.assertEqual(rows[0]["date"], "2024-01-01") + self.assertEqual(rows[0]["ticker"], "AAPL") + + def test_no_bom_still_works(self): + """Files without BOM (our own output, Linux-generated) read + identically — utf-8-sig is a no-op in that case.""" + self._write_bytes(b"date,ticker\n2024-01-01,AAPL\n") + + with open_csv_reader(self.path) as reader: + self.assertEqual(reader.fieldnames, ["date", "ticker"]) + + def test_bom_in_middle_of_file_is_not_stripped(self): + """Sanity: only leading BOM is handled. We don't aggressively + strip BOMs anywhere in the file (that would be destructive).""" + self._write_bytes(b"date,ticker\n2024-01-01," + BOM + b"AAPL\n") + + with open_csv_reader(self.path) as reader: + rows = list(reader) + # The BOM is preserved in the data (we only cared about the header) + self.assertEqual(rows[0]["ticker"], "AAPL") + + # ─── Header normalization ─────────────────────────────── + + def test_headers_lowercased(self): + """Mixed-case broker headers become stable lowercase form.""" + self._write_bytes(b"Date,Ticker,TYPE\n2024-01-01,AAPL,BUY\n") + + with open_csv_reader(self.path) as reader: + self.assertEqual(reader.fieldnames, ["date", "ticker", "type"]) + + def test_headers_stripped(self): + """Leading/trailing whitespace in headers (common in exports + generated by Excel) is stripped.""" + self._write_bytes(b" Date , Ticker\n2024-01-01,AAPL\n") + + with open_csv_reader(self.path) as reader: + self.assertEqual(reader.fieldnames, ["date", "ticker"]) + + def test_multi_word_headers_preserve_internal_spaces(self): + """`Total Amount` becomes `total amount` — internal spaces + preserved, only edges stripped.""" + self._write_bytes(b"Date,Total Amount\n2024-01-01,100.00\n") + + with open_csv_reader(self.path) as reader: + self.assertEqual(reader.fieldnames, ["date", "total amount"]) + + def test_bom_and_case_together(self): + """The combination that broke #33 — BOM plus capital D — + should yield a clean lowercase 'date' header.""" + self._write_bytes(BOM + b"Date,Ticker\n2024-01-01,AAPL\n") + + with open_csv_reader(self.path) as reader: + self.assertEqual(reader.fieldnames, ["date", "ticker"]) + + # ─── Edge cases ───────────────────────────────────────── + + def test_empty_file_has_no_fieldnames(self): + """Empty file: no crash, no fieldnames.""" + self._write_bytes(b"") + + with open_csv_reader(self.path) as reader: + self.assertIsNone(reader.fieldnames) + + def test_header_only_yields_no_rows(self): + """Header but no data rows: iteration yields nothing.""" + self._write_bytes(b"date,ticker\n") + + with open_csv_reader(self.path) as reader: + self.assertEqual(reader.fieldnames, ["date", "ticker"]) + self.assertEqual(list(reader), []) + + def test_custom_delimiter(self): + """Supports tab-separated files as a use case for future + brokers (e.g. some older exports).""" + self._write_bytes(b"date\tticker\n2024-01-01\tAAPL\n") + + with open_csv_reader(self.path, delimiter="\t") as reader: + self.assertEqual(reader.fieldnames, ["date", "ticker"]) + rows = list(reader) + self.assertEqual(rows[0]["ticker"], "AAPL") + + def test_context_manager_closes_file(self): + """Verify file is closed when context exits. Regression guard + against leaked file descriptors in long-running tools.""" + with open_csv_reader(self.path) as reader: + handle = reader.reader # underlying _csv.reader + # We can't easily check the backing file is open, but we can + # at least ensure no exception is raised on context exit. + pass + # After context exit, attempting to iterate the raw reader + # should fail because the file is closed + with self.assertRaises(ValueError): + # ValueError: I/O operation on closed file + next(iter(handle)) diff --git a/tests/test_generic_saver_no_bom.py b/tests/test_generic_saver_no_bom.py new file mode 100644 index 0000000..2d8daaa --- /dev/null +++ b/tests/test_generic_saver_no_bom.py @@ -0,0 +1,94 @@ +"""Regression tests: our output CSVs must NOT contain a UTF-8 BOM. + +Postel's law applied to #33: we tolerate BOM on input (some brokers +include it), but we emit clean UTF-8 on output. This keeps our +standardized CSV format minimal and interoperable with anything +downstream that reads plain UTF-8. +""" +import os +import pathlib +import tempfile +from unittest import TestCase + +import pendulum + +from pit38.domain.currency_exchange_service.currencies import Currency, FiatValue +from pit38.domain.transactions import Action, AssetValue, Transaction +from pit38.plugins.crypto.generic_saver import GenericCsvSaver as CryptoSaver +from pit38.plugins.stock.generic_saver import GenericCsvSaver as StockSaver + + +BOM = b"\xef\xbb\xbf" + + +def _sample_transaction(): + return Transaction( + asset=AssetValue(1.0, "AAPL"), + fiat_value=FiatValue(150.00, Currency.DOLLAR), + action=Action.BUY, + date=pendulum.parse("2024-01-01T10:00:00Z"), + ) + + +class TestStockSaverNoBOM(TestCase): + def setUp(self): + self.tmp = tempfile.NamedTemporaryFile( + mode="w", suffix=".csv", delete=False + ) + self.tmp.close() + self.path = self.tmp.name + + def tearDown(self): + os.unlink(self.path) + + def test_output_does_not_start_with_bom(self): + """Stock saver must emit plain UTF-8, no BOM. If someone changes + the encoding default, this test fires.""" + StockSaver.save([_sample_transaction()], [], self.path) + + raw = pathlib.Path(self.path).read_bytes() + self.assertFalse( + raw.startswith(BOM), + f"Output CSV starts with BOM ({raw[:3]!r}). " + f"Writer should use plain utf-8, not utf-8-sig.", + ) + + def test_output_is_valid_utf8(self): + """Defensive: output is decodable as utf-8 (no stray bytes).""" + StockSaver.save([_sample_transaction()], [], self.path) + + raw = pathlib.Path(self.path).read_bytes() + # Will raise UnicodeDecodeError if malformed + raw.decode("utf-8") + + def test_output_readable_back_by_our_loader(self): + """Round-trip: write with StockSaver, read with our loader, + verify no BOM-related breakage.""" + from pit38.data_sources.stock_loader.csv_loader import Loader + + StockSaver.save([_sample_transaction()], [], self.path) + + loaded = Loader.load(self.path) + self.assertEqual(len(loaded), 1) + + +class TestCryptoSaverNoBOM(TestCase): + def setUp(self): + self.tmp = tempfile.NamedTemporaryFile( + mode="w", suffix=".csv", delete=False + ) + self.tmp.close() + self.path = self.tmp.name + + def tearDown(self): + os.unlink(self.path) + + def test_output_does_not_start_with_bom(self): + """Crypto saver must also emit plain UTF-8, no BOM.""" + CryptoSaver.save([_sample_transaction()], self.path) + + raw = pathlib.Path(self.path).read_bytes() + self.assertFalse( + raw.startswith(BOM), + f"Output CSV starts with BOM ({raw[:3]!r}).", + ) diff --git a/tests/test_revolut_csv_service.py b/tests/test_revolut_csv_service.py new file mode 100644 index 0000000..ad6c6ff --- /dev/null +++ b/tests/test_revolut_csv_service.py @@ -0,0 +1,165 @@ +"""Tests for pit38.plugins.stock.revolut.csv.CsvService. + +Focuses on the `read_with_summary()` flow introduced for #33: liberal +parsing that counts skipped-by-type instead of failing silently. + +These are mid-level tests — they exercise the full CsvService + row +parser integration, but read from temp files rather than hitting the +real broker fixtures. That lets us dial in specific edge cases. +""" +import os +import tempfile +from unittest import TestCase + +from pit38.plugins.stock.revolut.csv import CsvService, ReadResult +from pit38.plugins.stock.revolut.transaction_row_parser import TransactionRowParser +from pit38.plugins.stock.revolut.operation_row_parser import OperationRowParser + + +HEADER = "date,Ticker,Type,Quantity,Price per share,Total Amount,Currency,FX Rate\n" + + +class TestReadResultDataclass(TestCase): + def test_total_skipped_empty_counter(self): + result = ReadResult(records=[]) + self.assertEqual(result.total_skipped, 0) + + def test_total_skipped_sums_all_types(self): + from collections import Counter + result = ReadResult( + records=[], + skipped_by_type=Counter({"CASH WITHDRAWAL": 3, "DEPOSIT": 2}) + ) + self.assertEqual(result.total_skipped, 5) + + +class TestCsvServiceReadWithSummary(TestCase): + def setUp(self): + self.tmp = tempfile.NamedTemporaryFile( + mode="w", suffix=".csv", delete=False + ) + self.tmp.close() + self.path = self.tmp.name + + def tearDown(self): + os.unlink(self.path) + + def _write(self, body: str) -> None: + with open(self.path, "w", encoding="utf-8") as f: + f.write(HEADER + body) + + # ─── Transaction parser: only BUY/SELL produce records ── + + def test_all_known_tax_operations_produce_records(self): + """BUY + SELL rows yield transactions, empty skip counter.""" + self._write( + "2024-01-01T10:00:00Z,AAPL,BUY - MARKET,10,USD 150.00,USD 1500.00,USD,0.25\n" + "2024-02-01T11:00:00Z,AAPL,SELL - MARKET,5,USD 180.00,USD 900.00,USD,0.26\n" + ) + + result = CsvService(self.path, TransactionRowParser).read_with_summary() + + self.assertEqual(len(result.records), 2) + self.assertEqual(result.total_skipped, 0) + self.assertEqual(dict(result.skipped_by_type), {}) + + def test_unknown_operations_counted_separately_by_type(self): + """Different unknown operation types are tallied by type — user + sees CASH WITHDRAWAL and DEPOSIT as separate counts.""" + self._write( + "2024-01-01T10:00:00Z,,CASH WITHDRAWAL,,,USD -100.00,USD,0.25\n" + "2024-01-02T10:00:00Z,,DEPOSIT,,,USD 500.00,USD,0.25\n" + "2024-01-03T10:00:00Z,,CASH WITHDRAWAL,,,USD -50.00,USD,0.25\n" + ) + + result = CsvService(self.path, TransactionRowParser).read_with_summary() + + self.assertEqual(len(result.records), 0) + self.assertEqual(result.total_skipped, 3) + self.assertEqual(result.skipped_by_type["CASH WITHDRAWAL"], 2) + self.assertEqual(result.skipped_by_type["DEPOSIT"], 1) + + def test_mixed_known_and_unknown_operations(self): + """Real-world mix: some taxable, some not. Each lands in the + right bucket.""" + self._write( + "2024-01-01T10:00:00Z,AAPL,BUY - MARKET,10,USD 150.00,USD 1500.00,USD,0.25\n" + "2024-01-02T10:00:00Z,,CASH WITHDRAWAL,,,USD -100.00,USD,0.25\n" + "2024-01-03T10:00:00Z,AAPL,SELL - MARKET,5,USD 180.00,USD 900.00,USD,0.26\n" + "2024-01-04T10:00:00Z,,TRANSFER,,,USD -50.00,USD,0.25\n" + ) + + result = CsvService(self.path, TransactionRowParser).read_with_summary() + + self.assertEqual(len(result.records), 2, "BUY and SELL") + self.assertEqual(result.total_skipped, 2, "CASH WITHDRAWAL and TRANSFER") + self.assertEqual(result.skipped_by_type["CASH WITHDRAWAL"], 1) + self.assertEqual(result.skipped_by_type["TRANSFER"], 1) + + def test_empty_operation_column_tagged_explicitly(self): + """Row with empty Type column gets reported as `` so the + user knows the CSV has malformed rows, not just unknown types.""" + self._write( + "2024-01-01T10:00:00Z,AAPL,,10,USD 150.00,USD 1500.00,USD,0.25\n" + ) + + result = CsvService(self.path, TransactionRowParser).read_with_summary() + + self.assertEqual(len(result.records), 0) + self.assertEqual(result.total_skipped, 1) + self.assertIn("", result.skipped_by_type) + + # ─── Operation parser: only DIVIDEND/SERVICE_FEE/STOCK_SPLIT ── + + def test_operation_parser_handles_dividend_and_skips_transactions(self): + """OperationRowParser skips BUY/SELL (their parser handles + them). But BUY/SELL are *known* operations — they go in their + own `records` list of this parser as None via parse() returning + None, not as skipped-by-type. Only truly unknown ops count. + """ + self._write( + "2024-01-01T10:00:00Z,AAPL,BUY - MARKET,10,USD 150.00,USD 1500.00,USD,0.25\n" + "2024-02-01T11:00:00Z,MSFT,DIVIDEND,,,USD 2.50,USD,0.25\n" + "2024-03-01T12:00:00Z,,CASH WITHDRAWAL,,,USD -50.00,USD,0.25\n" + ) + + result = CsvService(self.path, OperationRowParser).read_with_summary() + + # Only 1 record (the dividend). BUY is known, skipped by parse() + # (returns None), not counted. + self.assertEqual(len(result.records), 1) + # CASH WITHDRAWAL is unknown — counted + self.assertEqual(result.total_skipped, 1) + self.assertEqual(result.skipped_by_type["CASH WITHDRAWAL"], 1) + # BUY is a known operation type, not in skipped summary + self.assertNotIn("BUY - MARKET", result.skipped_by_type) + + # ─── Backward compatibility: read() returns records only ── + + def test_read_returns_records_without_summary(self): + """Legacy `read()` call site (if any) gets a plain list.""" + self._write( + "2024-01-01T10:00:00Z,AAPL,BUY - MARKET,10,USD 150.00,USD 1500.00,USD,0.25\n" + ) + + records = CsvService(self.path, TransactionRowParser).read() + self.assertEqual(len(records), 1) + # Not a ReadResult — just a list + self.assertIsInstance(records, list) + + # ─── BOM tolerance at CsvService level ───────────────── + + def test_bom_prefixed_file_reads_correctly(self): + """End-to-end: CsvService opens a BOM-prefixed file via + open_csv_reader and parses rows — no KeyError on 'date'.""" + with open(self.path, "wb") as f: + f.write(b"\xef\xbb\xbf") + f.write(HEADER.encode("utf-8")) + f.write( + b"2024-01-01T10:00:00Z,AAPL,BUY - MARKET,10,USD 150.00,USD 1500.00,USD,0.25\n" + ) + + result = CsvService(self.path, TransactionRowParser).read_with_summary() + + self.assertEqual(len(result.records), 1) + self.assertEqual(result.total_skipped, 0) From 10453442b7237e47964de393bd00760bef2a721d Mon Sep 17 00:00:00 2001 From: przemyslawbialon Date: Fri, 24 Apr 2026 12:43:30 +0200 Subject: [PATCH 3/4] fix: support "USD -0.07" + EU number formats via shared normalization (#33) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow-up on @inobrevi's CUSTODY FEE comment in #33: USD -0.07 (minus BETWEEN code and amount, not before) raised ValueError in the previous regex ([\d,.]+) which didn't cover a leading minus. Silent row-drop in downstream pipeline meant affected users were over-taxed by missing fee costs. Rather than bolting another regex variant on, extracts the two kinds of CSV-value normalization used across broker plugins into a shared module. Number parsing itself delegates to babel (CLDR locale-based). pit38/plugins/normalization.py (new) normalize_currency_layout(raw) — rewrites to canonical "" form, covering all Revolut and E*Trade variants seen in real exports. parse_amount(s) — tries en_US then de_DE locales (strict=True). Handles US "1,234.56", EU "1.234,56" and "1317,06", negatives, nbsp thousand separator. Babel replaces the hand-rolled comma/ space/replace chains both Revolut and E*Trade previously had. Migrated: - Revolut row_parser._fiat_value: single regex split + shared helpers - E*Trade FiatValueParser: drops _clean_up_raw_number / _resolve_currency pyproject.toml: adds babel~=2.14 (~6MB; lighter than pandas). Tests: +29 (total 146, was 117) - tests/test_normalization.py (new) — 22 unit tests pinning behaviour of both normalize_currency_layout and parse_amount - tests/test_revolut_row_parser.py — 3 regression tests for @inobrevi's CUSTODY FEE + EU format + EU-negative combo - tests/test_etrade_fiat_value_parser.py — 4 edge cases (euro, neg, nbsp thousand, unknown symbol) - tests/e2e/fixtures/revolut_stock_real.csv — anonymized CUSTODY FEE row (2024-03-15, USD -0.15) preserving BOM - tests/e2e/test_stock_e2e.py — asserts CUSTODY FEE now reaches the ServiceFee pipeline (was silently dropped before) Verification: pit38 import revolut-stock -i revolut_stock_real.csv -o out.csv now yields "Saved 2 transactions and 2 operations" (vs 2+1 before — CUSTODY FEE was dropped). No silent underreporting. abs() on parse_amount result preserves existing absolute-magnitude semantics — FiatValue(0.07, USD) matches current tax-calc expectations. Signed-amount propagation is a separate refactor (#61 Decimal migration). --- pit38/plugins/normalization.py | 95 ++++++++++++++++++ pit38/plugins/stock/etrade/row_parser.py | 38 +++++--- pit38/plugins/stock/revolut/row_parser.py | 39 ++++---- pyproject.toml | 1 + tests/e2e/fixtures/revolut_stock_real.csv | 1 + tests/e2e/test_stock_e2e.py | 14 ++- tests/test_etrade_fiat_value_parser.py | 34 ++++++- tests/test_normalization.py | 114 ++++++++++++++++++++++ tests/test_revolut_row_parser.py | 20 ++++ 9 files changed, 317 insertions(+), 39 deletions(-) create mode 100644 pit38/plugins/normalization.py create mode 100644 tests/test_normalization.py diff --git a/pit38/plugins/normalization.py b/pit38/plugins/normalization.py new file mode 100644 index 0000000..241f437 --- /dev/null +++ b/pit38/plugins/normalization.py @@ -0,0 +1,95 @@ +"""Shared CSV value normalization helpers for broker plugins. + +Brokers emit various layouts for the same logical value (amounts, +currencies). This module centralizes rewrite rules so plugins can focus +on broker-specific shape differences rather than re-inventing number +parsing or locale detection. + +Used by: + - pit38.plugins.stock.revolut.row_parser + - pit38.plugins.stock.etrade.row_parser +""" +import re + +from babel.numbers import parse_decimal, NumberFormatError + + +# Locales tried in order. en_US covers Revolut's historical exports +# (1,234.56); de_DE covers E*Trade's European output (1 234,56 / 1.234,56) +# and forward-compatible Revolut EU exports. If a new broker needs +# something else (e.g. fr_FR with nbsp thousand separator), add it here. +_NUMBER_LOCALES = ("en_US", "de_DE") + + +def parse_amount(s: str) -> float: + """Parse a number string, trying configured locales in order. + + Handles thousand/decimal separator combinations: + "1,234.56" (en_US) → 1234.56 + "1.234,56" (de_DE) → 1234.56 + "1 234,56" (whitespace stripped, de_DE) → 1234.56 + "-0.07" (en_US) → -0.07 + "-0,07" (de_DE) → -0.07 + + Whitespace (regular space and non-breaking space U+00A0) is stripped + before parsing. Some brokers use these as thousand separators, but + CLDR locales don't uniformly recognize them — stripping first keeps + the code simple. + + Uses babel's strict=True so ambiguous inputs (e.g. "1,317.06" + under de_DE locale, which would otherwise be misinterpreted as + 1.31706) fail fast instead of being silently mis-parsed. The + locale chain then tries the next locale. + + Returns a float for compatibility with existing FiatValue.amount. + When #61 migrates to Decimal, this can return babel's Decimal + directly (it already does internally). + + Raises ValueError if no configured locale can parse the input. + """ + cleaned = s.replace("\xa0", "").replace(" ", "") + for locale in _NUMBER_LOCALES: + try: + return float(parse_decimal(cleaned, locale=locale, strict=True)) + except NumberFormatError: + continue + raise ValueError( + f"Cannot parse amount in any supported locale " + f"({', '.join(_NUMBER_LOCALES)}): {s!r}" + ) + + +def normalize_currency_layout(raw: str) -> str: + """Rewrite a currency+amount string to canonical layout. + + Canonical form: ```` + + Input variants collected from real Revolut and E*Trade exports: + + ============== ================ + input → canonical + ============== ================ + "USD 529.68" → "USD 529.68" (already canonical) + "-USD 529.68" → "USD -529.68" (move sign past currency code) + "USD -0.07" → "USD -0.07" (already; #33 @inobrevi case) + "$500" → "$ 500" (insert space) + "-$529.68" → "$ -529.68" (move sign + insert space) + "$-0.07" → "$ -0.07" (insert space) + "€250.00" → "€ 250.00" (insert space) + "$25 001,75" → "$ 25 001,75" (only insert space; amount preserved) + + Sign stays attached to the amount side. parse_amount() handles the + signed number per-locale. + + Format-only: does NOT strip the sign and does NOT touch the inside + of the amount (digits/separators). That's parse_amount's job. + """ + # Move leading minus past currency code: "-USD X" → "USD -X" + raw = re.sub(r"^-([A-Z]{3}\s+)", r"\1-", raw) + # Move leading minus past currency symbol + insert space: "-$X" → "$ -X" + raw = re.sub(r"^-([^\w\s])", r"\1 -", raw) + # Insert space between currency symbol and amount: "$X" → "$ X" + # Currency codes already have a space after them (3-letter codes are + # word characters, so the regex starts with [^\w\s] = symbol only). + raw = re.sub(r"^([^\w\s])(?=[-\d])", r"\1 ", raw) + return raw diff --git a/pit38/plugins/stock/etrade/row_parser.py b/pit38/plugins/stock/etrade/row_parser.py index 69a1660..e07eac7 100644 --- a/pit38/plugins/stock/etrade/row_parser.py +++ b/pit38/plugins/stock/etrade/row_parser.py @@ -1,5 +1,9 @@ +import re + from loguru import logger + from pit38.domain.currency_exchange_service.currencies import Currency, FiatValue +from pit38.plugins.normalization import normalize_currency_layout, parse_amount class FiatValueParser: @@ -8,23 +12,25 @@ class FiatValueParser: "€": Currency.EURO, } - @classmethod - def _resolve_currency(cls, raw_currency: str) -> Currency: - currency_symbol = raw_currency[0] - return cls.SYMBOL_TO_CURRENCY.get(currency_symbol) - - @classmethod - def _clean_up_raw_number(cls, raw_fiat_value: str) -> str: - without_currency_symbol = raw_fiat_value[1:] - without_spaces = without_currency_symbol.replace(" ", "").replace("\xa0", "") - corrected_commas = without_spaces.replace(",", ".") - - return corrected_commas - @classmethod def parse(cls, raw_fiat_value: str) -> FiatValue: logger.debug(f"Parsing fiat value: {raw_fiat_value}") - currency = cls._resolve_currency(raw_fiat_value) - cleaned_raw_number = cls._clean_up_raw_number(raw_fiat_value) - return FiatValue(amount=float(cleaned_raw_number), currency=currency) \ No newline at end of file + normalized = normalize_currency_layout(raw_fiat_value) + # After normalization: "" + # Split on FIRST whitespace only — amount may legitimately contain + # space (as EU thousand separator) that parse_amount strips. + match = re.match(r'(\S+)\s+(.+)$', normalized) + if not match: + raise ValueError(f"Cannot parse fiat value: '{raw_fiat_value}'") + + symbol, amount_str = match.groups() + currency = cls.SYMBOL_TO_CURRENCY.get(symbol) + if currency is None: + raise ValueError( + f"Unknown currency symbol: {symbol!r} in {raw_fiat_value!r}" + ) + # abs() preserves existing absolute-magnitude semantic — see Revolut + # parser for rationale. Will change when #61 propagates signed amounts. + amount = abs(parse_amount(amount_str)) + return FiatValue(amount=amount, currency=currency) diff --git a/pit38/plugins/stock/revolut/row_parser.py b/pit38/plugins/stock/revolut/row_parser.py index cf89834..0cad0af 100644 --- a/pit38/plugins/stock/revolut/row_parser.py +++ b/pit38/plugins/stock/revolut/row_parser.py @@ -4,8 +4,9 @@ import pendulum from pit38.domain.currency_exchange_service.currencies import Currency, FiatValue, parse_currency -from pit38.domain.transactions import Transaction from pit38.domain.stock.operations.operation import OperationType +from pit38.domain.transactions import Transaction +from pit38.plugins.normalization import normalize_currency_layout, parse_amount class RowParser: @@ -25,27 +26,27 @@ def parse(cls, row: Dict) -> Transaction: @classmethod def _fiat_value(cls, row: Dict) -> FiatValue: - raw = row['total amount'] - if raw.startswith("-"): - raw = raw[1:] + normalized = normalize_currency_layout(row['total amount']) - # "USD 1317.06" or "EUR 500.00" — currency code + space + amount - code_match = re.match(r'([A-Z]{3})\s+([\d,.]+)', raw) - if code_match: - currency = parse_currency(code_match.group(1)) - amount = float(code_match.group(2).replace(",", "")) - cls._validate_currency(row, currency) - return FiatValue(amount, currency) + # After normalization: "" + # (both code "USD 529.68" and symbol "$ 529.68" reach the same form) + match = re.match(r'(\S+)\s+(\S+)$', normalized) + if not match: + raise ValueError(f"Cannot parse Total Amount: '{row['total amount']}'") - # "$1,003.01" or "€500.00" — currency symbol + amount - symbol_match = re.match(r'([^\d\s])([\d,.]+)', raw) - if symbol_match: - currency = parse_currency(symbol_match.group(1)) - amount = float(symbol_match.group(2).replace(",", "")) - cls._validate_currency(row, currency) - return FiatValue(amount, currency) + currency_str, amount_str = match.groups() + try: + currency = parse_currency(currency_str) # accepts "USD" and "$" + except Exception: + raise ValueError(f"Cannot parse Total Amount: '{row['total amount']}'") + # abs() preserves existing "all fiat values are absolute magnitudes" + # semantic — callers treat ServiceFee/Dividend/etc. as cost/income + # regardless of original sign. See #61 for the future refactor that + # will propagate signed amounts through the domain. + amount = abs(parse_amount(amount_str)) - raise ValueError(f"Cannot parse Total Amount: '{row['total amount']}')") + cls._validate_currency(row, currency) + return FiatValue(amount, currency) @classmethod def _validate_currency(cls, row: Dict, parsed_currency: Currency) -> None: diff --git a/pyproject.toml b/pyproject.toml index fba403b..b525526 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,6 +18,7 @@ dependencies = [ "rich~=13.7", "pandas~=2.2.2", "openpyxl~=3.1.2", + "babel~=2.14", ] [project.optional-dependencies] diff --git a/tests/e2e/fixtures/revolut_stock_real.csv b/tests/e2e/fixtures/revolut_stock_real.csv index 02ad688..d3ae6ad 100644 --- a/tests/e2e/fixtures/revolut_stock_real.csv +++ b/tests/e2e/fixtures/revolut_stock_real.csv @@ -5,3 +5,4 @@ 2024-04-05T16:00:00.000000Z,,DEPOSIT,,,USD 500.00,USD,0.25 2024-05-20T11:45:00.000000Z,MSFT,DIVIDEND,,,USD 2.50,USD,0.25 2024-06-01T12:00:00.000000Z,,TRANSFER,,,USD -50.00,USD,0.25 +2024-03-15T12:00:00.000000Z,,CUSTODY FEE,,,USD -0.15,USD,0.25 diff --git a/tests/e2e/test_stock_e2e.py b/tests/e2e/test_stock_e2e.py index f62f27a..d281d3d 100644 --- a/tests/e2e/test_stock_e2e.py +++ b/tests/e2e/test_stock_e2e.py @@ -52,16 +52,24 @@ def test_revolut_real_export_with_bom_and_unknown_operations(self): tx_result = CsvService(str(revolut_csv), TransactionRowParser).read_with_summary() self.assertEqual(len(tx_result.records), 2, "Should find BUY and SELL") - # Operations: DIVIDEND row parsed + # Operations: DIVIDEND + CUSTODY FEE rows parsed op_result = CsvService(str(revolut_csv), OperationRowParser).read_with_summary() - self.assertEqual(len(op_result.records), 1, "Should find DIVIDEND") + self.assertEqual(len(op_result.records), 2, "Should find DIVIDEND and CUSTODY FEE") - # Skipped summary includes all non-tax rows + # Skipped summary includes all non-tax rows (same for both parsers) self.assertEqual(tx_result.skipped_by_type["CASH WITHDRAWAL"], 1) self.assertEqual(tx_result.skipped_by_type["DEPOSIT"], 1) self.assertEqual(tx_result.skipped_by_type["TRANSFER"], 1) self.assertEqual(tx_result.total_skipped, 3) + # Regression for @inobrevi's follow-up: CUSTODY FEE with "USD -X" + # format must parse (was silently dropping before the + # normalize_currency_layout migration) + from pit38.domain.stock.operations.service_fee import ServiceFee + service_fees = [r for r in op_result.records if isinstance(r, ServiceFee)] + self.assertEqual(len(service_fees), 1) + self.assertAlmostEqual(service_fees[0].value.amount, 0.15, places=4) + def test_standardized_csv_to_tax_result(self): csv_path = FIXTURES / "standardized_stock.csv" diff --git a/tests/test_etrade_fiat_value_parser.py b/tests/test_etrade_fiat_value_parser.py index 71daa52..977eea9 100644 --- a/tests/test_etrade_fiat_value_parser.py +++ b/tests/test_etrade_fiat_value_parser.py @@ -18,4 +18,36 @@ def test_parse_dollar_big_number(self): parsed_fiat_value = FiatValueParser.parse(raw_fiat_value) - self.assertEqual(parsed_fiat_value, expected_fiat_value) \ No newline at end of file + self.assertEqual(parsed_fiat_value, expected_fiat_value) + + # ─── Tests added with the shared-normalization migration ── + + def test_parse_euro(self): + """Euro symbol variant (same code path as dollar after migration).""" + self.assertEqual( + FiatValueParser.parse("€250,00"), + FiatValue(amount=250.00, currency=Currency.EURO), + ) + + def test_parse_negative_symbol_amount(self): + """Negative with minus between symbol and amount: '$-0,15'.""" + self.assertEqual( + FiatValueParser.parse("$-0,15"), + FiatValue(amount=0.15, currency=Currency.DOLLAR), + ) + + def test_parse_nbsp_thousand_separator(self): + """Some E*Trade exports use non-breaking space (U+00A0) as the + thousand separator instead of regular space. parse_amount strips + both uniformly.""" + self.assertEqual( + FiatValueParser.parse("$25\xa0001,75"), + FiatValue(amount=25001.75, currency=Currency.DOLLAR), + ) + + def test_parse_unknown_symbol_raises(self): + """Currency symbol outside SYMBOL_TO_CURRENCY raises cleanly + (not a silent None that blows up downstream).""" + with self.assertRaises(ValueError) as ctx: + FiatValueParser.parse("£100,00") + self.assertIn("Unknown currency symbol", str(ctx.exception)) diff --git a/tests/test_normalization.py b/tests/test_normalization.py new file mode 100644 index 0000000..1d2c067 --- /dev/null +++ b/tests/test_normalization.py @@ -0,0 +1,114 @@ +"""Unit tests for pit38.plugins.normalization. + +The shared module centralizes two kinds of CSV-value normalization used +across broker plugins (Revolut, E*Trade). These tests pin the behaviour +so migrations to a new broker don't accidentally break assumptions. +""" +from unittest import TestCase + +from pit38.plugins.normalization import normalize_currency_layout, parse_amount + + +class TestNormalizeCurrencyLayout(TestCase): + # ─── Already canonical (no-op cases) ────────────── + + def test_code_with_amount_unchanged(self): + self.assertEqual(normalize_currency_layout("USD 529.68"), "USD 529.68") + + def test_minus_after_code_already_canonical(self): + """Primary regression for @inobrevi's #33 comment.""" + self.assertEqual(normalize_currency_layout("USD -0.07"), "USD -0.07") + + # ─── Sign rewrites ──────────────────────────────── + + def test_leading_minus_code_moved_past_currency(self): + self.assertEqual(normalize_currency_layout("-USD 529.68"), "USD -529.68") + + def test_leading_minus_symbol_moved_and_spaced(self): + self.assertEqual(normalize_currency_layout("-$529.68"), "$ -529.68") + + def test_minus_between_symbol_and_amount_spaced(self): + self.assertEqual(normalize_currency_layout("$-0.07"), "$ -0.07") + + # ─── Space insertion ────────────────────────────── + + def test_symbol_with_amount_gets_space(self): + self.assertEqual(normalize_currency_layout("$500"), "$ 500") + + def test_euro_symbol_spaced(self): + self.assertEqual(normalize_currency_layout("€250.00"), "€ 250.00") + + def test_preserves_internal_whitespace_for_european_format(self): + """E*Trade's '$25 001,75' must keep the internal space + intact — it's part of the amount (thousand separator), not + currency/amount delimiter.""" + self.assertEqual( + normalize_currency_layout("$25 001,75"), "$ 25 001,75" + ) + + # ─── Defensive cases ────────────────────────────── + + def test_empty_string_passes_through(self): + self.assertEqual(normalize_currency_layout(""), "") + + def test_unrecognized_format_passes_through(self): + """Inputs that don't match any pattern fall through unchanged. + Downstream will raise a parse error.""" + self.assertEqual(normalize_currency_layout("???"), "???") + + +class TestParseAmount(TestCase): + # ─── US format (en_US — tried first) ────────────── + + def test_us_thousand_decimal(self): + self.assertEqual(parse_amount("1,317.06"), 1317.06) + + def test_us_multiple_thousands(self): + self.assertEqual(parse_amount("1,234,567.89"), 1234567.89) + + def test_us_simple_decimal(self): + self.assertEqual(parse_amount("1317.06"), 1317.06) + + def test_us_integer(self): + self.assertEqual(parse_amount("1317"), 1317.0) + + def test_us_negative(self): + self.assertEqual(parse_amount("-0.07"), -0.07) + + # ─── EU format (de_DE — fallback) ───────────────── + + def test_eu_dot_thousand_comma_decimal(self): + self.assertEqual(parse_amount("1.317,06"), 1317.06) + + def test_eu_simple_decimal(self): + self.assertEqual(parse_amount("1317,06"), 1317.06) + + def test_eu_negative(self): + self.assertEqual(parse_amount("-0,07"), -0.07) + + # ─── Whitespace handling ────────────────────────── + + def test_eu_space_thousand_stripped(self): + """'1 317,06' — space as thousand separator, not recognized + by de_DE CLDR; we pre-strip whitespace before parsing.""" + self.assertEqual(parse_amount("1 317,06"), 1317.06) + + def test_eu_nbsp_thousand_stripped(self): + """Non-breaking space (U+00A0) as thousand separator — some + European exports (including older Excel-generated CSVs) use + this. Stripped like regular space.""" + self.assertEqual(parse_amount("1\xa0317,06"), 1317.06) + + # ─── Error paths ────────────────────────────────── + + def test_unparseable_raises_value_error(self): + with self.assertRaises(ValueError) as ctx: + parse_amount("abc") + msg = str(ctx.exception) + self.assertIn("en_US", msg) + self.assertIn("de_DE", msg) + self.assertIn("abc", msg) + + def test_empty_string_raises(self): + with self.assertRaises(ValueError): + parse_amount("") diff --git a/tests/test_revolut_row_parser.py b/tests/test_revolut_row_parser.py index 7f1960e..88435f6 100644 --- a/tests/test_revolut_row_parser.py +++ b/tests/test_revolut_row_parser.py @@ -58,3 +58,23 @@ def test_unparseable_amount_raises_error(self): with self.assertRaises(ValueError) as ctx: RowParser._fiat_value(row) self.assertIn("Cannot parse Total Amount", str(ctx.exception)) + + # ─── Regression tests for #33 follow-up (@inobrevi CUSTODY FEE) ── + + def test_negative_between_code_and_amount(self): + """Primary regression for @inobrevi's CUSTODY FEE row: 'USD -0.07' + (minus between code and amount). Before the shared normalization + fix, this raised ValueError.""" + row = {'total amount': 'USD -0.07', 'currency': 'USD'} + self.assertEqual(RowParser._fiat_value(row), FiatValue(0.07, Currency.DOLLAR)) + + def test_eu_format_with_comma_decimal(self): + """Forward-looking: Revolut hypothetically emits EU-formatted + numbers. Babel's de_DE locale handles this via the locale chain.""" + row = {'total amount': 'EUR 1.317,06', 'currency': 'EUR'} + self.assertEqual(RowParser._fiat_value(row), FiatValue(1317.06, Currency.EURO)) + + def test_eu_format_negative_mixed(self): + """Combined: EU decimal format + minus between code and amount.""" + row = {'total amount': 'EUR -1.317,06', 'currency': 'EUR'} + self.assertEqual(RowParser._fiat_value(row), FiatValue(1317.06, Currency.EURO)) From 3afd041fcc9b4e2b0af2cdf88e7adffba6fc0e71 Mon Sep 17 00:00:00 2001 From: przemyslawbialon Date: Fri, 24 Apr 2026 14:37:44 +0200 Subject: [PATCH 4/4] fix: import Counter in cli.py to satisfy F821 linter The `_print_skipped_summary` helper used `Counter` as a forward-ref string, but Counter was never imported. CI flake8 (F821) caught it. Replaced the string hint with a direct annotation and added the import. --- pit38/cli.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pit38/cli.py b/pit38/cli.py index c15d9bd..d88988d 100644 --- a/pit38/cli.py +++ b/pit38/cli.py @@ -1,4 +1,5 @@ import sys +from collections import Counter import click from loguru import logger @@ -24,7 +25,7 @@ def import_cmd(): pass -def _print_skipped_summary(skipped_by_type: "Counter") -> None: +def _print_skipped_summary(skipped_by_type: Counter) -> None: """Pretty-print a skip summary to stderr for the user to review.""" if not skipped_by_type: return