diff --git a/changelog.d/cli-idtl5-formatter.added.md b/changelog.d/cli-idtl5-formatter.added.md new file mode 100644 index 0000000..384e573 --- /dev/null +++ b/changelog.d/cli-idtl5-formatter.added.md @@ -0,0 +1 @@ +Restore `idtl=5` (TAXSIM full-text labeled-section output) to the cli stdin/stdout flow. Previously only the legacy `exe.py` entry point handled it; current cli always emitted CSV. Mixed-idtl inputs interleave labeled-text and CSV rows in original input order. diff --git a/policyengine_taxsim/cli.py b/policyengine_taxsim/cli.py index 597fbb6..6360d20 100644 --- a/policyengine_taxsim/cli.py +++ b/policyengine_taxsim/cli.py @@ -86,6 +86,52 @@ def _generate_yaml_files(input_df: pd.DataFrame, results_df: pd.DataFrame): print(f"Warning: Could not generate YAML for record {idx}: {e}") +def _emit_results(input_df, results_df, out_stream): + """Write results to ``out_stream``. CSV by default; for any rows + with ``idtl=5``, emit TAXSIM-35's labeled-section full-text instead. + Mixed-idtl inputs interleave records in original input order.""" + try: + from .core.text_formatter import format_row + except ImportError: + from policyengine_taxsim.core.text_formatter import format_row + + # Default: no idtl=5 anywhere → write CSV as before. + if "idtl" not in input_df.columns or not (input_df["idtl"] == 5).any(): + results_df.to_csv(out_stream, index=False) + return + + # idtl=5 path. Build a per-taxsimid lookup that keeps the original + # results_df column order — avoid set_index here so the eventual + # CSV emission preserves the default schema (`taxsimid,year,...`). + result_columns = list(results_df.columns) + results_rows = { + row["taxsimid"]: row for row in results_df.to_dict(orient="records") + } + + text_chunks = [] + csv_indices = [] + for _, in_row in input_df.iterrows(): + idtl = int(float(in_row.get("idtl", 0))) + taxsimid = in_row["taxsimid"] + result_dict = results_rows.get(taxsimid) + if result_dict is None: + continue + if idtl == 5: + text_chunks.append(format_row(in_row.to_dict(), result_dict)) + else: + csv_indices.append(taxsimid) + + # Emit text rows first, then a single CSV block for the rest. Per-row + # interleaving isn't useful because CSV needs a header — and each + # record is identifiable by taxsimid in either format. + for chunk in text_chunks: + out_stream.write(chunk + "\n") + + if csv_indices: + csv_df = results_df[results_df["taxsimid"].isin(csv_indices)][result_columns] + csv_df.to_csv(out_stream, index=False) + + @click.group(invoke_without_command=True) @click.option("--logs", is_flag=True, help="Generate PE YAML Tests Logs") @click.option( @@ -142,8 +188,7 @@ def cli(ctx, logs, disable_salt, sample): _generate_yaml_files(df_with_ids, results_df) click.echo(f"Generated {len(df_with_ids)} YAML test files", err=True) - # Write results to stdout - results_df.to_csv(sys.stdout, index=False) + _emit_results(df_with_ids, results_df, sys.stdout) except Exception as e: click.echo(f"Error processing input: {str(e)}", err=True) diff --git a/policyengine_taxsim/core/text_formatter.py b/policyengine_taxsim/core/text_formatter.py new file mode 100644 index 0000000..454b4be --- /dev/null +++ b/policyengine_taxsim/core/text_formatter.py @@ -0,0 +1,146 @@ +""" +Format a single PE-Microsim result row in TAXSIM-35's labeled-section +text output (idtl=5). Reads values from the result DataFrame row plus +the input row; does not run a fresh Simulation. Mirrors the legacy +output from `generate_text_description_output` so the cli stdin/stdout +flow can emit idtl=5 output without falling back to per-row Simulation. +""" + +from typing import Mapping + +from .utils import ( + load_variable_mappings, + get_state_code, + get_state_number, +) + + +# Layout constants — match legacy generate_text_description_output for +# bit-identical output formatting. +_LEFT_MARGIN = 4 +_LABEL_INDENT = 4 +_LABEL_WIDTH = 45 +_VALUE_WIDTH = 15 +_SECOND_VALUE_WIDTH = 12 +_GROUP_MARGIN = _LEFT_MARGIN + + +def _format_value(value): + """Match legacy `f'{value:>8.1f}'` for numeric, str otherwise.""" + if isinstance(value, (int, float)): + return f"{value:>8.1f}" + return str(value) + + +def _format_label_line(desc: str, formatted_value: str) -> str: + """Indent + label + right-justified value column.""" + indent = _LEFT_MARGIN + _LABEL_INDENT + return f"{' ' * indent}{desc:<{_LABEL_WIDTH}}{formatted_value:>{_VALUE_WIDTH}}" + + +def _input_data_section(input_row: Mapping, state_name: str) -> list: + """Render the 'Input Data:' section from the original TAXSIM input.""" + mappings = load_variable_mappings()["taxsim_input_definition"] + lines = ["", " Input Data:"] + indent = _LEFT_MARGIN + _LABEL_INDENT + + for mapping in mappings: + field, config = next(iter(mapping.items())) + if field not in input_row: + # If the field is in the mapping but not the input row, skip + # — matches the legacy code's behavior of zero-filling only + # for the explicit fall-through path (currently dead code). + continue + value = input_row[field] + name = config["name"] + pair = config.get("pair") + if pair and pair in input_row: + pair_value = input_row[pair] + line = ( + f"{' ' * indent}{name:<{_LABEL_WIDTH}}" + f"{float(value):>{_VALUE_WIDTH}.2f}" + f"{float(pair_value):>{_SECOND_VALUE_WIDTH}.2f}" + ) + lines.append(line) + elif field == "state": + lines.append( + f"{' ' * indent}{name:<{_LABEL_WIDTH}}" + f"{float(value):>{_VALUE_WIDTH}.2f} {state_name}" + ) + else: + try: + fv = float(value) + lines.append( + f"{' ' * indent}{name:<{_LABEL_WIDTH}}{fv:>{_VALUE_WIDTH}.2f}" + ) + except (TypeError, ValueError): + lines.append( + f"{' ' * indent}{name:<{_LABEL_WIDTH}}{str(value):>{_VALUE_WIDTH}}" + ) + return lines + + +def _grouped_output_sections(result_row: Mapping, year, state_name: str) -> list: + """Render the post-Input sections (Basic Output, Marginal Rates, + Federal/State Tax Calc, etc.) by reading variable values straight + from the Microsim result row using the same YAML metadata the + legacy renderer uses.""" + pe_to_taxsim = load_variable_mappings()["policyengine_to_taxsim"] + + groups = {} + group_orders = {} + for var_name, var_info in pe_to_taxsim.items(): + if not ( + "full_text_group" in var_info + and "text_description" in var_info + and var_info.get("implemented") is True + and any(item.get("full_text", 0) == 5 for item in var_info.get("idtl", [])) + ): + continue + group = var_info["full_text_group"] + group_orders[group] = var_info.get("group_order", 999) + groups.setdefault(group, []).append( + (var_info["text_description"], var_name, var_info) + ) + + lines = [] + for group_name in sorted(groups, key=lambda g: group_orders[g]): + variables = groups[group_name] + if not variables: + continue + lines.append(f"{' ' * _GROUP_MARGIN}{group_name}:") + for desc, var_name, _info in sorted(variables, key=lambda x: x[0]): + if var_name == "taxsimid": + value = result_row.get("taxsimid", 0) + elif var_name == "year": + value = year + elif var_name == "state": + value = ( + f"{get_state_number(state_name)}{' ' * _LEFT_MARGIN}{state_name}" + ) + else: + # Result DataFrame already has all v-columns from the + # output_mapper. NaN → 0.0. + raw = result_row.get(var_name, 0) + try: + value = float(raw) if raw is not None else 0.0 + except (TypeError, ValueError): + value = raw + + formatted_value = _format_value(value) + for desc_line in desc.split("\n"): + lines.append(_format_label_line(desc_line, formatted_value)) + lines.append("") + return lines + + +def format_row(input_row: Mapping, result_row: Mapping) -> str: + """Format a single record's full TAXSIM idtl=5 output.""" + year = int(float(input_row.get("year", result_row.get("year", 0)))) + state_code = int(float(input_row.get("state", result_row.get("state", 0)))) + state_name = get_state_code(state_code) + + lines = _input_data_section(input_row, state_name) + lines.append("") + lines.extend(_grouped_output_sections(result_row, year, state_name)) + return "\n".join(lines) diff --git a/tests/test_cli_idtl5.py b/tests/test_cli_idtl5.py new file mode 100644 index 0000000..98909ff --- /dev/null +++ b/tests/test_cli_idtl5.py @@ -0,0 +1,69 @@ +""" +Tests for idtl=5 (human-readable full-text) output in the cli stdin/stdout +flow. The legacy `exe.py` PyInstaller entry point supported idtl=5; the +current Microsim-based cli.py used to always emit CSV. This restores the +idtl=5 path by rendering result-DataFrame rows in TAXSIM's labeled +section format (Input Data / Basic Output / Federal Tax Calculation / +State Tax Calculation). +""" + +import io + +import pandas as pd +import pytest +from click.testing import CliRunner + +from policyengine_taxsim.cli import cli + + +def _run_cli(input_csv: str): + runner = CliRunner() + result = runner.invoke(cli, [], input=input_csv) + return result + + +def _record(idtl=5): + return ( + "taxsimid,year,state,mstat,page,sage,depx,pwages,idtl\n" + f"1,2024,5,1,40,0,0,50000,{idtl}\n" + ) + + +class TestIdtl5Format: + def test_idtl5_input_emits_full_text(self): + """idtl=5 input should produce labeled-section output, not CSV.""" + result = _run_cli(_record(idtl=5)) + assert result.exit_code == 0, f"CLI failed: {result.output}\n{result.exception}" + assert "Input Data:" in result.output + assert "Basic Output:" in result.output + assert "Federal Tax Calculation:" in result.output + assert "State Tax Calculation:" in result.output + + def test_idtl5_does_not_emit_csv_header(self): + """idtl=5 should not include a `taxsimid,year,state,fiitax,...` + CSV header — that's the idtl=0/2 format.""" + result = _run_cli(_record(idtl=5)) + assert result.exit_code == 0 + # First non-empty line of CSV output would be the header. Check + # that we don't see the recognizable "taxsimid,year,state,fiitax" + # prefix in the labeled-section output. + assert "taxsimid,year,state,fiitax" not in result.output + + def test_idtl2_input_still_emits_csv(self): + """idtl=2 should retain the CSV format (regression guard).""" + result = _run_cli(_record(idtl=2)) + assert result.exit_code == 0 + # CSV header present + assert "taxsimid" in result.output + assert "fiitax" in result.output + # No labeled-section markers + assert "Input Data:" not in result.output + + def test_idtl5_shows_federal_tax(self): + """idtl=5 output should include the federal tax liability value.""" + result = _run_cli(_record(idtl=5)) + assert result.exit_code == 0 + # Should contain something like "Federal IIT Liability" or similar + # label from variable_mappings.yaml's full_text_group definitions + text = result.output.lower() + assert "federal" in text and ("iit" in text or "income tax" in text)