diff --git a/README.md b/README.md index bce9887..a75b735 100644 --- a/README.md +++ b/README.md @@ -126,10 +126,12 @@ Typical next steps are: ```bash python -m dfode_kit.cli.main augment \ + --source /path/to/run/oneD_flame_CH4_phi1/ch4_phi1_sample.h5 \ --mech /path/to/mechanisms/CH4/gri30.yaml \ - --h5_file /path/to/run/oneD_flame_CH4_phi1/ch4_phi1_sample.h5 \ - --output_file /path/to/data/ch4_phi1_aug.npy \ - --dataset_num 20000 + --save /path/to/data/ch4_phi1_aug.npy \ + --preset random-local-combustion-v1 \ + --target-size 20000 \ + --apply python -m dfode_kit.cli.main label \ --mech /path/to/mechanisms/CH4/gri30.yaml \ diff --git a/dfode_kit/cli/commands/augment.py b/dfode_kit/cli/commands/augment.py index a036aba..acbc733 100644 --- a/dfode_kit/cli/commands/augment.py +++ b/dfode_kit/cli/commands/augment.py @@ -1,62 +1,84 @@ -def add_command_parser(subparsers): - augment_parser = subparsers.add_parser('augment', help='Perform data augmentation.') +from __future__ import annotations - augment_parser.add_argument( - '--mech', - required=True, - type=str, - help='Path to the YAML mechanism file.' - ) - augment_parser.add_argument( - '--h5_file', - required=True, - type=str, - help='Path to the h5 file to augment.' +import json + +from dfode_kit.cli.commands.augment_helpers import DEFAULT_AUGMENT_PRESET + + +def add_command_parser(subparsers): + augment_parser = subparsers.add_parser( + 'augment', + help='Perform data augmentation from sampled state data.', ) + augment_parser.add_argument('--source', type=str, help='Path to the sampled HDF5 source file.') + augment_parser.add_argument('--mech', type=str, help='Path to the YAML mechanism file.') + augment_parser.add_argument('--save', type=str, help='Path to save the augmented NumPy array.') augment_parser.add_argument( - '--output_file', - required=True, + '--preset', type=str, - help='Path to the output NUMPY file.' + default=DEFAULT_AUGMENT_PRESET, + help='Named augmentation preset.', ) augment_parser.add_argument( - '--heat_limit', - type=bool, - default=False, - help='contraint perturbed data with heat release.' - ) - augment_parser.add_argument( - '--element_limit', - type=bool, - default=True, - help='contraint perturbed data with element ratio.' - ) - augment_parser.add_argument( - '--dataset_num', - required=True, + '--target-size', + dest='target_size', type=int, - help='num of dataset.' - ) - augment_parser.add_argument( - '--perturb_factor', - type=float, - default=0.1, - help='Factor to perturb the data by.' + help='Requested number of augmented rows.', ) + augment_parser.add_argument('--seed', type=int, help='Random seed for reproducible augmentation.') + augment_parser.add_argument('--from-config', type=str, help='Load an augment plan/config JSON.') + augment_parser.add_argument('--write-config', type=str, help='Write the resolved augment plan/config to JSON.') + augment_parser.add_argument('--preview', action='store_true', help='Preview the resolved plan without executing augmentation.') + augment_parser.add_argument('--apply', action='store_true', help='Execute augmentation and write the output array.') + augment_parser.add_argument('--json', action='store_true', help='Print structured JSON output.') def handle_command(args): - import numpy as np + from dfode_kit.cli.commands.augment_helpers import apply_augment_plan, dump_plan_json, resolve_augment_plan + + if not args.preview and not args.apply and not args.write_config: + raise ValueError('Specify at least one action: --preview, --apply, or --write-config.') + + plan = resolve_augment_plan(args) + json_result = {'command_type': 'augment'} if args.json else None + + if args.write_config: + config_path = dump_plan_json(plan, args.write_config) + if args.json: + json_result['config_written'] = {'path': str(config_path)} + else: + print(f'Wrote augment config: {config_path}') + + if args.preview: + if args.json: + json_result['plan'] = plan + else: + _print_human_plan(plan) - from dfode_kit.data import get_TPY_from_h5, random_perturb + if args.apply: + result = apply_augment_plan(plan, quiet=args.json) + if args.json: + json_result['apply'] = result + else: + print(f"Completed augmentation from: {result['source']}") + print(f"output_path: {result['output_path']}") + print(f"returned_count: {result['returned_count']}") - print('Handling augment command') - print(f'Loading data from h5 file: {args.h5_file}') - data = get_TPY_from_h5(args.h5_file) - print('Data shape:', data.shape) + if args.json: + print(json.dumps(json_result, indent=2, sort_keys=True)) - all_data = random_perturb(data, args.mech, args.dataset_num, args.heat_limit, args.element_limit) - np.save(args.output_file, all_data) - print('Saved augmented data shape:', all_data.shape) - print(f'Saved augmented data to {args.output_file}') +def _print_human_plan(plan: dict): + print('Resolved augment plan') + print(f"preset: {plan['preset']}") + print(f"source: {plan['source']}") + print(f"mechanism: {plan['mechanism']}") + print(f"save: {plan['save']}") + print(f"target_size: {plan['target_size']}") + print(f"seed: {plan['seed']}") + print('resolved:') + for key in sorted(plan['resolved']): + print(f" {key}: {plan['resolved'][key]}") + print('notes:') + for note in plan['notes']: + print(f' - {note}') diff --git a/dfode_kit/cli/commands/augment_helpers.py b/dfode_kit/cli/commands/augment_helpers.py new file mode 100644 index 0000000..1822b2f --- /dev/null +++ b/dfode_kit/cli/commands/augment_helpers.py @@ -0,0 +1,162 @@ +from __future__ import annotations + +import io +import json +from contextlib import redirect_stdout +from dataclasses import dataclass +from pathlib import Path +from typing import Any + + +DEFAULT_AUGMENT_PRESET = 'random-local-combustion-v1' + + +@dataclass(frozen=True) +class AugmentPreset: + name: str + summary: str + resolved: dict[str, Any] + notes: list[str] + + +AUGMENT_PRESETS: dict[str, AugmentPreset] = { + DEFAULT_AUGMENT_PRESET: AugmentPreset( + name=DEFAULT_AUGMENT_PRESET, + summary='Current random local perturbation workflow with combustion-oriented defaults.', + resolved={ + 'heat_limit': False, + 'element_limit': True, + }, + notes=[ + 'This preset preserves the current default augmentation behavior on main.', + 'The CLI intentionally keeps the public surface minimal; detailed tuning should happen through config round-trip or future preset revisions.', + ], + ) +} + + +def get_augment_preset(name: str) -> AugmentPreset: + try: + return AUGMENT_PRESETS[name] + except KeyError as exc: + raise ValueError( + f"Unknown augment preset: {name}. Available presets: {', '.join(sorted(AUGMENT_PRESETS))}" + ) from exc + + +def resolve_augment_plan(args) -> dict[str, Any]: + if args.from_config: + plan = load_plan_json(args.from_config) + if plan.get('command_type') != 'augment': + raise ValueError(f"Unsupported command_type in config: {plan.get('command_type')}") + + source = args.source or plan.get('source') + mech = args.mech or plan.get('mechanism') + save = args.save or plan.get('save') + preset_name = args.preset or plan.get('preset', DEFAULT_AUGMENT_PRESET) + target_size = args.target_size if args.target_size is not None else plan.get('target_size') + seed = args.seed if args.seed is not None else plan.get('seed') + else: + _validate_required_args(args, ('source', 'mech', 'preset', 'target_size')) + source = args.source + mech = args.mech + save = args.save + preset_name = args.preset + target_size = args.target_size + seed = args.seed + + if args.apply and not save: + raise ValueError('The --save path is required when using --apply.') + + preset = get_augment_preset(preset_name) + source_path = Path(source).resolve() + if not source_path.is_file(): + raise ValueError(f'Source file does not exist: {source_path}') + + mechanism_path = Path(mech).resolve() + if not mechanism_path.is_file(): + raise ValueError(f'Mechanism file does not exist: {mechanism_path}') + + plan = { + 'schema_version': 1, + 'command_type': 'augment', + 'preset': preset.name, + 'preset_summary': preset.summary, + 'source': str(source_path), + 'mechanism': str(mechanism_path), + 'save': str(Path(save).resolve()) if save else None, + 'target_size': int(target_size), + 'seed': int(seed) if seed is not None else None, + 'config_path': str(Path(args.from_config).resolve()) if args.from_config else None, + 'notes': preset.notes, + 'resolved': dict(preset.resolved), + } + return plan + + +def apply_augment_plan(plan: dict[str, Any], quiet: bool = False) -> dict[str, Any]: + import numpy as np + + from dfode_kit.data import get_TPY_from_h5, random_perturb + + source_path = Path(plan['source']).resolve() + output_path = Path(plan['save']).resolve() + output_path.parent.mkdir(parents=True, exist_ok=True) + + if quiet: + with redirect_stdout(io.StringIO()): + data = get_TPY_from_h5(source_path) + augmented = random_perturb( + data, + plan['mechanism'], + plan['target_size'], + plan['resolved']['heat_limit'], + plan['resolved']['element_limit'], + seed=plan.get('seed'), + ) + else: + print('Handling augment command') + print(f'Loading data from h5 file: {source_path}') + data = get_TPY_from_h5(source_path) + print('Data shape:', data.shape) + augmented = random_perturb( + data, + plan['mechanism'], + plan['target_size'], + plan['resolved']['heat_limit'], + plan['resolved']['element_limit'], + seed=plan.get('seed'), + ) + print('Saved augmented data shape:', augmented.shape) + print(f'Saved augmented data to {output_path}') + + np.save(output_path, augmented) + + return { + 'event': 'augmentation_completed', + 'source': str(source_path), + 'output_path': str(output_path), + 'preset': plan['preset'], + 'requested_count': int(plan['target_size']), + 'returned_count': int(augmented.shape[0]), + 'feature_count': int(augmented.shape[1]) if augmented.ndim == 2 else None, + 'seed': plan.get('seed'), + } + + +def dump_plan_json(plan: dict[str, Any], path: str | Path) -> Path: + output_path = Path(path).resolve() + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(json.dumps(plan, indent=2, sort_keys=True) + '\n', encoding='utf-8') + return output_path + + +def load_plan_json(path: str | Path) -> dict[str, Any]: + input_path = Path(path).resolve() + return json.loads(input_path.read_text(encoding='utf-8')) + + +def _validate_required_args(args, names: tuple[str, ...]): + missing = [f'--{name.replace("_", "-")}' for name in names if getattr(args, name) is None] + if missing: + raise ValueError(f'Missing required arguments: {", ".join(missing)}') diff --git a/dfode_kit/data/augment.py b/dfode_kit/data/augment.py index f0a5d5d..9be8876 100644 --- a/dfode_kit/data/augment.py +++ b/dfode_kit/data/augment.py @@ -33,9 +33,12 @@ def random_perturb( cq: float = 10, inert_idx: int = -1, time_step: float = 1e-6, + seed: int | None = None, ) -> np.ndarray: import cantera as ct + rng = np.random.default_rng(seed) + array = array[array[:, 0] > frozenTem] gas = ct.Solution(mech_path) @@ -78,11 +81,11 @@ def random_perturb( test_r = np.copy(array[j]) - test_tmp[0] = test_r[0] + (maxT - minT) * (2 * np.random.rand() - 1.0) * alpha - test_tmp[1] = test_r[1] + (maxP - minP) * (2 * np.random.rand() - 1.0) * alpha * 20 - test_tmp[-1] = test_r[-1] + (maxN2 - minN2) * (2 * np.random.rand() - 1) * alpha + test_tmp[0] = test_r[0] + (maxT - minT) * (2 * rng.random() - 1.0) * alpha + test_tmp[1] = test_r[1] + (maxP - minP) * (2 * rng.random() - 1.0) * alpha * 20 + test_tmp[-1] = test_r[-1] + (maxN2 - minN2) * (2 * rng.random() - 1) * alpha for i in range(2, array.shape[1] - 1): - test_tmp[i] = np.abs(test_r[i]) ** (1 + (2 * np.random.rand() - 1) * alpha) + test_tmp[i] = np.abs(test_r[i]) ** (1 + (2 * rng.random() - 1) * alpha) test_tmp[2:-1] = test_tmp[2:-1] / np.sum(test_tmp[2:-1]) * (1 - test_tmp[-1]) if heat_limit: @@ -133,7 +136,7 @@ def random_perturb( print(num) new_array = np.array(new_array) - new_array = new_array[np.random.choice(new_array.shape[0], size=dataset)] + new_array = new_array[rng.choice(new_array.shape[0], size=dataset)] unique_array = np.unique(new_array, axis=0) print(unique_array.shape) return unique_array diff --git a/docs/augment.md b/docs/augment.md new file mode 100644 index 0000000..291f2da --- /dev/null +++ b/docs/augment.md @@ -0,0 +1,145 @@ +# Data Augmentation + +DFODE-kit provides a preset-driven CLI for data augmentation: + +```bash +dfode-kit augment ... +``` + +This page is the canonical reference for the current augmentation command. + +## Purpose + +Create an augmented NumPy dataset from a sampled HDF5 state file using a named augmentation preset. + +The current design goal is to keep the public CLI surface small while still supporting: + +- preview before execution +- machine-readable JSON output +- config round-trip through JSON files +- reproducible execution through an explicit seed + +## Command + +```bash +dfode-kit augment [options] +``` + +## Minimal public contract + +### Required + +- `--source` +- `--mech` + +### Usually required + +- `--save` (required for `--apply`) +- `--preset` +- `--target-size` + +### Workflow controls + +- `--preview` +- `--apply` +- `--json` +- `--write-config` +- `--from-config` + +### Optional but high-value + +- `--seed` + +## Current preset + +- `random-local-combustion-v1` + +This preset currently preserves the mainline augmentation behavior while keeping the public flag surface minimal. + +## Common workflow + +### Preview the resolved plan + +```bash +dfode-kit augment \ + --source /path/to/sample.h5 \ + --mech /path/to/gri30.yaml \ + --preset random-local-combustion-v1 \ + --target-size 20000 \ + --preview --json +``` + +### Write a machine-readable config + +```bash +dfode-kit augment \ + --source /path/to/sample.h5 \ + --mech /path/to/gri30.yaml \ + --preset random-local-combustion-v1 \ + --target-size 20000 \ + --preview \ + --write-config /path/to/augment-plan.json +``` + +### Apply directly + +```bash +dfode-kit augment \ + --source /path/to/sample.h5 \ + --mech /path/to/gri30.yaml \ + --save /path/to/aug.npy \ + --preset random-local-combustion-v1 \ + --target-size 20000 \ + --seed 1234 \ + --apply +``` + +### Apply from a saved config + +```bash +dfode-kit augment \ + --from-config /path/to/augment-plan.json \ + --save /path/to/aug.npy \ + --apply +``` + +## Output behavior + +### `--preview --json` +Prints a JSON object containing the resolved augmentation plan. + +### `--write-config` +Writes the resolved plan to a JSON file for review, editing, and later execution. + +### `--apply` +Runs augmentation and writes the output `.npy` dataset. + +In `--json` mode, the command reports a structured completion record including: + +- source path +- output path +- preset +- requested row count +- returned row count +- seed + +## Action rule + +At least one of the following must be specified: + +- `--preview` +- `--apply` +- `--write-config` + +## Design note + +The current augmentation CLI intentionally avoids exposing a large number of tuning flags. + +The preferred model is: + +1. choose a named preset +2. preview the resolved plan +3. optionally persist the plan with `--write-config` +4. apply the plan directly or via `--from-config` + +This keeps everyday CLI usage short while still supporting reproducible, machine-readable workflows. diff --git a/docs/cli.md b/docs/cli.md index 1b3d269..1f98ab3 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -78,10 +78,12 @@ Example: ```bash dfode-kit augment \ + --source /path/to/sample.h5 \ --mech /path/to/gri30.yaml \ - --h5_file /path/to/sample.h5 \ - --output_file /path/to/augmented.npy \ - --dataset_num 20000 + --save /path/to/augmented.npy \ + --preset random-local-combustion-v1 \ + --target-size 20000 \ + --apply ``` ### `label` @@ -131,6 +133,7 @@ Recent CLI refactors improved: The new `init` command already supports machine-readable JSON output for planning/provenance, and `run-case` supports JSON output for preview/apply results. +For augmentation specifically, see [Data Augmentation](augment.md). For the end-to-end artifact flow between `sample`, `augment`, `label`, `h52npy`, and `train`, see [Data Preparation and Training Workflow](data-workflow.md). Future work should still add: diff --git a/docs/data-workflow.md b/docs/data-workflow.md index 47d8287..2821255 100644 --- a/docs/data-workflow.md +++ b/docs/data-workflow.md @@ -68,20 +68,30 @@ Example: ```bash dfode-kit augment \ + --source /path/to/run/oneD_flame_CH4_phi1/ch4_phi1_sample.h5 \ --mech /path/to/gri30.yaml \ - --h5_file /path/to/run/oneD_flame_CH4_phi1/ch4_phi1_sample.h5 \ - --output_file /path/to/data/ch4_phi1_aug.npy \ - --dataset_num 20000 + --save /path/to/data/ch4_phi1_aug.npy \ + --preset random-local-combustion-v1 \ + --target-size 20000 \ + --apply ``` -Current optional controls: -- `--heat_limit` -- `--element_limit` -- `--perturb_factor` +Minimal public contract: +- `--source` +- `--mech` +- `--save` (required for `--apply`) +- `--preset` +- `--target-size` +- `--seed` (optional) +- `--preview` +- `--apply` +- `--json` +- `--write-config` +- `--from-config` ## Current note on `augment` -The current CLI surface exposes `--perturb_factor`, but the present command implementation does not yet thread that value through to the underlying augmentation routine. Treat the command as functional, but the public option surface here is not yet fully normalized. +The augmentation CLI is intentionally preset-driven and keeps the public flag surface small. For more advanced tuning, use `--preview --write-config` and apply later with `--from-config`. ### 4. `label` Input: @@ -148,8 +158,8 @@ This keeps: The CLI surface for the data pipeline is usable, but not yet as normalized as `init` and `run-case`. Current gaps include: -- limited machine-readable JSON output for `sample`, `augment`, `label`, and `train` -- older option naming conventions such as `--h5_file` and `--source_file` +- limited machine-readable JSON output for `sample`, `label`, and `train` +- older option naming conventions still present on some commands such as `--source_file` - thinner published documentation for training outputs and configuration detail than for case init/run These are good future cleanup targets, but the commands above describe the current behavior on `main`. diff --git a/docs/index.md b/docs/index.md index 2056f14..7ce9bc8 100644 --- a/docs/index.md +++ b/docs/index.md @@ -13,6 +13,7 @@ DFODE-kit is a Python toolkit for accelerating combustion chemistry integration - **CLI**: current `dfode-kit` commands and their purpose - **Canonical Case Initialization**: preset-based case setup with preview/apply/config workflows - **Runtime Configuration and Case Execution**: persistent machine-local environment config plus reproducible case launching +- **Data Augmentation**: the minimal preset-driven augment CLI, including preview/apply/config round-trip - **Data Preparation and Training Workflow**: the current artifact flow from sampled HDF5 to labeled datasets and models - **Architecture**: repo layout and current refactor direction - **Tutorials and Workflow**: how to think about the DFODE pipeline diff --git a/mkdocs.yml b/mkdocs.yml index 8b2e06c..742c766 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -31,6 +31,7 @@ nav: - CLI: cli.md - Canonical Case Initialization: init.md - Runtime Configuration and Case Execution: run-case.md + - Data Augmentation: augment.md - Data Preparation and Training Workflow: data-workflow.md - Architecture: architecture.md - Tutorials and Workflow: tutorials.md diff --git a/tests/test_augment_cli.py b/tests/test_augment_cli.py new file mode 100644 index 0000000..0387f38 --- /dev/null +++ b/tests/test_augment_cli.py @@ -0,0 +1,102 @@ +import json +from pathlib import Path +from types import SimpleNamespace + +from dfode_kit.cli.commands import augment, augment_helpers + + +class DummyArgs(SimpleNamespace): + pass + + +def make_args(tmp_path, **overrides): + source = tmp_path / 'sample.h5' + source.write_text('stub', encoding='utf-8') + mech = tmp_path / 'mech.yaml' + mech.write_text('stub', encoding='utf-8') + data = { + 'command': 'augment', + 'source': str(source), + 'mech': str(mech), + 'save': str(tmp_path / 'aug.npy'), + 'preset': augment_helpers.DEFAULT_AUGMENT_PRESET, + 'target_size': 12, + 'seed': 123, + 'from_config': None, + 'write_config': None, + 'preview': True, + 'apply': False, + 'json': True, + } + data.update(overrides) + return DummyArgs(**data) + + +def test_resolve_augment_plan_uses_minimal_contract(tmp_path): + args = make_args(tmp_path) + + plan = augment_helpers.resolve_augment_plan(args) + + assert plan['command_type'] == 'augment' + assert plan['preset'] == augment_helpers.DEFAULT_AUGMENT_PRESET + assert plan['target_size'] == 12 + assert plan['seed'] == 123 + assert plan['resolved'] == {'heat_limit': False, 'element_limit': True} + + +def test_resolve_augment_plan_from_config_allows_save_override(tmp_path): + args = make_args(tmp_path, write_config=str(tmp_path / 'augment-plan.json')) + plan = augment_helpers.resolve_augment_plan(args) + config_path = augment_helpers.dump_plan_json(plan, args.write_config) + + override_path = tmp_path / 'override.npy' + from_config_args = make_args( + tmp_path, + source=None, + mech=None, + save=str(override_path), + preset=None, + target_size=None, + seed=None, + from_config=str(config_path), + preview=True, + apply=False, + ) + + loaded = augment_helpers.resolve_augment_plan(from_config_args) + + assert loaded['save'] == str(override_path.resolve()) + assert loaded['target_size'] == 12 + assert loaded['seed'] == 123 + + +def test_handle_command_json_preview_and_apply(tmp_path, monkeypatch, capsys): + args = make_args(tmp_path, preview=True, apply=True, json=True) + + monkeypatch.setattr(augment_helpers, 'apply_augment_plan', lambda plan, quiet=False: { + 'event': 'augmentation_completed', + 'output_path': plan['save'], + 'returned_count': 9, + 'source': plan['source'], + 'preset': plan['preset'], + 'requested_count': plan['target_size'], + 'seed': plan['seed'], + }) + + augment.handle_command(args) + + payload = json.loads(capsys.readouterr().out) + assert payload['command_type'] == 'augment' + assert payload['plan']['target_size'] == 12 + assert payload['apply']['returned_count'] == 9 + + +def test_handle_command_requires_action(tmp_path): + args = make_args(tmp_path, preview=False, apply=False, write_config=None) + + try: + augment.handle_command(args) + except ValueError as exc: + assert 'Specify at least one action' in str(exc) + else: + raise AssertionError('expected ValueError')