diff --git a/src/detectmatelibrary/common/_core_op/_fit_logic.py b/src/detectmatelibrary/common/_core_op/_fit_logic.py index c702524..7a4aa61 100644 --- a/src/detectmatelibrary/common/_core_op/_fit_logic.py +++ b/src/detectmatelibrary/common/_core_op/_fit_logic.py @@ -74,6 +74,9 @@ def __init__( self._configuration_done = False self.config_finished = False + self._training_done = False + self.training_finished = False + self.data_use_configure = data_use_configure self.data_use_training = data_use_training @@ -84,6 +87,13 @@ def finish_config(self) -> bool: return False + def finish_training(self) -> bool: + if self._training_done and not self.training_finished: + self.training_finished = True + return True + + return False + def run(self) -> FitLogicState: if do_configure( data_use_configure=self.data_use_configure, @@ -103,5 +113,7 @@ def run(self) -> FitLogicState: ): self.data_used_train += 1 return FitLogicState.DO_TRAIN + elif self.data_used_train > 0 and not self._training_done: + self._training_done = True return FitLogicState.NOTHING \ No newline at end of file diff --git a/src/detectmatelibrary/common/core.py b/src/detectmatelibrary/common/core.py index 34d5a00..02afb3c 100644 --- a/src/detectmatelibrary/common/core.py +++ b/src/detectmatelibrary/common/core.py @@ -54,6 +54,9 @@ def configure( def set_configuration(self) -> None: pass + def post_train(self) -> None: + pass + def get_config(self) -> Dict[str, Any]: return self.config.get_config() @@ -100,6 +103,9 @@ def process(self, data: BaseSchema | bytes) -> BaseSchema | bytes | None: if fit_state == FitLogicState.DO_TRAIN: logger.info(f"<<{self.name}>> use data for training") self.train(input_=data_buffered) + elif self.fitlogic.finish_training(): + logger.info(f"<<{self.name}>> finalizing training") + self.post_train() output_ = self.output_schema() logger.info(f"<<{self.name}>> processing data") diff --git a/src/detectmatelibrary/common/detector.py b/src/detectmatelibrary/common/detector.py index e224f68..be79e36 100644 --- a/src/detectmatelibrary/common/detector.py +++ b/src/detectmatelibrary/common/detector.py @@ -3,6 +3,7 @@ from detectmatelibrary.utils.data_buffer import ArgsBuffer, BufferMode from detectmatelibrary.utils.aux import get_timestamp +from detectmatelibrary.utils.persistency.event_persistency import EventPersistency from detectmatelibrary.schemas import ParserSchema, DetectorSchema @@ -10,6 +11,7 @@ from typing import Dict, List, Optional, Any from detectmatelibrary.utils.time_format_handler import TimeFormatHandler +from tools.logging import logger _time_handler = TimeFormatHandler() @@ -89,6 +91,45 @@ def get_global_variables( return result +def validate_config_coverage( + detector_name: str, + config_events: EventsConfig | dict[str, Any], + persistency: EventPersistency, +) -> None: + """Log warnings when configured EventIDs or variables have no training + data. + + Args: + detector_name: Name of the detector (used in warning messages). + config_events: The detector's events configuration. + persistency: The persistency object populated during training. + """ + config_ids = ( + config_events.events.keys() + if isinstance(config_events, EventsConfig) + else config_events.keys() + ) + if not config_ids: + return + + events_seen = persistency.get_events_seen() + events_with_data = set(persistency.get_events_data().keys()) + + for event_id in config_ids: + if event_id not in events_seen: + logger.warning( + f"[{detector_name}] EventID {event_id!r} is configured but was " + "never observed in training data. Verify that EventIDs in your " + "config match those produced by the parser." + ) + elif event_id not in events_with_data: + logger.warning( + f"[{detector_name}] EventID {event_id!r} was observed in training " + "data but no configured variables were extracted. Verify that " + "variable names/positions in your config match those in the data." + ) + + class CoreDetectorConfig(CoreConfig): component_type: str = "detectors" method_type: str = "core_detector" @@ -158,3 +199,7 @@ def configure( @override def set_configuration(self) -> None: pass + + @override + def post_train(self) -> None: + pass diff --git a/src/detectmatelibrary/detectors/new_value_combo_detector.py b/src/detectmatelibrary/detectors/new_value_combo_detector.py index c461526..e85f96f 100644 --- a/src/detectmatelibrary/detectors/new_value_combo_detector.py +++ b/src/detectmatelibrary/detectors/new_value_combo_detector.py @@ -1,10 +1,12 @@ from detectmatelibrary.common._config import generate_detector_config +from detectmatelibrary.common._config._formats import EventsConfig from detectmatelibrary.common.detector import ( CoreDetectorConfig, CoreDetector, get_configured_variables, - get_global_variables + get_global_variables, + validate_config_coverage, ) from detectmatelibrary.utils.data_buffer import BufferMode @@ -19,6 +21,9 @@ from typing import Any, Dict, Sequence, cast, Tuple from itertools import combinations +from typing_extensions import override +from tools.logging import logger + def get_combo(variables: Dict[str, Any]) -> Dict[Tuple[str, ...], Tuple[Any, ...]]: """Get a single combination of all variables as a key-value pair.""" @@ -144,6 +149,12 @@ def detect( return True return False + @override + def post_train(self) -> None: + config = cast(NewValueComboDetectorConfig, self.config) + if not config.auto_config: + validate_config_coverage(self.name, config.events, self.persistency) + def configure(self, input_: ParserSchema) -> None: # type: ignore """Configure the detector based on the stability of individual variables, then learn value combinations based on that @@ -208,3 +219,10 @@ def set_configuration(self, max_combo_size: int = 3) -> None: ) # Update the config object from the dictionary instead of replacing it self.config = NewValueComboDetectorConfig.from_dict(config_dict, self.name) + events = self.config.events + if isinstance(events, EventsConfig) and not events.events: + logger.warning( + f"[{self.name}] auto_config=True generated an empty configuration. " + "No stable variable combinations were found in configure-phase data. " + "The detector will produce no alerts." + ) diff --git a/src/detectmatelibrary/detectors/new_value_detector.py b/src/detectmatelibrary/detectors/new_value_detector.py index e9fe7c6..01c6f0e 100644 --- a/src/detectmatelibrary/detectors/new_value_detector.py +++ b/src/detectmatelibrary/detectors/new_value_detector.py @@ -1,10 +1,12 @@ from detectmatelibrary.common._config._compile import generate_detector_config +from detectmatelibrary.common._config._formats import EventsConfig from detectmatelibrary.common.detector import ( CoreDetectorConfig, CoreDetector, get_configured_variables, - get_global_variables + get_global_variables, + validate_config_coverage, ) from detectmatelibrary.utils.persistency.event_data_structures.trackers.stability.stability_tracker import ( EventStabilityTracker @@ -15,6 +17,9 @@ from detectmatelibrary.schemas import ParserSchema, DetectorSchema from detectmatelibrary.constants import GLOBAL_EVENT_ID +from typing_extensions import override +from tools.logging import logger + class NewValueDetectorConfig(CoreDetectorConfig): method_type: str = "new_value_detector" @@ -109,6 +114,11 @@ def configure(self, input_: ParserSchema) -> None: # type: ignore named_variables=input_["logFormatVariables"], ) + @override + def post_train(self) -> None: + if not self.config.auto_config: + validate_config_coverage(self.name, self.config.events, self.persistency) + def set_configuration(self) -> None: variables = {} for event_id, tracker in self.auto_conf_persistency.get_events_data().items(): @@ -121,3 +131,10 @@ def set_configuration(self) -> None: ) # Update the config object from the dictionary instead of replacing it self.config = NewValueDetectorConfig.from_dict(config_dict, self.name) + events = self.config.events + if isinstance(events, EventsConfig) and not events.events: + logger.warning( + f"[{self.name}] auto_config=True generated an empty configuration. " + "No stable variables were found in configure-phase data. " + "The detector will produce no alerts." + ) diff --git a/src/detectmatelibrary/utils/persistency/event_persistency.py b/src/detectmatelibrary/utils/persistency/event_persistency.py index c21cb76..3c6d178 100644 --- a/src/detectmatelibrary/utils/persistency/event_persistency.py +++ b/src/detectmatelibrary/utils/persistency/event_persistency.py @@ -26,6 +26,7 @@ def __init__( event_data_kwargs: Optional[dict[str, Any]] = None, ): self.events_data: Dict[int | str, EventDataStructure] = {} + self.events_seen: set[int | str] = set() self.event_data_class = event_data_class self.event_data_kwargs = event_data_kwargs or {} self.variable_blacklist = variable_blacklist or [] @@ -39,6 +40,7 @@ def ingest_event( named_variables: Dict[str, Any] = {} ) -> None: """Ingest event data into the appropriate EventData store.""" + self.events_seen.add(event_id) if not variables and not named_variables: return self.event_templates[event_id] = event_template @@ -52,6 +54,11 @@ def ingest_event( data = data_structure.to_data(all_variables) data_structure.add_data(data) + def get_events_seen(self) -> set[int | str]: + """Retrieve all event IDs observed via ingest_event(), regardless of + whether variables were extracted.""" + return self.events_seen + def get_event_data(self, event_id: int | str) -> Any | None: """Retrieve the data for a specific event ID.""" data_structure = self.events_data.get(event_id) diff --git a/tests/test_common/test_core.py b/tests/test_common/test_core.py index f8d0183..2b4e594 100644 --- a/tests/test_common/test_core.py +++ b/tests/test_common/test_core.py @@ -330,3 +330,59 @@ def test_set_configuration_called_once(self) -> None: component.process(self._make_log(i)) assert component.set_configuration_called == 1 + + +class MockConfigWithPostTrain(CoreConfig): + data_use_training: int | None = 3 + + +class MockComponentWithPostTrain(CoreComponent): + def __init__(self, name: str, config: CoreConfig = MockConfigWithPostTrain()) -> None: + super().__init__( + name=name, type_="Dummy", config=config, input_schema=schemas.LogSchema + ) + self.post_train_called: int = 0 + + def train(self, input_) -> None: + pass + + def post_train(self) -> None: + self.post_train_called += 1 + + def run(self, input_, output_) -> bool: + return False + + +class TestPostTrain: + def _make_log(self, i: int) -> schemas.LogSchema: + return schemas.LogSchema({ + "__version__": "1.0.0", + "logID": str(i), + "logSource": "test", + "hostname": "test_hostname" + }) + + def test_post_train_called_once_after_training(self) -> None: + component = MockComponentWithPostTrain(name="PostTrain1") + for i in range(10): + component.process(self._make_log(i)) + assert component.post_train_called == 1 + + def test_post_train_not_called_without_training(self) -> None: + component = MockComponentWithPostTrain(name="PostTrain2", config=CoreConfig()) + for i in range(10): + component.process(self._make_log(i)) + assert component.post_train_called == 0 + + def test_post_train_called_on_first_detection_item(self) -> None: + """post_train fires on the item immediately after training ends.""" + component = MockComponentWithPostTrain(name="PostTrain3") + # data_use_training=3, so 4th item triggers post_train + for i in range(3): + component.process(self._make_log(i)) + assert component.post_train_called == 0 + component.process(self._make_log(3)) + assert component.post_train_called == 1 + # subsequent items don't re-trigger it + component.process(self._make_log(4)) + assert component.post_train_called == 1 diff --git a/tests/test_common/test_fit_logic.py b/tests/test_common/test_fit_logic.py new file mode 100644 index 0000000..11e96c4 --- /dev/null +++ b/tests/test_common/test_fit_logic.py @@ -0,0 +1,62 @@ +"""Tests for FitLogic training lifecycle hooks.""" + +from detectmatelibrary.common._core_op._fit_logic import ( + FitLogic, FitLogicState, TrainState +) + + +class TestFinishTraining: + """Test that finish_training() fires exactly once after bounded training.""" + + def test_finish_training_fires_once_after_bounded_training(self) -> None: + logic = FitLogic(data_use_configure=None, data_use_training=3) + finish_calls = [] + for _ in range(6): + logic.run() + finish_calls.append(logic.finish_training()) + assert finish_calls.count(True) == 1 + + def test_finish_training_fires_on_first_nothing_after_training(self) -> None: + logic = FitLogic(data_use_configure=None, data_use_training=2) + states = [] + finishes = [] + for _ in range(5): + state = logic.run() + states.append(state) + finishes.append(logic.finish_training()) + # First two calls are DO_TRAIN, third is first NOTHING + assert states[:2] == [FitLogicState.DO_TRAIN, FitLogicState.DO_TRAIN] + assert states[2] == FitLogicState.NOTHING + assert finishes[2] is True + assert all(not f for f in finishes[:2]) + assert all(not f for f in finishes[3:]) + + def test_finish_training_not_called_without_training(self) -> None: + logic = FitLogic(data_use_configure=None, data_use_training=None) + for _ in range(5): + logic.run() + assert logic.finish_training() is False + + def test_finish_training_not_called_during_training(self) -> None: + logic = FitLogic(data_use_configure=None, data_use_training=5) + for _ in range(5): + state = logic.run() + assert state == FitLogicState.DO_TRAIN + assert logic.finish_training() is False + + def test_finish_training_not_called_with_keep_training(self) -> None: + logic = FitLogic(data_use_configure=None, data_use_training=None) + logic.train_state = TrainState.KEEP_TRAINING + for _ in range(10): + state = logic.run() + assert state == FitLogicState.DO_TRAIN + assert logic.finish_training() is False + + def test_finish_training_after_configure_and_training(self) -> None: + """finish_training fires correctly even when configure phase precedes training.""" + logic = FitLogic(data_use_configure=2, data_use_training=3) + finish_calls = [] + for _ in range(8): + logic.run() + finish_calls.append(logic.finish_training()) + assert finish_calls.count(True) == 1 diff --git a/tests/test_detectors/test_mismatch_warnings.py b/tests/test_detectors/test_mismatch_warnings.py new file mode 100644 index 0000000..40242b0 --- /dev/null +++ b/tests/test_detectors/test_mismatch_warnings.py @@ -0,0 +1,218 @@ +"""Tests for config-data mismatch warnings in detectors. + +Covers two warning levels: +1. Configured EventID never seen in training data. +2. EventID seen in training data but configured variables not extracted. + +Also covers the auto_config=True empty-config warning. +""" + +import logging +import pytest +import detectmatelibrary.schemas as schemas +from detectmatelibrary.detectors.new_value_detector import NewValueDetector +from detectmatelibrary.detectors.new_value_combo_detector import NewValueComboDetector + + +def _make_parser_schema( + event_id: int, + variables: list, + log_format_variables: dict, +) -> schemas.ParserSchema: + return schemas.ParserSchema({ + "parserType": "test", + "EventID": event_id, + "template": "test template", + "variables": variables, + "logID": "1", + "parsedLogID": "1", + "parserID": "test_parser", + "log": "test log message", + "logFormatVariables": log_format_variables, + }) + + +# ---- Config fixtures -------------------------------------------------------- + +def _nvd_config(event_id: int, pos: int = 0, header: str | None = None) -> dict: + """Build a minimal NewValueDetector config targeting one variable.""" + instance: dict = {"params": {}} + if header is not None: + instance["header_variables"] = [{"pos": header, "params": {}}] + else: + instance["variables"] = [{"pos": pos, "name": f"var_{pos}", "params": {}}] + return { + "detectors": { + "TestDetector": { + "method_type": "new_value_detector", + "auto_config": False, + "params": {}, + "events": {event_id: {"inst": instance}}, + } + } + } + + +# ---- NewValueDetector warning tests ----------------------------------------- + +class TestNewValueDetectorMismatchWarnings: + + def test_warn_event_id_never_seen(self, caplog: pytest.LogCaptureFixture) -> None: + """Warn when configured EventID is not present in training data at + all.""" + detector = NewValueDetector(name="TestDetector", config=_nvd_config(event_id=99)) + # Train on EventID=1 only — EventID=99 is never seen + for _ in range(3): + detector.train(_make_parser_schema(1, ["val"], {"status": "ok"})) + + with caplog.at_level(logging.WARNING): + detector.post_train() + + assert any("99" in r.message and "never observed" in r.message for r in caplog.records) + + def test_warn_event_id_seen_but_no_variables_extracted( + self, caplog: pytest.LogCaptureFixture + ) -> None: + """Warn when EventID is seen but configured positional variable is out + of bounds.""" + # Config expects var at position 0, but data has empty variables list + detector = NewValueDetector(name="TestDetector", config=_nvd_config(event_id=1, pos=0)) + for _ in range(3): + # EventID=1 IS seen, but variables=[] so get_configured_variables returns {} + detector.train(_make_parser_schema(1, [], {"status": "ok"})) + + with caplog.at_level(logging.WARNING): + detector.post_train() + + assert any( + "1" in r.message and "no configured variables were extracted" in r.message + for r in caplog.records + ) + + def test_warn_header_variable_not_in_log_format( + self, caplog: pytest.LogCaptureFixture + ) -> None: + """Warn when EventID is seen but configured header_variable key is + absent.""" + # Config expects header 'Time', but data has only 'status' + detector = NewValueDetector( + name="TestDetector", config=_nvd_config(event_id=1, header="Time") + ) + for _ in range(3): + detector.train(_make_parser_schema(1, [], {"status": "ok"})) + + with caplog.at_level(logging.WARNING): + detector.post_train() + + assert any("no configured variables were extracted" in r.message for r in caplog.records) + + def test_no_warning_when_config_matches_data( + self, caplog: pytest.LogCaptureFixture + ) -> None: + """No warning when configured variables are correctly extracted during + training.""" + detector = NewValueDetector( + name="TestDetector", config=_nvd_config(event_id=1, header="status") + ) + for _ in range(3): + detector.train(_make_parser_schema(1, [], {"status": "ok"})) + + with caplog.at_level(logging.WARNING): + detector.post_train() + + assert not any(r.levelno == logging.WARNING for r in caplog.records) + + def test_auto_config_skips_mismatch_check( + self, caplog: pytest.LogCaptureFixture + ) -> None: + """post_train() does not run coverage check for auto_config=True + detectors.""" + detector = NewValueDetector() # auto_config=True by default + # Train on some data — no mismatch check should fire because auto_config=True + for _ in range(3): + detector.train(_make_parser_schema(1, ["x"], {"status": "ok"})) + + with caplog.at_level(logging.WARNING): + detector.post_train() + + mismatch_warnings = [ + r for r in caplog.records + if "never observed" in r.message or "no configured variables" in r.message + ] + assert not mismatch_warnings + + def test_auto_config_empty_config_warning( + self, caplog: pytest.LogCaptureFixture + ) -> None: + """Warn in set_configuration() when auto_config produces an empty + events config.""" + detector = NewValueDetector() + # configure() with no data → auto_conf_persistency is empty → empty config + with caplog.at_level(logging.WARNING): + detector.set_configuration() + + assert any("empty configuration" in r.message for r in caplog.records) + + +# ---- NewValueComboDetector warning tests ------------------------------------ + +class TestNewValueComboDetectorMismatchWarnings: + + def test_warn_event_id_never_seen(self, caplog: pytest.LogCaptureFixture) -> None: + """Warn when configured EventID is not present in combo detector + training data.""" + config = { + "detectors": { + "ComboDetector": { + "method_type": "new_value_combo_detector", + "auto_config": False, + "params": {}, + "events": { + 99: { + "inst": { + "params": {}, + "header_variables": [{"pos": "status", "params": {}}], + } + } + }, + } + } + } + detector = NewValueComboDetector(name="ComboDetector", config=config) + for _ in range(3): + detector.train(_make_parser_schema(1, ["val"], {"status": "ok"})) + + with caplog.at_level(logging.WARNING): + detector.post_train() + + assert any("99" in r.message and "never observed" in r.message for r in caplog.records) + + def test_no_warning_when_config_matches( + self, caplog: pytest.LogCaptureFixture + ) -> None: + """No warning when combo detector config matches training data.""" + config = { + "detectors": { + "ComboDetector": { + "method_type": "new_value_combo_detector", + "auto_config": False, + "params": {}, + "events": { + 1: { + "inst": { + "params": {}, + "header_variables": [{"pos": "status", "params": {}}], + } + } + }, + } + } + } + detector = NewValueComboDetector(name="ComboDetector", config=config) + for _ in range(3): + detector.train(_make_parser_schema(1, [], {"status": "ok"})) + + with caplog.at_level(logging.WARNING): + detector.post_train() + + assert not any(r.levelno == logging.WARNING for r in caplog.records) diff --git a/tests/test_utils/test_events_seen.py b/tests/test_utils/test_events_seen.py new file mode 100644 index 0000000..62467e3 --- /dev/null +++ b/tests/test_utils/test_events_seen.py @@ -0,0 +1,66 @@ +"""Tests for EventPersistency.events_seen tracking.""" + +from detectmatelibrary.utils.persistency.event_persistency import EventPersistency +from detectmatelibrary.utils.persistency.event_data_structures.trackers import EventStabilityTracker + + +class TestEventsSeen: + """Test that events_seen tracks all event IDs passed to ingest_event().""" + + def setup_method(self) -> None: + self.persistency = EventPersistency(event_data_class=EventStabilityTracker) + + def test_events_seen_recorded_on_early_return(self) -> None: + """Event ID is tracked even when variables are empty (early-return + path).""" + self.persistency.ingest_event( + event_id="E1", + event_template="some template", + variables=[], + named_variables={} + ) + assert "E1" in self.persistency.get_events_seen() + assert "E1" not in self.persistency.get_events_data() + + def test_events_seen_recorded_with_data(self) -> None: + """Event ID is tracked when variables are present.""" + self.persistency.ingest_event( + event_id="E2", + event_template="some template", + named_variables={"status": "ok"} + ) + assert "E2" in self.persistency.get_events_seen() + assert "E2" in self.persistency.get_events_data() + + def test_events_seen_not_duplicated(self) -> None: + """Repeated calls for the same event ID produce a single entry.""" + for _ in range(5): + self.persistency.ingest_event( + event_id=42, + event_template="t", + named_variables={"x": "v"} + ) + assert len(self.persistency.get_events_seen()) == 1 + assert 42 in self.persistency.get_events_seen() + + def test_events_seen_tracks_multiple_ids(self) -> None: + """Multiple distinct event IDs are all tracked.""" + for eid in [1, 2, 3]: + self.persistency.ingest_event( + event_id=eid, + event_template="t", + named_variables={"x": str(eid)} + ) + assert self.persistency.get_events_seen() == {1, 2, 3} + + def test_get_events_seen_returns_set(self) -> None: + """get_events_seen() returns a set.""" + result = self.persistency.get_events_seen() + assert isinstance(result, set) + + def test_events_seen_mixed_empty_and_nonempty(self) -> None: + """Events seen with and without data are both in events_seen.""" + self.persistency.ingest_event(event_id=1, event_template="t") + self.persistency.ingest_event(event_id=2, event_template="t", named_variables={"k": "v"}) + assert {1, 2} == self.persistency.get_events_seen() + assert set(self.persistency.get_events_data().keys()) == {2}