Fix label s3 ingestion (#309)

JKL98ISR · web-flow · commit 6cdb3ac5ffe5 · 2023-11-27T19:04:39.000Z
diff --git a/backend/deepchecks_monitoring/bgtasks/tasks_runner.py b/backend/deepchecks_monitoring/bgtasks/tasks_runner.py
@@ -133,8 +133,8 @@ async def _run_task(self, task: Task, session, queued_timestamp, lock):
             else:
                 self.logger.info({'message': f'Unknown task type: {task.bg_worker_task}'})
         except Exception:  # pylint: disable=broad-except
-            await session.rollback()
             self.logger.exception({'message': 'Exception running task', 'task': task.bg_worker_task})
+            await session.rollback()
 
 
 class BaseWorkerSettings():
diff --git a/backend/deepchecks_monitoring/ee/bgtasks/object_storage_ingestor.py b/backend/deepchecks_monitoring/ee/bgtasks/object_storage_ingestor.py
@@ -146,9 +146,9 @@ async def run(self, task: 'Task', session: AsyncSession, resources_provider: Res
                 version_prefixes = model_prefixes if version.latest_file_time is not None else ['']
                 for prefix in version_prefixes:
                     for df, time in self.ingest_prefix(s3, bucket, f'{version_path}/{prefix}', version.latest_file_time,
-                                                       errors, version.model_id, version.id):
-                        # For each file, set lock expiry to 240 seconds from now
-                        await lock.extend(240, replace_ttl=True)
+                                                       errors, version.model_id, version.id, need_ts=True):
+                        # For each file, set lock expiry to 360 seconds from now
+                        await lock.extend(360, replace_ttl=True)
                         await self.ingestion_backend.log_samples(version, df, session, organization_id, new_scan_time)
                         version.latest_file_time = max(version.latest_file_time or
                                                        pdl.datetime(year=1970, month=1, day=1), time)
@@ -158,8 +158,8 @@ async def run(self, task: 'Task', session: AsyncSession, resources_provider: Res
                 labels_path = f'{model_path}/labels/{prefix}'
                 for df, time in self.ingest_prefix(s3, bucket, labels_path, model.latest_labels_file_time,
                                                    errors, model_id):
-                    # For each file, set lock expiry to 240 seconds from now
-                    await lock.extend(240, replace_ttl=True)
+                    # For each file, set lock expiry to 360 seconds from now
+                    await lock.extend(360, replace_ttl=True)
                     await self.ingestion_backend.log_labels(model, df, session, organization_id)
                     model.latest_labels_file_time = max(model.latest_labels_file_time
                                                         or pdl.datetime(year=1970, month=1, day=1), time)
@@ -175,7 +175,8 @@ async def run(self, task: 'Task', session: AsyncSession, resources_provider: Res
         self.logger.info({'message': 'finished job', 'worker name': str(type(self)),
                           'task': task.id, 'model_id': model_id, 'org_id': organization_id})
 
-    def ingest_prefix(self, s3, bucket, prefix, last_file_time, errors, model_id, version_id=None):
+    def ingest_prefix(self, s3, bucket, prefix, last_file_time, errors,
+                      model_id, version_id=None, need_ts: bool = False):
         """Ingest all files in prefix, return df and file time"""
         last_file_time = last_file_time or pdl.datetime(year=1970, month=1, day=1)
         # First read all file names, then retrieve them sorted by date
@@ -226,15 +227,15 @@ def ingest_prefix(self, s3, bucket, prefix, last_file_time, errors, model_id, ve
                 self._handle_error(errors, f'Invalid file extension: {file["extension"]}, for file: {file["key"]}',
                                    model_id, version_id)
                 continue
-
-            if SAMPLE_TS_COL not in df or not is_integer_dtype(df[SAMPLE_TS_COL]):
-                self._handle_error(errors, f'Invalid timestamp column: {SAMPLE_TS_COL}, in file: {file["key"]}',
-                                   model_id, version_id)
-                continue
-            # The user facing API requires unix timestamps, but for the ingestion we convert it to ISO format
-            df[SAMPLE_TS_COL] = df[SAMPLE_TS_COL].apply(lambda x: pdl.from_timestamp(x).isoformat())
-            # Sort by timestamp
-            df = df.sort_values(by=[SAMPLE_TS_COL])
+            if need_ts:
+                if SAMPLE_TS_COL not in df or not is_integer_dtype(df[SAMPLE_TS_COL]):
+                    self._handle_error(errors, f'Invalid timestamp column: {SAMPLE_TS_COL}, in file: {file["key"]}',
+                                       model_id, version_id)
+                    continue
+                # The user facing API requires unix timestamps, but for the ingestion we convert it to ISO format
+                df[SAMPLE_TS_COL] = df[SAMPLE_TS_COL].apply(lambda x: pdl.from_timestamp(x).isoformat())
+                # Sort by timestamp
+                df = df.sort_values(by=[SAMPLE_TS_COL])
             yield df, file['time']
 
     def _handle_error(self, errors, error_message, model_id=None, model_version_id=None, set_warning_in_logs=True):
diff --git a/backend/deepchecks_monitoring/logic/data_ingestion.py b/backend/deepchecks_monitoring/logic/data_ingestion.py
@@ -185,7 +185,8 @@ async def log_labels(
         cache_functions,
         logger
 ):
-    valid_data = {}
+    unbatched_valid_data = pd.Series()
+    logged_ids = set()
     labels_table_columns = model.get_sample_labels_columns()
     labels_table_json_schema = {
         "type": "object",
@@ -198,29 +199,51 @@ async def log_labels(
     }
 
     validator = t.cast(t.Callable[..., t.Any], fastjsonschema.compile(labels_table_json_schema))
-
+    errors = []
     for sample in data:
         try:
             validator(sample)
-        except fastjsonschema.JsonSchemaValueException:
-            pass
-            # TODO: new table for model ingestion errors?
+        except fastjsonschema.JsonSchemaValueException as e:
+            errors.append({
+                "sample": str(sample),
+                "sample_id": sample.get(SAMPLE_ID_COL),
+                "error": f"Exception saving label: {str(e)}, for id: {sample.get(SAMPLE_ID_COL)}",
+                "model_id": model.id,
+            })
         else:
-            error = None
             # If got same index more than once, log it as error
-            if sample[SAMPLE_ID_COL] in valid_data:
-                error = f"Got duplicate sample id: {sample[SAMPLE_ID_COL]}"
+            if sample[SAMPLE_ID_COL] in logged_ids:
+                errors.append({
+                    "sample": str(sample),
+                    "sample_id": sample.get(SAMPLE_ID_COL),
+                    "error": f"Got duplicate label for sample id: {sample[SAMPLE_ID_COL]}. "
+                             f"{sample.get(SAMPLE_LABEL_COL)} vs "
+                             f"{unbatched_valid_data[sample[SAMPLE_ID_COL]].get(SAMPLE_LABEL_COL)}",
+                    "model_id": model.id,
+                })
+            else:
+                unbatched_valid_data[sample[SAMPLE_ID_COL]] = sample
+                logged_ids.add(sample[SAMPLE_ID_COL])
 
-            if not error:
-                valid_data[sample[SAMPLE_ID_COL]] = sample
+    await save_failures(session, errors, logger)
 
-    if valid_data:
+    if len(unbatched_valid_data) == 0:
+        return
+    max_messages_per_insert = QUERY_PARAM_LIMIT // 5
+    ids_to_log = unbatched_valid_data.keys()
+    for start_index in range(0, len(ids_to_log), max_messages_per_insert):
+        valid_data = unbatched_valid_data[ids_to_log[start_index:start_index + max_messages_per_insert]]
         # Query from the ids mapping all the relevant versions per each version. This is needed in order to query
         # the timestamps to invalidate the monitors cache
         versions_table = model.get_samples_versions_map_table(session)
-        versions_select = (select(versions_table.c["version_id"], array_agg(versions_table.c[SAMPLE_ID_COL]))
-                           .where(versions_table.c[SAMPLE_ID_COL].in_(list(valid_data.keys())))
-                           .group_by(versions_table.c["version_id"]))
+        versions_select = (
+            select(
+                versions_table.c["version_id"],
+                array_agg(versions_table.c[SAMPLE_ID_COL])
+            )
+            .where(versions_table.c[SAMPLE_ID_COL].in_(list(valid_data.keys())))
+            .group_by(versions_table.c["version_id"])
+        )
         results = (await session.execute(versions_select)).all()
 
         # Validation of classes amount for binary tasks
@@ -245,39 +268,39 @@ async def log_labels(
                         del valid_data[sample_id]
             await save_failures(session, errors, logger)
 
-    if valid_data:
-        # update label statistics
-        for row in results:
-            version_id = row[0]
-            sample_ids = [sample_id for sample_id in row[1] if sample_id in valid_data]
-            model_version: ModelVersion = \
-                (await session.execute(select(ModelVersion).where(ModelVersion.id == version_id))).scalars().first()
-            updated_statistics = copy.deepcopy(model_version.statistics)
-            for sample_id in sample_ids:
-                update_statistics_from_sample(updated_statistics, valid_data[sample_id])
-            if model_version.statistics != updated_statistics:
-                await model_version.update_statistics(updated_statistics, session)
-
-        # Insert or update all labels
-        labels_table = model.get_sample_labels_table(session)
-        insert_statement = postgresql.insert(labels_table)
-        upsert_statement = insert_statement.on_conflict_do_update(
-            index_elements=[SAMPLE_ID_COL],
-            set_={SAMPLE_LABEL_COL: insert_statement.excluded[SAMPLE_LABEL_COL]}
-        )
-        await session.execute(upsert_statement, list(valid_data.values()))
-
-        for row in results:
-            version_id = row[0]
-            sample_ids = [sample_id for sample_id in row[1] if sample_id in valid_data]
-            monitor_table_name = get_monitor_table_name(model.id, version_id)
-            ts_select = (select(Column(SAMPLE_TS_COL))
-                         .select_from(text(monitor_table_name))
-                         .where(Column(SAMPLE_ID_COL).in_(sample_ids)))
-            timestamps_affected = [pdl.instance(x) for x in (await session.execute(ts_select)).scalars()]
-            await add_cache_invalidation(org_id, version_id, timestamps_affected, session, cache_functions)
-
-        model.last_update_time = pdl.now()
+        if len(valid_data) > 0:
+            # update label statistics
+            for row in results:
+                version_id = row[0]
+                sample_ids = [sample_id for sample_id in row[1] if sample_id in valid_data]
+                model_version: ModelVersion = \
+                    (await session.execute(select(ModelVersion).where(ModelVersion.id == version_id))).scalars().first()
+                updated_statistics = copy.deepcopy(model_version.statistics)
+                for sample_id in sample_ids:
+                    update_statistics_from_sample(updated_statistics, valid_data[sample_id])
+                if model_version.statistics != updated_statistics:
+                    await model_version.update_statistics(updated_statistics, session)
+
+            # Insert or update all labels
+            labels_table = model.get_sample_labels_table(session)
+            insert_statement = postgresql.insert(labels_table)
+            upsert_statement = insert_statement.on_conflict_do_update(
+                index_elements=[SAMPLE_ID_COL],
+                set_={SAMPLE_LABEL_COL: insert_statement.excluded[SAMPLE_LABEL_COL]}
+            )
+            await session.execute(upsert_statement, valid_data.tolist())
+
+            for row in results:
+                version_id = row[0]
+                sample_ids = [sample_id for sample_id in row[1] if sample_id in valid_data]
+                monitor_table_name = get_monitor_table_name(model.id, version_id)
+                ts_select = (select(Column(SAMPLE_TS_COL))
+                             .select_from(text(monitor_table_name))
+                             .where(Column(SAMPLE_ID_COL).in_(sample_ids)))
+                timestamps_affected = [pdl.instance(x) for x in (await session.execute(ts_select)).scalars()]
+                await add_cache_invalidation(org_id, version_id, timestamps_affected, session, cache_functions)
+
+            model.last_update_time = pdl.now()
 
 
 async def add_cache_invalidation(organization_id, model_version_id, timestamps_updated, session, cache_functions):
diff --git a/backend/tests/api/test_data_input.py b/backend/tests/api/test_data_input.py
@@ -225,7 +225,7 @@ async def test_log_labels_non_existing_samples(
         model_id=classification_model["id"],
         data=[{
             "_dc_sample_id": "not exists",
-            "c": 0
+            "_dc_label": "0"
         }]
     )
     # Assert

Original file line number	Diff line number	Diff line change
`@@ -225,7 +225,7 @@ async def test_log_labels_non_existing_samples(`
`225`	`225`	`model_id=classification_model["id"],`
`226`	`226`	`data=[{`
`227`	`227`	`"_dc_sample_id": "not exists",`
`228`		`- "c": 0`
	`228`	`+ "_dc_label": "0"`
`229`	`229`	`}]`
`230`	`230`	`)`
`231`	`231`	`# Assert`