From a1217ab430c6e9a604918554c3dc16868265775a Mon Sep 17 00:00:00 2001 From: Michael Cochez Date: Tue, 29 Oct 2024 17:34:02 +0100 Subject: [PATCH 1/2] Implemented escaping of non-IRIs, closes #58. Trigger CI --- src/gqs/import_.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/src/gqs/import_.py b/src/gqs/import_.py index 740fce7..b383adf 100644 --- a/src/gqs/import_.py +++ b/src/gqs/import_.py @@ -20,6 +20,8 @@ import pickle import pathlib from typing import Any, Callable, Literal, Type, TypeVar, cast +from urllib.parse import ParseResult, urlparse, quote + from gqs.conversion import protobuf_builder from gqs.dataset import Dataset from gqs.mapping import RelationMapper, EntityMapper @@ -55,7 +57,16 @@ def _convert_mapper(id2X_file: pathlib.Path, target_file: pathlib.Path) -> None: with open(target_file, "w") as output: sep = "" for i in range(num_ids): - output.write(f"{sep}{mapping[i]}") + iri_raw: str = mapping[i] + try: + parsed: ParseResult = urlparse(iri_raw) + assert parsed.scheme + assert parsed.scheme != "gqs", f"the iri {iri_raw} in the imported dataset contains the protocol used by gqs to escape strings which are not valid URIs" + iri_clean: str = iri_raw + except ValueError | AssertionError: + iri_clean = f"gqs:{quote(iri_raw, safe='/')}" + logger.warning(f"identifier {iri_raw} is not a valid IRI, escaping to {iri_clean}") + output.write(f"{sep}{iri_clean}") sep = "\n" @@ -339,7 +350,7 @@ def _convert_queries(import_source: pathlib.Path, dataset: Dataset, lenient: boo output_file.write(proto_query_data.SerializeToString()) # We also need a stats file for this, creating that here stats_file_name = output_folder / f"{split}_stats.json" - stats = {"name": split, "count": len(query_instances), "hash": f"converted_from_{import_source}_{query_shape}"} + stats: dict[str, Any] = {"name": split, "count": len(query_instances), "hash": f"converted_from_{import_source}_{query_shape}"} try: with stats_file_name.open(mode="w") as stats_file: json.dump(stats, stats_file) From 053c18379dc4d813e032735e8401b9f78f70adb5 Mon Sep 17 00:00:00 2001 From: Michael Cochez Date: Thu, 7 Nov 2024 18:35:30 +0100 Subject: [PATCH 2/2] Corrected exception type --- src/gqs/import_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gqs/import_.py b/src/gqs/import_.py index b383adf..41c8127 100644 --- a/src/gqs/import_.py +++ b/src/gqs/import_.py @@ -63,7 +63,7 @@ def _convert_mapper(id2X_file: pathlib.Path, target_file: pathlib.Path) -> None: assert parsed.scheme assert parsed.scheme != "gqs", f"the iri {iri_raw} in the imported dataset contains the protocol used by gqs to escape strings which are not valid URIs" iri_clean: str = iri_raw - except ValueError | AssertionError: + except (ValueError , AssertionError): iri_clean = f"gqs:{quote(iri_raw, safe='/')}" logger.warning(f"identifier {iri_raw} is not a valid IRI, escaping to {iri_clean}") output.write(f"{sep}{iri_clean}")