From e73d1bd7b21ee8161c60431575a29c4c9fe9b986 Mon Sep 17 00:00:00 2001 From: Hubert Krzywonos Date: Fri, 13 Feb 2026 13:05:48 +0100 Subject: [PATCH 1/2] - renamed "redis" folder with import/export scripts to avoid dependency naming issues - config.ini - added a variable for the output directory of dump_index - dump_index - added a main function call, added reading output directory from config - cits2redis - added main function call, removed broken if checks, Windows fix for csv field limit - cnc - added main function call - meta2redis - added main function call, fix for reading unarchived CSV files, Windows fix for CSV field limit --- config.ini | 4 ++ scripts/cits2redis.py | 36 ++++++++++-------- scripts/cnc.py | 3 ++ scripts/dump_index.py | 5 ++- scripts/meta2redis.py | 14 +++++-- .../__export_redis_db__.py | 0 .../__import_redis_db__.py | 0 .../{redis => redis_importexport}/__init__.py | 0 .../__pycache__/__init__.cpython-313.pyc | Bin 9 files changed, 42 insertions(+), 20 deletions(-) rename scripts/{redis => redis_importexport}/__export_redis_db__.py (100%) mode change 100755 => 100644 rename scripts/{redis => redis_importexport}/__import_redis_db__.py (100%) mode change 100755 => 100644 rename scripts/{redis => redis_importexport}/__init__.py (100%) rename scripts/{redis => redis_importexport}/__pycache__/__init__.cpython-313.pyc (100%) diff --git a/config.ini b/config.ini index 80805ed..8c329ed 100755 --- a/config.ini +++ b/config.ini @@ -14,6 +14,10 @@ host=127.0.0.1 port=6379 batch_size=10000 +# Output folder for dump_index +[dump] +output="_out_" + [cnc] # ORCID API key to be used to query the ORCID API orcid= diff --git a/scripts/cits2redis.py b/scripts/cits2redis.py index 2112319..19d5648 100755 --- a/scripts/cits2redis.py +++ b/scripts/cits2redis.py @@ -29,11 +29,15 @@ _config = get_config() _logger = get_logger() -csv.field_size_limit(sys.maxsize) +if os.name != "nt": + csv.field_size_limit(sys.maxsize) +else: + csv.field_size_limit(2**31 - 1) + rconn = Redis( host=_config.get("redis", "host"), port=_config.get("redis", "port"), - db=_config.get("redis", "db_cits") + db=_config.get("cnc", "db_cits") ) def upload2redis(dump_path="", intype=""): @@ -65,23 +69,22 @@ def upload2redis(dump_path="", intype=""): if filename.endswith(".ttl"): with open(filename, "r", encoding="utf-8") as f: for line in f: - if needle in line: - # extract the part between "ci/" and ">" - start = line.find("ci/") + 3 - end = line.find(">", start) - oci = line[start:end] - all_ocis.append(oci) + # extract the part between "ci/" and ">" + start = line.find("ci/") + 3 + end = line.find(">", start) + oci = line[start:end] + all_ocis.append(oci) elif intype == "TTL": for filename in os.listdir(dump_path): if filename.endswith(".ttl"): - with open(filename, "r", encoding="utf-8") as f: + full_path = os.path.join(dump_path, filename) + with open(full_path, "r", encoding="utf-8") as f: for line in f: - if needle in line: - # extract the part between "ci/" and ">" - start = line.find("ci/") + 3 - end = line.find(">", start) - oci = line[start:end] - all_ocis.append(oci) + # extract the part between "ci/" and ">" + start = line.find("ci/") + 3 + end = line.find(">", start) + oci = line[start:end] + all_ocis.append(oci) for oci in all_ocis: citing = oci.split("-")[0] @@ -112,3 +115,6 @@ def main(): _logger.info("Uploading citations in RDF format to Redis ...") upload2redis(args.dump,args.intype) _logger.info("Done!") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scripts/cnc.py b/scripts/cnc.py index 7778a31..b58603d 100755 --- a/scripts/cnc.py +++ b/scripts/cnc.py @@ -405,3 +405,6 @@ def main(): # 4. Continue with the rest of your code **after all files are done** # e.g., merging outputs, generating RDF/CSV summary, logging, etc. # >> post_processing(output_dir) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scripts/dump_index.py b/scripts/dump_index.py index 1989ccd..773f13c 100644 --- a/scripts/dump_index.py +++ b/scripts/dump_index.py @@ -45,12 +45,12 @@ source = _config.get("INDEX", "source") service_name = _config.get("INDEX", "service") index_identifier = _config.get("INDEX", "identifier") +FILE_OUTPUT_DIR = _config.get("dump", "output") # === CONFIGURATION === CITED_BATCH_SIZE = 1500 CITED_PER_FILE = 10000 FILES_PER_ZIP = 1000 -FILE_OUTPUT_DIR = "_out_" def zip_and_cleanup(csv_dir, rdf_dir, slx_dir, files_per_zip, force = False, pnum=1): @@ -331,3 +331,6 @@ def process_pair(pairs, pnum, br_meta, end_cursor = False): force = end_cursor, pnum = pnum ) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scripts/meta2redis.py b/scripts/meta2redis.py index f49b5d2..75f78f4 100755 --- a/scripts/meta2redis.py +++ b/scripts/meta2redis.py @@ -31,7 +31,10 @@ from oc.index.utils.config import get_config _config = get_config() -csv.field_size_limit(sys.maxsize) +if os.name != "nt": + csv.field_size_limit(sys.maxsize) +else: + csv.field_size_limit(2**31 - 1) # glob indexes br_ids = _config.get("cnc", "br_ids").split(",") @@ -97,7 +100,7 @@ def _p_csvfile(a_csv_file,csv_name,rconn_db_br, rconn_db_ra, rconn_db_metadata): db_ra_buffer = [] db_metadata_buffer = [] - l_brs = list(csv.DictReader(io.TextIOWrapper(a_csv_file))) + l_brs = list(csv.DictReader(io.TextIOWrapper(a_csv_file, encoding="utf-8"))) # walk through each citation in the CSV logger.info("Walking through all the "+str( len(l_brs) )+" BRs (rows) in: "+str(csv_name) ) @@ -203,7 +206,7 @@ def upload2redis(dump_path="", redishost="localhost", redisport="6379", redisbat # Handle single CSV file csv_name = os.path.basename(dump_path) logger.info(f"CSV: Processing direct CSV file: {csv_name}") - with open(dump_path, 'r', encoding='utf-8') as csv_file: + with open(dump_path, 'rb') as csv_file: _p_csvfile(csv_file,csv_name, rconn_db_br, rconn_db_ra, rconn_db_metadata) else: logger.warning(f"Unsupported file type: {dump_path}") @@ -219,7 +222,7 @@ def upload2redis(dump_path="", redishost="localhost", redisport="6379", redisbat if filename.endswith(".csv"): logger.info(f"CSV: Processing direct CSV file: {filename}") - with open(filepath, 'r', encoding='utf-8') as csv_file: + with open(filepath, 'rb') as csv_file: _p_csvfile(csv_file, filename, rconn_db_br, rconn_db_ra, rconn_db_metadata) else: logger.error(f"Path does not exist or is neither a file nor directory: {dump_path}") @@ -265,3 +268,6 @@ def main(): ) logger.info("A total of unique "+str(res[0])+" BR OMIDs and "+str(res[1])+" RA OMIDs have been found and added to Redis.") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scripts/redis/__export_redis_db__.py b/scripts/redis_importexport/__export_redis_db__.py old mode 100755 new mode 100644 similarity index 100% rename from scripts/redis/__export_redis_db__.py rename to scripts/redis_importexport/__export_redis_db__.py diff --git a/scripts/redis/__import_redis_db__.py b/scripts/redis_importexport/__import_redis_db__.py old mode 100755 new mode 100644 similarity index 100% rename from scripts/redis/__import_redis_db__.py rename to scripts/redis_importexport/__import_redis_db__.py diff --git a/scripts/redis/__init__.py b/scripts/redis_importexport/__init__.py similarity index 100% rename from scripts/redis/__init__.py rename to scripts/redis_importexport/__init__.py diff --git a/scripts/redis/__pycache__/__init__.cpython-313.pyc b/scripts/redis_importexport/__pycache__/__init__.cpython-313.pyc similarity index 100% rename from scripts/redis/__pycache__/__init__.cpython-313.pyc rename to scripts/redis_importexport/__pycache__/__init__.cpython-313.pyc From 92a54fe9ac7706e4c0db04b539ebcb786ab97a94 Mon Sep 17 00:00:00 2001 From: Hubert Krzywonos Date: Fri, 13 Feb 2026 15:39:06 +0100 Subject: [PATCH 2/2] dump_index - added hardcoded output value for the automated workflow script (can't get the ConfigParser to read ["dump"]) --- scripts/dump_index.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/dump_index.py b/scripts/dump_index.py index 773f13c..ce3dda3 100644 --- a/scripts/dump_index.py +++ b/scripts/dump_index.py @@ -45,7 +45,7 @@ source = _config.get("INDEX", "source") service_name = _config.get("INDEX", "service") index_identifier = _config.get("INDEX", "identifier") -FILE_OUTPUT_DIR = _config.get("dump", "output") +FILE_OUTPUT_DIR = "../oc_autoworkflow/dir/output/index" # === CONFIGURATION === CITED_BATCH_SIZE = 1500