Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions config.ini
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ host=127.0.0.1
port=6379
batch_size=10000

# Output folder for dump_index
[dump]
output="_out_"

[cnc]
# ORCID API key to be used to query the ORCID API
orcid=
Expand Down
36 changes: 21 additions & 15 deletions scripts/cits2redis.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,15 @@

_config = get_config()
_logger = get_logger()
csv.field_size_limit(sys.maxsize)
if os.name != "nt":
csv.field_size_limit(sys.maxsize)
else:
csv.field_size_limit(2**31 - 1)

rconn = Redis(
host=_config.get("redis", "host"),
port=_config.get("redis", "port"),
db=_config.get("redis", "db_cits")
db=_config.get("cnc", "db_cits")
)

def upload2redis(dump_path="", intype=""):
Expand Down Expand Up @@ -65,23 +69,22 @@ def upload2redis(dump_path="", intype=""):
if filename.endswith(".ttl"):
with open(filename, "r", encoding="utf-8") as f:
for line in f:
if needle in line:
# extract the part between "ci/" and ">"
start = line.find("ci/") + 3
end = line.find(">", start)
oci = line[start:end]
all_ocis.append(oci)
# extract the part between "ci/" and ">"
start = line.find("ci/") + 3
end = line.find(">", start)
oci = line[start:end]
all_ocis.append(oci)
elif intype == "TTL":
for filename in os.listdir(dump_path):
if filename.endswith(".ttl"):
with open(filename, "r", encoding="utf-8") as f:
full_path = os.path.join(dump_path, filename)
with open(full_path, "r", encoding="utf-8") as f:
for line in f:
if needle in line:
# extract the part between "ci/" and ">"
start = line.find("ci/") + 3
end = line.find(">", start)
oci = line[start:end]
all_ocis.append(oci)
# extract the part between "ci/" and ">"
start = line.find("ci/") + 3
end = line.find(">", start)
oci = line[start:end]
all_ocis.append(oci)

for oci in all_ocis:
citing = oci.split("-")[0]
Expand Down Expand Up @@ -112,3 +115,6 @@ def main():
_logger.info("Uploading citations in RDF format to Redis ...")
upload2redis(args.dump,args.intype)
_logger.info("Done!")

if __name__ == "__main__":
main()
3 changes: 3 additions & 0 deletions scripts/cnc.py
Original file line number Diff line number Diff line change
Expand Up @@ -405,3 +405,6 @@ def main():
# 4. Continue with the rest of your code **after all files are done**
# e.g., merging outputs, generating RDF/CSV summary, logging, etc.
# >> post_processing(output_dir)

if __name__ == "__main__":
main()
5 changes: 4 additions & 1 deletion scripts/dump_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,12 @@
source = _config.get("INDEX", "source")
service_name = _config.get("INDEX", "service")
index_identifier = _config.get("INDEX", "identifier")
FILE_OUTPUT_DIR = "../oc_autoworkflow/dir/output/index"

# === CONFIGURATION ===
CITED_BATCH_SIZE = 1500
CITED_PER_FILE = 10000
FILES_PER_ZIP = 1000
FILE_OUTPUT_DIR = "_out_"


def zip_and_cleanup(csv_dir, rdf_dir, slx_dir, files_per_zip, force = False, pnum=1):
Expand Down Expand Up @@ -331,3 +331,6 @@ def process_pair(pairs, pnum, br_meta, end_cursor = False):
force = end_cursor,
pnum = pnum
)

if __name__ == "__main__":
main()
14 changes: 10 additions & 4 deletions scripts/meta2redis.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,10 @@
from oc.index.utils.config import get_config

_config = get_config()
csv.field_size_limit(sys.maxsize)
if os.name != "nt":
csv.field_size_limit(sys.maxsize)
else:
csv.field_size_limit(2**31 - 1)

# glob indexes
br_ids = _config.get("cnc", "br_ids").split(",")
Expand Down Expand Up @@ -97,7 +100,7 @@ def _p_csvfile(a_csv_file,csv_name,rconn_db_br, rconn_db_ra, rconn_db_metadata):
db_ra_buffer = []
db_metadata_buffer = []

l_brs = list(csv.DictReader(io.TextIOWrapper(a_csv_file)))
l_brs = list(csv.DictReader(io.TextIOWrapper(a_csv_file, encoding="utf-8")))

# walk through each citation in the CSV
logger.info("Walking through all the "+str( len(l_brs) )+" BRs (rows) in: "+str(csv_name) )
Expand Down Expand Up @@ -203,7 +206,7 @@ def upload2redis(dump_path="", redishost="localhost", redisport="6379", redisbat
# Handle single CSV file
csv_name = os.path.basename(dump_path)
logger.info(f"CSV: Processing direct CSV file: {csv_name}")
with open(dump_path, 'r', encoding='utf-8') as csv_file:
with open(dump_path, 'rb') as csv_file:
_p_csvfile(csv_file,csv_name, rconn_db_br, rconn_db_ra, rconn_db_metadata)
else:
logger.warning(f"Unsupported file type: {dump_path}")
Expand All @@ -219,7 +222,7 @@ def upload2redis(dump_path="", redishost="localhost", redisport="6379", redisbat

if filename.endswith(".csv"):
logger.info(f"CSV: Processing direct CSV file: {filename}")
with open(filepath, 'r', encoding='utf-8') as csv_file:
with open(filepath, 'rb') as csv_file:
_p_csvfile(csv_file, filename, rconn_db_br, rconn_db_ra, rconn_db_metadata)
else:
logger.error(f"Path does not exist or is neither a file nor directory: {dump_path}")
Expand Down Expand Up @@ -265,3 +268,6 @@ def main():
)

logger.info("A total of unique "+str(res[0])+" BR OMIDs and "+str(res[1])+" RA OMIDs have been found and added to Redis.")

if __name__ == "__main__":
main()
File renamed without changes.
File renamed without changes.
File renamed without changes.