diff --git a/rorapi/common/views.py b/rorapi/common/views.py index b28c770..a180c01 100644 --- a/rorapi/common/views.py +++ b/rorapi/common/views.py @@ -303,11 +303,10 @@ class IndexDataDump(APIView): permission_classes = [OurTokenPermission] def get(self, request, filename, dataenv, version=REST_FRAMEWORK["DEFAULT_VERSION"]): - schema = 1 + # Always use v2 schema - v1 indexing support has been removed + schema = 2 testdata = True st = 200 - if version == 'v2': - schema = 2 if dataenv == 'prod': testdata = False msg = management.call_command("setup", filename, schema=schema, testdata=testdata) diff --git a/rorapi/management/commands/createindex.py b/rorapi/management/commands/createindex.py index 68c37f6..5b936ef 100644 --- a/rorapi/management/commands/createindex.py +++ b/rorapi/management/commands/createindex.py @@ -13,12 +13,8 @@ def create_index(self, index, template_file): self.stdout.write('Created index {}'.format(index)) class Command(BaseCommand): - help = 'Create ROR API index' + help = 'Create ROR API v2 index' def handle(self, *args, **options): - if(options['schema']==1 or options['schema'] is None): - print("creating v1 index") - create_index(self, ES_VARS['INDEX_V1'], ES_VARS['INDEX_TEMPLATE_ES7_V1']) - if(options['schema']==2 or options['schema'] is None): - print("creating v2 index") - create_index(self, ES_VARS['INDEX_V2'], ES_VARS['INDEX_TEMPLATE_ES7_V2']) \ No newline at end of file + self.stdout.write("creating v2 index") + create_index(self, ES_VARS['INDEX_V2'], ES_VARS['INDEX_TEMPLATE_ES7_V2']) \ No newline at end of file diff --git a/rorapi/management/commands/deleteindex.py b/rorapi/management/commands/deleteindex.py index 9d389fa..2d39877 100644 --- a/rorapi/management/commands/deleteindex.py +++ b/rorapi/management/commands/deleteindex.py @@ -10,13 +10,11 @@ def delete_index(self, index): self.stdout.write('Index {} does not exist'.format(index)) class Command(BaseCommand): - help = 'Deletes ROR API index' + help = 'Deletes ROR API v2 index' def handle(self, *args, **options): - if(options['schema']==1 or options['schema'] is None): - print("deleting v1 index") - delete_index(self, ES_VARS['INDEX_V1']) - if(options['schema']==2 or options['schema'] is None): - print("deleting v2 index") + schema = options.get('schema', 2) + if schema == 2 or schema is None: + self.stdout.write("deleting v2 index") delete_index(self, ES_VARS['INDEX_V2']) diff --git a/rorapi/management/commands/indexror.py b/rorapi/management/commands/indexror.py index c86b720..228f17b 100644 --- a/rorapi/management/commands/indexror.py +++ b/rorapi/management/commands/indexror.py @@ -13,30 +13,10 @@ from django.core.management.base import BaseCommand from elasticsearch import TransportError -def get_nested_names_v1(org): - yield org['name'] - for label in org['labels']: - yield label['label'] - for alias in org['aliases']: - yield alias - for acronym in org['acronyms']: - yield acronym - def get_nested_names_v2(org): for name in org['names']: yield name['value'] -def get_nested_ids_v1(org): - yield org['id'] - yield re.sub('https://', '', org['id']) - yield re.sub('https://ror.org/', '', org['id']) - for ext_name, ext_id in org['external_ids'].items(): - if ext_name == 'GRID': - yield ext_id['all'] - else: - for eid in ext_id['all']: - yield eid - def get_nested_ids_v2(org): yield org['id'] yield re.sub('https://', '', org['id']) @@ -150,10 +130,10 @@ def process_files(dir, version): def index(dataset, version): err = {} - if version == 'v2': - index = ES_VARS['INDEX_V2'] - else: - index = ES_VARS['INDEX_V1'] + if version != 'v2': + err[index.__name__] = f"Only v2 schema version is supported. Received: {version}" + return err + index = ES_VARS['INDEX_V2'] backup_index = '{}-tmp'.format(index) ES7.reindex(body={ 'source': { @@ -174,22 +154,14 @@ def index(dataset, version): '_id': org['id'] } }) - if 'v2' in index: - org['names_ids'] = [{ - 'name': n - } for n in get_nested_names_v2(org)] - org['names_ids'] += [{ - 'id': n - } for n in get_nested_ids_v2(org)] - # experimental affiliations_match nested doc - org['affiliation_match'] = get_affiliation_match_doc(org) - else: - org['names_ids'] = [{ - 'name': n - } for n in get_nested_names_v1(org)] - org['names_ids'] += [{ - 'id': n - } for n in get_nested_ids_v1(org)] + org['names_ids'] = [{ + 'name': n + } for n in get_nested_names_v2(org)] + org['names_ids'] += [{ + 'id': n + } for n in get_nested_ids_v2(org)] + # experimental affiliations_match nested doc + org['affiliation_match'] = get_affiliation_match_doc(org) body.append(org) ES7.bulk(body) except TransportError: @@ -211,11 +183,10 @@ class Command(BaseCommand): def add_arguments(self, parser): parser.add_argument('dir', type=str, help='add directory name for S3 bucket to be processed') - parser.add_argument('version', type=str, help='schema version of files to be processed') def handle(self,*args, **options): dir = options['dir'] - version = options['version'] + version = 'v2' process_files(dir, version) diff --git a/rorapi/management/commands/indexrordump.py b/rorapi/management/commands/indexrordump.py index 6719705..e4ce244 100644 --- a/rorapi/management/commands/indexrordump.py +++ b/rorapi/management/commands/indexrordump.py @@ -12,30 +12,10 @@ HEADERS = {'Accept': 'application/vnd.github.v3+json'} -def get_nested_names_v1(org): - yield org['name'] - for label in org['labels']: - yield label['label'] - for alias in org['aliases']: - yield alias - for acronym in org['acronyms']: - yield acronym - def get_nested_names_v2(org): for name in org['names']: yield name['value'] -def get_nested_ids_v1(org): - yield org['id'] - yield re.sub('https://', '', org['id']) - yield re.sub('https://ror.org/', '', org['id']) - for ext_name, ext_id in org['external_ids'].items(): - if ext_name == 'GRID': - yield ext_id['all'] - else: - for eid in ext_id['all']: - yield eid - def get_nested_ids_v2(org): yield org['id'] yield re.sub('https://', '', org['id']) @@ -81,22 +61,14 @@ def index_dump(self, filename, index, dataset): '_id': org['id'] } }) - if 'v2' in index: - org['names_ids'] = [{ - 'name': n - } for n in get_nested_names_v2(org)] - org['names_ids'] += [{ - 'id': n - } for n in get_nested_ids_v2(org)] - # experimental affiliations_match nested doc - org['affiliation_match'] = get_affiliation_match_doc(org) - else: - org['names_ids'] = [{ - 'name': n - } for n in get_nested_names_v1(org)] - org['names_ids'] += [{ - 'id': n - } for n in get_nested_ids_v1(org)] + org['names_ids'] = [{ + 'name': n + } for n in get_nested_names_v2(org)] + org['names_ids'] += [{ + 'id': n + } for n in get_nested_ids_v2(org)] + # experimental affiliations_match nested doc + org['affiliation_match'] = get_affiliation_match_doc(org) body.append(org) ES7.bulk(body) except TransportError: @@ -134,22 +106,25 @@ def handle(self, *args, **options): json_files.append(file) if json_files: for json_file in json_files: - index = None json_path = os.path.join(DATA['WORKING_DIR'], filename, '') + json_file - if 'schema_v2' in json_file and (options['schema']==2 or options['schema'] is None): + # Check if file is v2.0+ format or legacy schema_v2 format + version_match = re.match(r'v(\d+)\.(\d+)', json_file) + is_v2_format = False + if version_match: + major, minor = map(int, version_match.groups()) + if major >= 2: + is_v2_format = True + elif 'schema_v2' in json_file: + # Legacy format with schema_v2 in filename + is_v2_format = True + + if is_v2_format and (options.get('schema') == 2 or options.get('schema') is None): self.stdout.write('Loading JSON') with open(json_path, 'r') as it: dataset = json.load(it) self.stdout.write('Indexing ROR dataset ' + json_file) index = ES_VARS['INDEX_V2'] index_dump(self, json_file, index, dataset) - if 'schema_v2' not in json_file and (options['schema']==1 or options['schema'] is None): - self.stdout.write('Loading JSON') - with open(json_path, 'r') as it: - dataset = json.load(it) - self.stdout.write('Indexing ROR dataset ' + json_file) - index = ES_VARS['INDEX_V1'] - index_dump(self, json_file, index, dataset) else: self.stdout.write("ROR data dump does not contain any JSON files") diff --git a/rorapi/management/commands/setup.py b/rorapi/management/commands/setup.py index a87b7f4..0795078 100644 --- a/rorapi/management/commands/setup.py +++ b/rorapi/management/commands/setup.py @@ -36,7 +36,7 @@ class Command(BaseCommand): def add_arguments(self, parser): parser.add_argument('filename', type=str, help='Name of data dump zip file to index without extension') - parser.add_argument('-s', '--schema', type=int, choices=[1, 2], help='Schema version to index if only indexing 1 version. Only set if not indexing both versions.') + parser.add_argument('-s', '--schema', type=int, choices=[2], default=2, help='Schema version to index (v2 only)') parser.add_argument('-t', '--testdata', action='store_true', help='Set flag to pull data dump from ror-data-test instead of ror-data') def handle(self, *args, **options): @@ -57,7 +57,7 @@ def handle(self, *args, **options): DeleteIndexCommand().handle(*args, **options) CreateIndexCommand().handle(*args, **options) IndexRorDumpCommand().handle(*args, **options) - msg = 'SUCCESS: ROR dataset {} indexed in version {}. Using test repo: {}'.format(filename, str(options['schema']), str(use_test_data)) + msg = 'SUCCESS: ROR dataset {} indexed in v2. Using test repo: {}'.format(filename, str(use_test_data)) except: msg = 'ERROR: Could not index ROR data dump. Check API logs for details.' else: diff --git a/rorapi/settings.py b/rorapi/settings.py index 513550e..b0ade37 100644 --- a/rorapi/settings.py +++ b/rorapi/settings.py @@ -151,8 +151,7 @@ STATIC_ROOT = os.path.join(BASE_DIR, 'static/') ES_VARS = { - 'INDEX_V1': 'organizations', - 'INDEX_TEMPLATE_ES7_V1': os.path.join(BASE_DIR, 'rorapi', 'v1', 'index_template_es7.json'), + 'INDEX_V1': 'organizations', # Kept for v1 API queries (backward compatibility) 'INDEX_V2': 'organizations-v2', 'INDEX_TEMPLATE_ES7_V2': os.path.join(BASE_DIR, 'rorapi', 'v2', 'index_template_es7.json'), 'BATCH_SIZE': 20,