Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions rorapi/common/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,11 +303,10 @@ class IndexDataDump(APIView):
permission_classes = [OurTokenPermission]

def get(self, request, filename, dataenv, version=REST_FRAMEWORK["DEFAULT_VERSION"]):
schema = 1
# Always use v2 schema - v1 indexing support has been removed
schema = 2
testdata = True
st = 200
if version == 'v2':
schema = 2
if dataenv == 'prod':
testdata = False
msg = management.call_command("setup", filename, schema=schema, testdata=testdata)
Expand Down
10 changes: 3 additions & 7 deletions rorapi/management/commands/createindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,8 @@ def create_index(self, index, template_file):
self.stdout.write('Created index {}'.format(index))

class Command(BaseCommand):
help = 'Create ROR API index'
help = 'Create ROR API v2 index'

def handle(self, *args, **options):
if(options['schema']==1 or options['schema'] is None):
print("creating v1 index")
create_index(self, ES_VARS['INDEX_V1'], ES_VARS['INDEX_TEMPLATE_ES7_V1'])
if(options['schema']==2 or options['schema'] is None):
print("creating v2 index")
create_index(self, ES_VARS['INDEX_V2'], ES_VARS['INDEX_TEMPLATE_ES7_V2'])
self.stdout.write("creating v2 index")
create_index(self, ES_VARS['INDEX_V2'], ES_VARS['INDEX_TEMPLATE_ES7_V2'])
10 changes: 4 additions & 6 deletions rorapi/management/commands/deleteindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,11 @@ def delete_index(self, index):
self.stdout.write('Index {} does not exist'.format(index))

class Command(BaseCommand):
help = 'Deletes ROR API index'
help = 'Deletes ROR API v2 index'

def handle(self, *args, **options):
if(options['schema']==1 or options['schema'] is None):
print("deleting v1 index")
delete_index(self, ES_VARS['INDEX_V1'])
if(options['schema']==2 or options['schema'] is None):
print("deleting v2 index")
schema = options.get('schema', 2)
if schema == 2 or schema is None:
self.stdout.write("deleting v2 index")
delete_index(self, ES_VARS['INDEX_V2'])

55 changes: 13 additions & 42 deletions rorapi/management/commands/indexror.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,30 +13,10 @@
from django.core.management.base import BaseCommand
from elasticsearch import TransportError

def get_nested_names_v1(org):
yield org['name']
for label in org['labels']:
yield label['label']
for alias in org['aliases']:
yield alias
for acronym in org['acronyms']:
yield acronym

def get_nested_names_v2(org):
for name in org['names']:
yield name['value']

def get_nested_ids_v1(org):
yield org['id']
yield re.sub('https://', '', org['id'])
yield re.sub('https://ror.org/', '', org['id'])
for ext_name, ext_id in org['external_ids'].items():
if ext_name == 'GRID':
yield ext_id['all']
else:
for eid in ext_id['all']:
yield eid

def get_nested_ids_v2(org):
yield org['id']
yield re.sub('https://', '', org['id'])
Expand Down Expand Up @@ -150,10 +130,10 @@ def process_files(dir, version):

def index(dataset, version):
err = {}
if version == 'v2':
index = ES_VARS['INDEX_V2']
else:
index = ES_VARS['INDEX_V1']
if version != 'v2':
err[index.__name__] = f"Only v2 schema version is supported. Received: {version}"
return err
index = ES_VARS['INDEX_V2']
backup_index = '{}-tmp'.format(index)
ES7.reindex(body={
'source': {
Expand All @@ -174,22 +154,14 @@ def index(dataset, version):
'_id': org['id']
}
})
if 'v2' in index:
org['names_ids'] = [{
'name': n
} for n in get_nested_names_v2(org)]
org['names_ids'] += [{
'id': n
} for n in get_nested_ids_v2(org)]
# experimental affiliations_match nested doc
org['affiliation_match'] = get_affiliation_match_doc(org)
else:
org['names_ids'] = [{
'name': n
} for n in get_nested_names_v1(org)]
org['names_ids'] += [{
'id': n
} for n in get_nested_ids_v1(org)]
org['names_ids'] = [{
'name': n
} for n in get_nested_names_v2(org)]
org['names_ids'] += [{
'id': n
} for n in get_nested_ids_v2(org)]
# experimental affiliations_match nested doc
org['affiliation_match'] = get_affiliation_match_doc(org)
body.append(org)
ES7.bulk(body)
except TransportError:
Expand All @@ -211,11 +183,10 @@ class Command(BaseCommand):

def add_arguments(self, parser):
parser.add_argument('dir', type=str, help='add directory name for S3 bucket to be processed')
parser.add_argument('version', type=str, help='schema version of files to be processed')

def handle(self,*args, **options):
dir = options['dir']
version = options['version']
version = 'v2'
process_files(dir, version)


65 changes: 20 additions & 45 deletions rorapi/management/commands/indexrordump.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,30 +12,10 @@

HEADERS = {'Accept': 'application/vnd.github.v3+json'}

def get_nested_names_v1(org):
yield org['name']
for label in org['labels']:
yield label['label']
for alias in org['aliases']:
yield alias
for acronym in org['acronyms']:
yield acronym

def get_nested_names_v2(org):
for name in org['names']:
yield name['value']

def get_nested_ids_v1(org):
yield org['id']
yield re.sub('https://', '', org['id'])
yield re.sub('https://ror.org/', '', org['id'])
for ext_name, ext_id in org['external_ids'].items():
if ext_name == 'GRID':
yield ext_id['all']
else:
for eid in ext_id['all']:
yield eid

def get_nested_ids_v2(org):
yield org['id']
yield re.sub('https://', '', org['id'])
Expand Down Expand Up @@ -81,22 +61,14 @@ def index_dump(self, filename, index, dataset):
'_id': org['id']
}
})
if 'v2' in index:
org['names_ids'] = [{
'name': n
} for n in get_nested_names_v2(org)]
org['names_ids'] += [{
'id': n
} for n in get_nested_ids_v2(org)]
# experimental affiliations_match nested doc
org['affiliation_match'] = get_affiliation_match_doc(org)
else:
org['names_ids'] = [{
'name': n
} for n in get_nested_names_v1(org)]
org['names_ids'] += [{
'id': n
} for n in get_nested_ids_v1(org)]
org['names_ids'] = [{
'name': n
} for n in get_nested_names_v2(org)]
org['names_ids'] += [{
'id': n
} for n in get_nested_ids_v2(org)]
# experimental affiliations_match nested doc
org['affiliation_match'] = get_affiliation_match_doc(org)
body.append(org)
ES7.bulk(body)
except TransportError:
Expand Down Expand Up @@ -134,22 +106,25 @@ def handle(self, *args, **options):
json_files.append(file)
if json_files:
for json_file in json_files:
index = None
json_path = os.path.join(DATA['WORKING_DIR'], filename, '') + json_file
if 'schema_v2' in json_file and (options['schema']==2 or options['schema'] is None):
# Check if file is v2.0+ format or legacy schema_v2 format
version_match = re.match(r'v(\d+)\.(\d+)', json_file)
is_v2_format = False
if version_match:
major, minor = map(int, version_match.groups())
if major >= 2:
is_v2_format = True
elif 'schema_v2' in json_file:
# Legacy format with schema_v2 in filename
is_v2_format = True

if is_v2_format and (options.get('schema') == 2 or options.get('schema') is None):
self.stdout.write('Loading JSON')
with open(json_path, 'r') as it:
dataset = json.load(it)
self.stdout.write('Indexing ROR dataset ' + json_file)
index = ES_VARS['INDEX_V2']
index_dump(self, json_file, index, dataset)
if 'schema_v2' not in json_file and (options['schema']==1 or options['schema'] is None):
self.stdout.write('Loading JSON')
with open(json_path, 'r') as it:
dataset = json.load(it)
self.stdout.write('Indexing ROR dataset ' + json_file)
index = ES_VARS['INDEX_V1']
index_dump(self, json_file, index, dataset)
else:
self.stdout.write("ROR data dump does not contain any JSON files")

Expand Down
4 changes: 2 additions & 2 deletions rorapi/management/commands/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ class Command(BaseCommand):

def add_arguments(self, parser):
parser.add_argument('filename', type=str, help='Name of data dump zip file to index without extension')
parser.add_argument('-s', '--schema', type=int, choices=[1, 2], help='Schema version to index if only indexing 1 version. Only set if not indexing both versions.')
parser.add_argument('-s', '--schema', type=int, choices=[2], default=2, help='Schema version to index (v2 only)')
parser.add_argument('-t', '--testdata', action='store_true', help='Set flag to pull data dump from ror-data-test instead of ror-data')

def handle(self, *args, **options):
Expand All @@ -57,7 +57,7 @@ def handle(self, *args, **options):
DeleteIndexCommand().handle(*args, **options)
CreateIndexCommand().handle(*args, **options)
IndexRorDumpCommand().handle(*args, **options)
msg = 'SUCCESS: ROR dataset {} indexed in version {}. Using test repo: {}'.format(filename, str(options['schema']), str(use_test_data))
msg = 'SUCCESS: ROR dataset {} indexed in v2. Using test repo: {}'.format(filename, str(use_test_data))
except:
msg = 'ERROR: Could not index ROR data dump. Check API logs for details.'
else:
Expand Down
3 changes: 1 addition & 2 deletions rorapi/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,8 +151,7 @@
STATIC_ROOT = os.path.join(BASE_DIR, 'static/')

ES_VARS = {
'INDEX_V1': 'organizations',
'INDEX_TEMPLATE_ES7_V1': os.path.join(BASE_DIR, 'rorapi', 'v1', 'index_template_es7.json'),
'INDEX_V1': 'organizations', # Kept for v1 API queries (backward compatibility)
'INDEX_V2': 'organizations-v2',
'INDEX_TEMPLATE_ES7_V2': os.path.join(BASE_DIR, 'rorapi', 'v2', 'index_template_es7.json'),
'BATCH_SIZE': 20,
Expand Down
Loading