Skip to content

Commit 2eb66b4

Browse files
committed
added column guesser based on utype and ucd, changed parse_json to static function
1 parent f69127c commit 2eb66b4

File tree

2 files changed

+97
-113
lines changed

2 files changed

+97
-113
lines changed

pyvo/dal/adhoc.py

Lines changed: 58 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import warnings
77
import copy
88
import requests
9+
import json
910

1011
from .query import DALResults, DALQuery, DALService, Record
1112
from .exceptions import DALServiceError
@@ -396,51 +397,6 @@ def iter_datalinks(self, preserve_order=False):
396397
yield from self._iter_datalinks_from_dlblock(
397398
preserve_order=preserve_order)
398399

399-
def iter_parse_json_params(
400-
self,
401-
json_key: str,
402-
colname: str="cloud_access",
403-
verbose: bool=False,
404-
**match_params
405-
):
406-
"""
407-
Iterate over all Records in a DalResult and return parsed json parameters.
408-
409-
Parameters
410-
----------
411-
json_key : str
412-
The primary key by which to filter JSON results.
413-
colname : str, optional
414-
The column containing JSON to be parsed, by default "cloud_access".
415-
verbose : bool, optional
416-
Whether to print progress and errors, by default False.
417-
**match_params : str, optional
418-
Further parameters on which to match beyond `json_key`.
419-
420-
Returns
421-
-------
422-
astropy.Table
423-
A table containing the JSON parameters separated into columns, each
424-
row corresponding to a matching JSON entry for each DataLinkRecord
425-
for each row of the original DalResult.
426-
427-
"""
428-
for irow, record in enumerate(self):
429-
access_points = record.parse_json_params(
430-
colname=colname,
431-
json_key=json_key,
432-
verbose=verbose,
433-
**match_params
434-
)
435-
access_points.add_column([irow]*len(access_points), name="record_row", index=0)
436-
if irow == 0:
437-
new_table = access_points
438-
else:
439-
for row in access_points.iterrows():
440-
new_table.add_row(row)
441-
442-
return new_table
443-
444400
def iter_get_cloud_params(
445401
self,
446402
provider: str,
@@ -475,9 +431,10 @@ def iter_get_cloud_params(
475431

476432
for jrow, row in enumerate(products):
477433
# if no colname column, there is nothing to do
478-
try:
434+
jsontxt = row._guess_cloud_column(colname=colname)
435+
if jsontxt:
479436
access_points = row.parse_json_params(
480-
colname=colname,
437+
json_txt=jsontxt,
481438
json_key=provider,
482439
verbose=verbose,
483440
**match_params
@@ -488,15 +445,14 @@ def iter_get_cloud_params(
488445
else:
489446
for row in access_points.iterrows():
490447
new_dl_table.add_row(row)
491-
except KeyError:
448+
else:
492449
# no json column, continue
493450
if verbose:
494-
print(f'No column {colname} found for row {irow}, datalink {jrow}')
451+
print(f'No column {colname} found for Results row {irow}, datalink row {jrow}')
495452
new_dl_table = TableElement(VOTableFile()).to_table()
496-
continue
497453

498454
# do the json parsing
499-
cloud_params = access_points
455+
cloud_params = new_dl_table
500456
cloud_params.add_column([irow]*len(cloud_params), name="record_row", index=0)
501457
if irow == 0:
502458
new_table = cloud_params
@@ -554,21 +510,21 @@ def getdataset(self, timeout=None):
554510
# this should go to Record.getdataset()
555511
return super().getdataset(timeout=timeout)
556512

513+
@staticmethod
557514
def parse_json_params(
558-
self,
515+
json_txt: str,
559516
json_key: str,
560-
colname: str="cloud_access",
561517
verbose: bool=False,
562518
**match_params
563519
):
564520
"""Parse information stored as JSON by key
565521
566522
Parameters
567523
----------
524+
json_txt : str
525+
Text interpreted as JSON
568526
json_key : str
569527
The primary key by which to filter JSON results
570-
colname : str, optional
571-
The column containing JSON to be parsed, by default "cloud_access"
572528
verbose : bool, optional
573529
Whether to print progress and errors, by default False
574530
**match_params : str, optional
@@ -581,43 +537,52 @@ def parse_json_params(
581537
row representing a matching JSON entry.
582538
583539
"""
584-
import json
585540

586541
# init results table (avoiding adding import of astropy.table.Table)
587542
new_table = TableElement(VOTableFile()).to_table()
588-
589-
if verbose:
590-
print(f'searching for and processing json column {colname}')
591543

592-
try:
593-
jsontxt = self[colname]
594-
jsonDict = json.loads(jsontxt)
595-
if json_key not in jsonDict and verbose:
596-
print(f'No key "{json_key}" found for record'
597-
'in column "{colname}"')
598-
else:
599-
p_params = jsonDict[json_key]
600-
checks = []
601-
for k, value in match_params.items():
602-
checks.append(p_params.getitem(k, value) == value)
603-
604-
if all(checks):
605-
if not isinstance(p_params, list):
606-
p_params = [p_params]
607-
colnames = list(p_params[0].keys())
608-
colvals = [[] for _ in colnames]
609-
for ppar in p_params:
610-
for idx, val in enumerate(ppar.values()):
611-
colvals[idx].append(val)
612-
new_table.add_columns(cols=colvals, names=colnames)
613-
614-
except KeyError:
615-
# no json column, return empty list
616-
if verbose:
617-
print(f'No column {colname} found for record.')
544+
jsonDict = json.loads(json_txt)
545+
if json_key not in jsonDict and verbose:
546+
print(f'No key "{json_key}" found in json_txt given.')
547+
else:
548+
p_params = jsonDict[json_key]
549+
checks = []
550+
for k, value in match_params.items():
551+
checks.append(p_params.getitem(k, value) == value)
552+
553+
if all(checks):
554+
if not isinstance(p_params, list):
555+
p_params = [p_params]
556+
colnames = list(p_params[0].keys())
557+
colvals = [[] for _ in colnames]
558+
for ppar in p_params:
559+
for idx, val in enumerate(ppar.values()):
560+
colvals[idx].append(val)
561+
new_table.add_columns(cols=colvals, names=colnames)
618562

619563
return new_table
620564

565+
def _guess_cloud_column(self, colname="cloud_access"):
566+
"""returns a guess for a URI to a data product in row.
567+
568+
This tries a few heuristics based on how cloud access or records might
569+
be marked up. This will return None if row does not look as if
570+
it contained a cloud access column.
571+
"""
572+
if hasattr(self, colname):
573+
return getattr(self, colname)
574+
575+
if colname in self:
576+
return self[colname]
577+
578+
cloud_access = self.getbyutype("adhoc:cloudstorage")
579+
if cloud_access:
580+
return cloud_access
581+
582+
cloud_access = self.getbyucd("meta.ref.cloudstorage")
583+
if cloud_access:
584+
return cloud_access
585+
621586
def get_cloud_params(
622587
self,
623588
provider: str,
@@ -651,10 +616,11 @@ def get_cloud_params(
651616

652617
for irow, row in enumerate(products):
653618
# if no colname column, there is nothing to do
654-
try:
619+
cloud_json = row._guess_cloud_column(colname=colname)
620+
if cloud_json:
655621
access_points = row.parse_json_params(
656-
colname=colname,
657-
key=provider,
622+
json_txt=cloud_json,
623+
json_key=provider,
658624
verbose=verbose,
659625
**match_params
660626
)
@@ -664,12 +630,12 @@ def get_cloud_params(
664630
else:
665631
for row in access_points.iterrows():
666632
new_table.add_row(row)
667-
except KeyError:
668-
# no json column, continue
633+
else:
634+
# no json column, return None
669635
if verbose:
670636
print(f'No column {colname} found for row {irow}')
671-
new_table = TableElement(VOTableFile()).to_table()
672-
continue
637+
new_table = None
638+
break
673639

674640
return new_table
675641

pyvo/dal/tests/test_datalink.py

Lines changed: 39 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -77,18 +77,38 @@ def callback(request, context):
7777
yield matcher
7878

7979
@pytest.fixture()
80-
def datalink_cloud(mocker):
80+
def datalink_cloud1(mocker):
8181
def callback(request, context):
8282
dl_base = parse('pyvo/dal/tests/data/datalink/datalink.xml')
8383
dl_base_table = dl_base.get_first_table().to_table()
8484
cloud_access_str = '{"aws": {"bucket_name": "test", "key":"path/to/cloudfile.fits", "region": "us-west-2"}}'
8585
dl_base_table.add_column([cloud_access_str]*4, name='cloud_access')
8686
out = BytesIO()
87-
writeto(from_table(dl_base_table), out)
87+
votable = from_table(dl_base_table)
88+
votable.get_first_table().get_field_by_id("cloud_access").utype = "adhoc:cloudstorage"
89+
writeto(votable, out)
8890
return out.getvalue()
8991

9092
with mocker.register_uri(
91-
'GET', 'http://example.com/datalink-cloud.xml', content=callback
93+
'GET', 'http://example.com/datalink-cloud1.xml', content=callback
94+
) as matcher:
95+
yield matcher
96+
97+
@pytest.fixture()
98+
def datalink_cloud2(mocker):
99+
def callback(request, context):
100+
dl_base = parse('pyvo/dal/tests/data/datalink/datalink.xml')
101+
dl_base_table = dl_base.get_first_table().to_table()
102+
cloud_access_str = '{"aws": {"bucket_name": "test", "key":"path/to/cloudfile.fits", "region": "us-west-2"}}'
103+
dl_base_table.add_column([cloud_access_str]*4, name='cloud_access')
104+
out = BytesIO()
105+
votable = from_table(dl_base_table)
106+
votable.get_first_table().get_field_by_id("cloud_access").ucd = "meta.ref.cloudstorage"
107+
writeto(votable, out)
108+
return out.getvalue()
109+
110+
with mocker.register_uri(
111+
'GET', 'http://example.com/datalink-cloud2.xml', content=callback
92112
) as matcher:
93113
yield matcher
94114

@@ -355,7 +375,7 @@ def test_no_datalink():
355375
result.getdatalink()
356376

357377
@pytest.mark.filterwarnings("ignore::astropy.io.votable.exceptions.E02")
358-
@pytest.mark.usefixtures('datalink_cloud')
378+
@pytest.mark.usefixtures('datalink_cloud1', 'datalink_cloud2')
359379
class TestJsonColumns:
360380
"""Tests for producing datalinks from tables containing links to
361381
datalink documents.
@@ -366,41 +386,39 @@ class TestJsonColumns:
366386
"ucd": "meta.ref.url"},
367387
{"name": "access_format", "datatype": "char", "arraysize": "*",
368388
"utype": "meta.code.mime"},
369-
{"name": "cloud_access", "datatype": "char", "arraysize": "*"},],
370-
[("http://example.com/datalink-cloud.xml",
389+
{"name": "cloud_access", "datatype": "char", "arraysize": "*",
390+
"utype": "adhoc:cloudstorage", "ucd": "meta.ref.cloudstorage"},],
391+
[("http://example.com/datalink-cloud1.xml",
371392
"application/x-votable+xml;content=datalink",
372393
'{"aws": {"bucket_name": "test", "key":"path/to/file1.fits", "region": "us-west-2"}}',),
373-
("http://example.com/datalink-cloud.xml",
394+
("http://example.com/datalink-cloud2.xml",
374395
"application/x-votable+xml;content=datalink",
375396
'{"aws": {"bucket_name": "test", "key":"path/to/file2.fits", "region": "us-west-2"}}',)],
376397
resultsClass=SIA2Results
377398
)
378399
def test_record_w_json(self):
379400

380-
381-
parsed_json_matches = self.res[0].parse_json_params("cloud_access", "aws")
401+
jsontxt = '{"aws": {"bucket_name": "test", "key":"path/to/file1.fits", "region": "us-west-2"}}'
402+
parsed_json_matches = self.res[0].parse_json_params(json_txt=jsontxt, json_key="aws")
382403
assert parsed_json_matches[0]["bucket_name"] == "test"
383404
assert parsed_json_matches[0]["key"] == "path/to/file1.fits"
384405
assert parsed_json_matches[0]["region"] == "us-west-2"
385406

386-
def test_iter_json(self):
387-
388-
parsed_json_matches = self.res.iter_parse_json_params("cloud_access", "aws")
389-
assert parsed_json_matches[0]["record_row"] == 0
390-
assert parsed_json_matches[0]["bucket_name"] == "test"
391-
assert parsed_json_matches[0]["key"] == "path/to/file1.fits"
392-
assert parsed_json_matches[0]["region"] == "us-west-2"
393-
assert parsed_json_matches[1]["record_row"] == 1
394-
assert parsed_json_matches[1]["key"] == "path/to/file2.fits"
395-
396407
def test_datalink_json(self):
397-
parsed_cloud_params = self.res[0].get_cloud_params("cloud_access", "aws")
408+
parsed_cloud_params = self.res[0].get_cloud_params(provider="aws", colname="cloud_access")
398409
assert parsed_cloud_params[0]["bucket_name"] == "test"
399410
assert parsed_cloud_params[0]["key"] == "path/to/cloudfile.fits"
400411
assert parsed_cloud_params[0]["region"] == "us-west-2"
401412

413+
parsed2 = self.res[0].get_cloud_params(provider="aws", colname="bad_col_name")
414+
assert parsed_cloud_params == parsed2
415+
416+
parsed3 = self.res[1].get_cloud_params(provider="aws", colname="bad_col_name")
417+
parsed4 = self.res[1].get_cloud_params(provider="aws", colname="cloud_access")
418+
assert parsed3 == parsed4
419+
402420
def test_iter_datalink_json(self):
403-
parsed_json_matches = self.res.iter_get_cloud_params("cloud_access", "aws")
421+
parsed_json_matches = self.res.iter_get_cloud_params(provider="aws", colname="cloud_access")
404422
assert parsed_json_matches[0]["record_row"] == 0
405423
assert parsed_json_matches[0]["datalink_row"] == 0
406424
assert parsed_json_matches[0]["bucket_name"] == "test"

0 commit comments

Comments
 (0)