diff --git a/lib/pyld/canon.py b/lib/pyld/canon.py new file mode 100644 index 0000000..b7bac44 --- /dev/null +++ b/lib/pyld/canon.py @@ -0,0 +1,554 @@ + +import hashlib +from pyld.parser import parse_nquads, to_nquad +from pyld.identifier_issuer import IdentifierIssuer +import copy + + +class URDNA2015(object): + """ + URDNA2015 implements the URDNA2015 RDF Dataset Normalization Algorithm. + """ + + def __init__(self): + self.blank_node_info = {} + self.hash_to_blank_nodes = {} + self.canonical_issuer = IdentifierIssuer('_:c14n') + self.quads = [] + self.POSITIONS = {'subject': 's', 'object': 'o', 'name': 'g'} + + # 4.4) Normalization Algorithm + def main(self, dataset, options): + # handle invalid output format + if 'format' in options: + if (options['format'] != 'application/n-quads' and + options['format'] != 'application/nquads'): + raise UnknownFormatError( + 'Unknown output format.', options['format']) + + # 1) Create the normalization state. + + # 2) For every quad in input dataset: + for graph_name, triples in dataset.items(): + if graph_name == '@default': + graph_name = None + for triple in triples: + quad = triple + if graph_name is not None: + if graph_name.startswith('_:'): + quad['name'] = {'type': 'blank node'} + else: + quad['name'] = {'type': 'IRI'} + quad['name']['value'] = graph_name + self.quads.append(quad) + + # 2.1) For each blank node that occurs in the quad, add a + # reference to the quad using the blank node identifier in the + # blank node to quads map, creating a new entry if necessary. + for key, component in quad.items(): + if key == 'predicate' or component['type'] != 'blank node': + continue + id_ = component['value'] + self.blank_node_info.setdefault( + id_, {'quads': []})['quads'].append(quad) + + # 3) Create a list of non-normalized blank node identifiers and + # populate it using the keys from the blank node to quads map. + non_normalized = set(self.blank_node_info.keys()) + + # 4) Initialize simple, a boolean flag, to true. + simple = True + + # 5) While simple is true, issue canonical identifiers for blank nodes: + while simple: + # 5.1) Set simple to false. + simple = False + + # 5.2) Clear hash to blank nodes map. + self.hash_to_blank_nodes = {} + + # 5.3) For each blank node identifier identifier in non-normalized + # identifiers: + for id_ in non_normalized: + # 5.3.1) Create a hash, hash, according to the Hash First + # Degree Quads algorithm. + hash = self.hash_first_degree_quads(id_) + + # 5.3.2) Add hash and identifier to hash to blank nodes map, + # creating a new entry if necessary. + self.hash_to_blank_nodes.setdefault(hash, []).append(id_) + + # 5.4) For each hash to identifier list mapping in hash to blank + # nodes map, lexicographically-sorted by hash: + for hash, id_list in sorted(self.hash_to_blank_nodes.items()): + # 5.4.1) If the length of identifier list is greater than 1, + # continue to the next mapping. + if len(id_list) > 1: + continue + + # 5.4.2) Use the Issue Identifier algorithm, passing canonical + # issuer and the single blank node identifier in identifier + # list, identifier, to issue a canonical replacement identifier + # for identifier. + # TODO: consider changing `get_id` to `issue` + id_ = id_list[0] + self.canonical_issuer.get_id(id_) + + # 5.4.3) Remove identifier from non-normalized identifiers. + non_normalized.remove(id_) + + # 5.4.4) Remove hash from the hash to blank nodes map. + del self.hash_to_blank_nodes[hash] + + # 5.4.5) Set simple to true. + simple = True + + # 6) For each hash to identifier list mapping in hash to blank nodes + # map, lexicographically-sorted by hash: + for hash, id_list in sorted(self.hash_to_blank_nodes.items()): + # 6.1) Create hash path list where each item will be a result of + # running the Hash N-Degree Quads algorithm. + hash_path_list = [] + + # 6.2) For each blank node identifier identifier in identifier + # list: + for id_ in id_list: + # 6.2.1) If a canonical identifier has already been issued for + # identifier, continue to the next identifier. + if self.canonical_issuer.has_id(id_): + continue + + # 6.2.2) Create temporary issuer, an identifier issuer + # initialized with the prefix _:b. + issuer = IdentifierIssuer('_:b') + + # 6.2.3) Use the Issue Identifier algorithm, passing temporary + # issuer and identifier, to issue a new temporary blank node + # identifier for identifier. + issuer.get_id(id_) + + # 6.2.4) Run the Hash N-Degree Quads algorithm, passing + # temporary issuer, and append the result to the hash path + # list. + hash_path_list.append(self.hash_n_degree_quads(id_, issuer)) + + # 6.3) For each result in the hash path list, + # lexicographically-sorted by the hash in result: + for result in sorted(hash_path_list, key=lambda r: r['hash']): + # 6.3.1) For each blank node identifier, existing identifier, + # that was issued a temporary identifier by identifier issuer + # in result, issue a canonical identifier, in the same order, + # using the Issue Identifier algorithm, passing canonical + # issuer and existing identifier. + for existing in result['issuer'].order: + self.canonical_issuer.get_id(existing) + + # Note: At this point all blank nodes in the set of RDF quads have been + # assigned canonical identifiers, which have been stored in the + # canonical issuer. Here each quad is updated by assigning each of its + # blank nodes its new identifier. + + # 7) For each quad, quad, in input dataset: + normalized = [] + for quad in self.quads: + # 7.1) Create a copy, quad copy, of quad and replace any existing + # blank node identifiers using the canonical identifiers previously + # issued by canonical issuer. Note: We optimize away the copy here. + for key, component in quad.items(): + if key == 'predicate': + continue + if(component['type'] == 'blank node' and not + component['value'].startswith( + self.canonical_issuer.prefix)): + component['value'] = self.canonical_issuer.get_id( + component['value']) + + # 7.2) Add quad copy to the normalized dataset. + normalized.append(to_nquad(quad)) + + # sort normalized output + normalized.sort() + + # 8) Return the normalized dataset. + if (options.get('format') == 'application/n-quads' or + options.get('format') == 'application/nquads'): + return ''.join(normalized) + return parse_nquads(''.join(normalized)) + + # 4.6) Hash First Degree Quads + def hash_first_degree_quads(self, id_): + # return cached hash + info = self.blank_node_info[id_] + if 'hash' in info: + return info['hash'] + + # 1) Initialize nquads to an empty list. It will be used to store quads + # in N-Quads format. + nquads = [] + + # 2) Get the list of quads quads associated with the reference blank + # node identifier in the blank node to quads map. + quads = info['quads'] + + # 3) For each quad quad in quads: + for quad in quads: + # 3.1) Serialize the quad in N-Quads format with the following + # special rule: + + # 3.1.1) If any component in quad is an blank node, then serialize + # it using a special identifier as follows: + copy = {} + for key, component in quad.items(): + if key == 'predicate': + copy[key] = component + continue + # 3.1.2) If the blank node's existing blank node identifier + # matches the reference blank node identifier then use the + # blank node identifier _:a, otherwise, use the blank node + # identifier _:z. + copy[key] = self.modify_first_degree_component( + id_, component, key) + nquads.append(to_nquad(copy)) + + # 4) Sort nquads in lexicographical order. + nquads.sort() + + # 5) Return the hash that results from passing the sorted, joined + # nquads through the hash algorithm. + info['hash'] = self.hash_nquads(nquads) + return info['hash'] + + # helper for modifying component during Hash First Degree Quads + def modify_first_degree_component(self, id_, component, key): + if component['type'] != 'blank node': + return component + component = copy.deepcopy(component) + component['value'] = '_:a' if component['value'] == id_ else '_:z' + return component + + # 4.7) Hash Related Blank Node + def hash_related_blank_node(self, related, quad, issuer, position): + # 1) Set the identifier to use for related, preferring first the + # canonical identifier for related if issued, second the identifier + # issued by issuer if issued, and last, if necessary, the result of + # the Hash First Degree Quads algorithm, passing related. + if self.canonical_issuer.has_id(related): + id_ = self.canonical_issuer.get_id(related) + elif issuer.has_id(related): + id_ = issuer.get_id(related) + else: + id_ = self.hash_first_degree_quads(related) + + # 2) Initialize a string input to the value of position. + # Note: We use a hash object instead. + md = self.create_hash() + md.update(position.encode('utf8')) + + # 3) If position is not g, append <, the value of the predicate in + # quad, and > to input. + if position != 'g': + md.update(self.get_related_predicate(quad).encode('utf8')) + + # 4) Append identifier to input. + md.update(id_.encode('utf8')) + + # 5) Return the hash that results from passing input through the hash + # algorithm. + return md.hexdigest() + + # helper for getting a related predicate + def get_related_predicate(self, quad): + return '<' + quad['predicate']['value'] + '>' + + # 4.8) Hash N-Degree Quads + def hash_n_degree_quads(self, id_, issuer): + # 1) Create a hash to related blank nodes map for storing hashes that + # identify related blank nodes. + # Note: 2) and 3) handled within `createHashToRelated` + hash_to_related = self.create_hash_to_related(id_, issuer) + + # 4) Create an empty string, data to hash. + # Note: We create a hash object instead. + md = self.create_hash() + + # 5) For each related hash to blank node list mapping in hash to + # related blank nodes map, sorted lexicographically by related hash: + for hash, blank_nodes in sorted(hash_to_related.items()): + # 5.1) Append the related hash to the data to hash. + md.update(hash.encode('utf8')) + + # 5.2) Create a string chosen path. + chosen_path = '' + + # 5.3) Create an unset chosen issuer variable. + chosen_issuer = None + + # 5.4) For each permutation of blank node list: + for permutation in permutations(blank_nodes): + # 5.4.1) Create a copy of issuer, issuer copy. + issuer_copy = copy.deepcopy(issuer) + + # 5.4.2) Create a string path. + path = '' + + # 5.4.3) Create a recursion list, to store blank node + # identifiers that must be recursively processed by this + # algorithm. + recursion_list = [] + + # 5.4.4) For each related in permutation: + skip_to_next_permutation = False + for related in permutation: + # 5.4.4.1) If a canonical identifier has been issued for + # related, append it to path. + if(self.canonical_issuer.has_id(related)): + path += self.canonical_issuer.get_id(related) + # 5.4.4.2) Otherwise: + else: + # 5.4.4.2.1) If issuer copy has not issued an + # identifier for related, append related to recursion + # list. + if not issuer_copy.has_id(related): + recursion_list.append(related) + + # 5.4.4.2.2) Use the Issue Identifier algorithm, + # passing issuer copy and related and append the result + # to path. + path += issuer_copy.get_id(related) + + # 5.4.4.3) If chosen path is not empty and the length of + # path is greater than or equal to the length of chosen + # path and path is lexicographically greater than chosen + # path, then skip to the next permutation. + if(len(chosen_path) != 0 and + len(path) >= len(chosen_path) and + path > chosen_path): + skip_to_next_permutation = True + break + + if skip_to_next_permutation: + continue + + # 5.4.5) For each related in recursion list: + for related in recursion_list: + # 5.4.5.1) Set result to the result of recursively + # executing the Hash N-Degree Quads algorithm, passing + # related for identifier and issuer copy for path + # identifier issuer. + result = self.hash_n_degree_quads(related, issuer_copy) + + # 5.4.5.2) Use the Issue Identifier algorithm, passing + # issuer copy and related and append the result to path. + path += issuer_copy.get_id(related) + + # 5.4.5.3) Append <, the hash in result, and > to path. + path += '<' + result['hash'] + '>' + + # 5.4.5.4) Set issuer copy to the identifier issuer in + # result. + issuer_copy = result['issuer'] + + # 5.4.5.5) If chosen path is not empty and the length of + # path is greater than or equal to the length of chosen + # path and path is lexicographically greater than chosen + # path, then skip to the next permutation. + if(len(chosen_path) != 0 and + len(path) >= len(chosen_path) and + path > chosen_path): + skip_to_next_permutation = True + break + + if skip_to_next_permutation: + continue + + # 5.4.6) If chosen path is empty or path is lexicographically + # less than chosen path, set chosen path to path and chosen + # issuer to issuer copy. + if len(chosen_path) == 0 or path < chosen_path: + chosen_path = path + chosen_issuer = issuer_copy + + # 5.5) Append chosen path to data to hash. + md.update(chosen_path.encode('utf8')) + + # 5.6) Replace issuer, by reference, with chosen issuer. + issuer = chosen_issuer + + # 6) Return issuer and the hash that results from passing data to hash + # through the hash algorithm. + return {'hash': md.hexdigest(), 'issuer': issuer} + + # helper for creating hash to related blank nodes map + def create_hash_to_related(self, id_, issuer): + # 1) Create a hash to related blank nodes map for storing hashes that + # identify related blank nodes. + hash_to_related = {} + + # 2) Get a reference, quads, to the list of quads in the blank node to + # quads map for the key identifier. + quads = self.blank_node_info[id_]['quads'] + + # 3) For each quad in quads: + for quad in quads: + # 3.1) For each component in quad, if component is the subject, + # object, and graph name and it is a blank node that is not + # identified by identifier: + for key, component in quad.items(): + if(key != 'predicate' and + component['type'] == 'blank node' and + component['value'] != id_): + # 3.1.1) Set hash to the result of the Hash Related Blank + # Node algorithm, passing the blank node identifier for + # component as related, quad, path identifier issuer as + # issuer, and position as either s, o, or g based on + # whether component is a subject, object, graph name, + # respectively. + related = component['value'] + position = self.POSITIONS[key] + hash = self.hash_related_blank_node( + related, quad, issuer, position) + + # 3.1.2) Add a mapping of hash to the blank node identifier + # for component to hash to related blank nodes map, adding + # an entry as necessary. + hash_to_related.setdefault(hash, []).append(related) + + return hash_to_related + + # helper to create appropriate hash object + def create_hash(self): + return hashlib.sha256() + + # helper to hash a list of nquads + def hash_nquads(self, nquads): + md = self.create_hash() + for nquad in nquads: + md.update(nquad.encode('utf8')) + return md.hexdigest() + + +class URGNA2012(URDNA2015): + """ + URGNA2012 implements the URGNA2012 RDF Graph Normalization Algorithm. + """ + + def __init__(self): + URDNA2015.__init__(self) + + # helper for modifying component during Hash First Degree Quads + def modify_first_degree_component(self, id_, component, key): + if component['type'] != 'blank node': + return component + component = copy.deepcopy(component) + if key == 'name': + component['value'] = '_:g' + else: + component['value'] = '_:a' if component['value'] == id_ else '_:z' + return component + + # helper for getting a related predicate + def get_related_predicate(self, quad): + return quad['predicate']['value'] + + # helper for creating hash to related blank nodes map + def create_hash_to_related(self, id_, issuer): + # 1) Create a hash to related blank nodes map for storing hashes that + # identify related blank nodes. + hash_to_related = {} + + # 2) Get a reference, quads, to the list of quads in the blank node to + # quads map for the key identifier. + quads = self.blank_node_info[id_]['quads'] + + # 3) For each quad in quads: + for quad in quads: + # 3.1) If the quad's subject is a blank node that does not match + # identifier, set hash to the result of the Hash Related Blank Node + # algorithm, passing the blank node identifier for subject as + # related, quad, path identifier issuer as issuer, and p as + # position. + if(quad['subject']['type'] == 'blank node' and + quad['subject']['value'] != id_): + related = quad['subject']['value'] + position = 'p' + # 3.2) Otherwise, if quad's object is a blank node that does + # not match identifier, to the result of the Hash Related Blank + # Node algorithm, passing the blank node identifier for object + # as related, quad, path identifier issuer as issuer, and r + # as position. + elif(quad['object']['type'] == 'blank node' and + quad['object']['value'] != id_): + related = quad['object']['value'] + position = 'r' + # 3.3) Otherwise, continue to the next quad. + else: + continue + + # 3.4) Add a mapping of hash to the blank node identifier for the + # component that matched (subject or object) to hash to related + # blank nodes map, adding an entry as necessary. + hash = self.hash_related_blank_node( + related, quad, issuer, position) + hash_to_related.setdefault(hash, []).append(related) + + return hash_to_related + + # helper to create appropriate hash object + def create_hash(self): + return hashlib.sha1() + + +def permutations(elements): + """ + Generates all of the possible permutations for the given list of elements. + + :param elements: the list of elements to permutate. + """ + # begin with sorted elements + elements.sort() + # initialize directional info for permutation algorithm + left = {} + for v in elements: + left[v] = True + + length = len(elements) + last = length - 1 + while True: + yield elements + + # Calculate the next permutation using the Steinhaus-Johnson-Trotter + # permutation algorithm. + + # get largest mobile element k + # (mobile: element is greater than the one it is looking at) + k, pos = None, 0 + for i in range(length): + e = elements[i] + is_left = left[e] + if((k is None or e > k) and + ((is_left and i > 0 and e > elements[i - 1]) or + (not is_left and i < last and e > elements[i + 1]))): + k, pos = e, i + + # no more permutations + if k is None: + return + + # swap k and the element it is looking at + swap = pos - 1 if left[k] else pos + 1 + elements[pos], elements[swap] = elements[swap], k + + # reverse the direction of all elements larger than k + for i in range(length): + if elements[i] > k: + left[elements[i]] = not left[elements[i]] + + +class UnknownFormatError(ValueError): + """ + Base class for unknown format errors. + """ + + def __init__(self, message, format): + Exception.__init__(self, message) + self.format = format \ No newline at end of file diff --git a/lib/pyld/context_resolver.py b/lib/pyld/context_resolver.py index 784821a..36d36e1 100644 --- a/lib/pyld/context_resolver.py +++ b/lib/pyld/context_resolver.py @@ -153,8 +153,8 @@ def _fetch_context(self, active_ctx, url, cycles): 'non-JSON response, or more than one HTTP Link Header was ' + 'provided for a remote context.', 'jsonld.InvalidUrl', - {'url': url, 'cause': cause}, - code='loading remote context failed') + {'url': url}, + code='loading remote context failed') from cause # ensure ctx is an object if not isinstance(context, dict) and not isinstance(context, frozendict): diff --git a/lib/pyld/documentloader/aiohttp.py b/lib/pyld/documentloader/aiohttp.py index 96f786e..123f666 100644 --- a/lib/pyld/documentloader/aiohttp.py +++ b/lib/pyld/documentloader/aiohttp.py @@ -123,8 +123,8 @@ async def async_loader(url, headers): except Exception as cause: raise JsonLdError( 'Could not retrieve a JSON-LD document from the URL.', - 'jsonld.LoadDocumentError', code='loading document failed', - cause=cause) + 'jsonld.LoadDocumentError', + code='loading document failed') from cause def loader(url, options=None): """ diff --git a/lib/pyld/documentloader/requests.py b/lib/pyld/documentloader/requests.py index 570f6fa..0015bdf 100644 --- a/lib/pyld/documentloader/requests.py +++ b/lib/pyld/documentloader/requests.py @@ -101,7 +101,7 @@ def loader(url, options={}): except Exception as cause: raise JsonLdError( 'Could not retrieve a JSON-LD document from the URL.', - 'jsonld.LoadDocumentError', code='loading document failed', - cause=cause) + 'jsonld.LoadDocumentError', + code='loading document failed') from cause return loader diff --git a/lib/pyld/identifier_issuer.py b/lib/pyld/identifier_issuer.py new file mode 100644 index 0000000..a05d40b --- /dev/null +++ b/lib/pyld/identifier_issuer.py @@ -0,0 +1,52 @@ +class IdentifierIssuer(object): + """ + An IdentifierIssuer issues unique identifiers, keeping track of any + previously issued identifiers. + """ + + def __init__(self, prefix): + """ + Initializes a new IdentifierIssuer. + + :param prefix: the prefix to use (''). + """ + self.prefix = prefix + self.counter = 0 + self.existing = {} + self.order = [] + + """ + Gets the new identifier for the given old identifier, where if no old + identifier is given a new identifier will be generated. + + :param [old]: the old identifier to get the new identifier for. + + :return: the new identifier. + """ + def get_id(self, old=None): + # return existing old identifier + if old and old in self.existing: + return self.existing[old] + + # get next identifier + id_ = self.prefix + str(self.counter) + self.counter += 1 + + # save mapping + if old is not None: + self.existing[old] = id_ + self.order.append(old) + + return id_ + + def has_id(self, old): + """ + Returns True if the given old identifier has already been assigned a + new identifier. + + :param old: the old identifier to check. + + :return: True if the old identifier has been assigned a new identifier, + False if not. + """ + return old in self.existing diff --git a/lib/pyld/jsonld.py b/lib/pyld/jsonld.py index 40201c2..bf359fb 100644 --- a/lib/pyld/jsonld.py +++ b/lib/pyld/jsonld.py @@ -15,7 +15,6 @@ """ import copy -import hashlib import json import re import sys @@ -23,20 +22,19 @@ from urllib.parse import urlparse import warnings import uuid + +from pyld.canon import URDNA2015, URGNA2012, UnknownFormatError +from pyld.parser import ParserError, parse_nquads, to_nquad, to_nquads +from pyld.identifier_issuer import IdentifierIssuer from .context_resolver import ContextResolver from c14n.Canonicalize import canonicalize from cachetools import LRUCache -from collections import namedtuple -from functools import cmp_to_key import lxml.html from numbers import Integral, Real from frozendict import frozendict from pyld.__about__ import (__copyright__, __license__, __version__) from .iri_resolver import resolve, unresolve -def cmp(a, b): - return (a > b) - (a < b) - __all__ = [ '__copyright__', '__license__', '__version__', 'compact', 'expand', 'flatten', 'frame', 'link', 'from_rdf', 'to_rdf', @@ -518,7 +516,7 @@ def compact(self, input_, ctx, options): except JsonLdError as cause: raise JsonLdError( 'Could not expand input before compaction.', - 'jsonld.CompactError', cause=cause) + 'jsonld.CompactError') from cause # process context active_ctx = self._get_initial_context(options) @@ -527,7 +525,7 @@ def compact(self, input_, ctx, options): except JsonLdError as cause: raise JsonLdError( 'Could not process context before compaction.', - 'jsonld.CompactError', cause=cause) + 'jsonld.CompactError') from cause # do compaction compacted = self._compact(active_ctx, None, expanded, options) @@ -634,8 +632,7 @@ def expand(self, input_, options): raise JsonLdError( 'Could not retrieve a JSON-LD document from the URL.', 'jsonld.LoadDocumentError', - {'remoteDoc': remote_doc}, code='loading document failed', - cause=cause) + {'remoteDoc': remote_doc}, code='loading document failed') from cause # set default base options.setdefault('base', remote_doc['documentUrl'] or '') @@ -713,7 +710,7 @@ def flatten(self, input_, ctx, options): except Exception as cause: raise JsonLdError( 'Could not expand input before flattening.', - 'jsonld.FlattenError', cause=cause) + 'jsonld.FlattenError') from cause # do flattening flattened = self._flatten(expanded) @@ -729,7 +726,7 @@ def flatten(self, input_, ctx, options): except Exception as cause: raise JsonLdError( 'Could not compact flattened output.', - 'jsonld.FlattenError', cause=cause) + 'jsonld.FlattenError') from cause return compacted @@ -796,8 +793,7 @@ def frame(self, input_, frame, options): raise JsonLdError( 'Could not retrieve a JSON-LD document from the URL.', 'jsonld.LoadDocumentError', - {'remoteDoc': remote_frame}, code='loading document failed', - cause=cause) + {'remoteDoc': remote_frame}, code='loading document failed') from cause # preserve frame context frame = remote_frame['document'] @@ -819,7 +815,7 @@ def frame(self, input_, frame, options): except JsonLdError as cause: raise JsonLdError( 'Could not process context before framing.', - 'jsonld.FrameError', cause=cause) + 'jsonld.FrameError') from cause # mode specific defaluts if 'omitGraph' not in options: @@ -833,7 +829,7 @@ def frame(self, input_, frame, options): except JsonLdError as cause: raise JsonLdError( 'Could not expand input before framing.', - 'jsonld.FrameError', cause=cause) + 'jsonld.FrameError') from cause try: # expand frame @@ -844,7 +840,7 @@ def frame(self, input_, frame, options): except JsonLdError as cause: raise JsonLdError( 'Could not expand frame before framing.', - 'jsonld.FrameError', cause=cause) + 'jsonld.FrameError') from cause # if the unexpanded frame includes a key expanding to @graph, frame the # default graph, otherwise, the merged graph @@ -870,7 +866,7 @@ def frame(self, input_, frame, options): except JsonLdError as cause: raise JsonLdError( 'Could not compact framed output.', - 'jsonld.FrameError', cause=cause) + 'jsonld.FrameError') from cause options['link'] = {} return self._cleanup_null(result, options) @@ -906,7 +902,7 @@ def normalize(self, input_, options): options.setdefault('extractAllScripts', True) options.setdefault('processingMode', 'json-ld-1.1') - if not options['algorithm'] in ['URDNA2015', 'URGNA2012']: + if options['algorithm'] not in ['URDNA2015', 'URGNA2012']: raise JsonLdError( 'Unsupported normalization algorithm.', 'jsonld.NormalizeError') @@ -929,11 +925,17 @@ def normalize(self, input_, options): except JsonLdError as cause: raise JsonLdError( 'Could not convert input to RDF dataset before normalization.', - 'jsonld.NormalizeError', cause=cause) + 'jsonld.NormalizeError') from cause # do normalization if options['algorithm'] == 'URDNA2015': - return URDNA2015().main(dataset, options) + try: + return URDNA2015().main(dataset, options) + except UnknownFormatError as cause: + raise JsonLdError( + str(cause), + 'jsonld.UnknownFormat', {'format': cause.format}) from cause + # assume URGNA2012 return URGNA2012().main(dataset, options) @@ -969,9 +971,9 @@ def from_rdf(self, dataset, options): if 'format' in options: # supported formats (processor-specific and global) if ((self.rdf_parsers is not None and - not options['format'] in self.rdf_parsers) or + options['format'] not in self.rdf_parsers) or (self.rdf_parsers is None and - not options['format'] in _rdf_parsers)): + options['format'] not in _rdf_parsers)): raise JsonLdError( 'Unknown input format.', 'jsonld.UnknownFormat', {'format': options['format']}) @@ -1020,7 +1022,7 @@ def to_rdf(self, input_, options): except JsonLdError as cause: raise JsonLdError( 'Could not expand input before serialization to ' - 'RDF.', 'jsonld.RdfError', cause=cause) + 'RDF.', 'jsonld.RdfError') from cause # create node map for default graph (and any named graphs) issuer = IdentifierIssuer('_:b') @@ -1256,10 +1258,8 @@ def compare_values(v1, v2): """ # 1. equal primitives if not _is_object(v1) and not _is_object(v2) and v1 == v2: - type1 = type(v1) - type2 = type(v2) - if type1 == bool or type2 == bool: - return type1 == type2 + if isinstance(v1, bool) or isinstance(v2, bool): + return type(v1) is type(v2) return True # 2. equal @values @@ -1268,10 +1268,9 @@ def compare_values(v1, v2): v1.get('@type') == v2.get('@type') and v1.get('@language') == v2.get('@language') and v1.get('@index') == v2.get('@index')): - type1 = type(v1['@value']) - type2 = type(v2['@value']) - if type1 == bool or type2 == bool: - return type1 == type2 + + if isinstance(v1['@value'], bool) or isinstance(v2['@value'], bool): + return type(v1['@value']) is type(v2['@value']) return True # 3. equal @ids @@ -1332,112 +1331,11 @@ def parse_nquads(input_): :return: an RDF dataset. """ - # define partial regexes - iri = '(?:<([^:]+:[^>]*)>)' - bnode = '(_:(?:[A-Za-z][A-Za-z0-9]*))' - plain = '"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)"' - datatype = '(?:\\^\\^' + iri + ')' - language = '(?:@([a-zA-Z]+(?:-[a-zA-Z0-9]+)*))' - literal = '(?:' + plain + '(?:' + datatype + '|' + language + ')?)' - ws = '[ \\t]+' - wso = '[ \\t]*' - eoln = r'(?:\r\n)|(?:\n)|(?:\r)' - empty = r'^' + wso + '$' - - # define quad part regexes - subject = '(?:' + iri + '|' + bnode + ')' + ws - property = iri + ws - object = '(?:' + iri + '|' + bnode + '|' + literal + ')' + wso - graph = '(?:\\.|(?:(?:' + iri + '|' + bnode + ')' + wso + '\\.))' - - # Note: Notice that the graph position does not include literals - # even though they are specified as a possible value in the - # N-Quads note (http://sw.deri.org/2008/07/n-quads/). This is - # intentional, as literals in that position are not supported by the - # RDF data model or the JSON-LD data model. - # See: https://github.com/digitalbazaar/pyld/pull/19 - - # full quad regex - quad = r'^' + wso + subject + property + object + graph + wso + '$' - - # build RDF dataset - dataset = {} - - # split N-Quad input into lines - lines = re.split(eoln, input_) - line_number = 0 - for line in lines: - line_number += 1 - - # skip empty lines - if re.search(empty, line) is not None: - continue - - # parse quad - match = re.search(quad, line) - if match is None: - raise JsonLdError( - 'Error while parsing N-Quads invalid quad.', - 'jsonld.ParseError', {'line': line_number}) - match = match.groups() - - # create RDF triple - triple = {'subject': {}, 'predicate': {}, 'object': {}} - - # get subject - if match[0] is not None: - triple['subject'] = {'type': 'IRI', 'value': match[0]} - else: - triple['subject'] = {'type': 'blank node', 'value': match[1]} - - # get predicate - triple['predicate'] = {'type': 'IRI', 'value': match[2]} - - # get object - if match[3] is not None: - triple['object'] = {'type': 'IRI', 'value': match[3]} - elif match[4] is not None: - triple['object'] = {'type': 'blank node', 'value': match[4]} - else: - triple['object'] = {'type': 'literal'} - unescaped = ( - match[5] - .replace('\\"', '\"') - .replace('\\t', '\t') - .replace('\\n', '\n') - .replace('\\r', '\r') - .replace('\\\\', '\\')) - if match[6] is not None: - triple['object']['datatype'] = match[6] - elif match[7] is not None: - triple['object']['datatype'] = RDF_LANGSTRING - triple['object']['language'] = match[7] - else: - triple['object']['datatype'] = XSD_STRING - triple['object']['value'] = unescaped - - # get graph name ('@default' is used for the default graph) - name = '@default' - if match[8] is not None: - name = match[8] - elif match[9] is not None: - name = match[9] - - # initialize graph in dataset - if name not in dataset: - dataset[name] = [triple] - # add triple if unique to its graph - else: - unique = True - triples = dataset[name] - for t in dataset[name]: - if JsonLdProcessor._compare_rdf_triples(t, triple): - unique = False - break - if unique: - triples.append(triple) - - return dataset + try: + result = parse_nquads(input_) + return result + except ParserError as cause: + raise JsonLdError(str(cause), 'jsonld.ParseError', details={'line': cause.line_number}) from cause @staticmethod def to_nquads(dataset): @@ -1448,80 +1346,11 @@ def to_nquads(dataset): :return: the N-Quads string. """ - quads = [] - for graph_name, triples in dataset.items(): - for triple in triples: - if graph_name == '@default': - graph_name = None - quads.append(JsonLdProcessor.to_nquad(triple, graph_name)) - quads.sort() - return ''.join(quads) + return to_nquads(dataset) @staticmethod def to_nquad(triple, graph_name=None): - """ - Converts an RDF triple and graph name to an N-Quad string (a single - quad). - - :param triple: the RDF triple or quad to convert (a triple or quad - may be passed, if a triple is passed then `graph_name` should be - given to specify the name of the graph the triple is in, `None` - for the default graph). - :param graph_name: the name of the graph containing the triple, None - for the default graph. - - :return: the N-Quad string. - """ - s = triple['subject'] - p = triple['predicate'] - o = triple['object'] - g = triple.get('name', {'value': graph_name})['value'] - - quad = '' - - # subject is an IRI - if s['type'] == 'IRI': - quad += '<' + s['value'] + '>' - else: - quad += s['value'] - quad += ' ' - - # property is an IRI - if p['type'] == 'IRI': - quad += '<' + p['value'] + '>' - else: - quad += p['value'] - quad += ' ' - - # object is IRI, bnode, or literal - if o['type'] == 'IRI': - quad += '<' + o['value'] + '>' - elif(o['type'] == 'blank node'): - quad += o['value'] - else: - escaped = ( - o['value'] - .replace('\\', '\\\\') - .replace('\t', '\\t') - .replace('\n', '\\n') - .replace('\r', '\\r') - .replace('\"', '\\"')) - quad += '"' + escaped + '"' - if o['datatype'] == RDF_LANGSTRING: - if o['language']: - quad += '@' + o['language'] - elif o['datatype'] != XSD_STRING: - quad += '^^<' + o['datatype'] + '>' - - # graph - if g is not None: - if not g.startswith('_:'): - quad += ' <' + g + '>' - else: - quad += ' ' + g - - quad += ' .\n' - return quad + return to_nquad(triple, graph_name) @staticmethod def arrayify(value): @@ -1535,28 +1364,6 @@ def arrayify(value): """ return value if _is_array(value) else [value] - @staticmethod - def _compare_rdf_triples(t1, t2): - """ - Compares two RDF triples for equality. - - :param t1: the first triple. - :param t2: the second triple. - - :return: True if the triples are the same, False if not. - """ - for attr in ['subject', 'predicate', 'object']: - if(t1[attr]['type'] != t2[attr]['type'] or - t1[attr]['value'] != t2[attr]['value']): - return False - - if t1['object'].get('language') != t2['object'].get('language'): - return False - if t1['object'].get('datatype') != t2['object'].get('datatype'): - return False - - return True - def _compact(self, active_ctx, active_property, element, options): """ Recursively compacts an element using the given active context. All @@ -2835,7 +2642,6 @@ def _process_context(self, active_ctx, local_ctx, options, :return: the new active context. """ - has_related = 'related' in active_ctx['mappings'] # normalize local context to an array if _is_object(local_ctx) and _is_array(local_ctx.get('@context')): local_ctx = local_ctx['@context'] @@ -3067,7 +2873,7 @@ def _process_context(self, active_ctx, local_ctx, options, 'json-ld-1.0', 'jsonld.SyntaxError', {'context': ctx}, code='invalid context entry') - if type(value) != bool: + if not isinstance(value, bool): raise JsonLdError( 'Invalid JSON-LD syntax; @propagate value must be a boolean.', 'jsonld.SyntaxError', {'context': ctx}, @@ -3111,7 +2917,7 @@ def _process_context(self, active_ctx, local_ctx, options, raise JsonLdError( 'Invalid JSON-LD syntax; invalid scoped context.', 'jsonld.SyntaxError', {'context': key_ctx, 'term': k}, - code='invalid scoped context') + code='invalid scoped context') from cause # cache processed result (only Python >= 3.6) # and give the context a unique identifier @@ -3539,8 +3345,7 @@ def _rdf_to_object(self, o, use_native_types, rdf_direction): 'JSON literal could not be parsed.', 'jsonld.InvalidJsonLiteral', {"value": rval['@value']}, - code='invalid JSON literal', - cause=cause) + code='invalid JSON literal') from cause # use native types for certain xsd types if use_native_types: @@ -3777,7 +3582,7 @@ def _match_frame(self, state, subjects, frame, parent, property): # when the property is None, which only occurs at the top-level. if property is None: state['uniqueEmbeds'] = {state['graph']: {}} - elif not state['graph'] in state['uniqueEmbeds']: + elif state['graph'] not in state['uniqueEmbeds']: state['uniqueEmbeds'][state['graph']] = {} if flags['embed'] == '@link' and id_ in link: @@ -3852,9 +3657,7 @@ def _match_frame(self, state, subjects, frame, parent, property): recurse = state['graph'] != '@merged' subframe = {} else: - subframe = frame['@graph'][0] - if not _is_object(subframe): - subFrame = {} + subframe = frame['@graph'][0] if not _is_object(subframe) else {} recurse = not (id_ == '@merged' or id_ == '@default') if recurse: @@ -4357,7 +4160,7 @@ def _cleanup_preserve(self, input_, options): idx = options['link'][id_].index(input_) # already visited return options['link'][id_][idx] - except: + except ValueError: # prevent circular visitation options['link'][id_].append(input_) else: @@ -4397,7 +4200,7 @@ def _cleanup_null(self, input_, options): idx = options['link'][id_].index(input_) # already visited return options['link'][id_][idx] - except: + except ValueError: # prevent circular visitation options['link'][id_].append(input_) else: @@ -4663,7 +4466,7 @@ def _compact_iri( # lexicographically less than the current choice if (is_usable_curie and ( candidate is None or - _compare_shortest_least(curie, candidate) < 0)): + (len(curie), curie) < (len(candidate), candidate))): candidate = curie # return curie candidate @@ -5030,7 +4833,7 @@ def _create_term_definition(self, active_ctx, local_ctx, term, defined, options, mapping['@id'] = active_ctx['@vocab'] + term if (value.get('@protected') or - (defined.get('@protected') and value.get('@protected') != False)): + (defined.get('@protected') and value.get('@protected', True))): mapping['protected'] = True if '@type' in value: @@ -5131,7 +4934,7 @@ def _create_term_definition(self, active_ctx, local_ctx, term, defined, options, mapping['@container'] = container if '@index' in value: - if not '@container' in value or not '@index' in mapping['@container']: + if '@container' not in value or '@index' not in mapping['@container']: raise JsonLdError( 'Invalid JSON-LD syntax; @index without @index in @container.', 'jsonld.SyntaxError', @@ -5353,8 +5156,8 @@ def _get_inverse_context(self, active_ctx): # create term selections for each mapping in the context, ordered by # shortest and then lexicographically least for term, mapping in sorted( - active_ctx['mappings'].items(), - key=cmp_to_key(_compare_shortest_least)): + active_ctx['mappings'].items(), + key=lambda kv: (len(kv[0]), kv[0])): if mapping is None or not mapping.get('@id'): continue @@ -5438,18 +5241,16 @@ def _clone_active_context(self, active_ctx): child['@vocab'] = active_ctx['@vocab'] return child - class JsonLdError(Exception): """ Base class for JSON-LD errors. """ - def __init__(self, message, type_, details=None, code=None, cause=None): + def __init__(self, message, type_, details=None, code=None): Exception.__init__(self, message) self.type = type_ self.details = details self.code = code - self.cause = cause self.causeTrace = traceback.extract_tb(*sys.exc_info()[2:]) def __str__(self): @@ -5459,622 +5260,10 @@ def __str__(self): rval += '\nCode: ' + self.code if self.details: rval += '\nDetails: ' + repr(self.details) - if self.cause: - rval += '\nCause: ' + str(self.cause) - rval += ''.join(traceback.format_list(self.causeTrace)) + rval += '\nCause: ' + str(self.__cause__) + rval += ''.join(traceback.format_list(self.causeTrace)) return rval - -class IdentifierIssuer(object): - """ - An IdentifierIssuer issues unique identifiers, keeping track of any - previously issued identifiers. - """ - - def __init__(self, prefix): - """ - Initializes a new IdentifierIssuer. - - :param prefix: the prefix to use (''). - """ - self.prefix = prefix - self.counter = 0 - self.existing = {} - self.order = [] - - """ - Gets the new identifier for the given old identifier, where if no old - identifier is given a new identifier will be generated. - - :param [old]: the old identifier to get the new identifier for. - - :return: the new identifier. - """ - def get_id(self, old=None): - # return existing old identifier - if old and old in self.existing: - return self.existing[old] - - # get next identifier - id_ = self.prefix + str(self.counter) - self.counter += 1 - - # save mapping - if old is not None: - self.existing[old] = id_ - self.order.append(old) - - return id_ - - def has_id(self, old): - """ - Returns True if the given old identifier has already been assigned a - new identifier. - - :param old: the old identifier to check. - - :return: True if the old identifier has been assigned a new identifier, - False if not. - """ - return old in self.existing - - -class URDNA2015(object): - """ - URDNA2015 implements the URDNA2015 RDF Dataset Normalization Algorithm. - """ - - def __init__(self): - self.blank_node_info = {} - self.hash_to_blank_nodes = {} - self.canonical_issuer = IdentifierIssuer('_:c14n') - self.quads = [] - self.POSITIONS = {'subject': 's', 'object': 'o', 'name': 'g'} - - # 4.4) Normalization Algorithm - def main(self, dataset, options): - # handle invalid output format - if 'format' in options: - if (options['format'] != 'application/n-quads' and - options['format'] != 'application/nquads'): - raise JsonLdError( - 'Unknown output format.', - 'jsonld.UnknownFormat', {'format': options['format']}) - - # 1) Create the normalization state. - - # 2) For every quad in input dataset: - for graph_name, triples in dataset.items(): - if graph_name == '@default': - graph_name = None - for triple in triples: - quad = triple - if graph_name is not None: - if graph_name.startswith('_:'): - quad['name'] = {'type': 'blank node'} - else: - quad['name'] = {'type': 'IRI'} - quad['name']['value'] = graph_name - self.quads.append(quad) - - # 2.1) For each blank node that occurs in the quad, add a - # reference to the quad using the blank node identifier in the - # blank node to quads map, creating a new entry if necessary. - for key, component in quad.items(): - if key == 'predicate' or component['type'] != 'blank node': - continue - id_ = component['value'] - self.blank_node_info.setdefault( - id_, {'quads': []})['quads'].append(quad) - - # 3) Create a list of non-normalized blank node identifiers and - # populate it using the keys from the blank node to quads map. - non_normalized = set(self.blank_node_info.keys()) - - # 4) Initialize simple, a boolean flag, to true. - simple = True - - # 5) While simple is true, issue canonical identifiers for blank nodes: - while simple: - # 5.1) Set simple to false. - simple = False - - # 5.2) Clear hash to blank nodes map. - self.hash_to_blank_nodes = {} - - # 5.3) For each blank node identifier identifier in non-normalized - # identifiers: - for id_ in non_normalized: - # 5.3.1) Create a hash, hash, according to the Hash First - # Degree Quads algorithm. - hash = self.hash_first_degree_quads(id_) - - # 5.3.2) Add hash and identifier to hash to blank nodes map, - # creating a new entry if necessary. - self.hash_to_blank_nodes.setdefault(hash, []).append(id_) - - # 5.4) For each hash to identifier list mapping in hash to blank - # nodes map, lexicographically-sorted by hash: - for hash, id_list in sorted(self.hash_to_blank_nodes.items()): - # 5.4.1) If the length of identifier list is greater than 1, - # continue to the next mapping. - if len(id_list) > 1: - continue - - # 5.4.2) Use the Issue Identifier algorithm, passing canonical - # issuer and the single blank node identifier in identifier - # list, identifier, to issue a canonical replacement identifier - # for identifier. - # TODO: consider changing `get_id` to `issue` - id_ = id_list[0] - self.canonical_issuer.get_id(id_) - - # 5.4.3) Remove identifier from non-normalized identifiers. - non_normalized.remove(id_) - - # 5.4.4) Remove hash from the hash to blank nodes map. - del self.hash_to_blank_nodes[hash] - - # 5.4.5) Set simple to true. - simple = True - - # 6) For each hash to identifier list mapping in hash to blank nodes - # map, lexicographically-sorted by hash: - for hash, id_list in sorted(self.hash_to_blank_nodes.items()): - # 6.1) Create hash path list where each item will be a result of - # running the Hash N-Degree Quads algorithm. - hash_path_list = [] - - # 6.2) For each blank node identifier identifier in identifier - # list: - for id_ in id_list: - # 6.2.1) If a canonical identifier has already been issued for - # identifier, continue to the next identifier. - if self.canonical_issuer.has_id(id_): - continue - - # 6.2.2) Create temporary issuer, an identifier issuer - # initialized with the prefix _:b. - issuer = IdentifierIssuer('_:b') - - # 6.2.3) Use the Issue Identifier algorithm, passing temporary - # issuer and identifier, to issue a new temporary blank node - # identifier for identifier. - issuer.get_id(id_) - - # 6.2.4) Run the Hash N-Degree Quads algorithm, passing - # temporary issuer, and append the result to the hash path - # list. - hash_path_list.append(self.hash_n_degree_quads(id_, issuer)) - - # 6.3) For each result in the hash path list, - # lexicographically-sorted by the hash in result: - cmp_hashes = cmp_to_key(lambda x, y: cmp(x['hash'], y['hash'])) - for result in sorted(hash_path_list, key=cmp_hashes): - # 6.3.1) For each blank node identifier, existing identifier, - # that was issued a temporary identifier by identifier issuer - # in result, issue a canonical identifier, in the same order, - # using the Issue Identifier algorithm, passing canonical - # issuer and existing identifier. - for existing in result['issuer'].order: - self.canonical_issuer.get_id(existing) - - # Note: At this point all blank nodes in the set of RDF quads have been - # assigned canonical identifiers, which have been stored in the - # canonical issuer. Here each quad is updated by assigning each of its - # blank nodes its new identifier. - - # 7) For each quad, quad, in input dataset: - normalized = [] - for quad in self.quads: - # 7.1) Create a copy, quad copy, of quad and replace any existing - # blank node identifiers using the canonical identifiers previously - # issued by canonical issuer. Note: We optimize away the copy here. - for key, component in quad.items(): - if key == 'predicate': - continue - if(component['type'] == 'blank node' and not - component['value'].startswith( - self.canonical_issuer.prefix)): - component['value'] = self.canonical_issuer.get_id( - component['value']) - - # 7.2) Add quad copy to the normalized dataset. - normalized.append(JsonLdProcessor.to_nquad(quad)) - - # sort normalized output - normalized.sort() - - # 8) Return the normalized dataset. - if (options.get('format') == 'application/n-quads' or - options.get('format') == 'application/nquads'): - return ''.join(normalized) - return JsonLdProcessor.parse_nquads(''.join(normalized)) - - # 4.6) Hash First Degree Quads - def hash_first_degree_quads(self, id_): - # return cached hash - info = self.blank_node_info[id_] - if 'hash' in info: - return info['hash'] - - # 1) Initialize nquads to an empty list. It will be used to store quads - # in N-Quads format. - nquads = [] - - # 2) Get the list of quads quads associated with the reference blank - # node identifier in the blank node to quads map. - quads = info['quads'] - - # 3) For each quad quad in quads: - for quad in quads: - # 3.1) Serialize the quad in N-Quads format with the following - # special rule: - - # 3.1.1) If any component in quad is an blank node, then serialize - # it using a special identifier as follows: - copy = {} - for key, component in quad.items(): - if key == 'predicate': - copy[key] = component - continue - # 3.1.2) If the blank node's existing blank node identifier - # matches the reference blank node identifier then use the - # blank node identifier _:a, otherwise, use the blank node - # identifier _:z. - copy[key] = self.modify_first_degree_component( - id_, component, key) - nquads.append(JsonLdProcessor.to_nquad(copy)) - - # 4) Sort nquads in lexicographical order. - nquads.sort() - - # 5) Return the hash that results from passing the sorted, joined - # nquads through the hash algorithm. - info['hash'] = self.hash_nquads(nquads) - return info['hash'] - - # helper for modifying component during Hash First Degree Quads - def modify_first_degree_component(self, id_, component, key): - if component['type'] != 'blank node': - return component - component = copy.deepcopy(component) - component['value'] = '_:a' if component['value'] == id_ else '_:z' - return component - - # 4.7) Hash Related Blank Node - def hash_related_blank_node(self, related, quad, issuer, position): - # 1) Set the identifier to use for related, preferring first the - # canonical identifier for related if issued, second the identifier - # issued by issuer if issued, and last, if necessary, the result of - # the Hash First Degree Quads algorithm, passing related. - if self.canonical_issuer.has_id(related): - id_ = self.canonical_issuer.get_id(related) - elif issuer.has_id(related): - id_ = issuer.get_id(related) - else: - id_ = self.hash_first_degree_quads(related) - - # 2) Initialize a string input to the value of position. - # Note: We use a hash object instead. - md = self.create_hash() - md.update(position.encode('utf8')) - - # 3) If position is not g, append <, the value of the predicate in - # quad, and > to input. - if position != 'g': - md.update(self.get_related_predicate(quad).encode('utf8')) - - # 4) Append identifier to input. - md.update(id_.encode('utf8')) - - # 5) Return the hash that results from passing input through the hash - # algorithm. - return md.hexdigest() - - # helper for getting a related predicate - def get_related_predicate(self, quad): - return '<' + quad['predicate']['value'] + '>' - - # 4.8) Hash N-Degree Quads - def hash_n_degree_quads(self, id_, issuer): - # 1) Create a hash to related blank nodes map for storing hashes that - # identify related blank nodes. - # Note: 2) and 3) handled within `createHashToRelated` - hash_to_related = self.create_hash_to_related(id_, issuer) - - # 4) Create an empty string, data to hash. - # Note: We create a hash object instead. - md = self.create_hash() - - # 5) For each related hash to blank node list mapping in hash to - # related blank nodes map, sorted lexicographically by related hash: - for hash, blank_nodes in sorted(hash_to_related.items()): - # 5.1) Append the related hash to the data to hash. - md.update(hash.encode('utf8')) - - # 5.2) Create a string chosen path. - chosen_path = '' - - # 5.3) Create an unset chosen issuer variable. - chosen_issuer = None - - # 5.4) For each permutation of blank node list: - for permutation in permutations(blank_nodes): - # 5.4.1) Create a copy of issuer, issuer copy. - issuer_copy = copy.deepcopy(issuer) - - # 5.4.2) Create a string path. - path = '' - - # 5.4.3) Create a recursion list, to store blank node - # identifiers that must be recursively processed by this - # algorithm. - recursion_list = [] - - # 5.4.4) For each related in permutation: - skip_to_next_permutation = False - for related in permutation: - # 5.4.4.1) If a canonical identifier has been issued for - # related, append it to path. - if(self.canonical_issuer.has_id(related)): - path += self.canonical_issuer.get_id(related) - # 5.4.4.2) Otherwise: - else: - # 5.4.4.2.1) If issuer copy has not issued an - # identifier for related, append related to recursion - # list. - if not issuer_copy.has_id(related): - recursion_list.append(related) - - # 5.4.4.2.2) Use the Issue Identifier algorithm, - # passing issuer copy and related and append the result - # to path. - path += issuer_copy.get_id(related) - - # 5.4.4.3) If chosen path is not empty and the length of - # path is greater than or equal to the length of chosen - # path and path is lexicographically greater than chosen - # path, then skip to the next permutation. - if(len(chosen_path) != 0 and - len(path) >= len(chosen_path) and - path > chosen_path): - skip_to_next_permutation = True - break - - if skip_to_next_permutation: - continue - - # 5.4.5) For each related in recursion list: - for related in recursion_list: - # 5.4.5.1) Set result to the result of recursively - # executing the Hash N-Degree Quads algorithm, passing - # related for identifier and issuer copy for path - # identifier issuer. - result = self.hash_n_degree_quads(related, issuer_copy) - - # 5.4.5.2) Use the Issue Identifier algorithm, passing - # issuer copy and related and append the result to path. - path += issuer_copy.get_id(related) - - # 5.4.5.3) Append <, the hash in result, and > to path. - path += '<' + result['hash'] + '>' - - # 5.4.5.4) Set issuer copy to the identifier issuer in - # result. - issuer_copy = result['issuer'] - - # 5.4.5.5) If chosen path is not empty and the length of - # path is greater than or equal to the length of chosen - # path and path is lexicographically greater than chosen - # path, then skip to the next permutation. - if(len(chosen_path) != 0 and - len(path) >= len(chosen_path) and - path > chosen_path): - skip_to_next_permutation = True - break - - if skip_to_next_permutation: - continue - - # 5.4.6) If chosen path is empty or path is lexicographically - # less than chosen path, set chosen path to path and chosen - # issuer to issuer copy. - if len(chosen_path) == 0 or path < chosen_path: - chosen_path = path - chosen_issuer = issuer_copy - - # 5.5) Append chosen path to data to hash. - md.update(chosen_path.encode('utf8')) - - # 5.6) Replace issuer, by reference, with chosen issuer. - issuer = chosen_issuer - - # 6) Return issuer and the hash that results from passing data to hash - # through the hash algorithm. - return {'hash': md.hexdigest(), 'issuer': issuer} - - # helper for creating hash to related blank nodes map - def create_hash_to_related(self, id_, issuer): - # 1) Create a hash to related blank nodes map for storing hashes that - # identify related blank nodes. - hash_to_related = {} - - # 2) Get a reference, quads, to the list of quads in the blank node to - # quads map for the key identifier. - quads = self.blank_node_info[id_]['quads'] - - # 3) For each quad in quads: - for quad in quads: - # 3.1) For each component in quad, if component is the subject, - # object, and graph name and it is a blank node that is not - # identified by identifier: - for key, component in quad.items(): - if(key != 'predicate' and - component['type'] == 'blank node' and - component['value'] != id_): - # 3.1.1) Set hash to the result of the Hash Related Blank - # Node algorithm, passing the blank node identifier for - # component as related, quad, path identifier issuer as - # issuer, and position as either s, o, or g based on - # whether component is a subject, object, graph name, - # respectively. - related = component['value'] - position = self.POSITIONS[key] - hash = self.hash_related_blank_node( - related, quad, issuer, position) - - # 3.1.2) Add a mapping of hash to the blank node identifier - # for component to hash to related blank nodes map, adding - # an entry as necessary. - hash_to_related.setdefault(hash, []).append(related) - - return hash_to_related - - # helper to create appropriate hash object - def create_hash(self): - return hashlib.sha256() - - # helper to hash a list of nquads - def hash_nquads(self, nquads): - md = self.create_hash() - for nquad in nquads: - md.update(nquad.encode('utf8')) - return md.hexdigest() - - -class URGNA2012(URDNA2015): - """ - URGNA2012 implements the URGNA2012 RDF Graph Normalization Algorithm. - """ - - def __init__(self): - URDNA2015.__init__(self) - - # helper for modifying component during Hash First Degree Quads - def modify_first_degree_component(self, id_, component, key): - if component['type'] != 'blank node': - return component - component = copy.deepcopy(component) - if key == 'name': - component['value'] = '_:g' - else: - component['value'] = '_:a' if component['value'] == id_ else '_:z' - return component - - # helper for getting a related predicate - def get_related_predicate(self, quad): - return quad['predicate']['value'] - - # helper for creating hash to related blank nodes map - def create_hash_to_related(self, id_, issuer): - # 1) Create a hash to related blank nodes map for storing hashes that - # identify related blank nodes. - hash_to_related = {} - - # 2) Get a reference, quads, to the list of quads in the blank node to - # quads map for the key identifier. - quads = self.blank_node_info[id_]['quads'] - - # 3) For each quad in quads: - for quad in quads: - # 3.1) If the quad's subject is a blank node that does not match - # identifier, set hash to the result of the Hash Related Blank Node - # algorithm, passing the blank node identifier for subject as - # related, quad, path identifier issuer as issuer, and p as - # position. - if(quad['subject']['type'] == 'blank node' and - quad['subject']['value'] != id_): - related = quad['subject']['value'] - position = 'p' - # 3.2) Otherwise, if quad's object is a blank node that does - # not match identifier, to the result of the Hash Related Blank - # Node algorithm, passing the blank node identifier for object - # as related, quad, path identifier issuer as issuer, and r - # as position. - elif(quad['object']['type'] == 'blank node' and - quad['object']['value'] != id_): - related = quad['object']['value'] - position = 'r' - # 3.3) Otherwise, continue to the next quad. - else: - continue - - # 3.4) Add a mapping of hash to the blank node identifier for the - # component that matched (subject or object) to hash to related - # blank nodes map, adding an entry as necessary. - hash = self.hash_related_blank_node( - related, quad, issuer, position) - hash_to_related.setdefault(hash, []).append(related) - - return hash_to_related - - # helper to create appropriate hash object - def create_hash(self): - return hashlib.sha1() - - -def permutations(elements): - """ - Generates all of the possible permutations for the given list of elements. - - :param elements: the list of elements to permutate. - """ - # begin with sorted elements - elements.sort() - # initialize directional info for permutation algorithm - left = {} - for v in elements: - left[v] = True - - length = len(elements) - last = length - 1 - while True: - yield elements - - # Calculate the next permutation using the Steinhaus-Johnson-Trotter - # permutation algorithm. - - # get largest mobile element k - # (mobile: element is greater than the one it is looking at) - k, pos = None, 0 - for i in range(length): - e = elements[i] - is_left = left[e] - if((k is None or e > k) and - ((is_left and i > 0 and e > elements[i - 1]) or - (not is_left and i < last and e > elements[i + 1]))): - k, pos = e, i - - # no more permutations - if k is None: - return - - # swap k and the element it is looking at - swap = pos - 1 if left[k] else pos + 1 - elements[pos], elements[swap] = elements[swap], k - - # reverse the direction of all elements larger than k - for i in range(length): - if elements[i] > k: - left[elements[i]] = not left[elements[i]] - - -def _compare_shortest_least(a, b): - """ - Compares two strings first based on length and then lexicographically. - - :param a: the first string. - :param b: the second string. - - :return: -1 if a < b, 1 if a > b, 0 if a == b. - """ - rval = cmp(len(a), len(b)) - if rval == 0: - rval = cmp(a, b) - return rval - - def _is_keyword(v): """ Returns whether or not the given value is a keyword. @@ -6413,8 +5602,7 @@ def load_document(url, raise JsonLdError( 'Could not retrieve a JSON-LD document from the URL.', 'jsonld.LoadDocumentError', - {'remoteDoc': remote_doc}, code='loading document failed', - cause=cause) + {'remoteDoc': remote_doc}, code='loading document failed') from cause return remote_doc @@ -6469,7 +5657,7 @@ def load_html(input, url, profile, options): raise JsonLdError( 'Invalid JSON syntax.', 'jsonld.SyntaxError', - {'content': content}, code='invalid script element', cause=cause) + {'content': content}, code='invalid script element') from cause elements = [] if profile: @@ -6489,7 +5677,7 @@ def load_html(input, url, profile, options): raise JsonLdError( 'Invalid JSON syntax.', 'jsonld.SyntaxError', - {'content': element.text}, code='invalid script element', cause=cause) + {'content': element.text}, code='invalid script element') from cause return result elif elements: try: @@ -6498,7 +5686,7 @@ def load_html(input, url, profile, options): raise JsonLdError( 'Invalid JSON syntax.', 'jsonld.SyntaxError', - {'content': elements[0].text}, code='invalid script element', cause=cause) + {'content': elements[0].text}, code='invalid script element') from cause else: raise JsonLdError( 'No script tag found.', diff --git a/lib/pyld/parser.py b/lib/pyld/parser.py new file mode 100644 index 0000000..12988c8 --- /dev/null +++ b/lib/pyld/parser.py @@ -0,0 +1,232 @@ + +import re + +XSD_STRING = 'http://www.w3.org/2001/XMLSchema#string' +RDF = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' +RDF_LANGSTRING = RDF + 'langString' + +def parse_nquads(input_): + """ + Parses RDF in the form of N-Quads. + + :param input_: the N-Quads input to parse. + + :return: an RDF dataset. + """ + # define partial regexes + iri = '(?:<([^:]+:[^>]*)>)' + bnode = '(_:(?:[A-Za-z][A-Za-z0-9]*))' + plain = '"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)"' + datatype = '(?:\\^\\^' + iri + ')' + language = '(?:@([a-zA-Z]+(?:-[a-zA-Z0-9]+)*))' + literal = '(?:' + plain + '(?:' + datatype + '|' + language + ')?)' + ws = '[ \\t]+' + wso = '[ \\t]*' + eoln = r'(?:\r\n)|(?:\n)|(?:\r)' + empty = r'^' + wso + '$' + + # define quad part regexes + subject = '(?:' + iri + '|' + bnode + ')' + ws + property = iri + ws + object = '(?:' + iri + '|' + bnode + '|' + literal + ')' + wso + graph = '(?:\\.|(?:(?:' + iri + '|' + bnode + ')' + wso + '\\.))' + + # Note: Notice that the graph position does not include literals + # even though they are specified as a possible value in the + # N-Quads note (http://sw.deri.org/2008/07/n-quads/). This is + # intentional, as literals in that position are not supported by the + # RDF data model or the JSON-LD data model. + # See: https://github.com/digitalbazaar/pyld/pull/19 + + # full quad regex + quad = r'^' + wso + subject + property + object + graph + wso + '$' + + # build RDF dataset + dataset = {} + + # split N-Quad input into lines + lines = re.split(eoln, input_) + line_number = 0 + for line in lines: + line_number += 1 + + # skip empty lines + if re.search(empty, line) is not None: + continue + + # parse quad + match = re.search(quad, line) + if match is None: + raise ParserError('Error while parsing N-Quads invalid quad.', line_number=line_number) + match = match.groups() + + # create RDF triple + triple = {'subject': {}, 'predicate': {}, 'object': {}} + + # get subject + if match[0] is not None: + triple['subject'] = {'type': 'IRI', 'value': match[0]} + else: + triple['subject'] = {'type': 'blank node', 'value': match[1]} + + # get predicate + triple['predicate'] = {'type': 'IRI', 'value': match[2]} + + # get object + if match[3] is not None: + triple['object'] = {'type': 'IRI', 'value': match[3]} + elif match[4] is not None: + triple['object'] = {'type': 'blank node', 'value': match[4]} + else: + triple['object'] = {'type': 'literal'} + unescaped = ( + match[5] + .replace('\\"', '\"') + .replace('\\t', '\t') + .replace('\\n', '\n') + .replace('\\r', '\r') + .replace('\\\\', '\\')) + if match[6] is not None: + triple['object']['datatype'] = match[6] + elif match[7] is not None: + triple['object']['datatype'] = RDF_LANGSTRING + triple['object']['language'] = match[7] + else: + triple['object']['datatype'] = XSD_STRING + triple['object']['value'] = unescaped + + # get graph name ('@default' is used for the default graph) + name = '@default' + if match[8] is not None: + name = match[8] + elif match[9] is not None: + name = match[9] + + # initialize graph in dataset + if name not in dataset: + dataset[name] = [triple] + # add triple if unique to its graph + else: + unique = True + triples = dataset[name] + for t in dataset[name]: + if _compare_rdf_triples(t, triple): + unique = False + break + if unique: + triples.append(triple) + + return dataset + +def to_nquads(dataset): + """ + Converts an RDF dataset to N-Quads. + + :param dataset: the RDF dataset to convert. + + :return: the N-Quads string. + """ + quads = [] + for graph_name, triples in dataset.items(): + for triple in triples: + if graph_name == '@default': + graph_name = None + quads.append(to_nquad(triple, graph_name)) + quads.sort() + return ''.join(quads) + +def to_nquad(triple, graph_name=None): + """ + Converts an RDF triple and graph name to an N-Quad string (a single + quad). + + :param triple: the RDF triple or quad to convert (a triple or quad + may be passed, if a triple is passed then `graph_name` should be + given to specify the name of the graph the triple is in, `None` + for the default graph). + :param graph_name: the name of the graph containing the triple, None + for the default graph. + + :return: the N-Quad string. + """ + s = triple['subject'] + p = triple['predicate'] + o = triple['object'] + g = triple.get('name', {'value': graph_name})['value'] + + quad = '' + + # subject is an IRI + if s['type'] == 'IRI': + quad += '<' + s['value'] + '>' + else: + quad += s['value'] + quad += ' ' + + # property is an IRI + if p['type'] == 'IRI': + quad += '<' + p['value'] + '>' + else: + quad += p['value'] + quad += ' ' + + # object is IRI, bnode, or literal + if o['type'] == 'IRI': + quad += '<' + o['value'] + '>' + elif(o['type'] == 'blank node'): + quad += o['value'] + else: + escaped = ( + o['value'] + .replace('\\', '\\\\') + .replace('\t', '\\t') + .replace('\n', '\\n') + .replace('\r', '\\r') + .replace('\"', '\\"')) + quad += '"' + escaped + '"' + if o['datatype'] == RDF_LANGSTRING: + if o['language']: + quad += '@' + o['language'] + elif o['datatype'] != XSD_STRING: + quad += '^^<' + o['datatype'] + '>' + + # graph + if g is not None: + if not g.startswith('_:'): + quad += ' <' + g + '>' + else: + quad += ' ' + g + + quad += ' .\n' + return quad + + +def _compare_rdf_triples(t1, t2): + """ + Compares two RDF triples for equality. + + :param t1: the first triple. + :param t2: the second triple. + + :return: True if the triples are the same, False if not. + """ + for attr in ['subject', 'predicate', 'object']: + if(t1[attr]['type'] != t2[attr]['type'] or + t1[attr]['value'] != t2[attr]['value']): + return False + + if t1['object'].get('language') != t2['object'].get('language'): + return False + if t1['object'].get('datatype') != t2['object'].get('datatype'): + return False + + return True + +class ParserError(ValueError): + """ + Base class for parsing errors. + """ + + def __init__(self, message, line_number=None): + Exception.__init__(self, message) + self.line_number = line_number diff --git a/tests/runtests.py b/tests/runtests.py index f12df9d..2affd92 100644 --- a/tests/runtests.py +++ b/tests/runtests.py @@ -544,8 +544,8 @@ def get_jsonld_error_code(err): if isinstance(err, jsonld.JsonLdError): if err.code: return err.code - elif err.cause: - return get_jsonld_error_code(err.cause) + elif err.__cause__: + return get_jsonld_error_code(err.__cause__) return str(err)