From bc0d7f84542e7bf645ac9d992adc1e454c569007 Mon Sep 17 00:00:00 2001 From: kouralex <1723419+kouralex@users.noreply.github.com> Date: Tue, 24 Nov 2020 12:39:52 +0200 Subject: [PATCH] introduce alphanumerical sorting policy; fixes #81 --- skosify/check.py | 20 +++++++++++++++++-- test/test_check.py | 50 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+), 2 deletions(-) diff --git a/skosify/check.py b/skosify/check.py index 12b5214..0a13dea 100644 --- a/skosify/check.py +++ b/skosify/check.py @@ -3,6 +3,9 @@ import logging import time +import locale +import sys +import re from rdflib.namespace import RDF, SKOS from .rdftools.namespace import SKOSEXT from .rdftools import localname, find_prop_overlap @@ -120,17 +123,19 @@ def preflabel_uniqueness(rdf, policy='all'): :param Graph rdf: An rdflib.graph.Graph object. :param str policy: Policy for deciding which value to keep as prefLabel - when multiple prefLabels are found. Possible values are 'shortest' + when multiple prefLabels are found. Possible values are 'natural' + (alphanumerical sort that follows the language collation order of the literal), 'shortest' (keep the shortest label), 'longest' (keep the longest label), 'uppercase' (prefer uppercase), 'lowercase' (prefer uppercase) or 'all' (keep all, just log the problems). Alternatively, a list of policies to apply in order, such as ['shortest', 'lowercase'], may - be used. + be used. Alphanumerical sorting is appended by default to any applicable list of policies to provide a deterministic tie-breaker. """ resources = set( (res for res, label in rdf.subject_objects(SKOS.prefLabel))) policy_fn = { + 'natural': locale.strxfrm, 'shortest': len, 'longest': lambda x: -len(x), 'uppercase': lambda x: int(x[0].islower()), @@ -157,6 +162,13 @@ def key_fn(label): if lang not in prefLabels: prefLabels[lang] = [] prefLabels[lang].append(label) + + if 'natural' not in policies: + policies.append('natural') + if sys.version_info.major < 3: + reload(sys) + sys.setdefaultencoding('utf8') + for lang, labels in prefLabels.items(): if len(labels) > 1: if policies[0] == 'all': @@ -165,6 +177,10 @@ def key_fn(label): "but keeping all of them due to preflabel-policy=all.", res, lang) continue + try: + locale.setlocale(locale.LC_ALL, re.sub(r'\..*', '.UTF-8', locale.normalize(lang))) + except locale.Error as err: + locale.setlocale(locale.LC_ALL, 'C') chosen = sorted(labels, key=key_fn)[0] diff --git a/test/test_check.py b/test/test_check.py index f457a02..a1786f1 100644 --- a/test/test_check.py +++ b/test/test_check.py @@ -173,3 +173,53 @@ def test_preflabel_uniqueness_shortest_uppercase(): assert (a, SKOS.altLabel, Literal('short', 'en')) in rdf assert (a, SKOS.altLabel, Literal('longer', 'en')) in rdf assert (a, SKOS.altLabel, Literal('Longer', 'en')) in rdf + + +def test_preflabel_uniqueness_alphanumeric(): + rdf = Graph() + a = BNode() + + rdf.add((a, RDF.type, SKOS.Concept)) + rdf.add((a, SKOS.prefLabel, Literal('äaa', 'en'))) # keep + rdf.add((a, SKOS.prefLabel, Literal('Äba', 'en'))) # remove + rdf.add((a, SKOS.prefLabel, Literal('aab', 'en'))) # remove + rdf.add((a, SKOS.prefLabel, Literal('aba', 'en'))) # remove + + rdf.add((a, SKOS.prefLabel, Literal('äa', 'fi'))) # remove + rdf.add((a, SKOS.prefLabel, Literal('Äb', 'fi'))) # remove + rdf.add((a, SKOS.prefLabel, Literal('aä', 'fi'))) # remove + rdf.add((a, SKOS.prefLabel, Literal('ab', 'fi'))) # keep + + len_before = len(rdf) + + skosify.check.preflabel_uniqueness(rdf, policy=['shortest']) + assert len(rdf) == len_before + assert (a, SKOS.prefLabel, Literal('äaa', 'en')) in rdf + assert (a, SKOS.altLabel, Literal('Äba', 'en')) in rdf + assert (a, SKOS.altLabel, Literal('aab', 'en')) in rdf + assert (a, SKOS.altLabel, Literal('aba', 'en')) in rdf + + assert (a, SKOS.prefLabel, Literal('ab', 'fi')) in rdf + assert (a, SKOS.altLabel, Literal('äa', 'fi')) in rdf + assert (a, SKOS.altLabel, Literal('Äb', 'fi')) in rdf + assert (a, SKOS.altLabel, Literal('aä', 'fi')) in rdf + + +def test_preflabel_uniqueness_alphanumeric2(): + rdf = Graph() + a = BNode() + + rdf.add((a, RDF.type, SKOS.Concept)) + rdf.add((a, SKOS.prefLabel, Literal('AAa', 'en'))) # remove + rdf.add((a, SKOS.prefLabel, Literal('Aaa', 'en'))) # remove + rdf.add((a, SKOS.prefLabel, Literal('aaa', 'en'))) # keep + rdf.add((a, SKOS.prefLabel, Literal('Äää', 'en'))) # remove + + len_before = len(rdf) + + skosify.check.preflabel_uniqueness(rdf, policy=[]) + assert len(rdf) == len_before + assert (a, SKOS.altLabel, Literal('AAa', 'en')) in rdf + assert (a, SKOS.altLabel, Literal('Aaa', 'en')) in rdf + assert (a, SKOS.prefLabel, Literal('aaa', 'en')) in rdf + assert (a, SKOS.altLabel, Literal('Äää', 'en')) in rdf