Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 18 additions & 2 deletions skosify/check.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@

import logging
import time
import locale
import sys
import re
from rdflib.namespace import RDF, SKOS
from .rdftools.namespace import SKOSEXT
from .rdftools import localname, find_prop_overlap
Expand Down Expand Up @@ -120,17 +123,19 @@ def preflabel_uniqueness(rdf, policy='all'):

:param Graph rdf: An rdflib.graph.Graph object.
:param str policy: Policy for deciding which value to keep as prefLabel
when multiple prefLabels are found. Possible values are 'shortest'
when multiple prefLabels are found. Possible values are 'natural'
(alphanumerical sort that follows the language collation order of the literal), 'shortest'
(keep the shortest label), 'longest' (keep the longest label),
'uppercase' (prefer uppercase), 'lowercase' (prefer uppercase) or
'all' (keep all, just log the problems). Alternatively, a list of
policies to apply in order, such as ['shortest', 'lowercase'], may
be used.
be used. Alphanumerical sorting is appended by default to any applicable list of policies to provide a deterministic tie-breaker.
"""
resources = set(
(res for res, label in rdf.subject_objects(SKOS.prefLabel)))

policy_fn = {
'natural': locale.strxfrm,
'shortest': len,
'longest': lambda x: -len(x),
'uppercase': lambda x: int(x[0].islower()),
Expand All @@ -157,6 +162,13 @@ def key_fn(label):
if lang not in prefLabels:
prefLabels[lang] = []
prefLabels[lang].append(label)

if 'natural' not in policies:
policies.append('natural')
if sys.version_info.major < 3:
reload(sys)
sys.setdefaultencoding('utf8')

for lang, labels in prefLabels.items():
if len(labels) > 1:
if policies[0] == 'all':
Expand All @@ -165,6 +177,10 @@ def key_fn(label):
"but keeping all of them due to preflabel-policy=all.",
res, lang)
continue
try:
locale.setlocale(locale.LC_ALL, re.sub(r'\..*', '.UTF-8', locale.normalize(lang)))
except locale.Error as err:
locale.setlocale(locale.LC_ALL, 'C')

chosen = sorted(labels, key=key_fn)[0]

Expand Down
50 changes: 50 additions & 0 deletions test/test_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,3 +173,53 @@ def test_preflabel_uniqueness_shortest_uppercase():
assert (a, SKOS.altLabel, Literal('short', 'en')) in rdf
assert (a, SKOS.altLabel, Literal('longer', 'en')) in rdf
assert (a, SKOS.altLabel, Literal('Longer', 'en')) in rdf


def test_preflabel_uniqueness_alphanumeric():
rdf = Graph()
a = BNode()

rdf.add((a, RDF.type, SKOS.Concept))
rdf.add((a, SKOS.prefLabel, Literal('äaa', 'en'))) # keep
rdf.add((a, SKOS.prefLabel, Literal('Äba', 'en'))) # remove
rdf.add((a, SKOS.prefLabel, Literal('aab', 'en'))) # remove
rdf.add((a, SKOS.prefLabel, Literal('aba', 'en'))) # remove

rdf.add((a, SKOS.prefLabel, Literal('äa', 'fi'))) # remove
rdf.add((a, SKOS.prefLabel, Literal('Äb', 'fi'))) # remove
rdf.add((a, SKOS.prefLabel, Literal('aä', 'fi'))) # remove
rdf.add((a, SKOS.prefLabel, Literal('ab', 'fi'))) # keep

len_before = len(rdf)

skosify.check.preflabel_uniqueness(rdf, policy=['shortest'])
assert len(rdf) == len_before
assert (a, SKOS.prefLabel, Literal('äaa', 'en')) in rdf
assert (a, SKOS.altLabel, Literal('Äba', 'en')) in rdf
assert (a, SKOS.altLabel, Literal('aab', 'en')) in rdf
assert (a, SKOS.altLabel, Literal('aba', 'en')) in rdf

assert (a, SKOS.prefLabel, Literal('ab', 'fi')) in rdf
assert (a, SKOS.altLabel, Literal('äa', 'fi')) in rdf
assert (a, SKOS.altLabel, Literal('Äb', 'fi')) in rdf
assert (a, SKOS.altLabel, Literal('aä', 'fi')) in rdf


def test_preflabel_uniqueness_alphanumeric2():
rdf = Graph()
a = BNode()

rdf.add((a, RDF.type, SKOS.Concept))
rdf.add((a, SKOS.prefLabel, Literal('AAa', 'en'))) # remove
rdf.add((a, SKOS.prefLabel, Literal('Aaa', 'en'))) # remove
rdf.add((a, SKOS.prefLabel, Literal('aaa', 'en'))) # keep
rdf.add((a, SKOS.prefLabel, Literal('Äää', 'en'))) # remove

len_before = len(rdf)

skosify.check.preflabel_uniqueness(rdf, policy=[])
assert len(rdf) == len_before
assert (a, SKOS.altLabel, Literal('AAa', 'en')) in rdf
assert (a, SKOS.altLabel, Literal('Aaa', 'en')) in rdf
assert (a, SKOS.prefLabel, Literal('aaa', 'en')) in rdf
assert (a, SKOS.altLabel, Literal('Äää', 'en')) in rdf