Skip to content

Commit ff953b2

Browse files
committed
updates to new script
1 parent 530309f commit ff953b2

File tree

1 file changed

+48
-9
lines changed

1 file changed

+48
-9
lines changed

scripts/investigate-amendment-dups.py

Lines changed: 48 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,15 @@
11
#!/usr/bin/env python3
22
import sys
33
import re
4-
from peyutil import read_as_json
4+
import os
5+
from peyutil import read_as_json, write_as_json
56

67
no_app_pat = re.compile(r"^amende?ment [#](\d+) not applied:")
78
homonym_pat = re.compile(r"^amende?ment [#](\d+) not applied: ([A-Za-z][-A-Za-z0-9 ]+[A-Za-z0-9]) is a homonym of (\d+)")
89

10+
src_fn_pat = re.compile(r"^(additions\-\d+\-\d+):(\d+)$")
11+
by_study_id = {}
12+
913
def check_if_del_works(amend_num, name, ott_id, edott, amendments_repo):
1014
exp_amend_idx = amend_num - 1
1115
rel_amends = []
@@ -20,13 +24,28 @@ def check_if_del_works(amend_num, name, ott_id, edott, amendments_repo):
2024
return (False, "solo")
2125
if rel_amends[0][0] == exp_amend_idx:
2226
return False, "first"
23-
found = False
27+
found = None
2428
for ra in rel_amends[1:]:
2529
if ra[0] == exp_amend_idx:
26-
found = True
30+
found = ra[1]
2731
break
28-
if not found:
32+
if found is None:
2933
return False, "notfound"
34+
taxon = found["taxon"]
35+
src = taxon["sourceinfo"]
36+
m = src_fn_pat.match(src)
37+
if not m:
38+
raise ValueError(f"'sourceinfo' {src} does not fit pattern.")
39+
fn_frag = m.group(1)
40+
bogus_id = int(m.group(2))
41+
fn = f"{fn_frag}.json"
42+
fp = os.path.join(amendments_repo, "amendments", fn)
43+
if not os.path.isfile(fp):
44+
raise RuntimeError(f"amendments file {fp} does not exist")
45+
offending_amend = read_as_json(fp)
46+
study_id = offending_amend['study_id']
47+
name_set = by_study_id.setdefault(study_id, set())
48+
name_set.add(name)
3049
return True, ""
3150

3251
def main(edott_fp,
@@ -43,22 +62,42 @@ def main(edott_fp,
4362
amend_num = int(hm.group(1))
4463
name = hm.group(2)
4564
ott_id = int(hm.group(3))
46-
print(amend_num, name, ott_id)
4765
rc = check_if_del_works(amend_num, name, ott_id, edott, amendments_repo)
4866
if rc[0]:
4967
to_del.append(amend_num)
5068
else:
5169
prob = rc[1]
5270
if prob == "solo":
53-
print("Atypical homonym. Solo in amendments {amend_num}:", edott[amend_num -1])
71+
sys.stderr.write(f"Atypical homonym. Solo in amendments {amend_num}: {edott[amend_num -1]}\n")
5472
elif prob == "notfound":
55-
print("PROBLEM {amend_num} does not match:", edott[amend_num -1])
73+
sys.stderr.write(f"PROBLEM {amend_num} does not match: {edott[amend_num -1]}\n")
5674
else:
5775
assert(prob == "first")
58-
print("Atypical homonym. First in amendments {amend_num} is bad:", edott[amend_num -1])
76+
sys.stderr.write(f"Atypical homonym. First in amendments {amend_num} is bad: {edott[amend_num -1]}\n")
5977
else:
60-
print(m.group(1), "not a homonym")
78+
sys.stderr.write(m.group(1) + " not a homonym")
79+
sk = list(by_study_id.keys())
80+
sk.sort()
81+
for study_id in sk:
82+
name_set = by_study_id[study_id]
83+
if len(name_set) == 1:
84+
name = next(name_set)
85+
sys.stderr.write(f"In https://tree.opentreeoflife.org/curator/study/view/{study_id} need to remap 1 taxon: \"{name}\"\n")
86+
else:
87+
sys.stderr.write(f"In https://tree.opentreeoflife.org/curator/study/view/{study_id} need to remap {len(name_set)} taxa:\n")
88+
nl = list(name_set)
89+
nl.sort()
90+
for name in nl:
91+
sys.stderr.write(f" \"{name}\"\n")
6192

93+
if to_del:
94+
tds = set([i-1 for i in to_del])
95+
new_edott = []
96+
for amend_idx, amend in enumerate(edott):
97+
if amend_idx not in tds:
98+
new_edott.append(amend)
99+
write_as_json(new_edott, sys.stdout, indent=2)
100+
62101
if __name__ == "__main__":
63102
try:
64103
_args = list(sys.argv[1:4])

0 commit comments

Comments
 (0)