-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest_blake_xml.py
More file actions
74 lines (51 loc) · 2.16 KB
/
test_blake_xml.py
File metadata and controls
74 lines (51 loc) · 2.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import blake_xml
def get_obj(xml_path, obj_number):
bxml = blake_xml.BlakeXML(xml_path)
obj = bxml.objects()[obj_number]
return bxml, obj
def text_equality(corr_file, test_obj):
corr = ''
with open(corr_file, 'r', encoding='utf-8') as infile:
for line in infile:
corr += line
return corr == test_obj.text()
bxml, obj = get_obj('data/vda.h.xml', 4)
def test_path():
assert bxml.path == 'data/vda.h.xml'
def test_xml_parse():
assert type(bxml.xml) == blake_xml.etree._ElementTree
def test_objects():
assert len(bxml.objects()) == 11
def test_text_equality_001():
assert text_equality('data/vda.h.illbk.05.txt', obj)
# .text should provide text for any <l> node under <phystext>.
# test that phystext/*/l text is included
def test_text_for_all_l_children_001():
_, o = get_obj('data/vda.h.xml', 4)
assert text_equality('data/vda.h.illbk.05.txt', o)
# test that phystext/*/*/*/l text is included
def test_text_for_all_l_children_002():
_, o = get_obj('data/ahania.a.xml', 4)
assert text_equality('data/ahania.a.illbk.05.txt', o)
# Note text should not be rendered in the transcription.
# e.g. don't print:
# <note>In Copy O, the etched number "7" in the upper right-hand corner
# is obscured by washes.</note>
def test_note_stripping():
_, o = get_obj('data/vda.o.xml', 9)
assert 'In Copy O' not in o.text()
# "space" nodes need to be replaced with a space. So that the following, for
# example, will have proper spacing:
# Jehovah<space extent="1"/>What Vengeance dost thou require
def test_spacing():
_, o = get_obj('data/abel.a.xml', 0)
assert 'JehovahWhat' not in o.text()
# Test fails
# Some xml files have '\u0097' characters that should be cleaned up in
# the xml. They were probably meant to be em dashes in windows-1252. If the
# characters are included in the SuperfastMatch input, it may or may not
# affect fragments/matching, but in any case, if the chars are to be removed
# it ought to be done in the xml files rather than the transformation code.
def test_control_chars():
_, o = get_obj('data/abel.a.xml', 0)
assert '\u0097' not in o.text()