-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathblake_xml.py
More file actions
89 lines (73 loc) · 3.27 KB
/
blake_xml.py
File metadata and controls
89 lines (73 loc) · 3.27 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#!/usr/bin/env python
import os
import re
import glob
from lxml import etree
class BlakeXML:
"""XML representation of a copy in the Blake Archive.
Corresponds with one Blake xml file. Contains descriptions of each
object (plate/page/etc) in the copy.
"""
def __init__(self, path):
self.path = path
self.xml = self.parse_xml()
def parse_xml(self):
"""Return etree read from xml file."""
with open(self.path, 'r', encoding='utf-8') as xmlfile:
return etree.parse(xmlfile)
def objects(self):
"""Return generator that yields each object(plate/page) in the xml."""
return [XMLObject(x, self) for x in self.xml.xpath('//objdesc/desc')]
class XMLObject:
"""XML representation of an object (plate/page/etc) in a copy."""
def __init__(self, xml, blake_xml):
self.xml = xml
self.parent = blake_xml
self.desc_id = self.xml.attrib['id']
def text(self):
"""Return transcription text, with some transformations."""
transcription = ''
for line in self.xml.xpath('phystext//l'):
# text is mainly in <l> nodes, but those <l> nodes can also
# have child nodes that include text. Some of that text is desired
# (e.g. from child <phystext> nodes).
# <note> child nodes may have text that should not be included.
# Delete note nodes then provide all (i.e. including nested)
# text in <l>
for note in line.findall('.//note'):
note.getparent().remove(note)
# some nodes contain <space/> nodes in place of whitespace, e.g.:
# Jehovah<space extent="1"/>What Vengeance dost thou require
# Replace these with a single space. They may represent multiple
# spaces, but we end up trimming contiguous whitespace anyway.
for space in line.findall('space'):
space.text = ' '
text = ''.join(line.itertext())
if text == '':
continue
# Some xml files have line text nodes split across two or more
# lines of the xml file. Those line breaks aren't part of
# the transcription. Replace newlines or a contiguous
# set of newlines with a single space.
# Also, replace contiguous spaces with a single space, per
# what has previously been done for superfastmatch.
text = re.sub(r'\s+', ' ', text)
text = text.strip()
transcription += text + '\n'
return transcription.rstrip()
def write_text(self, path=None):
"""Write transformed transcription text to file."""
filename = self.desc_id.lower() + '.txt'
fullpath = os.path.join(path, filename)
with open(fullpath, 'w', encoding='utf-8') as ofile:
ofile.write(self.text())
if __name__ == '__main__':
# Extract transcriptions from files in xml_path and write to files
# in txt_path
xml_path = 'works/xml/'
txt_path = 'works/text/'
for filename in glob.glob(os.path.join(xml_path, '*.xml')):
print('Extracting transcriptions from: ' + filename)
blakefile = BlakeXML(filename)
for obj in blakefile.objects():
obj.write_text(txt_path)