Skip to content

Commit 5ff6f5c

Browse files
authored
fix: tail issue in prune_unwanted_nodes() (#808)
* remove incorrect tail handling remove the incorrect tail handling in prune_unwanted_nodes() as the correct handling is already done in delete_element() * Update unit_tests.py
1 parent badd594 commit 5ff6f5c

File tree

2 files changed

+7
-7
lines changed

2 files changed

+7
-7
lines changed

tests/unit_tests.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -930,6 +930,12 @@ def test_htmlprocessing():
930930
processed = trafilatura.htmlprocessing.handle_textnode(node, options)
931931
assert processed.tail == "tail" and processed.text == "text"
932932

933+
# fix for bug 807
934+
node = html.fragment_fromstring("<div><p><span>span</span> span tail</p> p tail </div>")
935+
assert node.text_content() == "span span tail p tail "
936+
prune = etree.XPath(".//span")
937+
processed = trafilatura.htmlprocessing.prune_unwanted_nodes(node, [prune])
938+
assert node.text_content() == " span tail p tail "
933939

934940

935941
def test_extraction_options():

trafilatura/htmlprocessing.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -103,13 +103,7 @@ def prune_unwanted_nodes(
103103
for expression in nodelist:
104104
for subtree in expression(tree):
105105
# preserve tail text from deletion
106-
if subtree.tail is not None:
107-
prev = subtree.getprevious()
108-
if prev is None:
109-
prev = subtree.getparent()
110-
if prev is not None:
111-
# There is a previous node, append text to its tail
112-
prev.tail = (prev.tail or "") + " " + subtree.tail
106+
# tail is by default preserved by delete_element()
113107
# remove the node
114108
delete_element(subtree)
115109

0 commit comments

Comments
 (0)