diff --git a/microdata.py b/microdata.py index 19581a1..6132bba 100755 --- a/microdata.py +++ b/microdata.py @@ -153,6 +153,11 @@ def __repr__(self): 'time': 'datetime', } +block_elements = [ + "div", + "p", + "h1", "h2", "h3", "h4", "h5", "h6" +] def _find_items(e): items = [] @@ -232,8 +237,15 @@ def _text(e): chunks.append(e.data) elif hasattr(e, 'tagName') and e.tagName == 'script': return '' + elif hasattr(e, 'tagName') and e.tagName == 'br': + chunks.append("\n") + for child in e.childNodes: chunks.append(_text(child)) + + if hasattr(e, 'tagName') and e.tagName in block_elements: + chunks.append("\n") + return ''.join(chunks) diff --git a/test-data/example-dirty.html b/test-data/example-dirty.html new file mode 100644 index 0000000..7bf66d7 --- /dev/null +++ b/test-data/example-dirty.html @@ -0,0 +1,40 @@ + + + + + person example + + + +
+ Jane Doe + + + Professor +
A Professor that likes
Linebreaks
+
+ +

20341 Whitworth Institute

405 N. Whitworth

+ + +
+ Seattle, + WA + 98052 +
+ (425) 123-4567 + jane-doe@xyz.edu + + Jane's home page: + + + Graduate students: + + Alice Jones + + Bob Smith +
+ + diff --git a/test.py b/test.py index bf1a6c6..703419b 100644 --- a/test.py +++ b/test.py @@ -108,6 +108,23 @@ def test_skip_level(self): self.assertEqual(len(items), 1) self.assertEqual(items[0].name, "Jane Doe") + def test_newlines(self): + items = get_items(open("test-data/example-dirty.html")) + # this html should have just one main item + self.assertTrue(len(items), 1) + + item = items[0] + + # description contains a br tag so it should have a newline + self.assertEqual(item.description.strip(), "A Professor that likes\nLinebreaks") + + self.assertEqual(item.address.itemtype, [URI("http://schema.org/PostalAddress")]) + # street adress should contain newlines because p is a block element + self.assertEqual(item.address.streetAddress.strip(), "20341 Whitworth Institute\n405 N. Whitworth") + self.assertEqual(item.address.addressLocality, "Seattle") + + item = items[0] + def test_parse_multiple_props(self): items = get_items(open("test-data/multiple-props.html"))