diff --git a/microdata.py b/microdata.py
index 19581a1..6132bba 100755
--- a/microdata.py
+++ b/microdata.py
@@ -153,6 +153,11 @@ def __repr__(self):
'time': 'datetime',
}
+block_elements = [
+ "div",
+ "p",
+ "h1", "h2", "h3", "h4", "h5", "h6"
+]
def _find_items(e):
items = []
@@ -232,8 +237,15 @@ def _text(e):
chunks.append(e.data)
elif hasattr(e, 'tagName') and e.tagName == 'script':
return ''
+ elif hasattr(e, 'tagName') and e.tagName == 'br':
+ chunks.append("\n")
+
for child in e.childNodes:
chunks.append(_text(child))
+
+ if hasattr(e, 'tagName') and e.tagName in block_elements:
+ chunks.append("\n")
+
return ''.join(chunks)
diff --git a/test-data/example-dirty.html b/test-data/example-dirty.html
new file mode 100644
index 0000000..7bf66d7
--- /dev/null
+++ b/test-data/example-dirty.html
@@ -0,0 +1,40 @@
+
+
+
+
+ person example
+
+
+
+
+
Jane Doe
+

+
+
Professor
+
A Professor that likes
Linebreaks
+
+
+ 20341 Whitworth Institute
405 N. Whitworth
+
+
+
+
Seattle,
+
WA
+
98052
+
+
(425) 123-4567
+
jane-doe@xyz.edu
+
+ Jane's home page:
+
janedoe.com
+
+ Graduate students:
+
+ Alice Jones
+
+ Bob Smith
+
+
+
diff --git a/test.py b/test.py
index bf1a6c6..703419b 100644
--- a/test.py
+++ b/test.py
@@ -108,6 +108,23 @@ def test_skip_level(self):
self.assertEqual(len(items), 1)
self.assertEqual(items[0].name, "Jane Doe")
+ def test_newlines(self):
+ items = get_items(open("test-data/example-dirty.html"))
+ # this html should have just one main item
+ self.assertTrue(len(items), 1)
+
+ item = items[0]
+
+ # description contains a br tag so it should have a newline
+ self.assertEqual(item.description.strip(), "A Professor that likes\nLinebreaks")
+
+ self.assertEqual(item.address.itemtype, [URI("http://schema.org/PostalAddress")])
+ # street adress should contain newlines because p is a block element
+ self.assertEqual(item.address.streetAddress.strip(), "20341 Whitworth Institute\n405 N. Whitworth")
+ self.assertEqual(item.address.addressLocality, "Seattle")
+
+ item = items[0]
+
def test_parse_multiple_props(self):
items = get_items(open("test-data/multiple-props.html"))