Skip to content

Commit f230971

Browse files
authored
Add notebook for customizing LayoutParser Models with Label Studio Annotation (#124)
* Add notebook for lp+label studio annotation * better naming * Re-org and add file downloading * Better Docs * better gitignore
1 parent 0809fa8 commit f230971

File tree

6 files changed

+1398
-0
lines changed

6 files changed

+1398
-0
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
# Examples files
2+
examples/Customizing Layout Models with Label Studio Annotation/downloaded-annotations
3+
14
*.bak
25
.gitattributes
36
.last_checked

examples/Customizing Layout Models with Label Studio Annotation/Customizing Layout Models with Label Studio Annotation.ipynb

Lines changed: 1340 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
<div align="center">
2+
<h2> Customizing LayoutParser Models with Label Studio Annotation </h2>
3+
With Scientific Document Parsing as an example
4+
5+
---
6+
7+
[Webinar Video](https://www.youtube.com/watch?v=puOKTFXRyr4) | [Slides](https://szj.io/assets/files/talks/2022-Feb-LayoutParser-and-Label-Studio-Webinar.pdf) | [Notebooks](Customizing%20Layout%20Models%20with%20Label%20Studio%20Annotation.ipynb)
8+
</div>
9+
10+
![Overview of the Pipeline](pipeline-overview.jpg)
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
import pdf2image
2+
import tempfile
3+
import urllib.request
4+
import pandas as pd
5+
import zipfile
6+
7+
opener = urllib.request.build_opener()
8+
opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36')]
9+
urllib.request.install_opener(opener)
10+
11+
def download_auxiliary_paper_images(target_path: str = "downloaded-annotations"):
12+
13+
data_to_download = pd.DataFrame(
14+
[
15+
["1810.04805v2", 10, "1810.04805v2-10_ea8f.jpg"],
16+
["1810.04805v2", 11, "1810.04805v2-11_213f.jpg"],
17+
["1810.04805v2", 9, "1810.04805v2-9_dc05.jpg"],
18+
["1908.03557v1", 10, "1908.03557v1-10_fa12.jpg"],
19+
["1908.03557v1", 11, "1908.03557v1-11_a737.jpg"],
20+
],
21+
columns=["arxiv_id", "page", "filename"],
22+
)
23+
24+
for arxiv_id, gp in data_to_download.groupby("arxiv_id"):
25+
with tempfile.TemporaryDirectory() as tempdir:
26+
arxiv_link = f"http://arxiv.org/pdf/{arxiv_id}.pdf"
27+
urllib.request.urlretrieve(arxiv_link, f"{tempdir}/{arxiv_id}.pdf")
28+
pdf_images = pdf2image.convert_from_path(
29+
f"{tempdir}/{arxiv_id}.pdf", dpi=72
30+
)
31+
for _, row in gp.iterrows():
32+
pdf_images[row["page"]].save(f"{target_path}/images/{row['filename']}")
33+
34+
35+
ANNOTATION_FILE_PATH = "http://szj.io/assets/files/data/layoutparser-webinar-annotations-2022-Feb.zip"
36+
37+
def download_zipped_annotations():
38+
filehandle, _ = urllib.request.urlretrieve(ANNOTATION_FILE_PATH)
39+
zip_ref = zipfile.ZipFile(filehandle, 'r')
40+
zip_ref.extractall("./") # extract file to dir
41+
zip_ref.close() # close file
42+
43+
if __name__ == "__main__":
44+
download_zipped_annotations()
45+
download_auxiliary_paper_images()
200 KB
Loading
797 KB
Loading

0 commit comments

Comments
 (0)