Layout-Parser
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎examples/Customizing Layout Models with Label Studio Annotation/Customizing Layout Models with Label Studio Annotation.ipynb‎
Lines changed: 1340 additions & 0 deletions b/‎examples/Customizing Layout Models with Label Studio Annotation/Customizing Layout Models with Label Studio Annotation.ipynb‎
Lines changed: 1340 additions & 0 deletions
diff --git a/‎examples/Customizing Layout Models with Label Studio Annotation/README.md‎
Lines changed: 10 additions & 0 deletions b/‎examples/Customizing Layout Models with Label Studio Annotation/README.md‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎examples/Customizing Layout Models with Label Studio Annotation/download_annotation.py‎
Lines changed: 45 additions & 0 deletions b/‎examples/Customizing Layout Models with Label Studio Annotation/download_annotation.py‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎examples/Customizing Layout Models with Label Studio Annotation/pipeline-overview.jpg‎
200 KB b/‎examples/Customizing Layout Models with Label Studio Annotation/pipeline-overview.jpg‎
200 KB
diff --git a/‎examples/Customizing Layout Models with Label Studio Annotation/task-overview.png‎
797 KB b/‎examples/Customizing Layout Models with Label Studio Annotation/task-overview.png‎
797 KB
@@ -1,3 +1,6 @@
+# Examples files 
+examples/Customizing Layout Models with Label Studio Annotation/downloaded-annotations
+
 *.bak
 .gitattributes
 .last_checked
 
@@ -0,0 +1,10 @@
+<div align="center">
+    <h2> Customizing LayoutParser Models with Label Studio Annotation </h2>
+    With Scientific Document Parsing as an example
+
+--- 
+
+[Webinar Video](https://www.youtube.com/watch?v=puOKTFXRyr4) | [Slides](https://szj.io/assets/files/talks/2022-Feb-LayoutParser-and-Label-Studio-Webinar.pdf) | [Notebooks](Customizing%20Layout%20Models%20with%20Label%20Studio%20Annotation.ipynb)
+</div>
+
+![Overview of the Pipeline](pipeline-overview.jpg)
@@ -0,0 +1,45 @@
+import pdf2image
+import tempfile
+import urllib.request
+import pandas as pd
+import zipfile
+
+opener = urllib.request.build_opener()
+opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36')]
+urllib.request.install_opener(opener)
+
+def download_auxiliary_paper_images(target_path: str = "downloaded-annotations"):
+
+    data_to_download = pd.DataFrame(
+        [
+            ["1810.04805v2", 10, "1810.04805v2-10_ea8f.jpg"],
+            ["1810.04805v2", 11, "1810.04805v2-11_213f.jpg"],
+            ["1810.04805v2", 9, "1810.04805v2-9_dc05.jpg"],
+            ["1908.03557v1", 10, "1908.03557v1-10_fa12.jpg"],
+            ["1908.03557v1", 11, "1908.03557v1-11_a737.jpg"],
+        ],
+        columns=["arxiv_id", "page", "filename"],
+    )
+
+    for arxiv_id, gp in data_to_download.groupby("arxiv_id"):
+        with tempfile.TemporaryDirectory() as tempdir:
+            arxiv_link = f"http://arxiv.org/pdf/{arxiv_id}.pdf"
+            urllib.request.urlretrieve(arxiv_link, f"{tempdir}/{arxiv_id}.pdf")
+            pdf_images = pdf2image.convert_from_path(
+                f"{tempdir}/{arxiv_id}.pdf", dpi=72
+            )
+            for _, row in gp.iterrows():
+                pdf_images[row["page"]].save(f"{target_path}/images/{row['filename']}")
+
+
+ANNOTATION_FILE_PATH = "http://szj.io/assets/files/data/layoutparser-webinar-annotations-2022-Feb.zip"
+
+def download_zipped_annotations(): 
+    filehandle, _ = urllib.request.urlretrieve(ANNOTATION_FILE_PATH)
+    zip_ref = zipfile.ZipFile(filehandle, 'r')
+    zip_ref.extractall("./") # extract file to dir
+    zip_ref.close() # close file
+
+if __name__ == "__main__":
+    download_zipped_annotations()
+    download_auxiliary_paper_images()