1+ import pdf2image
2+ import tempfile
3+ import urllib .request
4+ import pandas as pd
5+ import zipfile
6+
7+ opener = urllib .request .build_opener ()
8+ opener .addheaders = [('User-Agent' ,'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36' )]
9+ urllib .request .install_opener (opener )
10+
11+ def download_auxiliary_paper_images (target_path : str = "downloaded-annotations" ):
12+
13+ data_to_download = pd .DataFrame (
14+ [
15+ ["1810.04805v2" , 10 , "1810.04805v2-10_ea8f.jpg" ],
16+ ["1810.04805v2" , 11 , "1810.04805v2-11_213f.jpg" ],
17+ ["1810.04805v2" , 9 , "1810.04805v2-9_dc05.jpg" ],
18+ ["1908.03557v1" , 10 , "1908.03557v1-10_fa12.jpg" ],
19+ ["1908.03557v1" , 11 , "1908.03557v1-11_a737.jpg" ],
20+ ],
21+ columns = ["arxiv_id" , "page" , "filename" ],
22+ )
23+
24+ for arxiv_id , gp in data_to_download .groupby ("arxiv_id" ):
25+ with tempfile .TemporaryDirectory () as tempdir :
26+ arxiv_link = f"http://arxiv.org/pdf/{ arxiv_id } .pdf"
27+ urllib .request .urlretrieve (arxiv_link , f"{ tempdir } /{ arxiv_id } .pdf" )
28+ pdf_images = pdf2image .convert_from_path (
29+ f"{ tempdir } /{ arxiv_id } .pdf" , dpi = 72
30+ )
31+ for _ , row in gp .iterrows ():
32+ pdf_images [row ["page" ]].save (f"{ target_path } /images/{ row ['filename' ]} " )
33+
34+
35+ ANNOTATION_FILE_PATH = "http://szj.io/assets/files/data/layoutparser-webinar-annotations-2022-Feb.zip"
36+
37+ def download_zipped_annotations ():
38+ filehandle , _ = urllib .request .urlretrieve (ANNOTATION_FILE_PATH )
39+ zip_ref = zipfile .ZipFile (filehandle , 'r' )
40+ zip_ref .extractall ("./" ) # extract file to dir
41+ zip_ref .close () # close file
42+
43+ if __name__ == "__main__" :
44+ download_zipped_annotations ()
45+ download_auxiliary_paper_images ()
0 commit comments