-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
38 lines (32 loc) · 996 Bytes
/
main.py
File metadata and controls
38 lines (32 loc) · 996 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
from PyPDF2 import PdfFileReader
import requests
import io
from pdf2image import convert_from_path
from PIL import Image
import pytesseract
import os
def OCR(pdf):
pdfName = pdf.split('.pdf')[0]
pages = convert_from_path(pdf, 500)
image_counter = 1
for page in pages:
filename = "page_" + str(image_counter) + ".jpg"
page.save(pdfName + filename, 'JPEG')
image_counter = image_counter + 1
filelimit = image_counter - 1
f = open(pdfName + ".txt", "wb")
text = ''
for i in range(1, filelimit + 1):
filename = pdfName + "page_" + str(i) + ".jpg"
text += str(((pytesseract.image_to_string(Image.open(filename)))))
text = text.replace('-\n', '')
text = text.replace('\n', ' \n')
os.remove(pdfName + "page_" + str(i) + ".jpg")
f.write(text.encode('utf-8', 'replace'))
f.close()
return text
def main():
pdf_path = '1.pdf'
print(OCR(pdf_path))
if __name__ == '__main__':
main()