-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtestreadpdf.py
More file actions
139 lines (102 loc) · 4.55 KB
/
Copy pathtestreadpdf.py
File metadata and controls
139 lines (102 loc) · 4.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
""" Try module for reading text from pdf documents
https://github.com/pdfminer/pdfminer.six
Usage:
Example:
"""
# # Have to comment out line 131-132 in pdfpage.py in pdfminer to extract text:
# # if check_extractable and not doc.is_extractable:
# # raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
# from pdfminer.high_level import extract_text
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from io import StringIO
from dateparser import parse
from datetime import datetime
from pikepdf import Pdf
import os
""" Gets any text in a protected pdf document provided in path argument
"""
def extract_protected_pdf_text(path: str, password: str) -> str:
output = StringIO()
resource_manager = PDFResourceManager()
laparams = LAParams()
device = TextConverter(resource_manager, output, codec='utf-8', laparams=laparams)
pdf_infile = open(path, 'rb')
interpreter = PDFPageInterpreter(resource_manager, device)
page_numbers = set()
if password is None:
for page in PDFPage.get_pages(pdf_infile, page_numbers, maxpages=0, caching=True, check_extractable=False):
interpreter.process_page(page)
else:
for page in PDFPage.get_pages(pdf_infile, page_numbers, maxpages=0, password=password, caching=True, check_extractable=False):
interpreter.process_page(page)
text = output.getvalue()
pdf_infile.close()
device.close()
output.close()
return text
""" Returns a filename based on extracted text from pdf file provided in text argument
"""
def create_file_name(text_or_filename: str) -> str:
index_string = 'Udskrevet '
if index_string in text_or_filename:
pdf_text = text_or_filename
index_string_lenght = len(index_string)
temp_date_string = pdf_text[pdf_text.index(index_string)+index_string_lenght:-1]
date_string = temp_date_string[0:temp_date_string.index('\n')]
printed_date = parse(date_string)
# PDF naming standard:
# <YEAR>-<MONTH>-<NUMBER>.pdf
# example: 2015-november-1.pdf
file_name_string = '{0:%Y}-{0:%B}-{1}.pdf'.format(printed_date, 1)
else:
file_name = text_or_filename
temp_list = file_name.split('\\')
temp_string = temp_list[-1]
temp_date = temp_string[0:temp_string.index('_', 0, -1)]
file_name_string = temp_date + '-1.pdf'
return file_name_string
""" Removes encryption from PDF and safe it to disk
"""
def remove_password_and_save(input_path: str, output_path: str):
with Pdf.open(input_path) as pdf:
pdf.save(filename = output_path, encryption = None)
"""
"""
def get_pdf_files_list(path: str):
files_return = []
for root, directories, files in os.walk(path):
for file in files:
if '.pdf' in file:
files_return.append(os.path.join(root, file))
return files_return
def main():
source_path = 'C:\\Users\\dhjensen\\OneDrive\\salarytest'
files = get_pdf_files_list(source_path)
for file in files:
if 'Daniel_Jensen' in file:
print ('Daniel_Jensen: ' + file)
file_name = create_file_name(text_or_filename = file)
output_file_name = 'pdfout\\' + file_name
print (output_file_name)
elif '22801 - Daniel - Lønspecifikation maj 2018.pdf' in file:
print('22801 - Daniel - Lønspecifikation maj 2018.pdf')
extracted_text = extract_protected_pdf_text(path=file, password=None)
file_name = create_file_name(text_or_filename = extracted_text)
output_file_name = 'pdfout\\' + file_name
print (output_file_name)
elif '22801' in file:
print('22801 baby: ' + file)
# TODO: Remove password found here from git history:
# https://help.github.com/en/github/authenticating-to-github/removing-sensitive-data-from-a-repository
# Move the content of this file to another file without the password
extracted_text = extract_protected_pdf_text(path=file, password='2505832461')
file_name = create_file_name(text_or_filename = extracted_text)
output_file_name = 'pdfout\\' + file_name
print (output_file_name)
else:
print('File we do not handle: ' + file)
if __name__ == '__main__':
main()