Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -131,3 +131,5 @@ dmypy.json

# Pyre type checker
.pyre/

.idea
29 changes: 25 additions & 4 deletions paperlessngx_postprocessor/paperless_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,28 @@ def __init__(self, api_url, auth_token, paperless_src_dir, logger=None):

self._auth_token = auth_token
self._cache = {}
self._cachable_types = ["correspondents", "document_types", "storage_paths", "tags"]
self._cachable_types = ["correspondents", "document_types", "storage_paths", "tags", "custom_fields"]
self._paperless_api_version = 3

self._common_headers = {"Authorization": f"Token {self._auth_token}",
"Content-Type": "application/json",
"Accept": f"application/json; version={self._paperless_api_version}"}

def _get_custom_fields(self):
return self._get_list("custom_fields")

def get_custom_field_by_name(self, search_name):
self._get_custom_fields()

search_result = [custom_field for custom_field in self._cache["custom_fields"] if custom_field["name"].lower() == search_name.lower().replace("_", " ")]
search_result = search_result[0] if search_result else None

if search_result:
return search_result
else:
self._logger.debug(f"Custom field with name {search_name} cannot be found.")
return {}

def delete_document_by_id(self, document_id):
item_type = "documents"
item_id = document_id
Expand Down Expand Up @@ -94,7 +110,7 @@ def get_item_id_by_name(self, item_type, item_name):
def patch_document(self, document_id, data):
response = requests.patch(f"{self._api_url}/documents/{document_id}/",
headers = self._common_headers,
data = data)
json = data)
if not response.ok:
self._log_request_error(response)
return response
Expand Down Expand Up @@ -179,6 +195,9 @@ def get_storage_path_by_id(self, storage_path_id):
def get_tag_by_id(self, tag_id):
return self._get_item_by_id("tags", tag_id)

def get_custom_field_by_id(self, custom_field_id):
return self._get_item_by_id("custom_fields", custom_field_id)

def get_metadata_in_filename_format(self, metadata):
new_metadata = {}
new_metadata["document_id"] = metadata["id"]
Expand All @@ -202,7 +221,8 @@ def get_metadata_in_filename_format(self, metadata):
new_metadata["added_day"] = f"{added_date.day:02d}"
new_metadata["added_date"] = added_date.strftime("%F")
new_metadata["added_date_object"] = added_date

new_metadata["custom_fields"] = metadata["custom_fields"]

return new_metadata

def get_metadata_from_filename_format(self, metadata_in_filename_format):
Expand All @@ -217,7 +237,8 @@ def get_metadata_from_filename_format(self, metadata_in_filename_format):
#result["created"] = metadata_in_filename_format["created"]
result["created_date"] = dateutil.parser.isoparse(metadata_in_filename_format["created"]).strftime("%F")
result["added"] = metadata_in_filename_format["added"]

result["custom_fields"] = metadata_in_filename_format["custom_fields"]

return result

def get_metadata_for_post_consume_script(self, document_id):
Expand Down
45 changes: 35 additions & 10 deletions paperlessngx_postprocessor/postprocessor.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import calendar
import copy
import dateutil.parser
import jinja2
import logging
Expand All @@ -22,6 +23,8 @@ def __init__(self, api, spec, logger = None):
self._match = spec[self.name].get("match")
self._metadata_regex = spec[self.name].get("metadata_regex")
self._metadata_postprocessing = spec[self.name].get("metadata_postprocessing")
#self._custom_fields_regex = spec[self.name].get("custom_fields_regex")
#self._custom_fields_postprocessing = spec[self.name].get("custom_fields_postprocessing")
self._validation_rule = spec[self.name].get("validation_rule")
#self._title_format = spec[self.name].get("title_format")

Expand Down Expand Up @@ -138,7 +141,7 @@ def _jinja_filter_regex_sub(self, string, pattern, repl):
return regex.sub(pattern, repl, string)

def _normalize_created_dates(self, new_metadata, old_metadata):
result = new_metadata.copy()
result = copy.deepcopy(new_metadata)
try:
result["created_year"] = str(int(new_metadata["created_year"]))
except:
Expand Down Expand Up @@ -186,17 +189,27 @@ def get_new_metadata(self, metadata, content):
read_only_metadata = {key: metadata[key] for key in read_only_metadata_keys if key in metadata}
writable_metadata_keys = list(set(metadata.keys()) - set(read_only_metadata_keys))
writable_metadata = {key: metadata[key] for key in writable_metadata_keys if key in metadata}

# Extract the regex_data
if self._metadata_regex is not None:
match_object = regex.search(self._metadata_regex, content)
if match_object is not None:
regex_data = match_object.groupdict()
#writable_metadata.update(match_object.groupdict())
matches = regex.finditer(self._metadata_regex, content)
# Iterate over all matches and merge it
regex_data = {}
for match_object in matches:
if match_object is not None:
current_groups = match_object.groupdict()
for group_name, value in current_groups.items():
if group_name in regex_data and value not in regex_data[group_name]:
regex_data[group_name] = f"{regex_data[group_name]},{value}"
else:
regex_data[group_name] = value

# Process all merged matches at once
if regex_data: # Only process if we found any matches
writable_metadata.update([(k, regex_data[k]) for k in regex_data if regex_data[k] is not None])
writable_metadata = self._normalize_created_dates(writable_metadata, metadata)
self._logger.debug(f"Regex results are {writable_metadata}")
else:
else: # No matches found
self._logger.warning(f"Regex '{self._metadata_regex}' for '{self.name}' didn't match for document_id={metadata['document_id']}")

# Cycle throguh the postprocessing rules
Expand All @@ -205,8 +218,20 @@ def get_new_metadata(self, metadata, content):
try:
old_value = writable_metadata.get(variable_name)
merged_metadata = {**writable_metadata, **read_only_metadata}
template = self._env.from_string(self._metadata_postprocessing[variable_name])
writable_metadata[variable_name] = template.render(**merged_metadata)

if variable_name != "custom_fields":
template = self._env.from_string(self._metadata_postprocessing[variable_name])
writable_metadata[variable_name] = template.render(**merged_metadata)
elif variable_name == "custom_fields":
for custom_field_name in (
self._metadata_postprocessing["custom_fields"]
).keys():
custom_field_definition = self._api.get_custom_field_by_name(custom_field_name)
for index, custom_field_metadata_iterate in enumerate(writable_metadata["custom_fields"]):
if custom_field_definition and custom_field_metadata_iterate["field"] == custom_field_definition["id"]:
template = self._env.from_string(self._metadata_postprocessing[variable_name][custom_field_name])
writable_metadata[variable_name][index]["value"] = template.render(**merged_metadata)

writable_metadata = self._normalize_created_dates(writable_metadata, metadata)
self._logger.debug(f"Updating '{variable_name}' using template {self._metadata_postprocessing[variable_name]} and metadata {merged_metadata}\n: '{old_value}'->'{writable_metadata[variable_name]}'")
except Exception as e:
Expand Down Expand Up @@ -257,7 +282,7 @@ def __init__(self, api, rules_dir, postprocessing_tag = None, invalid_tag = None


def _get_new_metadata_in_filename_format(self, metadata_in_filename_format, content):
new_metadata = metadata_in_filename_format.copy()
new_metadata = copy.deepcopy(metadata_in_filename_format)

for processor in self._processors:
if processor.matches(metadata_in_filename_format):
Expand Down
8 changes: 8 additions & 0 deletions rulesets.d/example.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,11 @@ Parse creation date from filename:
created_month: '{{ title_old | regex_sub("^(?P<created_year>\d{4})-(?P<created_month>\d{2})-(?P<created_day>\d{2}) (?P<title>.*)$", "\g<created_month>") }}'
created_day: '{{ title_old | regex_sub("^(?P<created_year>\d{4})-(?P<created_month>\d{2})-(?P<created_day>\d{2}) (?P<title>.*)$", "\g<created_day>") }}'
validation_rule: '{{ num_documents(correspondent=correspondent, document_type=document_type, created_date_object=created_date_object) == 1 }}'
---
Ruleset for Custom Field:
match: True
metadata_regex: 'Eingegangen (?P<entry_day>\d{1,2}).(?P<entry_month>\d{1,2}).(?P<entry_year>\d{4})'
metadata_postprocessing:
title: "Test Custom Fields Functionality"
custom_fields:
Eingegangen: '{{entry_year}}-{{entry_month}}-{{entry_day}}'