Skip to content

Commit 49ed780

Browse files
♻️ update fix_pdf syntax accessors, fix typo (#197)
1 parent 458a48a commit 49ed780

File tree

11 files changed

+49
-32
lines changed

11 files changed

+49
-32
lines changed

docs/code_samples/default_v2.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ model_id = 'MY_MODEL_ID'
88
mindee_client = Mindee::ClientV2.new(api_key: api_key)
99

1010
# Set inference parameters
11-
params = Mindee::Input::InferenceParameters.new(
11+
inference_params = Mindee::Input::InferenceParameters.new(
1212
# ID of the model, required.
1313
model_id,
1414
# If set to `true`, will enable Retrieval-Augmented Generation.
@@ -21,7 +21,7 @@ input_source = Mindee::Input::Source::PathInputSource.new(input_path)
2121
# Send for processing
2222
response = mindee_client.enqueue_and_get_inference(
2323
input_source,
24-
params # Note: this parameter can also be provided as a Hash.
24+
inference_params # Note: this parameter can also be provided as a Hash.
2525
)
2626

2727
# Print a brief summary of the parsed data

lib/mindee/errors/mindee_input_error.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ class MindeeMimeTypeError < MindeeSourceError
1616
# @param mime_type [String]
1717
def initialize(mime_type)
1818
@invalid_mimetype = mime_type
19-
super("'#{@invalid_mimetype}' mime type not allowed, must be one of" \
19+
super("'#{@invalid_mimetype}' mime type not allowed, must be one of " \
2020
"#{Mindee::Input::Source::ALLOWED_MIME_TYPES.join(', ')}")
2121
end
2222
end

lib/mindee/image/image_extractor.rb

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,10 @@ module ImageExtractor
1818
# @return [Origami::PDF] A PdfDocument handle.
1919
def self.attach_image_as_new_file(input_buffer, format: 'jpg')
2020
magick_image = MiniMagick::Image.read(input_buffer)
21-
# NOTE: some jpeg images get rendered as three different versions of themselves per output if the format isn't
22-
# converted.
21+
# NOTE: We force format consolidation to a single format to avoid frames being interpreted as the final output.
2322
magick_image.format(format)
2423
original_density = magick_image.resolution
25-
scale_factor = original_density[0].to_f / 4.166666 # No clue why the resolution needs to be reduced for
26-
# the pdf otherwise the resulting image shrinks.
24+
scale_factor = original_density[0].to_f / 4.166666 # Convert from default 300 DPI to 72.
2725
magick_image.format('pdf', 0, { density: scale_factor.to_s })
2826
Origami::PDF.read(StringIO.new(magick_image.to_blob))
2927
end
@@ -32,8 +30,7 @@ def self.attach_image_as_new_file(input_buffer, format: 'jpg')
3230
#
3331
# @param [Input::Source::LocalInputSource] input_source
3432
# @param [Integer] page_id ID of the Page to extract from.
35-
# @param [Array<Array<Geometry::Point>>, Array<Geometry::Quadrilateral>] polygons List of coordinates.
36-
# to extract.
33+
# @param [Array<Array<Geometry::Point>>, Array<Geometry::Quadrilateral>] polygons List of coordinates to extract.
3734
# @return [Array<Image::ExtractedImage>] Extracted Images.
3835
def self.extract_multiple_images_from_source(input_source, page_id, polygons)
3936
new_stream = load_input_source_pdf_page_as_stringio(input_source, page_id)

lib/mindee/input/sources/local_input_source.rb

Lines changed: 29 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,7 @@ def initialize(io_stream, filename, repair_pdf: false)
4747
end
4848

4949
if filename.end_with?('.pdf') && repair_pdf
50-
rescue_broken_pdf(@io_stream)
51-
@file_mimetype = Marcel::MimeType.for @io_stream
50+
fix_pdf!
5251

5352
logger.debug("Loaded new input #{@filename} from #{self.class}")
5453
return if ALLOWED_MIME_TYPES.include? @file_mimetype
@@ -57,27 +56,40 @@ def initialize(io_stream, filename, repair_pdf: false)
5756
raise Errors::MindeeMimeTypeError, @file_mimetype.to_s
5857
end
5958

60-
# Attempts to fix pdf files if mimetype is rejected.
61-
# "Broken PDFs" are often a result of third-party injecting invalid headers.
62-
# This attempts to remove them and send the file
63-
# @param stream [StringIO, File]
64-
def rescue_broken_pdf(stream)
65-
stream.gets('%PDF-')
66-
raise Errors::MindeePDFError if stream.eof? || stream.pos > 500
67-
68-
stream.pos = stream.pos - 5
69-
data = stream.read
70-
@io_stream.close
71-
72-
@io_stream = StringIO.new
73-
@io_stream << data
59+
# @deprecated See {#fix_pdf!} or {#self.fix_pdf} instead.
60+
def rescue_broken_pdf(_)
61+
fix_pdf!
7462
end
7563

76-
# Shorthand for pdf mimetype validation.
64+
# Shorthand for PDF mimetype validation.
7765
def pdf?
7866
@file_mimetype.to_s == 'application/pdf'
7967
end
8068

69+
# Attempts to fix the PDF data in the file.
70+
# @param maximum_offset [Integer] Maximum offset to look for the PDF header.
71+
# @return [void]
72+
# @raise [Mindee::Errors::MindeePDFError]
73+
def fix_pdf!(maximum_offset: 500)
74+
@io_stream = LocalInputSource.fix_pdf(@io_stream, maximum_offset: maximum_offset)
75+
@io_stream.rewind
76+
@file_mimetype = Marcel::MimeType.for @io_stream
77+
end
78+
79+
# Attempt to fix the PDF data in the given stream.
80+
# @param stream [StringIO] The stream to fix.
81+
# @param maximum_offset [Integer] Maximum offset to look for the PDF header.
82+
# @return [StringIO] The fixed stream.
83+
# @raise [Mindee::Errors::MindeePDFError]
84+
def self.fix_pdf(stream, maximum_offset: 500)
85+
out_stream = StringIO.new
86+
stream.gets('%PDF-')
87+
raise Errors::MindeePDFError if stream.eof? || stream.pos > maximum_offset
88+
89+
stream.pos = stream.pos - 5
90+
out_stream << stream.read
91+
end
92+
8193
# Cuts a PDF file according to provided options.
8294
# @param options [PageOptions, nil] Page cutting/merge options:
8395
#

lib/mindee/parsing/v2/field/list_field.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ def [](index)
7474
end
7575

7676
# Iterator for Enumerator inheritance.
77-
# NOTE: Untyped due to incomplete support in steep.
77+
# NOTE: Untyped due to incomplete support in current supported version of RBS.
7878
def each(&block)
7979
return to_enum(:each) unless block_given?
8080

lib/mindee/parsing/v2/field/simple_field.rb

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@ def to_s
2626
if @value.is_a?(TrueClass) || @value.is_a?(FalseClass)
2727
@value ? 'True' : 'False'
2828
elsif @value.is_a?(Integer) || @value.is_a?(Float)
29-
# NOTE: explicitly typing because steep is very, very dumb
3029
num = @value # @type var num: Integer | Float
3130
format_numeric_value(num)
3231
else

sig/custom/mini_magick.rbs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Stub for the mini_magick library.
2-
# Note: though typing annotations for the MiniMagick library now exist, it seems that they aren't strict enough
2+
# NOTE: though typing annotations for the MiniMagick library now exist, it seems that they aren't strict enough
33
# to match the rules we have on the repo, hence the existence of this file and the overrides present below.
44
module MiniMagick
55
class Image

sig/mindee/input/sources/local_input_source.rbs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,14 @@ module Mindee
88
attr_reader filename: String
99
attr_reader io_stream: StringIO | File
1010
def initialize: (StringIO | File, String, ?repair_pdf: bool) -> void
11+
12+
def fix_pdf!: (?maximum_offset: Integer) -> void
13+
def self.fix_pdf: (StringIO | File, ?maximum_offset: Integer) -> StringIO
14+
1115
def logger: () -> Logger
1216

1317

14-
def rescue_broken_pdf: (StringIO | File) -> (StringIO | File)
18+
def rescue_broken_pdf: (untyped) -> void
1519
def pdf?: -> bool
1620
def apply_page_options: (PageOptions) -> StringIO?
1721
def process_pdf: (PageOptions) -> StringIO?

sig/mindee/parsing/v2/field/list_field.rbs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@ module Mindee
1212
def empty?: -> bool
1313
def size: -> Integer
1414
def length: -> Integer
15-
# NOTE: Steep is incapable of handling typing of `each` when multiple types are used.
1615
def each: () { (untyped) -> untyped } -> untyped
1716
def []: (Integer) -> (BaseField)
1817
end

spec/parsing/v2/inference_spec.rb

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,12 @@ def load_v2_inference(resource_path)
9898
expect(fields['line_items'][0]).to be_a(object_field)
9999
expect(fields['line_items'][0]['quantity'].value).to eq(1.0)
100100

101+
expect(fields).to have_key('line_items')
102+
expect(fields['line_items']).not_to be_nil
103+
expect(fields['line_items']).to be_a(list_field)
104+
expect(fields['line_items'][0]).to be_a(object_field)
105+
expect(fields['line_items'][0]['quantity'].value).to eq(1.0)
106+
101107
tax_item_obj = first_tax_item
102108
expect(tax_item_obj.fields.size).to eq(3)
103109

0 commit comments

Comments
 (0)