From 481421a42df387c4416a885fe11c389abefb9fd9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ianar=C3=A9=20S=C3=A9vi?= Date: Mon, 22 Jun 2026 16:59:57 +0200 Subject: [PATCH 1/2] :bug: :boom: harmonize Crop and Split --- mindee/image/extracted_image.py | 12 ++++------- mindee/image/extracted_images.py | 5 +++++ mindee/pdf/extracted_pdfs.py | 5 +++++ mindee/v2/file_operations/crop.py | 6 +++--- mindee/v2/file_operations/crop_files.py | 20 ------------------- mindee/v2/file_operations/split.py | 6 +++--- mindee/v2/file_operations/split_files.py | 20 ------------------- mindee/v2/product/crop/crop_result.py | 6 ++++-- mindee/v2/product/split/split_result.py | 6 ++++-- .../v2/file_operations/test_crop_operation.py | 11 ++++------ .../file_operations/test_split_operation.py | 10 ++++------ 11 files changed, 36 insertions(+), 71 deletions(-) create mode 100644 mindee/image/extracted_images.py create mode 100644 mindee/pdf/extracted_pdfs.py delete mode 100644 mindee/v2/file_operations/crop_files.py delete mode 100644 mindee/v2/file_operations/split_files.py diff --git a/mindee/image/extracted_image.py b/mindee/image/extracted_image.py index 6d0df276..1a686dec 100644 --- a/mindee/image/extracted_image.py +++ b/mindee/image/extracted_image.py @@ -55,24 +55,20 @@ def __init__( self._element_id = 0 if element_id is None else element_id @requires_pillow - def save_to_file(self, output_path: Path | str, file_format: str | None = None): + def save_to_file(self, output_path: Path | str): """ Saves the document to a file. :param output_path: Path to save the file to. - :param file_format: Optional PIL-compatible format for the file. Inferred from file extension if not provided. :raises MindeeError: If an invalid path or filename is provided. """ try: resolved_path = Path(output_path).resolve() - if not file_format and len(resolved_path.suffix) < 1: + if not len(resolved_path.suffix) < 1: raise ValueError("Invalid file format.") self.buffer.seek(0) image = Image.open(self.buffer) - if file_format: - image.save(resolved_path, format=file_format) - else: - image.save(resolved_path) + image.save(resolved_path) logger.info("File saved successfully to '%s'.", resolved_path) except TypeError as e: raise MindeeError("Invalid path/filename provided.") from e @@ -92,7 +88,7 @@ def as_input_source(self) -> FileInput: @property def page_id(self): """ - ID of the page the receipt was found on. + ID of the page the image was found on. :return: A valid page ID. """ diff --git a/mindee/image/extracted_images.py b/mindee/image/extracted_images.py new file mode 100644 index 00000000..535a99d0 --- /dev/null +++ b/mindee/image/extracted_images.py @@ -0,0 +1,5 @@ +from mindee.image.extracted_image import ExtractedImage + + +class ExtractedImages(list[ExtractedImage]): + """List of extracted images.""" diff --git a/mindee/pdf/extracted_pdfs.py b/mindee/pdf/extracted_pdfs.py new file mode 100644 index 00000000..bd7fdeaa --- /dev/null +++ b/mindee/pdf/extracted_pdfs.py @@ -0,0 +1,5 @@ +from mindee.pdf.extracted_pdf import ExtractedPDF + + +class ExtractedPDFs(list[ExtractedPDF]): + """List of extracted PDFs.""" diff --git a/mindee/v2/file_operations/crop.py b/mindee/v2/file_operations/crop.py index 3f69b88c..15c0fe80 100644 --- a/mindee/v2/file_operations/crop.py +++ b/mindee/v2/file_operations/crop.py @@ -1,9 +1,9 @@ from mindee.error import MindeeError from mindee.geometry import Point, Polygon from mindee.image.extracted_image import ExtractedImage +from mindee.image.extracted_images import ExtractedImages from mindee.image.image_extractor import extract_multiple_images_from_source from mindee.input.local_input_source import LocalInputSource -from mindee.v2.file_operations.crop_files import CropFiles from mindee.v2.parsing.inference.field import FieldLocation from mindee.v2.product.crop.crop_item import CropItem @@ -25,7 +25,7 @@ def extract_single_crop( def extract_multiple_crops( input_source: LocalInputSource, crops: list[CropItem] -) -> CropFiles: +) -> ExtractedImages: """ Extracts individual receipts from multi-receipts documents. @@ -49,4 +49,4 @@ def extract_multiple_crops( polygon, ) ) - return CropFiles(images) + return ExtractedImages(images) diff --git a/mindee/v2/file_operations/crop_files.py b/mindee/v2/file_operations/crop_files.py deleted file mode 100644 index 4bb9f341..00000000 --- a/mindee/v2/file_operations/crop_files.py +++ /dev/null @@ -1,20 +0,0 @@ -from pathlib import Path - -from mindee.image.extracted_image import ExtractedImage - - -class CropFiles(list[ExtractedImage]): - """Crop files.""" - - def save_all_to_disk(self, path: Path | str, prefix: str = "crop"): - """ - Save all extracted crops to disk. - - :param path: Path to save the extracted splits to. - :param prefix: Prefix to add to the filename, defaults to 'crop'. - """ - if isinstance(path, str): - path = Path(path) - path.mkdir(parents=True, exist_ok=True) - for idx, split in enumerate(self, start=1): - split.save_to_file(path / f"{prefix}_{idx:03}.jpg") diff --git a/mindee/v2/file_operations/split.py b/mindee/v2/file_operations/split.py index 686f3929..8259b65f 100644 --- a/mindee/v2/file_operations/split.py +++ b/mindee/v2/file_operations/split.py @@ -1,8 +1,8 @@ from mindee.error import MindeeError from mindee.input.local_input_source import LocalInputSource from mindee.pdf.extracted_pdf import ExtractedPDF +from mindee.pdf.extracted_pdfs import ExtractedPDFs from mindee.pdf.pdf_extractor import PDFExtractor -from mindee.v2.file_operations.split_files import SplitFiles def extract_single_split( @@ -21,7 +21,7 @@ def extract_single_split( def extract_multiple_splits( input_source: LocalInputSource, splits: list[list[int]], -) -> SplitFiles: +) -> ExtractedPDFs: """ Extracts splits as complete PDFs from the document. @@ -35,4 +35,4 @@ def extract_multiple_splits( page_groups.append(list(range(split[0], split[1] + 1))) if len(splits) < 1: raise MindeeError("No indexes provided.") - return SplitFiles(pdf_extractor.extract_sub_documents(page_groups)) + return ExtractedPDFs(pdf_extractor.extract_sub_documents(page_groups)) diff --git a/mindee/v2/file_operations/split_files.py b/mindee/v2/file_operations/split_files.py deleted file mode 100644 index 8c23057b..00000000 --- a/mindee/v2/file_operations/split_files.py +++ /dev/null @@ -1,20 +0,0 @@ -from pathlib import Path - -from mindee.pdf.extracted_pdf import ExtractedPDF - - -class SplitFiles(list[ExtractedPDF]): - """Split files.""" - - def save_all_to_disk(self, path: str | Path, prefix: str = "split"): - """ - Save all extracted splits to disk. - - :param path: Path to save the extracted splits to. - :param prefix: Prefix to add to the filename, defaults to 'split'. - """ - if isinstance(path, str): - path = Path(path) - path.mkdir(parents=True, exist_ok=True) - for idx, split in enumerate(self, start=1): - split.save_to_file(path / f"{prefix}_{idx:03}.pdf") diff --git a/mindee/v2/product/crop/crop_result.py b/mindee/v2/product/crop/crop_result.py index 47561e90..d103e5b9 100644 --- a/mindee/v2/product/crop/crop_result.py +++ b/mindee/v2/product/crop/crop_result.py @@ -1,7 +1,7 @@ +from mindee.image.extracted_images import ExtractedImages from mindee.input.local_input_source import LocalInputSource from mindee.parsing.common.string_dict import StringDict from mindee.v2.file_operations.crop import extract_multiple_crops -from mindee.v2.file_operations.crop_files import CropFiles from mindee.v2.product.crop.crop_item import CropItem @@ -20,7 +20,9 @@ def __str__(self) -> str: out_str = f"Crops\n====={crops}" return out_str - def extract_from_input_source(self, input_source: LocalInputSource) -> CropFiles: + def extract_from_input_source( + self, input_source: LocalInputSource + ) -> ExtractedImages: """ Apply all the crops to a file and return a single extracted PDF. diff --git a/mindee/v2/product/split/split_result.py b/mindee/v2/product/split/split_result.py index ab3921bf..dd3ac9da 100644 --- a/mindee/v2/product/split/split_result.py +++ b/mindee/v2/product/split/split_result.py @@ -1,7 +1,7 @@ from mindee.input.local_input_source import LocalInputSource from mindee.parsing.common.string_dict import StringDict +from mindee.pdf.extracted_pdfs import ExtractedPDFs from mindee.v2.file_operations.split import extract_multiple_splits -from mindee.v2.file_operations.split_files import SplitFiles from mindee.v2.product.split.split_range import SplitRange @@ -20,7 +20,9 @@ def __str__(self) -> str: out_str = f"Splits\n======{splits}" return out_str - def extract_from_input_source(self, input_source: LocalInputSource) -> SplitFiles: + def extract_from_input_source( + self, input_source: LocalInputSource + ) -> ExtractedPDFs: """ Apply all the crops to a file and return a single extracted PDF. diff --git a/tests/v2/file_operations/test_crop_operation.py b/tests/v2/file_operations/test_crop_operation.py index 22fe809a..c887dbb3 100644 --- a/tests/v2/file_operations/test_crop_operation.py +++ b/tests/v2/file_operations/test_crop_operation.py @@ -5,7 +5,6 @@ import pytest from mindee.input.path_input import PathInput -from mindee.v2.file_operations.crop import extract_multiple_crops from mindee.v2.product.crop.crop_response import ( CropResponse, ) @@ -39,9 +38,8 @@ def crops_multi_page_json_path(): def test_single_page_crop_split(crops_single_page_path, crops_single_page_json_path): input_sample = PathInput(crops_single_page_path) with open(crops_single_page_json_path, "rb") as f: - response = json.load(f) - doc = CropResponse(response) - extracted_crops = extract_multiple_crops(input_sample, doc.inference.result.crops) + response = CropResponse(json.load(f)) + extracted_crops = response.inference.result.extract_from_input_source(input_sample) assert len(extracted_crops) == 1 assert extracted_crops[0].page_id == 0 @@ -55,9 +53,8 @@ def test_single_page_crop_split(crops_single_page_path, crops_single_page_json_p def test_multi_page_receipt_crop(crops_multi_page_path, crops_multi_page_json_path): input_sample = PathInput(crops_multi_page_path) with open(crops_multi_page_json_path, "rb") as f: - response = json.load(f) - doc = CropResponse(response) - extracted_crops = extract_multiple_crops(input_sample, doc.inference.result.crops) + response = CropResponse(json.load(f)) + extracted_crops = response.inference.result.extract_from_input_source(input_sample) assert len(extracted_crops) == 2 assert extracted_crops[0].page_id == 0 diff --git a/tests/v2/file_operations/test_split_operation.py b/tests/v2/file_operations/test_split_operation.py index e7cf3ddd..eb9b21cd 100644 --- a/tests/v2/file_operations/test_split_operation.py +++ b/tests/v2/file_operations/test_split_operation.py @@ -35,9 +35,8 @@ def splits_multi_page_json_path(): def test_single_page_split(splits_default, splits_single_page_json_path): input_sample = PathInput(splits_default) with open(splits_single_page_json_path, "rb") as f: - response = json.load(f) - doc = SplitResponse(response) - extracted_splits = doc.inference.result.extract_from_input_source(input_sample) + response = SplitResponse(json.load(f)) + extracted_splits = response.inference.result.extract_from_input_source(input_sample) assert len(extracted_splits) == 1 assert extracted_splits[0].get_page_count() == 1 @@ -47,9 +46,8 @@ def test_single_page_split(splits_default, splits_single_page_json_path): def test_multi_page_receipt_split(splits_5p, splits_multi_page_json_path): input_sample = PathInput(splits_5p) with open(splits_multi_page_json_path, "rb") as f: - response = json.load(f) - doc = SplitResponse(response) - extracted_splits = doc.inference.result.extract_from_input_source(input_sample) + response = SplitResponse(json.load(f)) + extracted_splits = response.inference.result.extract_from_input_source(input_sample) assert len(extracted_splits) == 3 assert extracted_splits[0].get_page_count() == 1 From 122e9fba4a00ac63a0f51eb1dce5bce2140ece84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ianar=C3=A9=20S=C3=A9vi?= Date: Mon, 22 Jun 2026 17:43:51 +0200 Subject: [PATCH 2/2] other fixes --- mindee/image/__init__.py | 3 -- mindee/image/extracted_image.py | 33 +++++++++++-------- mindee/image/image_extractor.py | 14 ++++---- mindee/input/local_input_source.py | 2 +- mindee/pdf/extracted_pdf.py | 16 ++++----- tests/input/test_compression.py | 2 +- .../test_invoice_splitter_auto_extraction.py | 4 +-- .../file_operations/test_split_operation.py | 11 +++++++ 8 files changed, 47 insertions(+), 38 deletions(-) diff --git a/mindee/image/__init__.py b/mindee/image/__init__.py index f562ff76..e69de29b 100644 --- a/mindee/image/__init__.py +++ b/mindee/image/__init__.py @@ -1,3 +0,0 @@ -from mindee.image.image_compressor import compress_image - -__all__ = ["compress_image"] diff --git a/mindee/image/extracted_image.py b/mindee/image/extracted_image.py index 1a686dec..23957eae 100644 --- a/mindee/image/extracted_image.py +++ b/mindee/image/extracted_image.py @@ -1,14 +1,12 @@ from __future__ import annotations -import io from pathlib import Path -from typing import Any +from typing import Any, BinaryIO from mindee.dependencies.checkers import PILLOW_AVAILABLE from mindee.dependencies.decorators import requires_pillow from mindee.error.mindee_error import MindeeError -from mindee.input.file_input import FileInput -from mindee.input.local_input_source import LocalInputSource +from mindee.input.bytes_input import BytesInput from mindee.logger import logger if PILLOW_AVAILABLE: @@ -21,6 +19,7 @@ class ExtractedImage: """Generic class for image extraction.""" + buffer: BinaryIO _page_id: int """Id of the page the image was extracted from.""" _element_id: int @@ -29,27 +28,33 @@ class ExtractedImage: """Name of the file the image was extracted from.""" def __init__( - self, input_source: LocalInputSource, page_id: int, element_id: int + self, + img_byte_stream: BinaryIO, + orig_filename: str, + orig_extension: str, + page_id: int, + element_id: int, ) -> None: """ Initialize the ExtractedImage with a buffer and an internal file name. - :param input_source: Local source for input. + :param img_byte_stream: The raw image bytes. + :param orig_filename: Name of the file the image was extracted from. :param page_id: ID of the page the element was found on. :param element_id: ID of the element in a page. """ - self.buffer = io.BytesIO(input_source.file_object.read()) - self.buffer.name = input_source.filename - self.filename = input_source.filename - if input_source.is_pdf(): + self.buffer = img_byte_stream + self.filename = orig_filename + + if orig_extension.lower().endswith("pdf"): extension = "jpg" else: - extension = Path(input_source.filename).resolve().suffix + extension = orig_extension.lower() self.buffer.seek(0) pg_number = str(page_id).zfill(3) elem_number = str(element_id).zfill(3) self.internal_file_name = ( - f"{input_source.filename}_page{pg_number}-{elem_number}.{extension}" + f"{orig_filename}_page{pg_number}-{elem_number}.{extension}" ) self._page_id = page_id self._element_id = 0 if element_id is None else element_id @@ -76,14 +81,14 @@ def save_to_file(self, output_path: Path | str): print(e) raise MindeeError(f"Could not save file {Path(output_path).name}.") from e - def as_input_source(self) -> FileInput: + def as_input_source(self) -> BytesInput: """ Return the file as a Mindee-compatible BufferInput source. :returns: A BufferInput source. """ self.buffer.seek(0) - return FileInput(self.buffer) + return BytesInput(self.buffer.read(), self.internal_file_name) @property def page_id(self): diff --git a/mindee/image/image_extractor.py b/mindee/image/image_extractor.py index 0a33168a..fb87c81e 100644 --- a/mindee/image/image_extractor.py +++ b/mindee/image/image_extractor.py @@ -10,7 +10,6 @@ from mindee.geometry.point import Point from mindee.geometry.polygon import Polygon, get_min_max_x, get_min_max_y from mindee.image.extracted_image import ExtractedImage -from mindee.input.bytes_input import BytesInput from mindee.input.local_input_source import LocalInputSource if PYPDFIUM2_AVAILABLE: @@ -66,7 +65,7 @@ def extract_image_from_polygon( width: float, height: float, file_format: str, -) -> bytes: +) -> BinaryIO: """ Crops the image from the given polygon. @@ -91,7 +90,7 @@ def extract_image_from_polygon( @requires_pillow -def save_image_to_buffer(image: Image.Image, file_format: str) -> bytes: +def save_image_to_buffer(image: Image.Image, file_format: str) -> BinaryIO: """ Saves an image as a buffer. @@ -102,7 +101,7 @@ def save_image_to_buffer(image: Image.Image, file_format: str) -> bytes: buffer = io.BytesIO() image.save(buffer, format=file_format) buffer.seek(0) - return buffer.read() + return buffer @requires_pillow @@ -159,10 +158,9 @@ def extract_multiple_images_from_source( ) extracted_elements.append( ExtractedImage( - BytesInput( - image_data, - f"{input_source.filename}_page{page_id + 1}-{element_id}.{file_extension}", - ), + image_data, + input_source.filename, + file_extension, page_id, element_id, ) diff --git a/mindee/input/local_input_source.py b/mindee/input/local_input_source.py index 1a8ecf15..0fe78842 100644 --- a/mindee/input/local_input_source.py +++ b/mindee/input/local_input_source.py @@ -10,7 +10,7 @@ from mindee.dependencies.checkers import PYPDFIUM2_AVAILABLE from mindee.error.mimetype_error import MimeTypeError from mindee.error.mindee_error import MindeeError, MindeeSourceError -from mindee.image import compress_image +from mindee.image.image_compressor import compress_image from mindee.input.page_options import KEEP_ONLY, REMOVE, PageOptions from mindee.logger import logger from mindee.pdf.pdf_compressor import compress_pdf diff --git a/mindee/pdf/extracted_pdf.py b/mindee/pdf/extracted_pdf.py index 21a0137f..a0325f9c 100644 --- a/mindee/pdf/extracted_pdf.py +++ b/mindee/pdf/extracted_pdf.py @@ -18,18 +18,18 @@ class ExtractedPDF: """An extracted sub-Pdf.""" - pdf_bytes: BinaryIO + buffer: BinaryIO filename: str - def __init__(self, pdf_bytes: BinaryIO, filename: str): - self.pdf_bytes = pdf_bytes + def __init__(self, pdf_byte_stream: BinaryIO, filename: str): + self.buffer = pdf_byte_stream self.filename = filename @requires_pypdfium2 def get_page_count(self) -> int: """Get the number of pages in the PDF file.""" try: - pdf = pdfium.PdfDocument(self.pdf_bytes) + pdf = pdfium.PdfDocument(self.buffer) return len(pdf) except Exception as e: raise MindeeError( @@ -50,11 +50,11 @@ def save_to_file(self, output_path: Path | str): raise MindeeError("Invalid save path provided {}.") if out_path.suffix.lower() != "pdf": out_path = out_path.parent / (out_path.stem + "." + "pdf") - self.pdf_bytes.seek(0) + self.buffer.seek(0) with open(out_path, "wb") as out_file: - out_file.write(self.pdf_bytes.read()) + out_file.write(self.buffer.read()) def as_input_source(self) -> BytesInput: """Returns the current PDF object as a usable BytesInput source.""" - self.pdf_bytes.seek(0) - return BytesInput(self.pdf_bytes.read(), self.filename) + self.buffer.seek(0) + return BytesInput(self.buffer.read(), self.filename) diff --git a/tests/input/test_compression.py b/tests/input/test_compression.py index 6b4371ab..5f9a941e 100644 --- a/tests/input/test_compression.py +++ b/tests/input/test_compression.py @@ -6,7 +6,7 @@ import pytest -from mindee.image import compress_image +from mindee.image.image_compressor import compress_image from mindee.input import PathInput from mindee.pdf.pdf_compressor import compress_pdf from mindee.pdf.pdf_utils import extract_text_from_pdf diff --git a/tests/v1/extraction/test_invoice_splitter_auto_extraction.py b/tests/v1/extraction/test_invoice_splitter_auto_extraction.py index 8c4d46c5..10aef482 100644 --- a/tests/v1/extraction/test_invoice_splitter_auto_extraction.py +++ b/tests/v1/extraction/test_invoice_splitter_auto_extraction.py @@ -53,9 +53,7 @@ def test_pdf_should_extract_invoices_strict(): ) for i, extracted_pdf in enumerate(extracted_base_pdfs): assert extracted_pdf.filename == extracted_pdfs_strict[i].filename - assert ( - extracted_pdf.pdf_bytes.read() == extracted_pdfs_strict[i].pdf_bytes.read() - ) + assert extracted_pdf.buffer.read() == extracted_pdfs_strict[i].buffer.read() assert len(extracted_pdfs_not_strict) == 2 assert extracted_pdfs_not_strict[0].filename == "default_sample_001-001.pdf" diff --git a/tests/v2/file_operations/test_split_operation.py b/tests/v2/file_operations/test_split_operation.py index eb9b21cd..4ca8d27e 100644 --- a/tests/v2/file_operations/test_split_operation.py +++ b/tests/v2/file_operations/test_split_operation.py @@ -53,3 +53,14 @@ def test_multi_page_receipt_split(splits_5p, splits_multi_page_json_path): assert extracted_splits[0].get_page_count() == 1 assert extracted_splits[1].get_page_count() == 3 assert extracted_splits[2].get_page_count() == 1 + + +@pytest.mark.pypdfium2 +def test_multi_page_receipt_single_split(splits_5p, splits_multi_page_json_path): + input_sample = PathInput(splits_5p) + with open(splits_multi_page_json_path, "rb") as f: + response = SplitResponse(json.load(f)) + split = response.inference.result.splits[1] + extracted_split = split.extract_from_input_source(input_sample) + + assert extracted_split.get_page_count() == 3