From 481421a42df387c4416a885fe11c389abefb9fd9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ianar=C3=A9=20S=C3=A9vi?= <ianare@mindee.co>
Date: Mon, 22 Jun 2026 16:59:57 +0200
Subject: [PATCH 1/2] :bug: :boom: harmonize Crop and Split

---
 mindee/image/extracted_image.py               | 12 ++++-------
 mindee/image/extracted_images.py              |  5 +++++
 mindee/pdf/extracted_pdfs.py                  |  5 +++++
 mindee/v2/file_operations/crop.py             |  6 +++---
 mindee/v2/file_operations/crop_files.py       | 20 -------------------
 mindee/v2/file_operations/split.py            |  6 +++---
 mindee/v2/file_operations/split_files.py      | 20 -------------------
 mindee/v2/product/crop/crop_result.py         |  6 ++++--
 mindee/v2/product/split/split_result.py       |  6 ++++--
 .../v2/file_operations/test_crop_operation.py | 11 ++++------
 .../file_operations/test_split_operation.py   | 10 ++++------
 11 files changed, 36 insertions(+), 71 deletions(-)
 create mode 100644 mindee/image/extracted_images.py
 create mode 100644 mindee/pdf/extracted_pdfs.py
 delete mode 100644 mindee/v2/file_operations/crop_files.py
 delete mode 100644 mindee/v2/file_operations/split_files.py

diff --git a/mindee/image/extracted_image.py b/mindee/image/extracted_image.py
index 6d0df276..1a686dec 100644
--- a/mindee/image/extracted_image.py
+++ b/mindee/image/extracted_image.py
@@ -55,24 +55,20 @@ def __init__(
         self._element_id = 0 if element_id is None else element_id
 
     @requires_pillow
-    def save_to_file(self, output_path: Path | str, file_format: str | None = None):
+    def save_to_file(self, output_path: Path | str):
         """
         Saves the document to a file.
 
         :param output_path: Path to save the file to.
-        :param file_format: Optional PIL-compatible format for the file. Inferred from file extension if not provided.
         :raises MindeeError: If an invalid path or filename is provided.
         """
         try:
             resolved_path = Path(output_path).resolve()
-            if not file_format and len(resolved_path.suffix) < 1:
+            if not len(resolved_path.suffix) < 1:
                 raise ValueError("Invalid file format.")
             self.buffer.seek(0)
             image = Image.open(self.buffer)
-            if file_format:
-                image.save(resolved_path, format=file_format)
-            else:
-                image.save(resolved_path)
+            image.save(resolved_path)
             logger.info("File saved successfully to '%s'.", resolved_path)
         except TypeError as e:
             raise MindeeError("Invalid path/filename provided.") from e
@@ -92,7 +88,7 @@ def as_input_source(self) -> FileInput:
     @property
     def page_id(self):
         """
-        ID of the page the receipt was found on.
+        ID of the page the image was found on.
 
         :return: A valid page ID.
         """
diff --git a/mindee/image/extracted_images.py b/mindee/image/extracted_images.py
new file mode 100644
index 00000000..535a99d0
--- /dev/null
+++ b/mindee/image/extracted_images.py
@@ -0,0 +1,5 @@
+from mindee.image.extracted_image import ExtractedImage
+
+
+class ExtractedImages(list[ExtractedImage]):
+    """List of extracted images."""
diff --git a/mindee/pdf/extracted_pdfs.py b/mindee/pdf/extracted_pdfs.py
new file mode 100644
index 00000000..bd7fdeaa
--- /dev/null
+++ b/mindee/pdf/extracted_pdfs.py
@@ -0,0 +1,5 @@
+from mindee.pdf.extracted_pdf import ExtractedPDF
+
+
+class ExtractedPDFs(list[ExtractedPDF]):
+    """List of extracted PDFs."""
diff --git a/mindee/v2/file_operations/crop.py b/mindee/v2/file_operations/crop.py
index 3f69b88c..15c0fe80 100644
--- a/mindee/v2/file_operations/crop.py
+++ b/mindee/v2/file_operations/crop.py
@@ -1,9 +1,9 @@
 from mindee.error import MindeeError
 from mindee.geometry import Point, Polygon
 from mindee.image.extracted_image import ExtractedImage
+from mindee.image.extracted_images import ExtractedImages
 from mindee.image.image_extractor import extract_multiple_images_from_source
 from mindee.input.local_input_source import LocalInputSource
-from mindee.v2.file_operations.crop_files import CropFiles
 from mindee.v2.parsing.inference.field import FieldLocation
 from mindee.v2.product.crop.crop_item import CropItem
 
@@ -25,7 +25,7 @@ def extract_single_crop(
 
 def extract_multiple_crops(
     input_source: LocalInputSource, crops: list[CropItem]
-) -> CropFiles:
+) -> ExtractedImages:
     """
     Extracts individual receipts from multi-receipts documents.
 
@@ -49,4 +49,4 @@ def extract_multiple_crops(
                 polygon,
             )
         )
-    return CropFiles(images)
+    return ExtractedImages(images)
diff --git a/mindee/v2/file_operations/crop_files.py b/mindee/v2/file_operations/crop_files.py
deleted file mode 100644
index 4bb9f341..00000000
--- a/mindee/v2/file_operations/crop_files.py
+++ /dev/null
@@ -1,20 +0,0 @@
-from pathlib import Path
-
-from mindee.image.extracted_image import ExtractedImage
-
-
-class CropFiles(list[ExtractedImage]):
-    """Crop files."""
-
-    def save_all_to_disk(self, path: Path | str, prefix: str = "crop"):
-        """
-        Save all extracted crops to disk.
-
-        :param path: Path to save the extracted splits to.
-        :param prefix: Prefix to add to the filename, defaults to 'crop'.
-        """
-        if isinstance(path, str):
-            path = Path(path)
-        path.mkdir(parents=True, exist_ok=True)
-        for idx, split in enumerate(self, start=1):
-            split.save_to_file(path / f"{prefix}_{idx:03}.jpg")
diff --git a/mindee/v2/file_operations/split.py b/mindee/v2/file_operations/split.py
index 686f3929..8259b65f 100644
--- a/mindee/v2/file_operations/split.py
+++ b/mindee/v2/file_operations/split.py
@@ -1,8 +1,8 @@
 from mindee.error import MindeeError
 from mindee.input.local_input_source import LocalInputSource
 from mindee.pdf.extracted_pdf import ExtractedPDF
+from mindee.pdf.extracted_pdfs import ExtractedPDFs
 from mindee.pdf.pdf_extractor import PDFExtractor
-from mindee.v2.file_operations.split_files import SplitFiles
 
 
 def extract_single_split(
@@ -21,7 +21,7 @@ def extract_single_split(
 def extract_multiple_splits(
     input_source: LocalInputSource,
     splits: list[list[int]],
-) -> SplitFiles:
+) -> ExtractedPDFs:
     """
     Extracts splits as complete PDFs from the document.
 
@@ -35,4 +35,4 @@ def extract_multiple_splits(
         page_groups.append(list(range(split[0], split[1] + 1)))
     if len(splits) < 1:
         raise MindeeError("No indexes provided.")
-    return SplitFiles(pdf_extractor.extract_sub_documents(page_groups))
+    return ExtractedPDFs(pdf_extractor.extract_sub_documents(page_groups))
diff --git a/mindee/v2/file_operations/split_files.py b/mindee/v2/file_operations/split_files.py
deleted file mode 100644
index 8c23057b..00000000
--- a/mindee/v2/file_operations/split_files.py
+++ /dev/null
@@ -1,20 +0,0 @@
-from pathlib import Path
-
-from mindee.pdf.extracted_pdf import ExtractedPDF
-
-
-class SplitFiles(list[ExtractedPDF]):
-    """Split files."""
-
-    def save_all_to_disk(self, path: str | Path, prefix: str = "split"):
-        """
-        Save all extracted splits to disk.
-
-        :param path: Path to save the extracted splits to.
-        :param prefix: Prefix to add to the filename, defaults to 'split'.
-        """
-        if isinstance(path, str):
-            path = Path(path)
-        path.mkdir(parents=True, exist_ok=True)
-        for idx, split in enumerate(self, start=1):
-            split.save_to_file(path / f"{prefix}_{idx:03}.pdf")
diff --git a/mindee/v2/product/crop/crop_result.py b/mindee/v2/product/crop/crop_result.py
index 47561e90..d103e5b9 100644
--- a/mindee/v2/product/crop/crop_result.py
+++ b/mindee/v2/product/crop/crop_result.py
@@ -1,7 +1,7 @@
+from mindee.image.extracted_images import ExtractedImages
 from mindee.input.local_input_source import LocalInputSource
 from mindee.parsing.common.string_dict import StringDict
 from mindee.v2.file_operations.crop import extract_multiple_crops
-from mindee.v2.file_operations.crop_files import CropFiles
 from mindee.v2.product.crop.crop_item import CropItem
 
 
@@ -20,7 +20,9 @@ def __str__(self) -> str:
         out_str = f"Crops\n====={crops}"
         return out_str
 
-    def extract_from_input_source(self, input_source: LocalInputSource) -> CropFiles:
+    def extract_from_input_source(
+        self, input_source: LocalInputSource
+    ) -> ExtractedImages:
         """
         Apply all the crops to a file and return a single extracted PDF.
 
diff --git a/mindee/v2/product/split/split_result.py b/mindee/v2/product/split/split_result.py
index ab3921bf..dd3ac9da 100644
--- a/mindee/v2/product/split/split_result.py
+++ b/mindee/v2/product/split/split_result.py
@@ -1,7 +1,7 @@
 from mindee.input.local_input_source import LocalInputSource
 from mindee.parsing.common.string_dict import StringDict
+from mindee.pdf.extracted_pdfs import ExtractedPDFs
 from mindee.v2.file_operations.split import extract_multiple_splits
-from mindee.v2.file_operations.split_files import SplitFiles
 from mindee.v2.product.split.split_range import SplitRange
 
 
@@ -20,7 +20,9 @@ def __str__(self) -> str:
         out_str = f"Splits\n======{splits}"
         return out_str
 
-    def extract_from_input_source(self, input_source: LocalInputSource) -> SplitFiles:
+    def extract_from_input_source(
+        self, input_source: LocalInputSource
+    ) -> ExtractedPDFs:
         """
         Apply all the crops to a file and return a single extracted PDF.
 
diff --git a/tests/v2/file_operations/test_crop_operation.py b/tests/v2/file_operations/test_crop_operation.py
index 22fe809a..c887dbb3 100644
--- a/tests/v2/file_operations/test_crop_operation.py
+++ b/tests/v2/file_operations/test_crop_operation.py
@@ -5,7 +5,6 @@
 import pytest
 
 from mindee.input.path_input import PathInput
-from mindee.v2.file_operations.crop import extract_multiple_crops
 from mindee.v2.product.crop.crop_response import (
     CropResponse,
 )
@@ -39,9 +38,8 @@ def crops_multi_page_json_path():
 def test_single_page_crop_split(crops_single_page_path, crops_single_page_json_path):
     input_sample = PathInput(crops_single_page_path)
     with open(crops_single_page_json_path, "rb") as f:
-        response = json.load(f)
-    doc = CropResponse(response)
-    extracted_crops = extract_multiple_crops(input_sample, doc.inference.result.crops)
+        response = CropResponse(json.load(f))
+    extracted_crops = response.inference.result.extract_from_input_source(input_sample)
     assert len(extracted_crops) == 1
 
     assert extracted_crops[0].page_id == 0
@@ -55,9 +53,8 @@ def test_single_page_crop_split(crops_single_page_path, crops_single_page_json_p
 def test_multi_page_receipt_crop(crops_multi_page_path, crops_multi_page_json_path):
     input_sample = PathInput(crops_multi_page_path)
     with open(crops_multi_page_json_path, "rb") as f:
-        response = json.load(f)
-    doc = CropResponse(response)
-    extracted_crops = extract_multiple_crops(input_sample, doc.inference.result.crops)
+        response = CropResponse(json.load(f))
+    extracted_crops = response.inference.result.extract_from_input_source(input_sample)
     assert len(extracted_crops) == 2
 
     assert extracted_crops[0].page_id == 0
diff --git a/tests/v2/file_operations/test_split_operation.py b/tests/v2/file_operations/test_split_operation.py
index e7cf3ddd..eb9b21cd 100644
--- a/tests/v2/file_operations/test_split_operation.py
+++ b/tests/v2/file_operations/test_split_operation.py
@@ -35,9 +35,8 @@ def splits_multi_page_json_path():
 def test_single_page_split(splits_default, splits_single_page_json_path):
     input_sample = PathInput(splits_default)
     with open(splits_single_page_json_path, "rb") as f:
-        response = json.load(f)
-    doc = SplitResponse(response)
-    extracted_splits = doc.inference.result.extract_from_input_source(input_sample)
+        response = SplitResponse(json.load(f))
+    extracted_splits = response.inference.result.extract_from_input_source(input_sample)
     assert len(extracted_splits) == 1
 
     assert extracted_splits[0].get_page_count() == 1
@@ -47,9 +46,8 @@ def test_single_page_split(splits_default, splits_single_page_json_path):
 def test_multi_page_receipt_split(splits_5p, splits_multi_page_json_path):
     input_sample = PathInput(splits_5p)
     with open(splits_multi_page_json_path, "rb") as f:
-        response = json.load(f)
-    doc = SplitResponse(response)
-    extracted_splits = doc.inference.result.extract_from_input_source(input_sample)
+        response = SplitResponse(json.load(f))
+    extracted_splits = response.inference.result.extract_from_input_source(input_sample)
     assert len(extracted_splits) == 3
 
     assert extracted_splits[0].get_page_count() == 1

From 122e9fba4a00ac63a0f51eb1dce5bce2140ece84 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ianar=C3=A9=20S=C3=A9vi?= <ianare@mindee.co>
Date: Mon, 22 Jun 2026 17:43:51 +0200
Subject: [PATCH 2/2] other fixes

---
 mindee/image/__init__.py                      |  3 --
 mindee/image/extracted_image.py               | 33 +++++++++++--------
 mindee/image/image_extractor.py               | 14 ++++----
 mindee/input/local_input_source.py            |  2 +-
 mindee/pdf/extracted_pdf.py                   | 16 ++++-----
 tests/input/test_compression.py               |  2 +-
 .../test_invoice_splitter_auto_extraction.py  |  4 +--
 .../file_operations/test_split_operation.py   | 11 +++++++
 8 files changed, 47 insertions(+), 38 deletions(-)

diff --git a/mindee/image/__init__.py b/mindee/image/__init__.py
index f562ff76..e69de29b 100644
--- a/mindee/image/__init__.py
+++ b/mindee/image/__init__.py
@@ -1,3 +0,0 @@
-from mindee.image.image_compressor import compress_image
-
-__all__ = ["compress_image"]
diff --git a/mindee/image/extracted_image.py b/mindee/image/extracted_image.py
index 1a686dec..23957eae 100644
--- a/mindee/image/extracted_image.py
+++ b/mindee/image/extracted_image.py
@@ -1,14 +1,12 @@
 from __future__ import annotations
 
-import io
 from pathlib import Path
-from typing import Any
+from typing import Any, BinaryIO
 
 from mindee.dependencies.checkers import PILLOW_AVAILABLE
 from mindee.dependencies.decorators import requires_pillow
 from mindee.error.mindee_error import MindeeError
-from mindee.input.file_input import FileInput
-from mindee.input.local_input_source import LocalInputSource
+from mindee.input.bytes_input import BytesInput
 from mindee.logger import logger
 
 if PILLOW_AVAILABLE:
@@ -21,6 +19,7 @@
 class ExtractedImage:
     """Generic class for image extraction."""
 
+    buffer: BinaryIO
     _page_id: int
     """Id of the page the image was extracted from."""
     _element_id: int
@@ -29,27 +28,33 @@ class ExtractedImage:
     """Name of the file the image was extracted from."""
 
     def __init__(
-        self, input_source: LocalInputSource, page_id: int, element_id: int
+        self,
+        img_byte_stream: BinaryIO,
+        orig_filename: str,
+        orig_extension: str,
+        page_id: int,
+        element_id: int,
     ) -> None:
         """
         Initialize the ExtractedImage with a buffer and an internal file name.
 
-        :param input_source: Local source for input.
+        :param img_byte_stream: The raw image bytes.
+        :param orig_filename: Name of the file the image was extracted from.
         :param page_id: ID of the page the element was found on.
         :param element_id: ID of the element in a page.
         """
-        self.buffer = io.BytesIO(input_source.file_object.read())
-        self.buffer.name = input_source.filename
-        self.filename = input_source.filename
-        if input_source.is_pdf():
+        self.buffer = img_byte_stream
+        self.filename = orig_filename
+
+        if orig_extension.lower().endswith("pdf"):
             extension = "jpg"
         else:
-            extension = Path(input_source.filename).resolve().suffix
+            extension = orig_extension.lower()
         self.buffer.seek(0)
         pg_number = str(page_id).zfill(3)
         elem_number = str(element_id).zfill(3)
         self.internal_file_name = (
-            f"{input_source.filename}_page{pg_number}-{elem_number}.{extension}"
+            f"{orig_filename}_page{pg_number}-{elem_number}.{extension}"
         )
         self._page_id = page_id
         self._element_id = 0 if element_id is None else element_id
@@ -76,14 +81,14 @@ def save_to_file(self, output_path: Path | str):
             print(e)
             raise MindeeError(f"Could not save file {Path(output_path).name}.") from e
 
-    def as_input_source(self) -> FileInput:
+    def as_input_source(self) -> BytesInput:
         """
         Return the file as a Mindee-compatible BufferInput source.
 
         :returns: A BufferInput source.
         """
         self.buffer.seek(0)
-        return FileInput(self.buffer)
+        return BytesInput(self.buffer.read(), self.internal_file_name)
 
     @property
     def page_id(self):
diff --git a/mindee/image/image_extractor.py b/mindee/image/image_extractor.py
index 0a33168a..fb87c81e 100644
--- a/mindee/image/image_extractor.py
+++ b/mindee/image/image_extractor.py
@@ -10,7 +10,6 @@
 from mindee.geometry.point import Point
 from mindee.geometry.polygon import Polygon, get_min_max_x, get_min_max_y
 from mindee.image.extracted_image import ExtractedImage
-from mindee.input.bytes_input import BytesInput
 from mindee.input.local_input_source import LocalInputSource
 
 if PYPDFIUM2_AVAILABLE:
@@ -66,7 +65,7 @@ def extract_image_from_polygon(
     width: float,
     height: float,
     file_format: str,
-) -> bytes:
+) -> BinaryIO:
     """
     Crops the image from the given polygon.
 
@@ -91,7 +90,7 @@ def extract_image_from_polygon(
 
 
 @requires_pillow
-def save_image_to_buffer(image: Image.Image, file_format: str) -> bytes:
+def save_image_to_buffer(image: Image.Image, file_format: str) -> BinaryIO:
     """
     Saves an image as a buffer.
 
@@ -102,7 +101,7 @@ def save_image_to_buffer(image: Image.Image, file_format: str) -> bytes:
     buffer = io.BytesIO()
     image.save(buffer, format=file_format)
     buffer.seek(0)
-    return buffer.read()
+    return buffer
 
 
 @requires_pillow
@@ -159,10 +158,9 @@ def extract_multiple_images_from_source(
         )
         extracted_elements.append(
             ExtractedImage(
-                BytesInput(
-                    image_data,
-                    f"{input_source.filename}_page{page_id + 1}-{element_id}.{file_extension}",
-                ),
+                image_data,
+                input_source.filename,
+                file_extension,
                 page_id,
                 element_id,
             )
diff --git a/mindee/input/local_input_source.py b/mindee/input/local_input_source.py
index 1a8ecf15..0fe78842 100644
--- a/mindee/input/local_input_source.py
+++ b/mindee/input/local_input_source.py
@@ -10,7 +10,7 @@
 from mindee.dependencies.checkers import PYPDFIUM2_AVAILABLE
 from mindee.error.mimetype_error import MimeTypeError
 from mindee.error.mindee_error import MindeeError, MindeeSourceError
-from mindee.image import compress_image
+from mindee.image.image_compressor import compress_image
 from mindee.input.page_options import KEEP_ONLY, REMOVE, PageOptions
 from mindee.logger import logger
 from mindee.pdf.pdf_compressor import compress_pdf
diff --git a/mindee/pdf/extracted_pdf.py b/mindee/pdf/extracted_pdf.py
index 21a0137f..a0325f9c 100644
--- a/mindee/pdf/extracted_pdf.py
+++ b/mindee/pdf/extracted_pdf.py
@@ -18,18 +18,18 @@
 class ExtractedPDF:
     """An extracted sub-Pdf."""
 
-    pdf_bytes: BinaryIO
+    buffer: BinaryIO
     filename: str
 
-    def __init__(self, pdf_bytes: BinaryIO, filename: str):
-        self.pdf_bytes = pdf_bytes
+    def __init__(self, pdf_byte_stream: BinaryIO, filename: str):
+        self.buffer = pdf_byte_stream
         self.filename = filename
 
     @requires_pypdfium2
     def get_page_count(self) -> int:
         """Get the number of pages in the PDF file."""
         try:
-            pdf = pdfium.PdfDocument(self.pdf_bytes)
+            pdf = pdfium.PdfDocument(self.buffer)
             return len(pdf)
         except Exception as e:
             raise MindeeError(
@@ -50,11 +50,11 @@ def save_to_file(self, output_path: Path | str):
             raise MindeeError("Invalid save path provided {}.")
         if out_path.suffix.lower() != "pdf":
             out_path = out_path.parent / (out_path.stem + "." + "pdf")
-        self.pdf_bytes.seek(0)
+        self.buffer.seek(0)
         with open(out_path, "wb") as out_file:
-            out_file.write(self.pdf_bytes.read())
+            out_file.write(self.buffer.read())
 
     def as_input_source(self) -> BytesInput:
         """Returns the current PDF object as a usable BytesInput source."""
-        self.pdf_bytes.seek(0)
-        return BytesInput(self.pdf_bytes.read(), self.filename)
+        self.buffer.seek(0)
+        return BytesInput(self.buffer.read(), self.filename)
diff --git a/tests/input/test_compression.py b/tests/input/test_compression.py
index 6b4371ab..5f9a941e 100644
--- a/tests/input/test_compression.py
+++ b/tests/input/test_compression.py
@@ -6,7 +6,7 @@
 
 import pytest
 
-from mindee.image import compress_image
+from mindee.image.image_compressor import compress_image
 from mindee.input import PathInput
 from mindee.pdf.pdf_compressor import compress_pdf
 from mindee.pdf.pdf_utils import extract_text_from_pdf
diff --git a/tests/v1/extraction/test_invoice_splitter_auto_extraction.py b/tests/v1/extraction/test_invoice_splitter_auto_extraction.py
index 8c4d46c5..10aef482 100644
--- a/tests/v1/extraction/test_invoice_splitter_auto_extraction.py
+++ b/tests/v1/extraction/test_invoice_splitter_auto_extraction.py
@@ -53,9 +53,7 @@ def test_pdf_should_extract_invoices_strict():
     )
     for i, extracted_pdf in enumerate(extracted_base_pdfs):
         assert extracted_pdf.filename == extracted_pdfs_strict[i].filename
-        assert (
-            extracted_pdf.pdf_bytes.read() == extracted_pdfs_strict[i].pdf_bytes.read()
-        )
+        assert extracted_pdf.buffer.read() == extracted_pdfs_strict[i].buffer.read()
 
     assert len(extracted_pdfs_not_strict) == 2
     assert extracted_pdfs_not_strict[0].filename == "default_sample_001-001.pdf"
diff --git a/tests/v2/file_operations/test_split_operation.py b/tests/v2/file_operations/test_split_operation.py
index eb9b21cd..4ca8d27e 100644
--- a/tests/v2/file_operations/test_split_operation.py
+++ b/tests/v2/file_operations/test_split_operation.py
@@ -53,3 +53,14 @@ def test_multi_page_receipt_split(splits_5p, splits_multi_page_json_path):
     assert extracted_splits[0].get_page_count() == 1
     assert extracted_splits[1].get_page_count() == 3
     assert extracted_splits[2].get_page_count() == 1
+
+
+@pytest.mark.pypdfium2
+def test_multi_page_receipt_single_split(splits_5p, splits_multi_page_json_path):
+    input_sample = PathInput(splits_5p)
+    with open(splits_multi_page_json_path, "rb") as f:
+        response = SplitResponse(json.load(f))
+    split = response.inference.result.splits[1]
+    extracted_split = split.extract_from_input_source(input_sample)
+
+    assert extracted_split.get_page_count() == 3