Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions mindee/image/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +0,0 @@
from mindee.image.image_compressor import compress_image

__all__ = ["compress_image"]
45 changes: 23 additions & 22 deletions mindee/image/extracted_image.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
from __future__ import annotations

import io
from pathlib import Path
from typing import Any
from typing import Any, BinaryIO

from mindee.dependencies.checkers import PILLOW_AVAILABLE
from mindee.dependencies.decorators import requires_pillow
from mindee.error.mindee_error import MindeeError
from mindee.input.file_input import FileInput
from mindee.input.local_input_source import LocalInputSource
from mindee.input.bytes_input import BytesInput
from mindee.logger import logger

if PILLOW_AVAILABLE:
Expand All @@ -21,6 +19,7 @@
class ExtractedImage:
"""Generic class for image extraction."""

buffer: BinaryIO
_page_id: int
"""Id of the page the image was extracted from."""
_element_id: int
Expand All @@ -29,70 +28,72 @@ class ExtractedImage:
"""Name of the file the image was extracted from."""

def __init__(
self, input_source: LocalInputSource, page_id: int, element_id: int
self,
img_byte_stream: BinaryIO,
orig_filename: str,
orig_extension: str,
page_id: int,
element_id: int,
) -> None:
"""
Initialize the ExtractedImage with a buffer and an internal file name.

:param input_source: Local source for input.
:param img_byte_stream: The raw image bytes.
:param orig_filename: Name of the file the image was extracted from.
:param page_id: ID of the page the element was found on.
:param element_id: ID of the element in a page.
"""
self.buffer = io.BytesIO(input_source.file_object.read())
self.buffer.name = input_source.filename
self.filename = input_source.filename
if input_source.is_pdf():
self.buffer = img_byte_stream
self.filename = orig_filename

if orig_extension.lower().endswith("pdf"):
extension = "jpg"
else:
extension = Path(input_source.filename).resolve().suffix
extension = orig_extension.lower()
self.buffer.seek(0)
pg_number = str(page_id).zfill(3)
elem_number = str(element_id).zfill(3)
self.internal_file_name = (
f"{input_source.filename}_page{pg_number}-{elem_number}.{extension}"
f"{orig_filename}_page{pg_number}-{elem_number}.{extension}"
)
self._page_id = page_id
self._element_id = 0 if element_id is None else element_id

@requires_pillow
def save_to_file(self, output_path: Path | str, file_format: str | None = None):
def save_to_file(self, output_path: Path | str):
"""
Saves the document to a file.

:param output_path: Path to save the file to.
:param file_format: Optional PIL-compatible format for the file. Inferred from file extension if not provided.
:raises MindeeError: If an invalid path or filename is provided.
"""
try:
resolved_path = Path(output_path).resolve()
if not file_format and len(resolved_path.suffix) < 1:
if not len(resolved_path.suffix) < 1:
raise ValueError("Invalid file format.")
self.buffer.seek(0)
image = Image.open(self.buffer)
if file_format:
image.save(resolved_path, format=file_format)
else:
image.save(resolved_path)
image.save(resolved_path)
logger.info("File saved successfully to '%s'.", resolved_path)
except TypeError as e:
raise MindeeError("Invalid path/filename provided.") from e
except Exception as e:
print(e)
raise MindeeError(f"Could not save file {Path(output_path).name}.") from e

def as_input_source(self) -> FileInput:
def as_input_source(self) -> BytesInput:
"""
Return the file as a Mindee-compatible BufferInput source.

:returns: A BufferInput source.
"""
self.buffer.seek(0)
return FileInput(self.buffer)
return BytesInput(self.buffer.read(), self.internal_file_name)

@property
def page_id(self):
"""
ID of the page the receipt was found on.
ID of the page the image was found on.

:return: A valid page ID.
"""
Expand Down
5 changes: 5 additions & 0 deletions mindee/image/extracted_images.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from mindee.image.extracted_image import ExtractedImage


class ExtractedImages(list[ExtractedImage]):
"""List of extracted images."""
14 changes: 6 additions & 8 deletions mindee/image/image_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from mindee.geometry.point import Point
from mindee.geometry.polygon import Polygon, get_min_max_x, get_min_max_y
from mindee.image.extracted_image import ExtractedImage
from mindee.input.bytes_input import BytesInput
from mindee.input.local_input_source import LocalInputSource

if PYPDFIUM2_AVAILABLE:
Expand Down Expand Up @@ -66,7 +65,7 @@ def extract_image_from_polygon(
width: float,
height: float,
file_format: str,
) -> bytes:
) -> BinaryIO:
"""
Crops the image from the given polygon.
Expand All @@ -91,7 +90,7 @@ def extract_image_from_polygon(


@requires_pillow
def save_image_to_buffer(image: Image.Image, file_format: str) -> bytes:
def save_image_to_buffer(image: Image.Image, file_format: str) -> BinaryIO:
"""
Saves an image as a buffer.
Expand All @@ -102,7 +101,7 @@ def save_image_to_buffer(image: Image.Image, file_format: str) -> bytes:
buffer = io.BytesIO()
image.save(buffer, format=file_format)
buffer.seek(0)
return buffer.read()
return buffer


@requires_pillow
Expand Down Expand Up @@ -159,10 +158,9 @@ def extract_multiple_images_from_source(
)
extracted_elements.append(
ExtractedImage(
BytesInput(
image_data,
f"{input_source.filename}_page{page_id + 1}-{element_id}.{file_extension}",
),
image_data,
input_source.filename,
file_extension,
page_id,
element_id,
)
Expand Down
2 changes: 1 addition & 1 deletion mindee/input/local_input_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from mindee.dependencies.checkers import PYPDFIUM2_AVAILABLE
from mindee.error.mimetype_error import MimeTypeError
from mindee.error.mindee_error import MindeeError, MindeeSourceError
from mindee.image import compress_image
from mindee.image.image_compressor import compress_image
from mindee.input.page_options import KEEP_ONLY, REMOVE, PageOptions
from mindee.logger import logger
from mindee.pdf.pdf_compressor import compress_pdf
Expand Down
16 changes: 8 additions & 8 deletions mindee/pdf/extracted_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,18 @@
class ExtractedPDF:
"""An extracted sub-Pdf."""

pdf_bytes: BinaryIO
buffer: BinaryIO
filename: str

def __init__(self, pdf_bytes: BinaryIO, filename: str):
self.pdf_bytes = pdf_bytes
def __init__(self, pdf_byte_stream: BinaryIO, filename: str):
self.buffer = pdf_byte_stream
self.filename = filename

@requires_pypdfium2
def get_page_count(self) -> int:
"""Get the number of pages in the PDF file."""
try:
pdf = pdfium.PdfDocument(self.pdf_bytes)
pdf = pdfium.PdfDocument(self.buffer)
return len(pdf)
except Exception as e:
raise MindeeError(
Expand All @@ -50,11 +50,11 @@ def save_to_file(self, output_path: Path | str):
raise MindeeError("Invalid save path provided {}.")
if out_path.suffix.lower() != "pdf":
out_path = out_path.parent / (out_path.stem + "." + "pdf")
self.pdf_bytes.seek(0)
self.buffer.seek(0)
with open(out_path, "wb") as out_file:
out_file.write(self.pdf_bytes.read())
out_file.write(self.buffer.read())

def as_input_source(self) -> BytesInput:
"""Returns the current PDF object as a usable BytesInput source."""
self.pdf_bytes.seek(0)
return BytesInput(self.pdf_bytes.read(), self.filename)
self.buffer.seek(0)
return BytesInput(self.buffer.read(), self.filename)
5 changes: 5 additions & 0 deletions mindee/pdf/extracted_pdfs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from mindee.pdf.extracted_pdf import ExtractedPDF


class ExtractedPDFs(list[ExtractedPDF]):
"""List of extracted PDFs."""
6 changes: 3 additions & 3 deletions mindee/v2/file_operations/crop.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from mindee.error import MindeeError
from mindee.geometry import Point, Polygon
from mindee.image.extracted_image import ExtractedImage
from mindee.image.extracted_images import ExtractedImages
from mindee.image.image_extractor import extract_multiple_images_from_source
from mindee.input.local_input_source import LocalInputSource
from mindee.v2.file_operations.crop_files import CropFiles
from mindee.v2.parsing.inference.field import FieldLocation
from mindee.v2.product.crop.crop_item import CropItem

Expand All @@ -25,7 +25,7 @@ def extract_single_crop(

def extract_multiple_crops(
input_source: LocalInputSource, crops: list[CropItem]
) -> CropFiles:
) -> ExtractedImages:
"""
Extracts individual receipts from multi-receipts documents.

Expand All @@ -49,4 +49,4 @@ def extract_multiple_crops(
polygon,
)
)
return CropFiles(images)
return ExtractedImages(images)
20 changes: 0 additions & 20 deletions mindee/v2/file_operations/crop_files.py

This file was deleted.

6 changes: 3 additions & 3 deletions mindee/v2/file_operations/split.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from mindee.error import MindeeError
from mindee.input.local_input_source import LocalInputSource
from mindee.pdf.extracted_pdf import ExtractedPDF
from mindee.pdf.extracted_pdfs import ExtractedPDFs
from mindee.pdf.pdf_extractor import PDFExtractor
from mindee.v2.file_operations.split_files import SplitFiles


def extract_single_split(
Expand All @@ -21,7 +21,7 @@ def extract_single_split(
def extract_multiple_splits(
input_source: LocalInputSource,
splits: list[list[int]],
) -> SplitFiles:
) -> ExtractedPDFs:
"""
Extracts splits as complete PDFs from the document.

Expand All @@ -35,4 +35,4 @@ def extract_multiple_splits(
page_groups.append(list(range(split[0], split[1] + 1)))
if len(splits) < 1:
raise MindeeError("No indexes provided.")
return SplitFiles(pdf_extractor.extract_sub_documents(page_groups))
return ExtractedPDFs(pdf_extractor.extract_sub_documents(page_groups))
20 changes: 0 additions & 20 deletions mindee/v2/file_operations/split_files.py

This file was deleted.

6 changes: 4 additions & 2 deletions mindee/v2/product/crop/crop_result.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from mindee.image.extracted_images import ExtractedImages
from mindee.input.local_input_source import LocalInputSource
from mindee.parsing.common.string_dict import StringDict
from mindee.v2.file_operations.crop import extract_multiple_crops
from mindee.v2.file_operations.crop_files import CropFiles
from mindee.v2.product.crop.crop_item import CropItem


Expand All @@ -20,7 +20,9 @@ def __str__(self) -> str:
out_str = f"Crops\n====={crops}"
return out_str

def extract_from_input_source(self, input_source: LocalInputSource) -> CropFiles:
def extract_from_input_source(
self, input_source: LocalInputSource
) -> ExtractedImages:
"""
Apply all the crops to a file and return a single extracted PDF.

Expand Down
6 changes: 4 additions & 2 deletions mindee/v2/product/split/split_result.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from mindee.input.local_input_source import LocalInputSource
from mindee.parsing.common.string_dict import StringDict
from mindee.pdf.extracted_pdfs import ExtractedPDFs
from mindee.v2.file_operations.split import extract_multiple_splits
from mindee.v2.file_operations.split_files import SplitFiles
from mindee.v2.product.split.split_range import SplitRange


Expand All @@ -20,7 +20,9 @@ def __str__(self) -> str:
out_str = f"Splits\n======{splits}"
return out_str

def extract_from_input_source(self, input_source: LocalInputSource) -> SplitFiles:
def extract_from_input_source(
self, input_source: LocalInputSource
) -> ExtractedPDFs:
"""
Apply all the crops to a file and return a single extracted PDF.

Expand Down
2 changes: 1 addition & 1 deletion tests/input/test_compression.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import pytest

from mindee.image import compress_image
from mindee.image.image_compressor import compress_image
from mindee.input import PathInput
from mindee.pdf.pdf_compressor import compress_pdf
from mindee.pdf.pdf_utils import extract_text_from_pdf
Expand Down
4 changes: 1 addition & 3 deletions tests/v1/extraction/test_invoice_splitter_auto_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,7 @@ def test_pdf_should_extract_invoices_strict():
)
for i, extracted_pdf in enumerate(extracted_base_pdfs):
assert extracted_pdf.filename == extracted_pdfs_strict[i].filename
assert (
extracted_pdf.pdf_bytes.read() == extracted_pdfs_strict[i].pdf_bytes.read()
)
assert extracted_pdf.buffer.read() == extracted_pdfs_strict[i].buffer.read()

assert len(extracted_pdfs_not_strict) == 2
assert extracted_pdfs_not_strict[0].filename == "default_sample_001-001.pdf"
Expand Down
Loading
Loading