From b531afa101fd988befda034d51698ee050844665 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cle=CC=81ment=20Doumouro?= Date: Wed, 24 Jun 2026 15:21:48 +0200 Subject: [PATCH] feature(extract-core): output page byte ranges --- extract-core/extract_core/__init__.py | 6 +- extract-core/extract_core/objects.py | 24 ++++-- extract-python/benches/compare.py | 4 +- extract-python/extract_python/constants.py | 2 +- extract-python/extract_python/docling_.py | 56 +++++++------- extract-python/extract_python/marker_.py | 41 ++++------- extract-python/extract_python/miner_u.py | 23 ++---- extract-python/extract_python/utils.py | 21 +++++- extract-python/tests/test_docling.py | 2 +- extract-python/tests/test_marker.py | 2 +- extract-python/tests/test_miner_u.py | 2 +- extract-python/tests/test_utils.py | 85 ++++++++++++++++++++++ 12 files changed, 178 insertions(+), 90 deletions(-) create mode 100644 extract-python/tests/test_utils.py diff --git a/extract-core/extract_core/__init__.py b/extract-core/extract_core/__init__.py index 1e16f3b..d5eda82 100644 --- a/extract-core/extract_core/__init__.py +++ b/extract-core/extract_core/__init__.py @@ -12,7 +12,8 @@ InputDoc, MarkdownDoc, OutputFormat, - PageIndexes, + Pages, + Ranges, Result, Status, SupportedExt, @@ -58,7 +59,8 @@ "MinerUConfig", "MinerUPipelineConfig", "OutputFormat", - "PageIndexes", + "Ranges", + "Pages", "Pipeline", "PipelineType", "Result", diff --git a/extract-core/extract_core/objects.py b/extract-core/extract_core/objects.py index 7954bbc..eac8f67 100644 --- a/extract-core/extract_core/objects.py +++ b/extract-core/extract_core/objects.py @@ -3,6 +3,7 @@ import traceback import uuid from abc import ABC +from collections.abc import Sequence from enum import StrEnum from functools import cache from io import BytesIO @@ -16,7 +17,7 @@ no_enum_values_config, safe_copy, ) -from pydantic import AfterValidator, RootModel, TypeAdapter +from pydantic import AfterValidator, Field, TypeAdapter from pydantic import BaseModel as _BaseModel logger = logging.getLogger(__name__) @@ -203,18 +204,25 @@ def without_content(self) -> Self: return safe_copy(self, update={"content": None}) -class PageIndexes(RootModel[list[tuple[int, int]]]): - # Stores page end index +Ranges = list[tuple[int, int]] + + +class Pages(BaseModel): + total: int = 0 + byte_ranges: Ranges = [] + @classmethod - def from_page_end_indices(cls, lengths: list[int]) -> Self: - return [ - ((lengths[p - 1] if p > 0 else 0), lengths[p]) for p in range(len(lengths)) - ] + def from_pages_bytes_sizes(cls, sizes: Sequence[int]) -> Self: + bytes_ranges = [] + for p, end in enumerate(sizes): + start = 0 if p == 0 else bytes_ranges[-1][1] + bytes_ranges.append((start, start + end)) + return cls(total=len(sizes), byte_ranges=bytes_ranges) class ConversionOutput(BaseModel): path: Path - pages: PageIndexes = [] + pages: Pages = Field(default_factory=Pages) class MarkdownDoc(ConversionOutput): diff --git a/extract-python/benches/compare.py b/extract-python/benches/compare.py index 16cbd19..1b97f33 100644 --- a/extract-python/benches/compare.py +++ b/extract-python/benches/compare.py @@ -3,7 +3,7 @@ import markdown2 import pypdfium2 -from extract_core import BaseModel, OutputFormat, PageIndexes +from extract_core import BaseModel, OutputFormat, PageRanges from extract_python.utils import chdir from html2image import Html2Image from PIL import Image, ImageDraw @@ -140,7 +140,7 @@ def _scan_pages( root: Path, comparison: ComparisonItem ) -> list[dict[str, tuple[int, int]]]: all_pages = [ - PageIndexes.model_validate_json( + PageRanges.model_validate_json( (root / compared / "artifacts" / "pages.json").read_text() ).root for compared in comparison.compared diff --git a/extract-python/extract_python/constants.py b/extract-python/extract_python/constants.py index 9a5cc9c..71c44ae 100644 --- a/extract-python/extract_python/constants.py +++ b/extract-python/extract_python/constants.py @@ -1,2 +1,2 @@ ARTIFACTS = "artifacts" -DEFAULT_MD_PAGE_SEP = '
' +DEFAULT_MD_PAGE_SEP = '\n
\n' diff --git a/extract-python/extract_python/docling_.py b/extract-python/extract_python/docling_.py index 2e50983..9d63401 100644 --- a/extract-python/extract_python/docling_.py +++ b/extract-python/extract_python/docling_.py @@ -23,7 +23,6 @@ InputDoc, MarkdownDoc, OutputFormat, - PageIndexes, Pipeline, PipelineType, Result, @@ -34,7 +33,7 @@ from pydantic_core.core_schema import SerializerFunctionWrapHandler from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP -from .utils import chdir, map_and_preserve, path_to_artifacts_dirname +from .utils import chdir, map_and_preserve, path_to_artifacts_dirname, write_pages logger = logging.getLogger(__name__) @@ -115,39 +114,38 @@ def _to_markdown_doc( raise FileExistsError(f"directory {md_dir} already exists") # Let's avoid issue of duplicated input file names flattened top level md_filename = md_dir_name + OutputFormat.MARKDOWN - total_length = 0 - n_pages = len(res.pages) - with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td: tmp_dir = Path(td) - page_path = Path("page.md") - # We do a chdir to bypass a Docling bug which only allows to maintain relative - # image ref when saving the markdown to a relative path - with (tmp_dir / md_filename).open("w") as f, chdir(tmp_dir): - end_indices = [] - for page_i in range(n_pages): - res.document.save_as_markdown( - page_path, - page_no=page_i + 1, - image_mode=ImageRefMode.REFERENCED, - artifacts_dir=Path(ARTIFACTS), - **kwargs, - ) - content = page_path.read_text() - if page_i > 0: - content += "\n" - if page_i < n_pages - 1: - content += page_sep - total_length += len(content) - end_indices.append(total_length) - f.write(content) - f.flush() - page_path.unlink() + md_path = tmp_dir / md_filename + current_page_path = tmp_dir / "page.md" + with chdir(tmp_dir): + # We do a chdir to bypass a Docling bug which only allows to maintain + # relative image ref when saving the markdown to a relative path + pages = _docling_pages_it(res, current_page_path, **kwargs) + with md_path.open("wb") as f: + pages = write_pages(pages, page_sep, f) + # Clean up the tmp page file before move everything to the end destination + current_page_path.unlink() shutil.move(tmp_dir, md_dir) - pages = PageIndexes.from_page_end_indices(end_indices) return MarkdownDoc(path=Path(md_dir_name), pages=pages) +def _docling_pages_it( + res: ConversionResult, output_path: Path, **kwargs +) -> Iterable[str]: + n_pages = len(res.pages) + for page_i in range(n_pages): + res.document.save_as_markdown( + output_path, + page_no=page_i + 1, + image_mode=ImageRefMode.REFERENCED, + artifacts_dir=Path(ARTIFACTS), + **kwargs, + ) + content = output_path.read_text() + yield content + + class SerializableFormatOptions(DoclingFormatOption): # Utility class to serialize Python format options into a JSON which can be # correctly deserialized into a docling FormatOption diff --git a/extract-python/extract_python/marker_.py b/extract-python/extract_python/marker_.py index 6200c6b..162faa0 100644 --- a/extract-python/extract_python/marker_.py +++ b/extract-python/extract_python/marker_.py @@ -9,15 +9,14 @@ InputDoc, MarkdownDoc, OutputFormat, - PageIndexes, Pipeline, PipelineType, Result, Status, ) -from .constants import ARTIFACTS -from .utils import path_to_artifacts_dirname, report_recoverable_errors +from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP +from .utils import path_to_artifacts_dirname, report_recoverable_errors, write_pages if TYPE_CHECKING: from marker.converters.pdf import PdfConverter @@ -63,7 +62,9 @@ async def _process_doc( content, _, images = text_from_rendered(rendered) match output_format: case OutputFormat.MARKDOWN: - output = _to_markdown_doc(doc, content, images, output_path) + output = _to_markdown_doc( + doc, content, images, output_path, page_sep=DEFAULT_MD_PAGE_SEP + ) case _: raise NotImplementedError(f"unsupported output format {output_format}") input_doc = doc.without_content() @@ -71,7 +72,12 @@ async def _process_doc( def _to_markdown_doc( - input_doc: InputDoc, content: str, images: dict[str, "Image"], output_path: Path + input_doc: InputDoc, + content: str, + images: dict[str, "Image"], + output_path: Path, + *, + page_sep: str = DEFAULT_MD_PAGE_SEP, ) -> MarkdownDoc: from marker.renderers.markdown import MarkdownRenderer # noqa: PLC0415 @@ -85,24 +91,9 @@ def _to_markdown_doc( im.save(artifacts_dir / im_name) del images gc.collect() - page_sep = MarkdownRenderer.page_separator - content = content.split(page_sep) - n_pages = len(content) - md_path = (output_path / md_dir_name / md_dir_name).with_suffix( - OutputFormat.MARKDOWN.value - ) - total_length = 0 - end_indices = [] - with md_path.open("w", encoding="utf-8") as f: - for page_i, page_content in enumerate(content): - content = page_content - if page_i > 0: - content += "\n" - if page_i < n_pages - 1: - content += page_sep - total_length += len(content) - end_indices.append(total_length) - f.write(content) - f.flush() - pages = PageIndexes.from_page_end_indices(end_indices) + pages = content.split(MarkdownRenderer.page_separator) + md_path = output_path / md_dir_name / md_dir_name + md_path = md_path.with_suffix(OutputFormat.MARKDOWN.value) + with md_path.open("wb") as f: + pages = write_pages(pages, page_sep, f) return MarkdownDoc(path=Path(md_dir_name), pages=pages) diff --git a/extract-python/extract_python/miner_u.py b/extract-python/extract_python/miner_u.py index 73e8879..399253c 100644 --- a/extract-python/extract_python/miner_u.py +++ b/extract-python/extract_python/miner_u.py @@ -12,7 +12,6 @@ MinerUBackend, MinerUPipelineConfig, OutputFormat, - PageIndexes, Pipeline, PipelineType, Result, @@ -20,7 +19,7 @@ ) from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP -from .utils import path_to_artifacts_dirname, reset_env +from .utils import path_to_artifacts_dirname, reset_env, write_pages _MINER_U_CONVERSION_ERRORS = tuple() MDMakeFunction = Callable[[list, str, str], str | None] @@ -148,21 +147,9 @@ def _dump_md_content( if md_make_mode is None: md_make_mode = MakeMode.MM_MD - total_length = 0 - end_indices = [] - with md_path.open("w") as f: - n_pages = len(pdf_info) - for page_i, page in enumerate(pdf_info): - content = md_make_fn([page], md_make_mode, str(im_dir)) - if page_i > 0: - content += "\n" - if page_i < n_pages - 1: - content += page_sep - total_length += len(content) - end_indices.append(total_length) - f.write(content) - f.flush() - end_indices = PageIndexes.from_page_end_indices(end_indices) + pages = (md_make_fn([p], md_make_mode, str(im_dir)) for p in pdf_info) + with md_path.open("wb") as f: + pages = write_pages(pages, page_sep, f) output_path = md_path.parent.relative_to(output_path) - output = ConversionOutput(path=output_path, pages=end_indices) + output = ConversionOutput(path=output_path, pages=pages) return output diff --git a/extract-python/extract_python/utils.py b/extract-python/extract_python/utils.py index 6f8ee2a..74a977c 100644 --- a/extract-python/extract_python/utils.py +++ b/extract-python/extract_python/utils.py @@ -5,9 +5,9 @@ from functools import wraps from itertools import tee from pathlib import Path, PurePath -from typing import Protocol, TypeVar +from typing import BinaryIO, Protocol, TypeVar -from extract_core import Error, InputDoc, Result, Status +from extract_core import Error, InputDoc, Pages, Result, Status R = TypeVar("R") In = TypeVar("In") @@ -73,3 +73,20 @@ def reset_env() -> Generator[None, None, None]: finally: os.environ.clear() os.environ.update(old_env) + + +def write_pages(pages: Iterable[str], page_sep: str, out: BinaryIO) -> Pages: + pages = iter(pages) + next_page = None + pages_byte_sizes = [] + sentinel = object() + while True: + content = next(pages, sentinel) if next_page is None else next_page + if content is sentinel: + break + next_page = next(pages, sentinel) + if next_page is not sentinel: + content += page_sep + content = content.encode() + pages_byte_sizes.append(out.write(content)) + return Pages.from_pages_bytes_sizes(pages_byte_sizes) diff --git a/extract-python/tests/test_docling.py b/extract-python/tests/test_docling.py index 8632458..b233276 100644 --- a/extract-python/tests/test_docling.py +++ b/extract-python/tests/test_docling.py @@ -51,7 +51,7 @@ async def test_docling_pdf_to_markdown( assert (output_path / p).is_dir() assert (output_path / p / p.name).with_suffix(".md").exists() assert any((output_path / p).glob("artifacts/*.png")) - assert all(r.output.pages for r in res) + assert all(r.output.pages.byte_ranges for r in res) assert not any(r.errors for r in res) input_path = [r.input.path for r in res] expected_input_path = [ diff --git a/extract-python/tests/test_marker.py b/extract-python/tests/test_marker.py index 7ca52a1..55a142f 100644 --- a/extract-python/tests/test_marker.py +++ b/extract-python/tests/test_marker.py @@ -45,7 +45,7 @@ async def test_marker_pdf_to_markdown( assert (output_path / p).is_dir() assert (output_path / p / p.name).with_suffix(".md").exists() assert any((output_path / p).glob("artifacts/*.jpeg")) - assert all(r.output.pages for r in res) + assert all(r.output.pages.byte_ranges for r in res) assert not any(r.errors for r in res) input_path = [r.input.path for r in res] expected_path = [ diff --git a/extract-python/tests/test_miner_u.py b/extract-python/tests/test_miner_u.py index 908b326..2a71ce7 100644 --- a/extract-python/tests/test_miner_u.py +++ b/extract-python/tests/test_miner_u.py @@ -46,7 +46,7 @@ async def test_miner_u_pdf_to_markdown( assert (output_path / p).is_dir() assert (output_path / p / p.name).with_suffix(".md").exists() assert any((output_path / p).glob("artifacts/*.jpg")) - assert all(r.output.pages for r in res) + assert all(r.output.pages.byte_ranges for r in res) assert not any(r.errors for r in res) input_path = [r.input.path for r in res] expected_path = [ diff --git a/extract-python/tests/test_utils.py b/extract-python/tests/test_utils.py new file mode 100644 index 0000000..f05e008 --- /dev/null +++ b/extract-python/tests/test_utils.py @@ -0,0 +1,85 @@ +from io import BytesIO + +import pytest +from extract_python.utils import write_pages + + +def _read_page(doc: BytesIO, start: int, *, end: int) -> str: + doc.seek(start) + return doc.read(end - start).decode("utf-8") + + +_MD_DOC_0 = """ +# First page +content +
+# Second page +content +
+# Third page +content""" + +_MD_DOC_0_PAGE_0 = """ +# First page +content""" + +_MD_DOC_0_PAGE_1 = """ +# Second page +content""" + +_MD_DOC_0_PAGE_2 = """ +# Third page +content""" + + +@pytest.mark.parametrize( + ("pages", "page_sep", "expected_n_pages", "expected_page_contents"), + [ + ( + [_MD_DOC_0_PAGE_0, _MD_DOC_0_PAGE_1, _MD_DOC_0_PAGE_2], + '\n
\n', + 3, + [ + f'{_MD_DOC_0_PAGE_0}\n
\n', + f'{_MD_DOC_0_PAGE_1}\n
\n', + f"{_MD_DOC_0_PAGE_2}", + ], + ), + ( + [_MD_DOC_0_PAGE_0, _MD_DOC_0_PAGE_1, _MD_DOC_0_PAGE_2], + "\n\n", + 3, + [ + f"{_MD_DOC_0_PAGE_0}\n\n", + f"{_MD_DOC_0_PAGE_1}\n\n", + f"{_MD_DOC_0_PAGE_2}", + ], + ), + ( + [_MD_DOC_0_PAGE_0, _MD_DOC_0_PAGE_1, _MD_DOC_0_PAGE_2], + "", + 3, + [f"{_MD_DOC_0_PAGE_0}", f"{_MD_DOC_0_PAGE_1}", f"{_MD_DOC_0_PAGE_2}"], + ), + ], +) +def test_write_pages( + pages: list[str], + page_sep: str, + expected_n_pages: int, + expected_page_contents: list[str], +) -> None: + # Given + output = BytesIO() + # When + written_pages = write_pages(pages, page_sep, out=output) + # Then + assert written_pages.total == expected_n_pages + byte_ranges = written_pages.byte_ranges + assert len(byte_ranges) == len(expected_page_contents) + for byte_range, expected_content in zip( + byte_ranges, expected_page_contents, strict=True + ): + start, end = byte_range + page = _read_page(output, start, end=end) + assert page == expected_content