From b531afa101fd988befda034d51698ee050844665 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cle=CC=81ment=20Doumouro?= <clement.doumouro@gmail.com>
Date: Wed, 24 Jun 2026 15:21:48 +0200
Subject: [PATCH] feature(extract-core): output page byte ranges

---
 extract-core/extract_core/__init__.py      |  6 +-
 extract-core/extract_core/objects.py       | 24 ++++--
 extract-python/benches/compare.py          |  4 +-
 extract-python/extract_python/constants.py |  2 +-
 extract-python/extract_python/docling_.py  | 56 +++++++-------
 extract-python/extract_python/marker_.py   | 41 ++++-------
 extract-python/extract_python/miner_u.py   | 23 ++----
 extract-python/extract_python/utils.py     | 21 +++++-
 extract-python/tests/test_docling.py       |  2 +-
 extract-python/tests/test_marker.py        |  2 +-
 extract-python/tests/test_miner_u.py       |  2 +-
 extract-python/tests/test_utils.py         | 85 ++++++++++++++++++++++
 12 files changed, 178 insertions(+), 90 deletions(-)
 create mode 100644 extract-python/tests/test_utils.py

diff --git a/extract-core/extract_core/__init__.py b/extract-core/extract_core/__init__.py
index 1e16f3b..d5eda82 100644
--- a/extract-core/extract_core/__init__.py
+++ b/extract-core/extract_core/__init__.py
@@ -12,7 +12,8 @@
     InputDoc,
     MarkdownDoc,
     OutputFormat,
-    PageIndexes,
+    Pages,
+    Ranges,
     Result,
     Status,
     SupportedExt,
@@ -58,7 +59,8 @@
     "MinerUConfig",
     "MinerUPipelineConfig",
     "OutputFormat",
-    "PageIndexes",
+    "Ranges",
+    "Pages",
     "Pipeline",
     "PipelineType",
     "Result",
diff --git a/extract-core/extract_core/objects.py b/extract-core/extract_core/objects.py
index 7954bbc..eac8f67 100644
--- a/extract-core/extract_core/objects.py
+++ b/extract-core/extract_core/objects.py
@@ -3,6 +3,7 @@
 import traceback
 import uuid
 from abc import ABC
+from collections.abc import Sequence
 from enum import StrEnum
 from functools import cache
 from io import BytesIO
@@ -16,7 +17,7 @@
     no_enum_values_config,
     safe_copy,
 )
-from pydantic import AfterValidator, RootModel, TypeAdapter
+from pydantic import AfterValidator, Field, TypeAdapter
 from pydantic import BaseModel as _BaseModel
 
 logger = logging.getLogger(__name__)
@@ -203,18 +204,25 @@ def without_content(self) -> Self:
         return safe_copy(self, update={"content": None})
 
 
-class PageIndexes(RootModel[list[tuple[int, int]]]):
-    # Stores page end index
+Ranges = list[tuple[int, int]]
+
+
+class Pages(BaseModel):
+    total: int = 0
+    byte_ranges: Ranges = []
+
     @classmethod
-    def from_page_end_indices(cls, lengths: list[int]) -> Self:
-        return [
-            ((lengths[p - 1] if p > 0 else 0), lengths[p]) for p in range(len(lengths))
-        ]
+    def from_pages_bytes_sizes(cls, sizes: Sequence[int]) -> Self:
+        bytes_ranges = []
+        for p, end in enumerate(sizes):
+            start = 0 if p == 0 else bytes_ranges[-1][1]
+            bytes_ranges.append((start, start + end))
+        return cls(total=len(sizes), byte_ranges=bytes_ranges)
 
 
 class ConversionOutput(BaseModel):
     path: Path
-    pages: PageIndexes = []
+    pages: Pages = Field(default_factory=Pages)
 
 
 class MarkdownDoc(ConversionOutput):
diff --git a/extract-python/benches/compare.py b/extract-python/benches/compare.py
index 16cbd19..1b97f33 100644
--- a/extract-python/benches/compare.py
+++ b/extract-python/benches/compare.py
@@ -3,7 +3,7 @@
 
 import markdown2
 import pypdfium2
-from extract_core import BaseModel, OutputFormat, PageIndexes
+from extract_core import BaseModel, OutputFormat, PageRanges
 from extract_python.utils import chdir
 from html2image import Html2Image
 from PIL import Image, ImageDraw
@@ -140,7 +140,7 @@ def _scan_pages(
     root: Path, comparison: ComparisonItem
 ) -> list[dict[str, tuple[int, int]]]:
     all_pages = [
-        PageIndexes.model_validate_json(
+        PageRanges.model_validate_json(
             (root / compared / "artifacts" / "pages.json").read_text()
         ).root
         for compared in comparison.compared
diff --git a/extract-python/extract_python/constants.py b/extract-python/extract_python/constants.py
index 9a5cc9c..71c44ae 100644
--- a/extract-python/extract_python/constants.py
+++ b/extract-python/extract_python/constants.py
@@ -1,2 +1,2 @@
 ARTIFACTS = "artifacts"
-DEFAULT_MD_PAGE_SEP = '<div style="page-break-after: always;"></div>'
+DEFAULT_MD_PAGE_SEP = '\n<div style="page-break-after: always;"></div>\n'
diff --git a/extract-python/extract_python/docling_.py b/extract-python/extract_python/docling_.py
index 2e50983..9d63401 100644
--- a/extract-python/extract_python/docling_.py
+++ b/extract-python/extract_python/docling_.py
@@ -23,7 +23,6 @@
     InputDoc,
     MarkdownDoc,
     OutputFormat,
-    PageIndexes,
     Pipeline,
     PipelineType,
     Result,
@@ -34,7 +33,7 @@
 from pydantic_core.core_schema import SerializerFunctionWrapHandler
 
 from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
-from .utils import chdir, map_and_preserve, path_to_artifacts_dirname
+from .utils import chdir, map_and_preserve, path_to_artifacts_dirname, write_pages
 
 logger = logging.getLogger(__name__)
 
@@ -115,39 +114,38 @@ def _to_markdown_doc(
         raise FileExistsError(f"directory {md_dir} already exists")
     # Let's avoid issue of duplicated input file names flattened top level
     md_filename = md_dir_name + OutputFormat.MARKDOWN
-    total_length = 0
-    n_pages = len(res.pages)
-
     with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td:
         tmp_dir = Path(td)
-        page_path = Path("page.md")
-        # We do a chdir to bypass a Docling bug which only allows to maintain relative
-        # image ref when saving the markdown to a relative path
-        with (tmp_dir / md_filename).open("w") as f, chdir(tmp_dir):
-            end_indices = []
-            for page_i in range(n_pages):
-                res.document.save_as_markdown(
-                    page_path,
-                    page_no=page_i + 1,
-                    image_mode=ImageRefMode.REFERENCED,
-                    artifacts_dir=Path(ARTIFACTS),
-                    **kwargs,
-                )
-                content = page_path.read_text()
-                if page_i > 0:
-                    content += "\n"
-                if page_i < n_pages - 1:
-                    content += page_sep
-                total_length += len(content)
-                end_indices.append(total_length)
-                f.write(content)
-                f.flush()
-                page_path.unlink()
+        md_path = tmp_dir / md_filename
+        current_page_path = tmp_dir / "page.md"
+        with chdir(tmp_dir):
+            # We do a chdir to bypass a Docling bug which only allows to maintain
+            # relative image ref when saving the markdown to a relative path
+            pages = _docling_pages_it(res, current_page_path, **kwargs)
+            with md_path.open("wb") as f:
+                pages = write_pages(pages, page_sep, f)
+        # Clean up the tmp page file before move everything to the end destination
+        current_page_path.unlink()
         shutil.move(tmp_dir, md_dir)
-    pages = PageIndexes.from_page_end_indices(end_indices)
     return MarkdownDoc(path=Path(md_dir_name), pages=pages)
 
 
+def _docling_pages_it(
+    res: ConversionResult, output_path: Path, **kwargs
+) -> Iterable[str]:
+    n_pages = len(res.pages)
+    for page_i in range(n_pages):
+        res.document.save_as_markdown(
+            output_path,
+            page_no=page_i + 1,
+            image_mode=ImageRefMode.REFERENCED,
+            artifacts_dir=Path(ARTIFACTS),
+            **kwargs,
+        )
+        content = output_path.read_text()
+        yield content
+
+
 class SerializableFormatOptions(DoclingFormatOption):
     # Utility class to serialize Python format options into a JSON which can be
     # correctly deserialized into a docling FormatOption
diff --git a/extract-python/extract_python/marker_.py b/extract-python/extract_python/marker_.py
index 6200c6b..162faa0 100644
--- a/extract-python/extract_python/marker_.py
+++ b/extract-python/extract_python/marker_.py
@@ -9,15 +9,14 @@
     InputDoc,
     MarkdownDoc,
     OutputFormat,
-    PageIndexes,
     Pipeline,
     PipelineType,
     Result,
     Status,
 )
 
-from .constants import ARTIFACTS
-from .utils import path_to_artifacts_dirname, report_recoverable_errors
+from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
+from .utils import path_to_artifacts_dirname, report_recoverable_errors, write_pages
 
 if TYPE_CHECKING:
     from marker.converters.pdf import PdfConverter
@@ -63,7 +62,9 @@ async def _process_doc(
     content, _, images = text_from_rendered(rendered)
     match output_format:
         case OutputFormat.MARKDOWN:
-            output = _to_markdown_doc(doc, content, images, output_path)
+            output = _to_markdown_doc(
+                doc, content, images, output_path, page_sep=DEFAULT_MD_PAGE_SEP
+            )
         case _:
             raise NotImplementedError(f"unsupported output format {output_format}")
     input_doc = doc.without_content()
@@ -71,7 +72,12 @@ async def _process_doc(
 
 
 def _to_markdown_doc(
-    input_doc: InputDoc, content: str, images: dict[str, "Image"], output_path: Path
+    input_doc: InputDoc,
+    content: str,
+    images: dict[str, "Image"],
+    output_path: Path,
+    *,
+    page_sep: str = DEFAULT_MD_PAGE_SEP,
 ) -> MarkdownDoc:
     from marker.renderers.markdown import MarkdownRenderer  # noqa: PLC0415
 
@@ -85,24 +91,9 @@ def _to_markdown_doc(
         im.save(artifacts_dir / im_name)
     del images
     gc.collect()
-    page_sep = MarkdownRenderer.page_separator
-    content = content.split(page_sep)
-    n_pages = len(content)
-    md_path = (output_path / md_dir_name / md_dir_name).with_suffix(
-        OutputFormat.MARKDOWN.value
-    )
-    total_length = 0
-    end_indices = []
-    with md_path.open("w", encoding="utf-8") as f:
-        for page_i, page_content in enumerate(content):
-            content = page_content
-            if page_i > 0:
-                content += "\n"
-            if page_i < n_pages - 1:
-                content += page_sep
-            total_length += len(content)
-            end_indices.append(total_length)
-            f.write(content)
-            f.flush()
-    pages = PageIndexes.from_page_end_indices(end_indices)
+    pages = content.split(MarkdownRenderer.page_separator)
+    md_path = output_path / md_dir_name / md_dir_name
+    md_path = md_path.with_suffix(OutputFormat.MARKDOWN.value)
+    with md_path.open("wb") as f:
+        pages = write_pages(pages, page_sep, f)
     return MarkdownDoc(path=Path(md_dir_name), pages=pages)
diff --git a/extract-python/extract_python/miner_u.py b/extract-python/extract_python/miner_u.py
index 73e8879..399253c 100644
--- a/extract-python/extract_python/miner_u.py
+++ b/extract-python/extract_python/miner_u.py
@@ -12,7 +12,6 @@
     MinerUBackend,
     MinerUPipelineConfig,
     OutputFormat,
-    PageIndexes,
     Pipeline,
     PipelineType,
     Result,
@@ -20,7 +19,7 @@
 )
 
 from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
-from .utils import path_to_artifacts_dirname, reset_env
+from .utils import path_to_artifacts_dirname, reset_env, write_pages
 
 _MINER_U_CONVERSION_ERRORS = tuple()
 MDMakeFunction = Callable[[list, str, str], str | None]
@@ -148,21 +147,9 @@ def _dump_md_content(
 
     if md_make_mode is None:
         md_make_mode = MakeMode.MM_MD
-    total_length = 0
-    end_indices = []
-    with md_path.open("w") as f:
-        n_pages = len(pdf_info)
-        for page_i, page in enumerate(pdf_info):
-            content = md_make_fn([page], md_make_mode, str(im_dir))
-            if page_i > 0:
-                content += "\n"
-            if page_i < n_pages - 1:
-                content += page_sep
-            total_length += len(content)
-            end_indices.append(total_length)
-            f.write(content)
-            f.flush()
-    end_indices = PageIndexes.from_page_end_indices(end_indices)
+    pages = (md_make_fn([p], md_make_mode, str(im_dir)) for p in pdf_info)
+    with md_path.open("wb") as f:
+        pages = write_pages(pages, page_sep, f)
     output_path = md_path.parent.relative_to(output_path)
-    output = ConversionOutput(path=output_path, pages=end_indices)
+    output = ConversionOutput(path=output_path, pages=pages)
     return output
diff --git a/extract-python/extract_python/utils.py b/extract-python/extract_python/utils.py
index 6f8ee2a..74a977c 100644
--- a/extract-python/extract_python/utils.py
+++ b/extract-python/extract_python/utils.py
@@ -5,9 +5,9 @@
 from functools import wraps
 from itertools import tee
 from pathlib import Path, PurePath
-from typing import Protocol, TypeVar
+from typing import BinaryIO, Protocol, TypeVar
 
-from extract_core import Error, InputDoc, Result, Status
+from extract_core import Error, InputDoc, Pages, Result, Status
 
 R = TypeVar("R")
 In = TypeVar("In")
@@ -73,3 +73,20 @@ def reset_env() -> Generator[None, None, None]:
     finally:
         os.environ.clear()
         os.environ.update(old_env)
+
+
+def write_pages(pages: Iterable[str], page_sep: str, out: BinaryIO) -> Pages:
+    pages = iter(pages)
+    next_page = None
+    pages_byte_sizes = []
+    sentinel = object()
+    while True:
+        content = next(pages, sentinel) if next_page is None else next_page
+        if content is sentinel:
+            break
+        next_page = next(pages, sentinel)
+        if next_page is not sentinel:
+            content += page_sep
+        content = content.encode()
+        pages_byte_sizes.append(out.write(content))
+    return Pages.from_pages_bytes_sizes(pages_byte_sizes)
diff --git a/extract-python/tests/test_docling.py b/extract-python/tests/test_docling.py
index 8632458..b233276 100644
--- a/extract-python/tests/test_docling.py
+++ b/extract-python/tests/test_docling.py
@@ -51,7 +51,7 @@ async def test_docling_pdf_to_markdown(
         assert (output_path / p).is_dir()
         assert (output_path / p / p.name).with_suffix(".md").exists()
         assert any((output_path / p).glob("artifacts/*.png"))
-    assert all(r.output.pages for r in res)
+    assert all(r.output.pages.byte_ranges for r in res)
     assert not any(r.errors for r in res)
     input_path = [r.input.path for r in res]
     expected_input_path = [
diff --git a/extract-python/tests/test_marker.py b/extract-python/tests/test_marker.py
index 7ca52a1..55a142f 100644
--- a/extract-python/tests/test_marker.py
+++ b/extract-python/tests/test_marker.py
@@ -45,7 +45,7 @@ async def test_marker_pdf_to_markdown(
         assert (output_path / p).is_dir()
         assert (output_path / p / p.name).with_suffix(".md").exists()
         assert any((output_path / p).glob("artifacts/*.jpeg"))
-    assert all(r.output.pages for r in res)
+    assert all(r.output.pages.byte_ranges for r in res)
     assert not any(r.errors for r in res)
     input_path = [r.input.path for r in res]
     expected_path = [
diff --git a/extract-python/tests/test_miner_u.py b/extract-python/tests/test_miner_u.py
index 908b326..2a71ce7 100644
--- a/extract-python/tests/test_miner_u.py
+++ b/extract-python/tests/test_miner_u.py
@@ -46,7 +46,7 @@ async def test_miner_u_pdf_to_markdown(
         assert (output_path / p).is_dir()
         assert (output_path / p / p.name).with_suffix(".md").exists()
         assert any((output_path / p).glob("artifacts/*.jpg"))
-    assert all(r.output.pages for r in res)
+    assert all(r.output.pages.byte_ranges for r in res)
     assert not any(r.errors for r in res)
     input_path = [r.input.path for r in res]
     expected_path = [
diff --git a/extract-python/tests/test_utils.py b/extract-python/tests/test_utils.py
new file mode 100644
index 0000000..f05e008
--- /dev/null
+++ b/extract-python/tests/test_utils.py
@@ -0,0 +1,85 @@
+from io import BytesIO
+
+import pytest
+from extract_python.utils import write_pages
+
+
+def _read_page(doc: BytesIO, start: int, *, end: int) -> str:
+    doc.seek(start)
+    return doc.read(end - start).decode("utf-8")
+
+
+_MD_DOC_0 = """
+# First page
+content
+<div style="page-break-after: always;"></div>
+# Second page
+content
+<div style="page-break-after: always;"></div>
+# Third page
+content"""
+
+_MD_DOC_0_PAGE_0 = """
+# First page
+content"""
+
+_MD_DOC_0_PAGE_1 = """
+# Second page
+content"""
+
+_MD_DOC_0_PAGE_2 = """
+# Third page
+content"""
+
+
+@pytest.mark.parametrize(
+    ("pages", "page_sep", "expected_n_pages", "expected_page_contents"),
+    [
+        (
+            [_MD_DOC_0_PAGE_0, _MD_DOC_0_PAGE_1, _MD_DOC_0_PAGE_2],
+            '\n<div style="page-break-after: always;"></div>\n',
+            3,
+            [
+                f'{_MD_DOC_0_PAGE_0}\n<div style="page-break-after: always;"></div>\n',
+                f'{_MD_DOC_0_PAGE_1}\n<div style="page-break-after: always;"></div>\n',
+                f"{_MD_DOC_0_PAGE_2}",
+            ],
+        ),
+        (
+            [_MD_DOC_0_PAGE_0, _MD_DOC_0_PAGE_1, _MD_DOC_0_PAGE_2],
+            "\n\n",
+            3,
+            [
+                f"{_MD_DOC_0_PAGE_0}\n\n",
+                f"{_MD_DOC_0_PAGE_1}\n\n",
+                f"{_MD_DOC_0_PAGE_2}",
+            ],
+        ),
+        (
+            [_MD_DOC_0_PAGE_0, _MD_DOC_0_PAGE_1, _MD_DOC_0_PAGE_2],
+            "",
+            3,
+            [f"{_MD_DOC_0_PAGE_0}", f"{_MD_DOC_0_PAGE_1}", f"{_MD_DOC_0_PAGE_2}"],
+        ),
+    ],
+)
+def test_write_pages(
+    pages: list[str],
+    page_sep: str,
+    expected_n_pages: int,
+    expected_page_contents: list[str],
+) -> None:
+    # Given
+    output = BytesIO()
+    # When
+    written_pages = write_pages(pages, page_sep, out=output)
+    # Then
+    assert written_pages.total == expected_n_pages
+    byte_ranges = written_pages.byte_ranges
+    assert len(byte_ranges) == len(expected_page_contents)
+    for byte_range, expected_content in zip(
+        byte_ranges, expected_page_contents, strict=True
+    ):
+        start, end = byte_range
+        page = _read_page(output, start, end=end)
+        assert page == expected_content