Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions extract-core/extract_core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
InputDoc,
MarkdownDoc,
OutputFormat,
PageIndexes,
Pages,
Ranges,
Result,
Status,
SupportedExt,
Expand Down Expand Up @@ -58,7 +59,8 @@
"MinerUConfig",
"MinerUPipelineConfig",
"OutputFormat",
"PageIndexes",
"Ranges",
"Pages",
"Pipeline",
"PipelineType",
"Result",
Expand Down
24 changes: 16 additions & 8 deletions extract-core/extract_core/objects.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import traceback
import uuid
from abc import ABC
from collections.abc import Sequence
from enum import StrEnum
from functools import cache
from io import BytesIO
Expand All @@ -16,7 +17,7 @@
no_enum_values_config,
safe_copy,
)
from pydantic import AfterValidator, RootModel, TypeAdapter
from pydantic import AfterValidator, Field, TypeAdapter
from pydantic import BaseModel as _BaseModel

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -203,18 +204,25 @@ def without_content(self) -> Self:
return safe_copy(self, update={"content": None})


class PageIndexes(RootModel[list[tuple[int, int]]]):
# Stores page end index
Ranges = list[tuple[int, int]]


class Pages(BaseModel):
total: int = 0
byte_ranges: Ranges = []

@classmethod
def from_page_end_indices(cls, lengths: list[int]) -> Self:
return [
((lengths[p - 1] if p > 0 else 0), lengths[p]) for p in range(len(lengths))
]
def from_pages_bytes_sizes(cls, sizes: Sequence[int]) -> Self:
bytes_ranges = []
for p, end in enumerate(sizes):
start = 0 if p == 0 else bytes_ranges[-1][1]
bytes_ranges.append((start, start + end))
return cls(total=len(sizes), byte_ranges=bytes_ranges)


class ConversionOutput(BaseModel):
path: Path
pages: PageIndexes = []
pages: Pages = Field(default_factory=Pages)


class MarkdownDoc(ConversionOutput):
Expand Down
4 changes: 2 additions & 2 deletions extract-python/benches/compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import markdown2
import pypdfium2
from extract_core import BaseModel, OutputFormat, PageIndexes
from extract_core import BaseModel, OutputFormat, PageRanges

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

important: I think this PageRangesdoesnt exist anymore. If I understood correctly it's replaced by Page.

from extract_python.utils import chdir
from html2image import Html2Image
from PIL import Image, ImageDraw
Expand Down Expand Up @@ -140,7 +140,7 @@ def _scan_pages(
root: Path, comparison: ComparisonItem
) -> list[dict[str, tuple[int, int]]]:
all_pages = [
PageIndexes.model_validate_json(
PageRanges.model_validate_json(

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

important: same here, we need:

Pages.model_validate_json(
    (root / compared / "artifacts" / "pages.json").read_text()
).byte_ranges

(root / compared / "artifacts" / "pages.json").read_text()
).root
for compared in comparison.compared
Expand Down
2 changes: 1 addition & 1 deletion extract-python/extract_python/constants.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
ARTIFACTS = "artifacts"
DEFAULT_MD_PAGE_SEP = '<div style="page-break-after: always;"></div>'
DEFAULT_MD_PAGE_SEP = '\n<div style="page-break-after: always;"></div>\n'
56 changes: 27 additions & 29 deletions extract-python/extract_python/docling_.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
InputDoc,
MarkdownDoc,
OutputFormat,
PageIndexes,
Pipeline,
PipelineType,
Result,
Expand All @@ -34,7 +33,7 @@
from pydantic_core.core_schema import SerializerFunctionWrapHandler

from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
from .utils import chdir, map_and_preserve, path_to_artifacts_dirname
from .utils import chdir, map_and_preserve, path_to_artifacts_dirname, write_pages

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -115,39 +114,38 @@ def _to_markdown_doc(
raise FileExistsError(f"directory {md_dir} already exists")
# Let's avoid issue of duplicated input file names flattened top level
md_filename = md_dir_name + OutputFormat.MARKDOWN
total_length = 0
n_pages = len(res.pages)

with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td:
tmp_dir = Path(td)
page_path = Path("page.md")
# We do a chdir to bypass a Docling bug which only allows to maintain relative
# image ref when saving the markdown to a relative path
with (tmp_dir / md_filename).open("w") as f, chdir(tmp_dir):
end_indices = []
for page_i in range(n_pages):
res.document.save_as_markdown(
page_path,
page_no=page_i + 1,
image_mode=ImageRefMode.REFERENCED,
artifacts_dir=Path(ARTIFACTS),
**kwargs,
)
content = page_path.read_text()
if page_i > 0:
content += "\n"
if page_i < n_pages - 1:
content += page_sep
total_length += len(content)
end_indices.append(total_length)
f.write(content)
f.flush()
page_path.unlink()
md_path = tmp_dir / md_filename
current_page_path = tmp_dir / "page.md"
with chdir(tmp_dir):
# We do a chdir to bypass a Docling bug which only allows to maintain
# relative image ref when saving the markdown to a relative path
pages = _docling_pages_it(res, current_page_path, **kwargs)
with md_path.open("wb") as f:
pages = write_pages(pages, page_sep, f)
# Clean up the tmp page file before move everything to the end destination
current_page_path.unlink()

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hint: this will crash if the current page doesn't exist. This can be adjusted:

Suggested change
current_page_path.unlink()
current_page_path.unlink(missing_ok=True)

shutil.move(tmp_dir, md_dir)
pages = PageIndexes.from_page_end_indices(end_indices)
return MarkdownDoc(path=Path(md_dir_name), pages=pages)


def _docling_pages_it(
res: ConversionResult, output_path: Path, **kwargs
) -> Iterable[str]:
n_pages = len(res.pages)
for page_i in range(n_pages):
res.document.save_as_markdown(
output_path,
page_no=page_i + 1,
image_mode=ImageRefMode.REFERENCED,
artifacts_dir=Path(ARTIFACTS),
**kwargs,
)
content = output_path.read_text()
yield content


class SerializableFormatOptions(DoclingFormatOption):
# Utility class to serialize Python format options into a JSON which can be
# correctly deserialized into a docling FormatOption
Expand Down
41 changes: 16 additions & 25 deletions extract-python/extract_python/marker_.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,14 @@
InputDoc,
MarkdownDoc,
OutputFormat,
PageIndexes,
Pipeline,
PipelineType,
Result,
Status,
)

from .constants import ARTIFACTS
from .utils import path_to_artifacts_dirname, report_recoverable_errors
from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
from .utils import path_to_artifacts_dirname, report_recoverable_errors, write_pages

if TYPE_CHECKING:
from marker.converters.pdf import PdfConverter
Expand Down Expand Up @@ -63,15 +62,22 @@ async def _process_doc(
content, _, images = text_from_rendered(rendered)
match output_format:
case OutputFormat.MARKDOWN:
output = _to_markdown_doc(doc, content, images, output_path)
output = _to_markdown_doc(
doc, content, images, output_path, page_sep=DEFAULT_MD_PAGE_SEP
)
case _:
raise NotImplementedError(f"unsupported output format {output_format}")
input_doc = doc.without_content()
return Result(input=input_doc, status=Status.SUCCESS, output=output)


def _to_markdown_doc(
input_doc: InputDoc, content: str, images: dict[str, "Image"], output_path: Path
input_doc: InputDoc,
content: str,
images: dict[str, "Image"],
output_path: Path,
*,
page_sep: str = DEFAULT_MD_PAGE_SEP,
) -> MarkdownDoc:
from marker.renderers.markdown import MarkdownRenderer # noqa: PLC0415

Expand All @@ -85,24 +91,9 @@ def _to_markdown_doc(
im.save(artifacts_dir / im_name)
del images
gc.collect()
page_sep = MarkdownRenderer.page_separator
content = content.split(page_sep)
n_pages = len(content)
md_path = (output_path / md_dir_name / md_dir_name).with_suffix(
OutputFormat.MARKDOWN.value
)
total_length = 0
end_indices = []
with md_path.open("w", encoding="utf-8") as f:
for page_i, page_content in enumerate(content):
content = page_content
if page_i > 0:
content += "\n"
if page_i < n_pages - 1:
content += page_sep
total_length += len(content)
end_indices.append(total_length)
f.write(content)
f.flush()
pages = PageIndexes.from_page_end_indices(end_indices)
pages = content.split(MarkdownRenderer.page_separator)
md_path = output_path / md_dir_name / md_dir_name
md_path = md_path.with_suffix(OutputFormat.MARKDOWN.value)
with md_path.open("wb") as f:
pages = write_pages(pages, page_sep, f)
return MarkdownDoc(path=Path(md_dir_name), pages=pages)
23 changes: 5 additions & 18 deletions extract-python/extract_python/miner_u.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,14 @@
MinerUBackend,
MinerUPipelineConfig,
OutputFormat,
PageIndexes,
Pipeline,
PipelineType,
Result,
Status,
)

from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
from .utils import path_to_artifacts_dirname, reset_env
from .utils import path_to_artifacts_dirname, reset_env, write_pages

_MINER_U_CONVERSION_ERRORS = tuple()
MDMakeFunction = Callable[[list, str, str], str | None]
Expand Down Expand Up @@ -148,21 +147,9 @@ def _dump_md_content(

if md_make_mode is None:
md_make_mode = MakeMode.MM_MD
total_length = 0
end_indices = []
with md_path.open("w") as f:
n_pages = len(pdf_info)
for page_i, page in enumerate(pdf_info):
content = md_make_fn([page], md_make_mode, str(im_dir))
if page_i > 0:
content += "\n"
if page_i < n_pages - 1:
content += page_sep
total_length += len(content)
end_indices.append(total_length)
f.write(content)
f.flush()
end_indices = PageIndexes.from_page_end_indices(end_indices)
pages = (md_make_fn([p], md_make_mode, str(im_dir)) for p in pdf_info)
with md_path.open("wb") as f:
pages = write_pages(pages, page_sep, f)
output_path = md_path.parent.relative_to(output_path)
output = ConversionOutput(path=output_path, pages=end_indices)
output = ConversionOutput(path=output_path, pages=pages)
return output
21 changes: 19 additions & 2 deletions extract-python/extract_python/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
from functools import wraps
from itertools import tee
from pathlib import Path, PurePath
from typing import Protocol, TypeVar
from typing import BinaryIO, Protocol, TypeVar

from extract_core import Error, InputDoc, Result, Status
from extract_core import Error, InputDoc, Pages, Result, Status

R = TypeVar("R")
In = TypeVar("In")
Expand Down Expand Up @@ -73,3 +73,20 @@ def reset_env() -> Generator[None, None, None]:
finally:
os.environ.clear()
os.environ.update(old_env)


def write_pages(pages: Iterable[str], page_sep: str, out: BinaryIO) -> Pages:
pages = iter(pages)
next_page = None
pages_byte_sizes = []
sentinel = object()
while True:
content = next(pages, sentinel) if next_page is None else next_page

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hint: write_pages uses next_page is None as the "no lookahead yet" marker. The only re-entry to the next(pages) branch is when next_page is None, which can only recur if a page value is literally None.

You can rewrite with a sentinel, something like:

def write_pages(pages: Iterable[str | None], page_sep: str, out: BinaryIO) -> Pages:
    it = iter(pages)
    sentinel = object()
    sizes = []
    prev = next(it, sentinel)
    while prev is not sentinel:
        cur = next(it, sentinel)
        content = prev or "" # This is the trick
        if cur is not sentinel:
            content += page_sep
        sizes.append(out.write(content.encode()))
        prev = cur
    return Pages.from_pages_bytes_sizes(sizes)

if content is sentinel:
break
next_page = next(pages, sentinel)
if next_page is not sentinel:
content += page_sep
content = content.encode()
pages_byte_sizes.append(out.write(content))
return Pages.from_pages_bytes_sizes(pages_byte_sizes)
2 changes: 1 addition & 1 deletion extract-python/tests/test_docling.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ async def test_docling_pdf_to_markdown(
assert (output_path / p).is_dir()
assert (output_path / p / p.name).with_suffix(".md").exists()
assert any((output_path / p).glob("artifacts/*.png"))
assert all(r.output.pages for r in res)
assert all(r.output.pages.byte_ranges for r in res)
assert not any(r.errors for r in res)
input_path = [r.input.path for r in res]
expected_input_path = [
Expand Down
2 changes: 1 addition & 1 deletion extract-python/tests/test_marker.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ async def test_marker_pdf_to_markdown(
assert (output_path / p).is_dir()
assert (output_path / p / p.name).with_suffix(".md").exists()
assert any((output_path / p).glob("artifacts/*.jpeg"))
assert all(r.output.pages for r in res)
assert all(r.output.pages.byte_ranges for r in res)
assert not any(r.errors for r in res)
input_path = [r.input.path for r in res]
expected_path = [
Expand Down
2 changes: 1 addition & 1 deletion extract-python/tests/test_miner_u.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ async def test_miner_u_pdf_to_markdown(
assert (output_path / p).is_dir()
assert (output_path / p / p.name).with_suffix(".md").exists()
assert any((output_path / p).glob("artifacts/*.jpg"))
assert all(r.output.pages for r in res)
assert all(r.output.pages.byte_ranges for r in res)
assert not any(r.errors for r in res)
input_path = [r.input.path for r in res]
expected_path = [
Expand Down
Loading