-
Notifications
You must be signed in to change notification settings - Fork 0
feature(extract-core): output page byte ranges #12
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,7 +3,7 @@ | |
|
|
||
| import markdown2 | ||
| import pypdfium2 | ||
| from extract_core import BaseModel, OutputFormat, PageIndexes | ||
| from extract_core import BaseModel, OutputFormat, PageRanges | ||
| from extract_python.utils import chdir | ||
| from html2image import Html2Image | ||
| from PIL import Image, ImageDraw | ||
|
|
@@ -140,7 +140,7 @@ def _scan_pages( | |
| root: Path, comparison: ComparisonItem | ||
| ) -> list[dict[str, tuple[int, int]]]: | ||
| all_pages = [ | ||
| PageIndexes.model_validate_json( | ||
| PageRanges.model_validate_json( | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. important: same here, we need: |
||
| (root / compared / "artifacts" / "pages.json").read_text() | ||
| ).root | ||
| for compared in comparison.compared | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,2 +1,2 @@ | ||
| ARTIFACTS = "artifacts" | ||
| DEFAULT_MD_PAGE_SEP = '<div style="page-break-after: always;"></div>' | ||
| DEFAULT_MD_PAGE_SEP = '\n<div style="page-break-after: always;"></div>\n' |
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -23,7 +23,6 @@ | |||||
| InputDoc, | ||||||
| MarkdownDoc, | ||||||
| OutputFormat, | ||||||
| PageIndexes, | ||||||
| Pipeline, | ||||||
| PipelineType, | ||||||
| Result, | ||||||
|
|
@@ -34,7 +33,7 @@ | |||||
| from pydantic_core.core_schema import SerializerFunctionWrapHandler | ||||||
|
|
||||||
| from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP | ||||||
| from .utils import chdir, map_and_preserve, path_to_artifacts_dirname | ||||||
| from .utils import chdir, map_and_preserve, path_to_artifacts_dirname, write_pages | ||||||
|
|
||||||
| logger = logging.getLogger(__name__) | ||||||
|
|
||||||
|
|
@@ -115,39 +114,38 @@ def _to_markdown_doc( | |||||
| raise FileExistsError(f"directory {md_dir} already exists") | ||||||
| # Let's avoid issue of duplicated input file names flattened top level | ||||||
| md_filename = md_dir_name + OutputFormat.MARKDOWN | ||||||
| total_length = 0 | ||||||
| n_pages = len(res.pages) | ||||||
|
|
||||||
| with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td: | ||||||
| tmp_dir = Path(td) | ||||||
| page_path = Path("page.md") | ||||||
| # We do a chdir to bypass a Docling bug which only allows to maintain relative | ||||||
| # image ref when saving the markdown to a relative path | ||||||
| with (tmp_dir / md_filename).open("w") as f, chdir(tmp_dir): | ||||||
| end_indices = [] | ||||||
| for page_i in range(n_pages): | ||||||
| res.document.save_as_markdown( | ||||||
| page_path, | ||||||
| page_no=page_i + 1, | ||||||
| image_mode=ImageRefMode.REFERENCED, | ||||||
| artifacts_dir=Path(ARTIFACTS), | ||||||
| **kwargs, | ||||||
| ) | ||||||
| content = page_path.read_text() | ||||||
| if page_i > 0: | ||||||
| content += "\n" | ||||||
| if page_i < n_pages - 1: | ||||||
| content += page_sep | ||||||
| total_length += len(content) | ||||||
| end_indices.append(total_length) | ||||||
| f.write(content) | ||||||
| f.flush() | ||||||
| page_path.unlink() | ||||||
| md_path = tmp_dir / md_filename | ||||||
| current_page_path = tmp_dir / "page.md" | ||||||
| with chdir(tmp_dir): | ||||||
| # We do a chdir to bypass a Docling bug which only allows to maintain | ||||||
| # relative image ref when saving the markdown to a relative path | ||||||
| pages = _docling_pages_it(res, current_page_path, **kwargs) | ||||||
| with md_path.open("wb") as f: | ||||||
| pages = write_pages(pages, page_sep, f) | ||||||
| # Clean up the tmp page file before move everything to the end destination | ||||||
| current_page_path.unlink() | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hint: this will crash if the current page doesn't exist. This can be adjusted:
Suggested change
|
||||||
| shutil.move(tmp_dir, md_dir) | ||||||
| pages = PageIndexes.from_page_end_indices(end_indices) | ||||||
| return MarkdownDoc(path=Path(md_dir_name), pages=pages) | ||||||
|
|
||||||
|
|
||||||
| def _docling_pages_it( | ||||||
| res: ConversionResult, output_path: Path, **kwargs | ||||||
| ) -> Iterable[str]: | ||||||
| n_pages = len(res.pages) | ||||||
| for page_i in range(n_pages): | ||||||
| res.document.save_as_markdown( | ||||||
| output_path, | ||||||
| page_no=page_i + 1, | ||||||
| image_mode=ImageRefMode.REFERENCED, | ||||||
| artifacts_dir=Path(ARTIFACTS), | ||||||
| **kwargs, | ||||||
| ) | ||||||
| content = output_path.read_text() | ||||||
| yield content | ||||||
|
|
||||||
|
|
||||||
| class SerializableFormatOptions(DoclingFormatOption): | ||||||
| # Utility class to serialize Python format options into a JSON which can be | ||||||
| # correctly deserialized into a docling FormatOption | ||||||
|
|
||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,9 +5,9 @@ | |
| from functools import wraps | ||
| from itertools import tee | ||
| from pathlib import Path, PurePath | ||
| from typing import Protocol, TypeVar | ||
| from typing import BinaryIO, Protocol, TypeVar | ||
|
|
||
| from extract_core import Error, InputDoc, Result, Status | ||
| from extract_core import Error, InputDoc, Pages, Result, Status | ||
|
|
||
| R = TypeVar("R") | ||
| In = TypeVar("In") | ||
|
|
@@ -73,3 +73,20 @@ def reset_env() -> Generator[None, None, None]: | |
| finally: | ||
| os.environ.clear() | ||
| os.environ.update(old_env) | ||
|
|
||
|
|
||
| def write_pages(pages: Iterable[str], page_sep: str, out: BinaryIO) -> Pages: | ||
| pages = iter(pages) | ||
| next_page = None | ||
| pages_byte_sizes = [] | ||
| sentinel = object() | ||
| while True: | ||
| content = next(pages, sentinel) if next_page is None else next_page | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hint: You can rewrite with a sentinel, something like: def write_pages(pages: Iterable[str | None], page_sep: str, out: BinaryIO) -> Pages:
it = iter(pages)
sentinel = object()
sizes = []
prev = next(it, sentinel)
while prev is not sentinel:
cur = next(it, sentinel)
content = prev or "" # This is the trick
if cur is not sentinel:
content += page_sep
sizes.append(out.write(content.encode()))
prev = cur
return Pages.from_pages_bytes_sizes(sizes) |
||
| if content is sentinel: | ||
| break | ||
| next_page = next(pages, sentinel) | ||
| if next_page is not sentinel: | ||
| content += page_sep | ||
| content = content.encode() | ||
| pages_byte_sizes.append(out.write(content)) | ||
| return Pages.from_pages_bytes_sizes(pages_byte_sizes) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
important: I think this
PageRangesdoesnt exist anymore. If I understood correctly it's replaced byPage.