-
Notifications
You must be signed in to change notification settings - Fork 774
Multimodal PDF support #1047
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Multimodal PDF support #1047
Changes from all commits
65b7a07
fa4cb94
3dbc070
4b0d848
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
import os | ||
|
||
import pymupdf | ||
from paperqa.types import ParsedMetadata, ParsedText | ||
from paperqa.types import ParsedMedia, ParsedMetadata, ParsedText | ||
from paperqa.utils import ImpossibleParsingError | ||
from paperqa.version import __version__ as pqa_version | ||
|
||
|
@@ -16,18 +16,61 @@ def setup_pymupdf_python_logging() -> None: | |
|
||
|
||
BLOCK_TEXT_INDEX = 4 | ||
# Attributes of pymupdf.Pixmap that contain useful metadata | ||
PYMUPDF_PIXMAP_ATTRS = { | ||
"alpha", | ||
# YAGNI on "digest" because it's not JSON serializable | ||
"height", | ||
"irect", | ||
"is_monochrome", | ||
"is_unicolor", | ||
"n", | ||
"size", | ||
"stride", | ||
"width", | ||
"x", | ||
"xres", | ||
"y", | ||
"yres", | ||
} | ||
|
||
|
||
def parse_pdf_to_pages( | ||
path: str | os.PathLike, | ||
page_size_limit: int | None = None, | ||
use_block_parsing: bool = False, | ||
parse_media: bool = True, | ||
full_page: bool = False, | ||
image_cluster_tolerance: float | tuple[float, float] = 25, | ||
image_dpi: float | None = 150, | ||
**_, | ||
) -> ParsedText: | ||
"""Parse a PDF. | ||
|
||
Args: | ||
path: Path to the PDF file to parse. | ||
page_size_limit: Sensible character limit one page's text, | ||
used to catch bad PDF reads. | ||
use_block_parsing: Opt-in flag to parse text block-wise. | ||
parse_media: Flag to also parse media (e.g. images, tables). | ||
full_page: Set True to screenshot the entire page as one image, | ||
instead of parsing individual images or tables. | ||
image_cluster_tolerance: Tolerance (points) passed to `Page.cluster_drawings`. | ||
Can be a single value to apply to both X and Y directions, | ||
or a two-tuple to specify X and Y directions separately. | ||
The default was chosen to perform well on image extraction from LitQA2 PDFs. | ||
image_dpi: Dots per inch for images captured from the PDF. | ||
**_: Thrown away kwargs. | ||
""" | ||
x_tol, y_tol = ( | ||
image_cluster_tolerance | ||
if isinstance(image_cluster_tolerance, tuple) | ||
else (image_cluster_tolerance, image_cluster_tolerance) | ||
) | ||
|
||
with pymupdf.open(path) as file: | ||
pages: dict[str, str] = {} | ||
total_length = 0 | ||
content: dict[str, str | tuple[str, list[ParsedMedia]]] = {} | ||
total_length = count_media = 0 | ||
|
||
for i in range(file.page_count): | ||
try: | ||
|
@@ -63,13 +106,60 @@ def parse_pdf_to_pages( | |
f" long, which exceeds the {page_size_limit} char limit for the PDF" | ||
f" at path {path}." | ||
) | ||
pages[str(i + 1)] = text | ||
media: list[ParsedMedia] = [] | ||
if parse_media: | ||
if full_page: # Capture the entire page as one image | ||
pix = page.get_pixmap(dpi=image_dpi) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. could we add some error handling here if a bad image is hit and returning an ImpossibleParsingError? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We already protect I haven't seen a |
||
media.append( | ||
ParsedMedia( | ||
index=0, | ||
data=pix.tobytes(), | ||
info={"type": "screenshot"} | ||
| {a: getattr(pix, a) for a in PYMUPDF_PIXMAP_ATTRS}, | ||
) | ||
) | ||
else: | ||
# Capture drawings/figures | ||
for box_i, box in enumerate( | ||
page.cluster_drawings( | ||
drawings=page.get_drawings(), | ||
x_tolerance=x_tol, | ||
y_tolerance=y_tol, | ||
) | ||
): | ||
pix = page.get_pixmap(clip=box, dpi=image_dpi) | ||
media.append( | ||
ParsedMedia( | ||
index=box_i, | ||
data=pix.tobytes(), | ||
info={"bbox": tuple(box), "type": "drawing"} | ||
| {a: getattr(pix, a) for a in PYMUPDF_PIXMAP_ATTRS}, | ||
) | ||
) | ||
|
||
# Capture tables | ||
for table_i, table in enumerate(t for t in page.find_tables()): | ||
pix = page.get_pixmap(clip=table.bbox, dpi=image_dpi) | ||
media.append( | ||
ParsedMedia( | ||
index=table_i, | ||
data=pix.tobytes(), | ||
text=table.to_markdown().strip(), | ||
info={"bbox": tuple(table.bbox), "type": "table"} | ||
| {a: getattr(pix, a) for a in PYMUPDF_PIXMAP_ATTRS}, | ||
) | ||
) | ||
content[str(i + 1)] = text, media | ||
else: | ||
content[str(i + 1)] = text | ||
total_length += len(text) | ||
count_media += len(media) | ||
|
||
metadata = ParsedMetadata( | ||
parsing_libraries=[f"pymupdf ({pymupdf.__version__})"], | ||
parsing_libraries=[f"{pymupdf.__name__} ({pymupdf.__version__})"], | ||
paperqa_version=pqa_version, | ||
total_parsed_text_length=total_length, | ||
count_parsed_media=count_media, | ||
parse_type="pdf", | ||
) | ||
return ParsedText(content=pages, metadata=metadata) | ||
return ParsedText(content=content, metadata=metadata) |
Uh oh!
There was an error while loading. Please reload this page.