Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ question answering, summarization, and contradiction detection.
- [Local Embedding Models (Sentence Transformers)](#local-embedding-models-sentence-transformers)
- [Adjusting number of sources](#adjusting-number-of-sources)
- [Using Code or HTML](#using-code-or-html)
- [Multimodal Support](#multimodal-support)
- [Using External DB/Vector DB and Caching](#using-external-dbvector-db-and-caching)
- [Creating Index](#creating-index)
- [Manifest Files](#manifest-files)
Expand Down Expand Up @@ -726,6 +727,28 @@ session = await docs.aquery("Where is the search bar in the header defined?")
print(session)
```

### Multimodal Support

Multimodal support centers on:

- Standalone images
- Images or tables in PDFs

The `Docs` object stores media via a `ParsedMedia` object.
When chunking a document, media are not split at chunk boundaries,
so it's possible 2+ chunks can correspond with the same media.
This means within PaperQA each chunk
has a one-to-many relationship between `ParsedMedia` and chunks.

Depending on the source document, the same image can appear multiple times
(e.g. each page of a PDF has a logo in the margins).
Thus, clients should consider media databases
to have a many-to-many relationship with chunks.

When creating contextual summaries on a given chunk (a `Text`),
the summary LLM is passed both the chunk's text and the chunk's associated media,
but the output contextual summary itself remains text-only.

### Using External DB/Vector DB and Caching

You may want to cache parsed texts and embeddings in an external database or file.
Expand Down Expand Up @@ -895,6 +918,7 @@ will return much faster than the first query and we'll be certain the authors ma
| `parsing.pdfs_use_block_parsing` | `False` | Opt-in flag for block-based PDF parsing over text-based PDF parsing. |
| `parsing.use_doc_details` | `True` | Whether to get metadata details for docs. |
| `parsing.overlap` | `250` | Characters to overlap chunks. |
| `parsing.multimodal` | `True` | Flag to parse both text and images from applicable documents. |
| `parsing.defer_embedding` | `False` | Whether to defer embedding until summarization. |
| `parsing.parse_pdf` | `paperqa_pypdf.parse_pdf_to_pages` | Function to parse PDF files. |
| `parsing.configure_pdf_parser` | No-op | Callable to configure the PDF parser within `parse_pdf`, useful for behaviors such as enabling logging. |
Expand Down
102 changes: 96 additions & 6 deletions packages/paper-qa-pymupdf/src/paperqa_pymupdf/reader.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os

import pymupdf
from paperqa.types import ParsedMetadata, ParsedText
from paperqa.types import ParsedMedia, ParsedMetadata, ParsedText
from paperqa.utils import ImpossibleParsingError
from paperqa.version import __version__ as pqa_version

Expand All @@ -16,18 +16,61 @@ def setup_pymupdf_python_logging() -> None:


BLOCK_TEXT_INDEX = 4
# Attributes of pymupdf.Pixmap that contain useful metadata
PYMUPDF_PIXMAP_ATTRS = {
"alpha",
# YAGNI on "digest" because it's not JSON serializable
"height",
"irect",
"is_monochrome",
"is_unicolor",
"n",
"size",
"stride",
"width",
"x",
"xres",
"y",
"yres",
}


def parse_pdf_to_pages(
path: str | os.PathLike,
page_size_limit: int | None = None,
use_block_parsing: bool = False,
parse_media: bool = True,
full_page: bool = False,
image_cluster_tolerance: float | tuple[float, float] = 25,
image_dpi: float | None = 150,
**_,
) -> ParsedText:
"""Parse a PDF.

Args:
path: Path to the PDF file to parse.
page_size_limit: Sensible character limit one page's text,
used to catch bad PDF reads.
use_block_parsing: Opt-in flag to parse text block-wise.
parse_media: Flag to also parse media (e.g. images, tables).
full_page: Set True to screenshot the entire page as one image,
instead of parsing individual images or tables.
image_cluster_tolerance: Tolerance (points) passed to `Page.cluster_drawings`.
Can be a single value to apply to both X and Y directions,
or a two-tuple to specify X and Y directions separately.
The default was chosen to perform well on image extraction from LitQA2 PDFs.
image_dpi: Dots per inch for images captured from the PDF.
**_: Thrown away kwargs.
"""
x_tol, y_tol = (
image_cluster_tolerance
if isinstance(image_cluster_tolerance, tuple)
else (image_cluster_tolerance, image_cluster_tolerance)
)

with pymupdf.open(path) as file:
pages: dict[str, str] = {}
total_length = 0
content: dict[str, str | tuple[str, list[ParsedMedia]]] = {}
total_length = count_media = 0

for i in range(file.page_count):
try:
Expand Down Expand Up @@ -63,13 +106,60 @@ def parse_pdf_to_pages(
f" long, which exceeds the {page_size_limit} char limit for the PDF"
f" at path {path}."
)
pages[str(i + 1)] = text
media: list[ParsedMedia] = []
if parse_media:
if full_page: # Capture the entire page as one image
pix = page.get_pixmap(dpi=image_dpi)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could we add some error handling here if a bad image is hit and returning an ImpossibleParsingError?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We already protect page = file.load_page(...) with ImpossibleParsingError, I think once a Page is loaded in and constructed, we should be good.

I haven't seen a get_pixmap crash so far, I'd like to hold off on this for the scope of PR

media.append(
ParsedMedia(
index=0,
data=pix.tobytes(),
info={"type": "screenshot"}
| {a: getattr(pix, a) for a in PYMUPDF_PIXMAP_ATTRS},
)
)
else:
# Capture drawings/figures
for box_i, box in enumerate(
page.cluster_drawings(
drawings=page.get_drawings(),
x_tolerance=x_tol,
y_tolerance=y_tol,
)
):
pix = page.get_pixmap(clip=box, dpi=image_dpi)
media.append(
ParsedMedia(
index=box_i,
data=pix.tobytes(),
info={"bbox": tuple(box), "type": "drawing"}
| {a: getattr(pix, a) for a in PYMUPDF_PIXMAP_ATTRS},
)
)

# Capture tables
for table_i, table in enumerate(t for t in page.find_tables()):
pix = page.get_pixmap(clip=table.bbox, dpi=image_dpi)
media.append(
ParsedMedia(
index=table_i,
data=pix.tobytes(),
text=table.to_markdown().strip(),
info={"bbox": tuple(table.bbox), "type": "table"}
| {a: getattr(pix, a) for a in PYMUPDF_PIXMAP_ATTRS},
)
)
content[str(i + 1)] = text, media
else:
content[str(i + 1)] = text
total_length += len(text)
count_media += len(media)

metadata = ParsedMetadata(
parsing_libraries=[f"pymupdf ({pymupdf.__version__})"],
parsing_libraries=[f"{pymupdf.__name__} ({pymupdf.__version__})"],
paperqa_version=pqa_version,
total_parsed_text_length=total_length,
count_parsed_media=count_media,
parse_type="pdf",
)
return ParsedText(content=pages, metadata=metadata)
return ParsedText(content=content, metadata=metadata)
135 changes: 126 additions & 9 deletions packages/paper-qa-pymupdf/tests/test_paperqa_pymupdf.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,22 @@
import base64
import json
from pathlib import Path
from typing import cast

import pymupdf
import pytest
from paperqa.readers import PDFParserFn
from paperqa.utils import ImpossibleParsingError
from paperqa import Doc, Docs
from paperqa.readers import PDFParserFn, chunk_pdf
from paperqa.utils import ImpossibleParsingError, bytes_to_string

from paperqa_pymupdf import parse_pdf_to_pages

REPO_ROOT = Path(__file__).parents[3]
STUB_DATA_DIR = REPO_ROOT / "tests" / "stub_data"


def test_parse_pdf_to_pages() -> None:
@pytest.mark.asyncio
async def test_parse_pdf_to_pages() -> None:
assert isinstance(parse_pdf_to_pages, PDFParserFn)

filepath = STUB_DATA_DIR / "pasa.pdf"
Expand All @@ -21,19 +26,131 @@ def test_parse_pdf_to_pages() -> None:
assert (
"Abstract\n\nWe introduce PaSa, an advanced Paper Search"
"\nagent powered by large language models."
) in parsed_text.content["1"], "Block parsing failed to handle abstract"
) in parsed_text.content["1"][0], "Block parsing failed to handle abstract"

# Check Figure 1
p2_text = parsed_text.content["2"]
# Check the images in Figure 1
assert not isinstance(parsed_text.content["2"], str)
p2_text, p2_media = parsed_text.content["2"]
assert "Figure 1" in p2_text, "Expected Figure 1 title"
assert "Crawler" in p2_text, "Expected Figure 1 contents"
(p2_image,) = [m for m in p2_media if m.info["type"] == "drawing"]
assert p2_image.index == 0
assert isinstance(p2_image.data, bytes)

# Check the image is valid base64
base64_data = bytes_to_string(p2_image.data)
assert base64_data
assert base64.b64decode(base64_data, validate=True) == p2_image.data

# Check we can round-trip serialize the image
serde_p2_image = type(p2_image).model_validate_json(p2_image.model_dump_json())
assert serde_p2_image == p2_image

# Check useful attributes are present and are JSON serializable
json.dumps(p2_image.info)
for attr in ("width", "height"):
dim = p2_image.info[attr]
assert isinstance(dim, int | float)
assert dim > 0, "Edge length should be positive"

# Check Figure 1 can be used to answer questions
doc = Doc(
docname="He2025",
dockey="stub",
citation=(
'He, Yichen, et al. "PaSa: An LLM Agent for Comprehensive Academic Paper'
' Search." *arXiv*, 2025, arXiv:2501.10120v1. Accessed 2025.'
),
)
texts = chunk_pdf(parsed_text, doc=doc, chunk_chars=3000, overlap=100)
# pylint: disable=duplicate-code
fig_1_text = texts[1]
assert (
"Figure 1: Architecture of PaSa" in fig_1_text.text
), "Expecting Figure 1 for the test to work"
assert fig_1_text.media, "Expecting media to test multimodality"
fig_1_text.text = "stub" # Replace text to confirm multimodality works
docs = Docs()
assert await docs.aadd_texts(texts=[fig_1_text], doc=doc)
for query, substrings_min_counts in [
("What actions can the Crawler take?", [(("search", "expand", "stop"), 2)]),
("What actions can the Selector take?", [(("select", "drop"), 2)]),
(
"How many User Query are there, and what do they do?",
[(("two", "2"), 2), (("crawler", "selector"), 2)],
),
]:
session = await docs.aquery(query=query)
assert session.contexts, "Expected contexts to be generated"
assert all(
c.text.text == fig_1_text.text and c.text.media == fig_1_text.media
for c in session.contexts
), "Expected context to reuse Figure 1's text and media"
for substrings, min_count in cast(
list[tuple[tuple[str, ...], int]], substrings_min_counts
):
assert (
sum(x in session.answer.lower() for x in substrings) >= min_count
), f"Expected {session.answer=} to have at {substrings} present"

# Let's check the full page parsing behavior
parsed_text_full_page = parse_pdf_to_pages(filepath, full_page=True)
assert isinstance(parsed_text_full_page.content, dict)
assert "1" in parsed_text_full_page.content, "Parsed text should contain page 1"
assert "2" in parsed_text_full_page.content, "Parsed text should contain page 2"
for page_num in ("1", "2"):
page_content = parsed_text_full_page.content[page_num]
assert not isinstance(page_content, str), f"Page {page_num} should have images"
# Check each page has exactly one image
page_text, (full_page_image,) = page_content
assert page_text
assert full_page_image.index == 0, "Full page image should have index 0"
assert isinstance(full_page_image.data, bytes)
assert len(full_page_image.data) > 0, "Full page image should have data"
# Check useful attributes are present and are JSON serializable
json.dumps(p2_image.info)
for attr in ("width", "height"):
dim = full_page_image.info[attr]
assert isinstance(dim, int | float)
assert dim > 0, "Edge length should be positive"

# Check the no-media behavior
parsed_text_no_media = parse_pdf_to_pages(filepath, parse_media=False)
assert isinstance(parsed_text_no_media.content, dict)
assert all(isinstance(c, str) for c in parsed_text_no_media.content.values())

# Check metadata
(parsing_library,) = parsed_text.metadata.parsing_libraries
assert pymupdf.__name__ in parsing_library
assert parsed_text.metadata.parse_type == "pdf"
for pt in (parsed_text, parsed_text_full_page, parsed_text_no_media):
(parsing_library,) = pt.metadata.parsing_libraries
assert pymupdf.__name__ in parsing_library
assert pt.metadata.parse_type == "pdf"

# Check commonalities across all modes
assert (
len(parsed_text.content)
== len(parsed_text_full_page.content)
== len(parsed_text_no_media.content)
), "All modes should parse the same number of pages"


def test_page_size_limit_denial() -> None:
with pytest.raises(ImpossibleParsingError, match="char limit"):
parse_pdf_to_pages(STUB_DATA_DIR / "paper.pdf", page_size_limit=10) # chars


def test_table_parsing() -> None:
filepath = STUB_DATA_DIR / "influence.pdf"
parsed_text = parse_pdf_to_pages(filepath)
assert isinstance(parsed_text.content, dict)
assert all(
t and t[0] != "\n" and t[-1] != "\n" for t in parsed_text.content.values()
), "Expected no leading/trailing newlines in parsed text"
assert "1" in parsed_text.content, "Parsed text should contain page 1"
all_tables = {
i: [m for m in pagenum_media[1] if m.info["type"] == "table"]
for i, pagenum_media in parsed_text.content.items()
if isinstance(pagenum_media, tuple)
}
assert (
sum(len(tables) for tables in all_tables.values()) >= 2
), "Expected a few tables to be parsed"
5 changes: 5 additions & 0 deletions packages/paper-qa-pypdf/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,11 @@ name = "paper-qa-pypdf"
readme = "README.md"
requires-python = ">=3.11"

[project.optional-dependencies]
media = [
"pypdfium2>=4.22.0", # Pin for PYPDFIUM_INFO addition
]

[tool.ruff]
extend = "../../pyproject.toml"

Expand Down
Loading