From e202861e4de46a172d8ff802d990b77d619a40c3 Mon Sep 17 00:00:00 2001 From: Onur Solmaz <2453968+osolmaz@users.noreply.github.com> Date: Wed, 19 Mar 2025 21:47:40 +0100 Subject: [PATCH 01/10] Extract rich text from blocks, wip --- jsondoc/__init__.py | 1 + jsondoc/extract_rich_text.py | 163 +++++++++++++++++++++++++++++++++++ jsondoc/utils.py | 10 ++- 3 files changed, 172 insertions(+), 2 deletions(-) create mode 100644 jsondoc/__init__.py create mode 100644 jsondoc/extract_rich_text.py diff --git a/jsondoc/__init__.py b/jsondoc/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/jsondoc/__init__.py @@ -0,0 +1 @@ + diff --git a/jsondoc/extract_rich_text.py b/jsondoc/extract_rich_text.py new file mode 100644 index 0000000..493216a --- /dev/null +++ b/jsondoc/extract_rich_text.py @@ -0,0 +1,163 @@ +from typing import Dict, List, Optional, Union + +from pydantic import BaseModel + +from jsondoc.convert.utils import block_supports_rich_text, get_rich_text_from_block +from jsondoc.models.block.base import BlockBase +from jsondoc.models.block.types.rich_text.base import RichTextBase +from jsondoc.models.page import Page + + +class BackRef(BaseModel): + block_id: str + begin_idx: int + end_idx: int + + +class TextWithBackref(BaseModel): + text: str + backrefs: list[BackRef] + + +def extract_rich_text_from_page( + page: Page, include_annotations: bool = False +) -> Dict[str, Union[str, List[Dict]]]: + """ + Extract all rich text content from a JSON-DOC page. + + Args: + page: A JSON-DOC Page object + include_annotations: If True, includes formatting info (bold, italic, etc.) in the output + + Returns: + A dictionary containing: + - 'title': The page title text + - 'content': A list of text content from all blocks, each item is either: + - A string (if include_annotations=False) + - A dict with 'text' and 'annotations' (if include_annotations=True) + """ + result = {"title": "", "content": []} + + # Extract title + if page.properties.title and page.properties.title.title: + title_texts = [] + for rich_text in page.properties.title.title: + title_texts.append(rich_text.plain_text) + result["title"] = "".join(title_texts) + + # Process all blocks recursively + result["content"] = extract_rich_text_from_blocks( + page.children, include_annotations + ) + + return result + + +def _process_rich_text_items(rich_text_list, include_annotations, result): + """ + Helper function to process a list of rich text items and append them to the result. + + Args: + rich_text_list: List of rich text items to process + include_annotations: Whether to include formatting annotations + result: The result list to append items to + """ + if not rich_text_list: + return + + for rich_text in rich_text_list: + if include_annotations: + result.append( + { + "text": rich_text.plain_text, + "annotations": rich_text.annotations.model_dump() + if hasattr(rich_text, "annotations") + else {}, + "href": rich_text.href if hasattr(rich_text, "href") else None, + } + ) + else: + result.append(rich_text.plain_text) + + +def extract_rich_text_from_blocks( + blocks: List[BlockBase], include_annotations: bool = False +) -> List[Union[str, Dict]]: + """ + Extract rich text content from a list of blocks recursively. + + Args: + blocks: List of BlockBase objects + include_annotations: If True, includes formatting info in the output + + Returns: + List of text content, either as strings or annotation dictionaries + """ + result = [] + + for block in blocks: + # Extract rich text if the block supports it + if block_supports_rich_text(block): + try: + rich_text_list = get_rich_text_from_block(block) + _process_rich_text_items(rich_text_list, include_annotations, result) + except ValueError: + # Block doesn't support rich text (shouldn't happen due to our check) + pass + + # Extract captions from blocks that support them + if block.type == "image" and hasattr(block.image, "caption"): + _process_rich_text_items(block.image.caption, include_annotations, result) + elif block.type == "code" and hasattr(block.code, "caption"): + _process_rich_text_items(block.code.caption, include_annotations, result) + + # Process child blocks recursively + if hasattr(block, "children") and block.children: + child_content = extract_rich_text_from_blocks( + block.children, include_annotations + ) + result.extend(child_content) + + # Handle special blocks like tables that have rich text in different structure + if block.type == "table_row" and hasattr(block.table_row, "cells"): + for cell in block.table_row.cells: + if isinstance(cell, list): + _process_rich_text_items(cell, include_annotations, result) + + return result + + +def extract_plain_text_from_page(page: Page) -> str: + """ + Extract all plain text content from a JSON-DOC page and return it as a single string. + + Args: + page: A JSON-DOC Page object + + Returns: + A string containing all the text content from the page + """ + extracted = extract_rich_text_from_page(page, include_annotations=False) + + # Join title and content with appropriate separators + result = [] + if extracted["title"]: + result.append(extracted["title"]) + + if extracted["content"]: + # Filter out empty strings and join with spaces + content_text = " ".join([item for item in extracted["content"] if item]) + if content_text: + result.append(content_text) + + return "\n\n".join(result) + + +def extract_text_from_jsondoc_page_with_block_backref( + page: Page, +) -> list[TextWithBackref]: + """ + Extract rich text from jsondoc data. + """ + rich_text = extract_rich_text_from_page(page) + return rich_text diff --git a/jsondoc/utils.py b/jsondoc/utils.py index 38eefdb..8d0548c 100644 --- a/jsondoc/utils.py +++ b/jsondoc/utils.py @@ -6,6 +6,8 @@ from contextlib import contextmanager from datetime import datetime, timezone +from jsondoc.models.block.base import CreatedBy + ARBITRARY_JSON_SCHEMA_OBJECT = { "type": "object", "properties": {}, @@ -188,9 +190,13 @@ def set_field_recursive(obj: any, field_name: str, value: any) -> None: # set_field_recursive(v, field_name, value) -def set_created_by(obj: any, created_by: str) -> None: +def set_created_by(obj: any, created_by: str | CreatedBy) -> None: """ Recursively sets the 'created_by' field to the given value in the given object. """ - assert isinstance(created_by, str) + assert isinstance(created_by, (str, CreatedBy)) + + if isinstance(created_by, str): + created_by = CreatedBy(id=created_by, object="user") + set_field_recursive(obj, "created_by", created_by) From a31df579edfae094bed8d890aaa474bdaa0e3569 Mon Sep 17 00:00:00 2001 From: Onur Solmaz <2453968+osolmaz@users.noreply.github.com> Date: Wed, 19 Mar 2025 22:12:43 +0100 Subject: [PATCH 02/10] Add TextWithBackref --- jsondoc/extract_rich_text.py | 167 ++++++++++++++++++++++++++++++++--- 1 file changed, 157 insertions(+), 10 deletions(-) diff --git a/jsondoc/extract_rich_text.py b/jsondoc/extract_rich_text.py index 493216a..c7a7c9b 100644 --- a/jsondoc/extract_rich_text.py +++ b/jsondoc/extract_rich_text.py @@ -1,4 +1,4 @@ -from typing import Dict, List, Optional, Union +from typing import Dict, List, Union from pydantic import BaseModel @@ -19,22 +19,165 @@ class TextWithBackref(BaseModel): backrefs: list[BackRef] -def extract_rich_text_from_page( +def extract_text_with_backref_from_page( page: Page, include_annotations: bool = False -) -> Dict[str, Union[str, List[Dict]]]: +) -> TextWithBackref: + """ + Extract all rich text content from a JSON-DOC page as a single string + with backrefs tracking the block origins. + + Args: + page: A JSON-DOC Page object + include_annotations: If True, includes formatting info (not used in backref tracking) + + Returns: + TextWithBackref: Object containing concatenated text and backrefs + """ + concat_text = "" + backrefs = [] + + # Extract title + title_text = "" + if page.properties.title and page.properties.title.title: + for rich_text in page.properties.title.title: + title_text += rich_text.plain_text + + if title_text: + begin_idx = len(concat_text) + concat_text += title_text + end_idx = len(concat_text) + # Add a backref for the page title using the page's ID + backrefs.append(BackRef(block_id=page.id, begin_idx=begin_idx, end_idx=end_idx)) + # Add a newline after the title + concat_text += "\n\n" + + # Process all blocks recursively and collect their text with backrefs + blocks_with_text = _extract_blocks_with_text(page.children, include_annotations) + + # Add all blocks to the concatenated text with their respective backrefs + for block_id, block_text in blocks_with_text: + if block_text: + begin_idx = len(concat_text) + concat_text += block_text + end_idx = len(concat_text) + + backrefs.append( + BackRef(block_id=block_id, begin_idx=begin_idx, end_idx=end_idx) + ) + + # Add a space after each block + concat_text += " " + + return TextWithBackref(text=concat_text.strip(), backrefs=backrefs) + + +def _extract_blocks_with_text( + blocks: List[BlockBase], include_annotations: bool = False +) -> List[tuple[str, str]]: + """ + Extract text from blocks and return a list of (block_id, text) tuples. + + Args: + blocks: List of blocks to process + include_annotations: Whether to include annotations (not used in this implementation) + + Returns: + List of (block_id, text) tuples + """ + result = [] + + for block in blocks: + # Get text from the current block + block_text = _extract_text_from_single_block(block) + if block_text: + result.append((block.id, block_text)) + + # Process child blocks recursively + if hasattr(block, "children") and block.children: + child_results = _extract_blocks_with_text( + block.children, include_annotations + ) + result.extend(child_results) + + return result + + +def _extract_text_from_single_block(block: BlockBase) -> str: + """ + Extract text from a single block without processing its children. + + Args: + block: The block to extract text from + + Returns: + The text content of the block + """ + result = [] + + # Extract rich text if the block supports it + if block_supports_rich_text(block): + try: + rich_text_list = get_rich_text_from_block(block) + for rich_text in rich_text_list: + result.append(rich_text.plain_text) + except ValueError: + pass + + # Extract captions from blocks that support them + if block.type == "image" and hasattr(block.image, "caption"): + for caption_text in block.image.caption: + result.append(caption_text.plain_text) + elif block.type == "code" and hasattr(block.code, "caption"): + for caption_text in block.code.caption: + result.append(caption_text.plain_text) + + # Handle special blocks like tables + if block.type == "table_row" and hasattr(block.table_row, "cells"): + for cell in block.table_row.cells: + if isinstance(cell, list): + for item in cell: + if hasattr(item, "plain_text"): + result.append(item.plain_text) + + return " ".join(result) + + +def _extract_text_from_block( + block: BlockBase, include_annotations: bool = False +) -> str: + """ + Extract all text from a single block, including its children. + + Args: + block: The block to extract text from + include_annotations: Whether to include annotations (not used in this implementation) + + Returns: + A string with all text from the block + """ + # Extract text from the current block + result = [_extract_text_from_single_block(block)] + + # Process child blocks recursively + if hasattr(block, "children") and block.children: + for child in block.children: + child_text = _extract_text_from_block(child, include_annotations) + if child_text: + result.append(child_text) + + return " ".join([text for text in result if text]) + + +def extract_rich_text_from_page(page: Page, include_annotations: bool = False) -> Dict: """ Extract all rich text content from a JSON-DOC page. Args: page: A JSON-DOC Page object - include_annotations: If True, includes formatting info (bold, italic, etc.) in the output + include_annotations: If True, includes formatting info in the output Returns: - A dictionary containing: - - 'title': The page title text - - 'content': A list of text content from all blocks, each item is either: - - A string (if include_annotations=False) - - A dict with 'text' and 'annotations' (if include_annotations=True) + Dictionary containing title and content lists """ result = {"title": "", "content": []} @@ -53,7 +196,11 @@ def extract_rich_text_from_page( return result -def _process_rich_text_items(rich_text_list, include_annotations, result): +def _process_rich_text_items( + rich_text_list: list[RichTextBase], + include_annotations: bool, + result: list, +) -> None: """ Helper function to process a list of rich text items and append them to the result. From 666403879273d538c1186f4ffc918dbd398b733e Mon Sep 17 00:00:00 2001 From: Onur Solmaz <2453968+osolmaz@users.noreply.github.com> Date: Wed, 19 Mar 2025 22:19:13 +0100 Subject: [PATCH 03/10] Checkpoint --- .gitignore | 3 +- jsondoc/{utils.py => utils/__init__.py} | 0 .../text_with_backref.py} | 148 +----------------- 3 files changed, 5 insertions(+), 146 deletions(-) rename jsondoc/{utils.py => utils/__init__.py} (100%) rename jsondoc/{extract_rich_text.py => utils/text_with_backref.py} (51%) diff --git a/.gitignore b/.gitignore index 26fb22e..534ee83 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,5 @@ __pycache__ build/ *.docx -*.pptx \ No newline at end of file +*.pptx +scratch/ \ No newline at end of file diff --git a/jsondoc/utils.py b/jsondoc/utils/__init__.py similarity index 100% rename from jsondoc/utils.py rename to jsondoc/utils/__init__.py diff --git a/jsondoc/extract_rich_text.py b/jsondoc/utils/text_with_backref.py similarity index 51% rename from jsondoc/extract_rich_text.py rename to jsondoc/utils/text_with_backref.py index c7a7c9b..d77c8bc 100644 --- a/jsondoc/extract_rich_text.py +++ b/jsondoc/utils/text_with_backref.py @@ -14,14 +14,14 @@ class BackRef(BaseModel): end_idx: int -class TextWithBackref(BaseModel): +class TextWithBackrefs(BaseModel): text: str backrefs: list[BackRef] def extract_text_with_backref_from_page( page: Page, include_annotations: bool = False -) -> TextWithBackref: +) -> TextWithBackrefs: """ Extract all rich text content from a JSON-DOC page as a single string with backrefs tracking the block origins. @@ -68,7 +68,7 @@ def extract_text_with_backref_from_page( # Add a space after each block concat_text += " " - return TextWithBackref(text=concat_text.strip(), backrefs=backrefs) + return TextWithBackrefs(text=concat_text.strip(), backrefs=backrefs) def _extract_blocks_with_text( @@ -166,145 +166,3 @@ def _extract_text_from_block( result.append(child_text) return " ".join([text for text in result if text]) - - -def extract_rich_text_from_page(page: Page, include_annotations: bool = False) -> Dict: - """ - Extract all rich text content from a JSON-DOC page. - - Args: - page: A JSON-DOC Page object - include_annotations: If True, includes formatting info in the output - - Returns: - Dictionary containing title and content lists - """ - result = {"title": "", "content": []} - - # Extract title - if page.properties.title and page.properties.title.title: - title_texts = [] - for rich_text in page.properties.title.title: - title_texts.append(rich_text.plain_text) - result["title"] = "".join(title_texts) - - # Process all blocks recursively - result["content"] = extract_rich_text_from_blocks( - page.children, include_annotations - ) - - return result - - -def _process_rich_text_items( - rich_text_list: list[RichTextBase], - include_annotations: bool, - result: list, -) -> None: - """ - Helper function to process a list of rich text items and append them to the result. - - Args: - rich_text_list: List of rich text items to process - include_annotations: Whether to include formatting annotations - result: The result list to append items to - """ - if not rich_text_list: - return - - for rich_text in rich_text_list: - if include_annotations: - result.append( - { - "text": rich_text.plain_text, - "annotations": rich_text.annotations.model_dump() - if hasattr(rich_text, "annotations") - else {}, - "href": rich_text.href if hasattr(rich_text, "href") else None, - } - ) - else: - result.append(rich_text.plain_text) - - -def extract_rich_text_from_blocks( - blocks: List[BlockBase], include_annotations: bool = False -) -> List[Union[str, Dict]]: - """ - Extract rich text content from a list of blocks recursively. - - Args: - blocks: List of BlockBase objects - include_annotations: If True, includes formatting info in the output - - Returns: - List of text content, either as strings or annotation dictionaries - """ - result = [] - - for block in blocks: - # Extract rich text if the block supports it - if block_supports_rich_text(block): - try: - rich_text_list = get_rich_text_from_block(block) - _process_rich_text_items(rich_text_list, include_annotations, result) - except ValueError: - # Block doesn't support rich text (shouldn't happen due to our check) - pass - - # Extract captions from blocks that support them - if block.type == "image" and hasattr(block.image, "caption"): - _process_rich_text_items(block.image.caption, include_annotations, result) - elif block.type == "code" and hasattr(block.code, "caption"): - _process_rich_text_items(block.code.caption, include_annotations, result) - - # Process child blocks recursively - if hasattr(block, "children") and block.children: - child_content = extract_rich_text_from_blocks( - block.children, include_annotations - ) - result.extend(child_content) - - # Handle special blocks like tables that have rich text in different structure - if block.type == "table_row" and hasattr(block.table_row, "cells"): - for cell in block.table_row.cells: - if isinstance(cell, list): - _process_rich_text_items(cell, include_annotations, result) - - return result - - -def extract_plain_text_from_page(page: Page) -> str: - """ - Extract all plain text content from a JSON-DOC page and return it as a single string. - - Args: - page: A JSON-DOC Page object - - Returns: - A string containing all the text content from the page - """ - extracted = extract_rich_text_from_page(page, include_annotations=False) - - # Join title and content with appropriate separators - result = [] - if extracted["title"]: - result.append(extracted["title"]) - - if extracted["content"]: - # Filter out empty strings and join with spaces - content_text = " ".join([item for item in extracted["content"] if item]) - if content_text: - result.append(content_text) - - return "\n\n".join(result) - - -def extract_text_from_jsondoc_page_with_block_backref( - page: Page, -) -> list[TextWithBackref]: - """ - Extract rich text from jsondoc data. - """ - rich_text = extract_rich_text_from_page(page) - return rich_text From 5c76457e23182d309b5168a968685f5bd9921eb1 Mon Sep 17 00:00:00 2001 From: Onur Solmaz <2453968+osolmaz@users.noreply.github.com> Date: Wed, 19 Mar 2025 22:31:51 +0100 Subject: [PATCH 04/10] Checkpoint --- jsondoc/utils/text_with_backref.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/jsondoc/utils/text_with_backref.py b/jsondoc/utils/text_with_backref.py index d77c8bc..cf7270d 100644 --- a/jsondoc/utils/text_with_backref.py +++ b/jsondoc/utils/text_with_backref.py @@ -9,6 +9,7 @@ class BackRef(BaseModel): + plain_text: str block_id: str begin_idx: int end_idx: int @@ -47,7 +48,14 @@ def extract_text_with_backref_from_page( concat_text += title_text end_idx = len(concat_text) # Add a backref for the page title using the page's ID - backrefs.append(BackRef(block_id=page.id, begin_idx=begin_idx, end_idx=end_idx)) + backrefs.append( + BackRef( + plain_text=title_text, + block_id=page.id, + begin_idx=begin_idx, + end_idx=end_idx, + ) + ) # Add a newline after the title concat_text += "\n\n" @@ -62,7 +70,12 @@ def extract_text_with_backref_from_page( end_idx = len(concat_text) backrefs.append( - BackRef(block_id=block_id, begin_idx=begin_idx, end_idx=end_idx) + BackRef( + plain_text=block_text, + block_id=block_id, + begin_idx=begin_idx, + end_idx=end_idx, + ) ) # Add a space after each block From dc77945e05a883d001fce596b54ecedbfa49a2bb Mon Sep 17 00:00:00 2001 From: Onur Solmaz <2453968+osolmaz@users.noreply.github.com> Date: Thu, 20 Mar 2025 09:25:49 +0100 Subject: [PATCH 05/10] Extract blocks util --- jsondoc/utils/block.py | 56 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 jsondoc/utils/block.py diff --git a/jsondoc/utils/block.py b/jsondoc/utils/block.py new file mode 100644 index 0000000..7155048 --- /dev/null +++ b/jsondoc/utils/block.py @@ -0,0 +1,56 @@ +from typing import OrderedDict + +from jsondoc.models.block.base import BlockBase +from jsondoc.models.page import Page + + +def extract_blocks( + input_obj: Page | BlockBase | list[BlockBase], +) -> dict[str, BlockBase]: + """ + Creates a mapping of block IDs to Block objects from various input types. + + Args: + input_obj: Can be either a Page object, a single Block object, or a list of Block objects + + Returns: + A dictionary mapping block IDs (strings) to their corresponding Block objects + """ + block_map: dict[str, BlockBase] = OrderedDict() + + # Handle Page input + if isinstance(input_obj, Page): + # Process all blocks in the page + for block in input_obj.children: + _process_block_and_children(block, block_map) + + # Handle single Block input + elif isinstance(input_obj, BlockBase): + _process_block_and_children(input_obj, block_map) + + # Handle list of Blocks input + elif isinstance(input_obj, list): + for block in input_obj: + if isinstance(block, BlockBase): + _process_block_and_children(block, block_map) + + return block_map + + +def _process_block_and_children( + block: BlockBase, block_map: dict[str, BlockBase] +) -> None: + """ + Helper function to process a block and its children recursively, adding them to the block map. + + Args: + block: The block to process + block_map: The dictionary mapping block IDs to Block objects + """ + # Add the current block to the map + block_map[block.id] = block + + # Process children recursively if they exist + if hasattr(block, "children") and block.children: + for child in block.children: + _process_block_and_children(child, block_map) From 58355bc7ed72f84bbb42d91c7b2f47bc5ccd0223 Mon Sep 17 00:00:00 2001 From: Onur Solmaz <2453968+osolmaz@users.noreply.github.com> Date: Thu, 20 Mar 2025 09:32:41 +0100 Subject: [PATCH 06/10] Checkpoint --- jsondoc/utils/text_with_backref.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/jsondoc/utils/text_with_backref.py b/jsondoc/utils/text_with_backref.py index cf7270d..2a80365 100644 --- a/jsondoc/utils/text_with_backref.py +++ b/jsondoc/utils/text_with_backref.py @@ -11,7 +11,7 @@ class BackRef(BaseModel): plain_text: str block_id: str - begin_idx: int + start_idx: int end_idx: int @@ -44,7 +44,7 @@ def extract_text_with_backref_from_page( title_text += rich_text.plain_text if title_text: - begin_idx = len(concat_text) + start_idx = len(concat_text) concat_text += title_text end_idx = len(concat_text) # Add a backref for the page title using the page's ID @@ -52,7 +52,7 @@ def extract_text_with_backref_from_page( BackRef( plain_text=title_text, block_id=page.id, - begin_idx=begin_idx, + start_idx=start_idx, end_idx=end_idx, ) ) @@ -65,7 +65,7 @@ def extract_text_with_backref_from_page( # Add all blocks to the concatenated text with their respective backrefs for block_id, block_text in blocks_with_text: if block_text: - begin_idx = len(concat_text) + start_idx = len(concat_text) concat_text += block_text end_idx = len(concat_text) @@ -73,7 +73,7 @@ def extract_text_with_backref_from_page( BackRef( plain_text=block_text, block_id=block_id, - begin_idx=begin_idx, + start_idx=start_idx, end_idx=end_idx, ) ) From 1ea814c559193c55ef43db4dae6f44d59b7715b0 Mon Sep 17 00:00:00 2001 From: Onur Solmaz <2453968+osolmaz@users.noreply.github.com> Date: Thu, 20 Mar 2025 10:07:47 +0100 Subject: [PATCH 07/10] get_intersecting_backrefs --- jsondoc/utils/text_with_backref.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/jsondoc/utils/text_with_backref.py b/jsondoc/utils/text_with_backref.py index 2a80365..a4a337c 100644 --- a/jsondoc/utils/text_with_backref.py +++ b/jsondoc/utils/text_with_backref.py @@ -19,6 +19,26 @@ class TextWithBackrefs(BaseModel): text: str backrefs: list[BackRef] + def get_intersecting_backrefs(self, start_idx: int, end_idx: int) -> list[BackRef]: + """ + Returns all backrefs that intersect with the given text range. + + A backref intersects if any part of it overlaps with the range defined by start_idx and end_idx. + This happens when the backref starts before the end of the range AND ends after the start of the range. + + Args: + start_idx: The starting index of the text range + end_idx: The ending index of the text range (exclusive) + + Returns: + A list of BackRef objects that intersect with the given range + """ + return [ + backref + for backref in self.backrefs + if backref.start_idx < end_idx and backref.end_idx > start_idx + ] + def extract_text_with_backref_from_page( page: Page, include_annotations: bool = False From f0b0d99c857dae131fd941e51b34cc095b058cdd Mon Sep 17 00:00:00 2001 From: Onur Solmaz <2453968+osolmaz@users.noreply.github.com> Date: Thu, 20 Mar 2025 11:04:52 +0100 Subject: [PATCH 08/10] Handle unnecessary whitespace in html conversion --- jsondoc/convert/utils.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/jsondoc/convert/utils.py b/jsondoc/convert/utils.py index 7056d7c..ab7c681 100644 --- a/jsondoc/convert/utils.py +++ b/jsondoc/convert/utils.py @@ -649,14 +649,27 @@ def _final_block_transformation(obj: BlockBase | str | RichTextBase): ensure_table_cell_count(obj) elif isinstance(obj, str): text_ = all_whitespace_re.sub(" ", obj) + if not text_.strip(): + # Skip empty strings + return None return create_paragraph_block(text=text_) elif isinstance(obj, RichTextBase): + # if not obj.plain_text.strip(): + # # Skip empty rich text objects + # return None new_obj_ = create_paragraph_block() new_obj_.paragraph.rich_text = [obj] return new_obj_ elif isinstance(obj, PlaceholderBlockBase): # Make sure no placeholder blocks are left behind return None + # elif isinstance(obj, tuple(BLOCKS_WITH_RICH_TEXT)): + # # Check for blocks that support rich text + # rich_text = get_rich_text_from_block(obj) + # if rich_text is not None: + # # If the block has no rich text or only empty rich text, skip it + # if not rich_text or all(not rt.plain_text.strip() for rt in rich_text): + # return None return obj From 7e9ba4cb47a7aeedfcf23003ced281c9ff8ed5ef Mon Sep 17 00:00:00 2001 From: Onur Solmaz <2453968+osolmaz@users.noreply.github.com> Date: Thu, 20 Mar 2025 22:58:16 +0100 Subject: [PATCH 09/10] Fix colspan issue --- jsondoc/convert/html.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/jsondoc/convert/html.py b/jsondoc/convert/html.py index c212320..082af56 100644 --- a/jsondoc/convert/html.py +++ b/jsondoc/convert/html.py @@ -841,10 +841,15 @@ def convert_td(self, el, convert_as_inline): paragraph_block.rich_text will be extracted to form table_row.cells. """ # Get colspan - colspan = el.get("colspan", 1) + colspan = el.get("colspan", "1") # Get rowspan # rowspan = el.get("rowspan", 1) # We need to come up with a much different way to handle rowspan + if not isinstance(colspan, int): + try: + colspan = int(colspan) + except ValueError: + colspan = 1 next_objects = [] if colspan > 1: From e5d27b7ef5a61f0a30d4686fb68c1b9a2416ae51 Mon Sep 17 00:00:00 2001 From: Onur Solmaz <2453968+osolmaz@users.noreply.github.com> Date: Fri, 21 Mar 2025 16:17:08 +0100 Subject: [PATCH 10/10] Add TypeID generation for block and page ids, refactor HtmlToJsonDocConverter options to use Pydantic --- jsondoc/convert/html.py | 120 +++++++++++++++++++++++--------------- jsondoc/convert/utils.py | 41 ++++++++----- jsondoc/utils/__init__.py | 15 ++++- pyproject.toml | 1 + uv.lock | 23 ++++++++ 5 files changed, 138 insertions(+), 62 deletions(-) diff --git a/jsondoc/convert/html.py b/jsondoc/convert/html.py index 082af56..3548dd8 100644 --- a/jsondoc/convert/html.py +++ b/jsondoc/convert/html.py @@ -1,6 +1,6 @@ import re from types import NoneType -from typing import List, Union +from typing import Callable, List, Union from bs4 import BeautifulSoup, Comment, Doctype, NavigableString from pydantic import BaseModel @@ -45,7 +45,7 @@ from jsondoc.models.page import Page from jsondoc.models.shared_definitions import Annotations from jsondoc.rules import is_block_child_allowed -from jsondoc.utils import generate_id, get_current_time +from jsondoc.utils import generate_block_id, get_current_time line_beginning_re = re.compile(r"^", re.MULTILINE) whitespace_re = re.compile(r"[\t ]+") @@ -307,7 +307,9 @@ def reconcile_to_rich_text( def reconcile_to_block( - block: BlockBase, children: List[CHILDREN_TYPE] + block: BlockBase, + children: List[CHILDREN_TYPE], + typeid: bool = False, ) -> List[CHILDREN_TYPE]: """ Given a block and a list of children, @@ -350,7 +352,7 @@ def reconcile_to_block( # Get corresponding field from the block block_field = getattr(block, block_type) init_kwargs = { - "id": generate_id(), + "id": generate_block_id(typeid=typeid), "created_time": child.created_time, block_type: type(block_field)(), } @@ -383,26 +385,20 @@ def reconcile_to_block( class HtmlToJsonDocConverter(object): - class DefaultOptions: - autolinks = True - code_language = "" - code_language_callback = None - convert = None - default_title = False - keep_inline_images_in = [] - strip = None - force_page = False - - class Options(DefaultOptions): - pass + class Options(BaseModel): + autolinks: bool = True + code_language: str = "" + code_language_callback: Callable | None = None + convert: Callable | None = None + default_title: bool = False + keep_inline_images_in: list[str] = [] + strip: str | None = None + force_page: bool = False + typeid: bool = False def __init__(self, **options): - # Create an options dictionary. Use DefaultOptions as a base so that - # it doesn't have to be extended. - self.options = _todict(self.DefaultOptions) - self.options.update(_todict(self.Options)) - self.options.update(options) - if self.options["strip"] is not None and self.options["convert"] is not None: + self.options = self.Options(**options) + if self.options.strip is not None and self.options.convert is not None: raise ValueError( "You may specify either tags to strip or tags to convert, but not both." ) @@ -417,7 +413,7 @@ def convert_soup(self, soup: BeautifulSoup) -> Page | BlockBase | List[BlockBase is_page = self._is_soup_page(soup) ret = None - if is_page or self.options["force_page"]: + if is_page or self.options.force_page: title = self._get_html_title(soup) # Ensure that children is a list if not isinstance(children, list): @@ -427,6 +423,7 @@ def convert_soup(self, soup: BeautifulSoup) -> Page | BlockBase | List[BlockBase ret = create_page( title=title, children=children, + typeid=self.options.typeid, ) else: ret = children @@ -526,7 +523,11 @@ def is_nested_node(el): if current_level_object is None: objects = children_objects elif isinstance(current_level_object, BlockBase): - objects = reconcile_to_block(current_level_object, children_objects) + objects = reconcile_to_block( + current_level_object, + children_objects, + typeid=self.options.typeid, + ) elif isinstance(current_level_object, RichTextBase): objects = reconcile_to_rich_text(current_level_object, children_objects) else: @@ -615,8 +616,8 @@ def process_text(self, el): def should_convert_tag(self, tag): tag = tag.lower() - strip = self.options["strip"] - convert = self.options["convert"] + strip = self.options.strip + convert = self.options.convert if strip is not None: return tag not in strip elif convert is not None: @@ -629,7 +630,7 @@ def convert_a(self, el, convert_as_inline): return ConvertOutput(main_object=create_rich_text(url=href)) convert_b = abstract_inline_conversion( - lambda self: Annotations(bold=True) # 2 * self.options["strong_em_symbol"] + lambda self: Annotations(bold=True) # 2 * self.options.strong_em_symbol ) def convert_blockquote(self, el, convert_as_inline): @@ -646,7 +647,11 @@ def convert_blockquote(self, el, convert_as_inline): return ConvertOutput(main_object=create_rich_text()) # TODO: If text has newlines, split them and add 2, 3, ... lines as children - return ConvertOutput(main_object=create_quote_block()) + return ConvertOutput( + main_object=create_quote_block( + typeid=self.options.typeid, + ) + ) def convert_br(self, el, convert_as_inline): if convert_as_inline: @@ -683,40 +688,48 @@ def convert_h1(self, el, convert_as_inline): if convert_as_inline: return ConvertOutput(main_object=create_rich_text()) - return ConvertOutput(main_object=create_h1_block()) + return ConvertOutput(main_object=create_h1_block(typeid=self.options.typeid)) def convert_h2(self, el, convert_as_inline): if convert_as_inline: return ConvertOutput(main_object=create_rich_text()) - return ConvertOutput(main_object=create_h2_block()) + return ConvertOutput(main_object=create_h2_block(typeid=self.options.typeid)) def convert_h3(self, el, convert_as_inline): if convert_as_inline: return ConvertOutput(main_object=create_rich_text()) - return ConvertOutput(main_object=create_h3_block()) + return ConvertOutput(main_object=create_h3_block(typeid=self.options.typeid)) def convert_h4(self, el, convert_as_inline): if convert_as_inline: return ConvertOutput(main_object=create_rich_text()) - return ConvertOutput(main_object=create_paragraph_block()) + return ConvertOutput( + main_object=create_paragraph_block(typeid=self.options.typeid) + ) def convert_h5(self, el, convert_as_inline): if convert_as_inline: return ConvertOutput(main_object=create_rich_text()) - return ConvertOutput(main_object=create_paragraph_block()) + return ConvertOutput( + main_object=create_paragraph_block(typeid=self.options.typeid) + ) def convert_h6(self, el, convert_as_inline): if convert_as_inline: return ConvertOutput(main_object=create_rich_text()) - return ConvertOutput(main_object=create_paragraph_block()) + return ConvertOutput( + main_object=create_paragraph_block(typeid=self.options.typeid) + ) def convert_hr(self, el, convert_as_inline): - return ConvertOutput(main_object=create_divider_block()) + return ConvertOutput( + main_object=create_divider_block(typeid=self.options.typeid) + ) convert_i = convert_em @@ -730,13 +743,14 @@ def convert_img(self, el, convert_as_inline): # title_part = ' "%s"' % title.replace('"', r"\"") if title else "" if ( convert_as_inline - and el.parent.name not in self.options["keep_inline_images_in"] + and el.parent.name not in self.options.keep_inline_images_in ): return alt return ConvertOutput( main_object=create_image_block( url=src, + typeid=self.options.typeid, # alt is not supported in JSON-DOC yet # caption=alt, ) @@ -755,15 +769,21 @@ def convert_list(self, el, convert_as_inline): def convert_li(self, el, convert_as_inline): parent = el.parent if parent is not None and parent.name == "ol": - return ConvertOutput(main_object=create_numbered_list_item_block()) + return ConvertOutput( + main_object=create_numbered_list_item_block(typeid=self.options.typeid) + ) else: - return ConvertOutput(main_object=create_bullet_list_item_block()) + return ConvertOutput( + main_object=create_bullet_list_item_block(typeid=self.options.typeid) + ) def convert_p(self, el, convert_as_inline): if convert_as_inline: return ConvertOutput(main_object=create_rich_text()) - return ConvertOutput(main_object=create_paragraph_block()) + return ConvertOutput( + main_object=create_paragraph_block(typeid=self.options.typeid) + ) def convert_pre(self, el, convert_as_inline): text = el.get_text() @@ -771,12 +791,16 @@ def convert_pre(self, el, convert_as_inline): if not text: return None - code_language = self.options["code_language"] + code_language = self.options.code_language - if self.options["code_language_callback"]: - code_language = self.options["code_language_callback"](el) or code_language + if self.options.code_language_callback: + code_language = self.options.code_language_callback(el) or code_language - return ConvertOutput(main_object=create_code_block(language=code_language)) + return ConvertOutput( + main_object=create_code_block( + language=code_language, typeid=self.options.typeid + ) + ) def convert_script(self, el, convert_as_inline): return None @@ -793,19 +817,19 @@ def convert_style(self, el, convert_as_inline): # Notion does not have an alternative for sub and sup tags convert_sub = abstract_inline_conversion( lambda self: Annotations() - # self.options["sub_symbol"], + # self.options.sub_symbol, ) convert_sup = abstract_inline_conversion( lambda self: Annotations() - # self.options["sup_symbol"], + # self.options.sup_symbol, ) def convert_table(self, el, convert_as_inline): has_column_header = html_table_has_header_row(el) return ConvertOutput( main_object=create_table_block( - has_column_header=has_column_header, + has_column_header=has_column_header, typeid=self.options.typeid ) ) @@ -868,7 +892,9 @@ def convert_tr(self, el, convert_as_inline): """ Table row """ - return ConvertOutput(main_object=create_table_row_block()) + return ConvertOutput( + main_object=create_table_row_block(typeid=self.options.typeid) + ) def html_to_jsondoc(html: str | bytes, **options) -> Page | BlockBase | List[BlockBase]: diff --git a/jsondoc/convert/utils.py b/jsondoc/convert/utils.py index ab7c681..a074e89 100644 --- a/jsondoc/convert/utils.py +++ b/jsondoc/convert/utils.py @@ -45,7 +45,7 @@ from jsondoc.models.page import CreatedBy, LastEditedBy, Page, Parent, Properties, Title from jsondoc.models.shared_definitions import Annotations from jsondoc.rules import is_block_child_allowed -from jsondoc.utils import generate_id, get_current_time +from jsondoc.utils import generate_block_id, generate_page_id, get_current_time all_whitespace_re = re.compile(r"[\s]+") @@ -136,10 +136,11 @@ def create_paragraph_block( id: str | None = None, created_time=None, metadata: dict | None = None, + typeid: bool = False, **kwargs, ) -> ParagraphBlock: if id is None: - id = generate_id() + id = generate_block_id(typeid=typeid) if created_time is None: created_time = get_current_time() @@ -160,10 +161,11 @@ def create_bullet_list_item_block( text: str | None = None, id: str | None = None, created_time=None, + typeid: bool = False, **kwargs, ) -> BulletedListItemBlock: if id is None: - id = generate_id() + id = generate_block_id(typeid=typeid) if created_time is None: created_time = get_current_time() @@ -183,10 +185,11 @@ def create_numbered_list_item_block( text: str | None = None, id: str | None = None, created_time=None, + typeid: bool = False, **kwargs, ) -> NumberedListItemBlock: if id is None: - id = generate_id() + id = generate_block_id(typeid=typeid) if created_time is None: created_time = get_current_time() @@ -207,10 +210,11 @@ def create_code_block( language: str | None = None, id: str | None = None, created_time=None, + typeid: bool = False, **kwargs, ) -> CodeBlock: if id is None: - id = generate_id() + id = generate_block_id(typeid=typeid) if created_time is None: created_time = get_current_time() @@ -238,9 +242,10 @@ def create_code_block( def create_divider_block( id: str | None = None, created_time=None, + typeid: bool = False, ) -> DividerBlock: if id is None: - id = generate_id() + id = generate_block_id(typeid=typeid) if created_time is None: created_time = get_current_time() @@ -256,10 +261,11 @@ def create_h1_block( text: str | None = None, id: str | None = None, created_time=None, + typeid: bool = False, **kwargs, ) -> Heading1Block: if id is None: - id = generate_id() + id = generate_block_id(typeid=typeid) if created_time is None: created_time = get_current_time() @@ -280,10 +286,11 @@ def create_h2_block( text: str | None = None, id: str | None = None, created_time=None, + typeid: bool = False, **kwargs, ) -> Heading2Block: if id is None: - id = generate_id() + id = generate_block_id(typeid=typeid) if created_time is None: created_time = get_current_time() @@ -304,10 +311,11 @@ def create_h3_block( text: str | None = None, id: str | None = None, created_time=None, + typeid: bool = False, **kwargs, ) -> Heading3Block: if id is None: - id = generate_id() + id = generate_block_id(typeid=typeid) if created_time is None: created_time = get_current_time() @@ -329,9 +337,10 @@ def create_image_block( caption: str | None = None, id: str | None = None, created_time=None, + typeid: bool = False, ) -> ImageBlock: if id is None: - id = generate_id() + id = generate_block_id(typeid=typeid) if created_time is None: created_time = get_current_time() @@ -355,10 +364,11 @@ def create_quote_block( text: str | None = None, id: str | None = None, created_time=None, + typeid: bool = False, **kwargs, ) -> QuoteBlock: if id is None: - id = generate_id() + id = generate_block_id(typeid=typeid) if created_time is None: created_time = get_current_time() @@ -379,9 +389,10 @@ def create_table_row_block( cells: List[List[RichTextBase]] = [], id: str | None = None, created_time=None, + typeid: bool = False, ) -> TableRowBlock: if id is None: - id = generate_id() + id = generate_block_id(typeid=typeid) if created_time is None: created_time = get_current_time() @@ -401,9 +412,10 @@ def create_table_block( table_width: int | None = None, has_column_header: bool = False, has_row_header: bool = False, + typeid: bool = False, ) -> TableBlock: if id is None: - id = generate_id() + id = generate_block_id(typeid=typeid) if created_time is None: created_time = get_current_time() @@ -430,6 +442,7 @@ def create_page( title: str | List[RichTextBase] | None = None, archived: bool | None = None, in_trash: bool | None = None, + typeid: bool = False, # parent: str | None = None, # icon # TBD ) -> Page: @@ -437,7 +450,7 @@ def create_page( Creates a page with the given blocks """ if id is None: - id = generate_id() + id = generate_page_id(typeid=typeid) if created_time is None: created_time = get_current_time() diff --git a/jsondoc/utils/__init__.py b/jsondoc/utils/__init__.py index 8d0548c..e1ca43c 100644 --- a/jsondoc/utils/__init__.py +++ b/jsondoc/utils/__init__.py @@ -6,6 +6,8 @@ from contextlib import contextmanager from datetime import datetime, timezone +from typeid import TypeID + from jsondoc.models.block.base import CreatedBy ARBITRARY_JSON_SCHEMA_OBJECT = { @@ -14,8 +16,19 @@ "additionalProperties": True, } +TYPEID_BLOCK_ID_PREFIX = "bk" +TYPEID_PAGE_ID_PREFIX = "pg" + + +def generate_block_id(typeid: bool = False) -> str: + if typeid: + return str(TypeID(prefix=TYPEID_BLOCK_ID_PREFIX)) + return str(uuid.uuid4()) + -def generate_id() -> str: +def generate_page_id(typeid: bool = False) -> str: + if typeid: + return str(TypeID(prefix=TYPEID_PAGE_ID_PREFIX)) return str(uuid.uuid4()) diff --git a/pyproject.toml b/pyproject.toml index eb542d5..aa17bcd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,6 +10,7 @@ dependencies = [ "jsonschema>=4.23.0,<5", "pypandoc>=1.15", "beautifulsoup4>=4.13.3", + "typeid-python>=0.3.2", ] [project.scripts] diff --git a/uv.lock b/uv.lock index 1346717..1f5d494 100644 --- a/uv.lock +++ b/uv.lock @@ -748,6 +748,7 @@ dependencies = [ { name = "jsonschema" }, { name = "pydantic" }, { name = "pypandoc" }, + { name = "typeid-python" }, ] [package.dev-dependencies] @@ -767,6 +768,7 @@ requires-dist = [ { name = "jsonschema", specifier = ">=4.23.0,<5" }, { name = "pydantic", specifier = ">=2.7.2,<3" }, { name = "pypandoc", specifier = ">=1.15" }, + { name = "typeid-python", specifier = ">=0.3.2" }, ] [package.metadata.requires-dev] @@ -998,6 +1000,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl", hash = "sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f", size = 85359 }, ] +[[package]] +name = "typeid-python" +version = "0.3.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "uuid6" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/8d/09/b9b747c4f5f47e32fcb49d2a61c1235838df22bd02445507f60744bb6759/typeid_python-0.3.2.tar.gz", hash = "sha256:07d176af35ba75a10721ffd73f70e9582bc2705d3b4cb3d8df956e3221eaf2a6", size = 6934 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8a/24/f5f6373f168c362c861c89fc7f7b3750968784ab90b0162bdc6cf77ad0bf/typeid_python-0.3.2-py3-none-any.whl", hash = "sha256:d4fc91e12152df9f7a468655c5fbd1824fb1b706a19ffdce0e7fcef4520ed139", size = 7229 }, +] + [[package]] name = "typing-extensions" version = "4.12.2" @@ -1016,6 +1030,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c8/19/4ec628951a74043532ca2cf5d97b7b14863931476d117c471e8e2b1eb39f/urllib3-2.3.0-py3-none-any.whl", hash = "sha256:1cee9ad369867bfdbbb48b7dd50374c0967a0bb7710050facf0dd6911440e3df", size = 128369 }, ] +[[package]] +name = "uuid6" +version = "2024.7.10" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/2c/56/2560a9f1ccab9e12b1b3478a3c870796cf4d8ee5652bb19b61751cced14a/uuid6-2024.7.10.tar.gz", hash = "sha256:2d29d7f63f593caaeea0e0d0dd0ad8129c9c663b29e19bdf882e864bedf18fb0", size = 8705 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d3/3e/4ae6af487ce5781ed71d5fe10aca72e7cbc4d4f45afc31b120287082a8dd/uuid6-2024.7.10-py3-none-any.whl", hash = "sha256:93432c00ba403751f722829ad21759ff9db051dea140bf81493271e8e4dd18b7", size = 6376 }, +] + [[package]] name = "virtualenv" version = "20.29.3"