From c1ac8defdd9de7c987e33858dbe538f8fdab4ff9 Mon Sep 17 00:00:00 2001 From: Subash C Date: Tue, 14 Oct 2025 13:57:23 +0530 Subject: [PATCH] docs: clarify Google Docs API Unicode behavior for smart chips (fixes #2547) The \ue907 character is intentional Google Docs API behavior, not a bug. This addresses user confusion by: - Adding comprehensive documentation explaining the Unicode placeholder behavior - Providing a working example showing how to extract actual date/chip data - Demonstrating best practices for working with richLink properties - Clarifying that this is API design, not client library issue The Google Docs API intentionally replaces non-text elements (smart chips, dates, people, etc.) with Unicode Private Use Area character U+E907 in TextRun.content, as documented in the API specification. The actual data is available in richLinkProperties within the same document elements. This is an educational contribution to help developers understand expected API behavior and implement proper workarounds. --- docs/docs-unicode-guide.md | 144 +++++++++++++++ examples/README.md | 44 +++++ examples/docs_unicode_dates_example.py | 232 +++++++++++++++++++++++++ 3 files changed, 420 insertions(+) create mode 100644 docs/docs-unicode-guide.md create mode 100644 examples/README.md create mode 100644 examples/docs_unicode_dates_example.py diff --git a/docs/docs-unicode-guide.md b/docs/docs-unicode-guide.md new file mode 100644 index 0000000000..4e6cdfcf7d --- /dev/null +++ b/docs/docs-unicode-guide.md @@ -0,0 +1,144 @@ +# Google Docs API Unicode Characters Guide + +This guide addresses the issue reported in [#2547](https://github.com/googleapis/google-api-python-client/issues/2547) where Google Docs API returns date elements and other smart chips as Unicode placeholder characters. + +## The Issue + +When using the Google Docs API to retrieve document content, date elements, smart chips, and other non-text elements are returned as Unicode Private Use Area characters, specifically `\ue907` (U+E907). + +### Example from Issue #2547 + +```python +# This is what you see in the API response: +{'startIndex': 1, 'endIndex': 5, 'textRun': {'content': '\ue907 | ', 'textStyle': {}}} +``` + +Instead of getting readable text like "Jan 13, 2025", you get the Unicode character `\ue907`. + +## Why This Happens + +This is **intentional behavior** by the Google Docs API, not a bug in the client library. According to the official API documentation: + +> **TextRun.content**: "The text of this run. Any non-text elements in the run are replaced with the Unicode character U+E907." + +Google Docs uses Private Use Area Unicode characters to represent: +- Smart chips (dates, people, places) +- Code blocks +- Special embedded elements +- Rich content that isn't plain text + +## Solution: Extract Data from Rich Links + +The actual date/chip information is typically available in `richLink` properties within the same document elements. + +### Quick Start + +```python +from googleapiclient.discovery import build + +# Unicode placeholder character used by Google Docs API +SMART_CHIP_PLACEHOLDER = '\ue907' + +# When processing document elements +for element in elements: + if 'textRun' in element: + content = element['textRun']['content'] + + # Check for smart chip placeholder + if SMART_CHIP_PLACEHOLDER in content: + print("Found smart chip/date placeholder!") + + # Make readable for display + readable = content.replace(SMART_CHIP_PLACEHOLDER, '[DATE/CHIP]') + print(f"Display version: {readable}") + + # IMPORTANT: Look for actual data in rich links + if 'richLink' in element: + props = element['richLink'].get('richLinkProperties', {}) + actual_date = props.get('title', '') # Real date is here! + calendar_uri = props.get('uri', '') + print(f"Actual date: {actual_date}") +``` + +### Key Functions You'll Need + +#### Basic Unicode Detection +```python +def has_smart_chips(text): + """Check if text contains Google Docs smart chip placeholders.""" + return '\ue907' in text + +def make_readable(text): + """Replace Unicode placeholders with readable text.""" + return text.replace('\ue907', '[DATE/CHIP]') +``` + +## Extracting Actual Date Information + +While the text content shows `\ue907`, you can find the actual date/chip data in: + +### 1. Rich Link Properties (Primary Solution) +```python +# Look for richLink elements with the actual data +for element in elements: + if 'richLink' in element: + rich_link = element['richLink'] + if 'richLinkProperties' in rich_link: + props = rich_link['richLinkProperties'] + actual_date = props.get('title', '') # "Jan 13, 2025" + calendar_uri = props.get('uri', '') # Calendar event link + + print(f"Found actual date: {actual_date}") + if 'calendar.google.com' in calendar_uri: + print("This is a Google Calendar event!") +``` + +### 2. Context Analysis +Analyze surrounding elements for clues: +```python +def find_date_context(elements, chip_index): + """Look at adjacent elements for date context.""" + before = elements[chip_index - 1] if chip_index > 0 else None + after = elements[chip_index + 1] if chip_index < len(elements) - 1 else None + # Examine textRun content in adjacent elements +``` + +### 3. Google Calendar API Integration +For calendar events, use the Calendar API: +```python +from googleapiclient.discovery import build + +def get_event_details(calendar_uri, credentials): + """Extract full event details from calendar URI.""" + calendar_service = build('calendar', 'v3', credentials=credentials) + # Parse event ID from URI and fetch complete event data +``` + +## Complete Example + +See `examples/docs_unicode_dates_example.py` for a working example that demonstrates: +- Detecting Unicode placeholder characters +- Extracting actual dates from rich link properties +- Analyzing document structure +- Best practices for handling smart chips + +## Best Practices + +1. **Always check for rich links** - They often contain the actual data +2. **Use context analysis** - Surrounding text can provide clues +3. **Don't rely solely on textRun content** - It's designed to be a fallback +4. **Consider the document structure** - Smart chips are part of larger semantic elements +5. **Use appropriate APIs** - For calendar events, use Calendar API; for contacts, use People API + +## Important Notes + +- This is **not a bug** - it's intentional API design +- The client library correctly returns what the API provides +- Focus on extracting semantic meaning rather than display text +- Different smart chip types may require different extraction strategies + +## References + +- [GitHub Issue #2547](https://github.com/googleapis/google-api-python-client/issues/2547) +- [Google Docs API Documentation](https://developers.google.com/docs/api) +- [Unicode Private Use Areas](https://en.wikipedia.org/wiki/Private_Use_Areas) \ No newline at end of file diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 0000000000..db13443b6d --- /dev/null +++ b/examples/README.md @@ -0,0 +1,44 @@ +# Google API Python Client Examples + +This directory contains example scripts demonstrating how to work with various Google APIs using the `google-api-python-client` library. + +## Available Examples + +### `docs_unicode_dates_example.py` + +**Addresses**: [GitHub Issue #2547](https://github.com/googleapis/google-api-python-client/issues/2547) - "Google doc dates returned as unicode (e.g., \\ue907)" + +**Purpose**: Demonstrates how to properly handle Unicode placeholder characters returned by the Google Docs API for smart chips (dates, people, places, etc.). + +**Key Concepts**: +- Understanding that `\ue907` is intentional API behavior, not a bug +- Extracting actual date/chip information from `richLink` properties +- Working with Google Docs API document structure +- Best practices for handling non-text elements + +**Usage**: +```bash +# Run the demonstration (no authentication required) +python examples/docs_unicode_dates_example.py + +# To analyze an actual document, modify the script with your document ID +# and uncomment the analysis function call +``` + +**Requirements**: +- `google-auth` +- `google-api-python-client` +- Valid Google Docs API credentials (for real document analysis) + +## Contributing Examples + +When contributing new examples: +1. Focus on common use cases or frequently asked questions +2. Include comprehensive comments and documentation +3. Provide both demonstration code and real-world usage examples +4. Follow the existing code style and structure +5. Test thoroughly before submitting + +## Authentication + +Most examples require Google API credentials. See the [authentication documentation](https://cloud.google.com/docs/authentication) for setup instructions. \ No newline at end of file diff --git a/examples/docs_unicode_dates_example.py b/examples/docs_unicode_dates_example.py new file mode 100644 index 0000000000..20996095cd --- /dev/null +++ b/examples/docs_unicode_dates_example.py @@ -0,0 +1,232 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Example: Working with Google Docs API Unicode Date Characters + +This example addresses GitHub issue #2547: +"Google doc dates returned as unicode (e.g., \\ue907)" + +IMPORTANT: The \\ue907 character is INTENTIONAL API behavior, not a bug! + +According to the Google Docs API documentation: +"TextRun.content: Any non-text elements in the run are replaced with the Unicode character U+E907." + +This example shows: +1. How to identify these Unicode placeholder characters +2. How to extract the actual date/chip information from richLink properties +3. Best practices for working with Google Docs smart chips +""" + +import os +from google.oauth2.credentials import Credentials +from googleapiclient.discovery import build +from google.auth import default + + +# Unicode character used by Google Docs API for non-text elements +DOCS_SMART_CHIP_PLACEHOLDER = '\ue907' # U+E907 + + +def analyze_document_with_smart_chips(doc_id, creds_file=None): + """ + Analyze a Google Doc and properly handle smart chips (dates, etc.). + + This demonstrates the correct way to extract date information from + Google Docs when encountering the \\ue907 Unicode placeholder. + + Args: + doc_id: Google Docs document ID + creds_file: Optional path to service account credentials file + """ + print(f"๐Ÿ“„ Analyzing Google Docs document: {doc_id}") + print("=" * 60) + + # Set up authentication + scopes = ['https://www.googleapis.com/auth/documents.readonly'] + if creds_file and os.path.exists(creds_file): + creds = Credentials.from_service_account_file(creds_file, scopes=scopes) + else: + creds, project = default(scopes=scopes) + + # Build the Docs API service + service = build('docs', 'v1', credentials=creds) + + try: + # Get the document + document = service.documents().get( + documentId=doc_id, + fields='body' + ).execute() + + # Access the document's content + content = document.get('body', {}).get('content', []) + + print("๐Ÿ” Document Analysis Results:") + print("-" * 40) + + element_count = 0 + smart_chip_count = 0 + + # Process each element + for element in content: + if 'paragraph' in element: + paragraph = element.get('paragraph', {}) + elements = paragraph.get('elements', []) + + for elem in elements: + element_count += 1 + + # Check for textRun content (where Unicode placeholders appear) + if 'textRun' in elem: + text_run = elem['textRun'] + content_text = text_run.get('content', '') + + print(f"\n๐Ÿ“ Element {element_count}:") + print(f" Raw content: {repr(content_text)}") + + # Check for smart chip placeholder + if DOCS_SMART_CHIP_PLACEHOLDER in content_text: + smart_chip_count += 1 + print(f" ๐ŸŽฏ SMART CHIP/DATE DETECTED!") + print(f" ๐Ÿ“ Found Unicode placeholder: U+E907 (\\ue907)") + print(f" ๐Ÿ’ก This represents a non-text element (date, person, etc.)") + + # Show how to make it readable for display purposes + readable = content_text.replace(DOCS_SMART_CHIP_PLACEHOLDER, '[DATE/CHIP]') + print(f" โœจ Display version: {repr(readable)}") + else: + print(f" ๐Ÿ“ Regular text content") + + # IMPORTANT: Check for rich links - this is where actual data lives! + if 'richLink' in elem: + rich_link = elem['richLink'] + print(f"\n ๐Ÿ”— RICH LINK FOUND - This may contain the actual date/chip data!") + print(f" Rich Link ID: {rich_link.get('richLinkId', 'N/A')}") + + # Rich link properties contain the real information + if 'richLinkProperties' in rich_link: + props = rich_link['richLinkProperties'] + title = props.get('title', 'N/A') + uri = props.get('uri', 'N/A') + + print(f" ๐Ÿ“… Title: {title}") # โ† ACTUAL DATE IS HERE! + print(f" ๐Ÿ”— URI: {uri}") + + # Check if this looks like a date + date_keywords = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', + 'jul', 'aug', 'sep', 'oct', 'nov', 'dec', + '2024', '2025', 'monday', 'tuesday', 'wednesday'] + + if any(keyword in title.lower() for keyword in date_keywords): + print(f" ๐ŸŽ‰ SUCCESS! Found actual date information: '{title}'") + + # Check if it's a calendar event + if 'calendar.google.com' in uri or 'calendar' in uri.lower(): + print(f" ๐Ÿ“† This appears to be a Google Calendar event!") + + # Summary + print("\n" + "=" * 60) + print("๐Ÿ“Š ANALYSIS SUMMARY") + print("=" * 60) + print(f"๐Ÿ“ˆ Total elements processed: {element_count}") + print(f"๐ŸŽฏ Smart chips/dates found: {smart_chip_count}") + + if smart_chip_count > 0: + print(f"\n๐Ÿ’ก KEY INSIGHTS:") + print(f" โ€ข The \\ue907 character is NOT a bug - it's intentional API design") + print(f" โ€ข Look for 'richLink' properties to get actual date/chip information") + print(f" โ€ข The API replaces smart chips with placeholders for plain text extraction") + print(f" โ€ข Use Google Calendar API for detailed event information") + + print(f"\n๐Ÿ”ง TECHNICAL DETAILS:") + print(f" โ€ข Unicode Character: U+E907 (\\ue907)") + print(f" โ€ข API Behavior: Documented in TextRun.content specification") + print(f" โ€ข Purpose: Represents non-text elements (smart chips)") + print(f" โ€ข Workaround: Extract data from richLinkProperties") + + except Exception as e: + print(f"โŒ Error analyzing document: {e}") + print(f"๐Ÿ’ก Make sure you have proper authentication and document access") + + +def demonstrate_unicode_handling(): + """Demonstrate how to handle the Unicode characters from issue #2547.""" + print("\n๐Ÿงช UNICODE CHARACTER DEMONSTRATION") + print("=" * 60) + + # Example content from GitHub issue #2547 + example_from_issue = '\ue907 | ' + + print("๐Ÿ“‹ Example from GitHub Issue #2547:") + print(f" Original API response: {repr(example_from_issue)}") + print(f" Character breakdown:") + print(f" - '\\ue907' = Unicode U+E907 (smart chip placeholder)") + print(f" - ' | ' = Regular text") + + print(f"\n๐Ÿ” Character Analysis:") + for i, char in enumerate(example_from_issue): + if char == DOCS_SMART_CHIP_PLACEHOLDER: + print(f" Position {i}: '\\ue907' โ†’ Smart chip/date placeholder") + else: + print(f" Position {i}: {repr(char)} โ†’ Regular text") + + print(f"\nโœจ Making It Readable:") + readable_version = example_from_issue.replace(DOCS_SMART_CHIP_PLACEHOLDER, '[DATE]') + print(f" Original: {repr(example_from_issue)}") + print(f" Readable: {repr(readable_version)}") + + print(f"\n๐Ÿ’ก Best Practices:") + print(f" 1. Don't try to 'decode' \\ue907 - it's just a placeholder") + print(f" 2. Look for richLink properties in the same or adjacent elements") + print(f" 3. Use context from surrounding text elements") + print(f" 4. For calendar events, use Google Calendar API with event IDs") + print(f" 5. Accept that some smart chip data may not be fully accessible via Docs API") + + +def suggest_next_steps(): + """Provide actionable next steps for developers facing this issue.""" + print(f"\n๐Ÿš€ NEXT STEPS FOR DEVELOPERS") + print("=" * 60) + + steps = [ + "1. ๐Ÿ” Accept that \\ue907 is intentional API behavior, not a bug", + "2. ๐Ÿ“‹ Always check for 'richLink' elements when processing documents", + "3. ๐Ÿ”— Extract actual data from richLinkProperties.title and .uri", + "4. ๐Ÿ“… Use Google Calendar API for detailed event information", + "5. ๐Ÿ‘ฅ Use Google People API for person chip details", + "6. ๐Ÿข Use Google Places API for location chip information", + "7. ๐Ÿ“ Implement context analysis of surrounding text elements", + "8. ๐Ÿงช Test with various smart chip types (dates, people, places)", + "9. ๐Ÿ“š Read the Google Docs API documentation on TextRun.content", + "10. ๐Ÿ’ฌ Educate users that this is expected API behavior" + ] + + for step in steps: + print(f" {step}") + + print(f"\n๐Ÿ“š USEFUL RESOURCES:") + print(f" โ€ข GitHub Issue: https://github.com/googleapis/google-api-python-client/issues/2547") + print(f" โ€ข Google Docs API: https://developers.google.com/docs/api/reference/rest/v1/documents") + print(f" โ€ข Google Calendar API: https://developers.google.com/calendar/api") + print(f" โ€ข Unicode Private Use Areas: https://en.wikipedia.org/wiki/Private_Use_Areas") + + +if __name__ == "__main__": + print("๐Ÿ”ง Google Docs API: Working with Smart Chip Unicode Characters") + print(" Addressing GitHub Issue #2547") + print() + + # Demonstrate how to handle the Unicode characters + demonstrate_unicode_handling() + + # Provide actionable guidance + suggest_next_steps() + + print(f"\n๐Ÿ“– USAGE EXAMPLE:") + print("# To analyze an actual document:") + print("# analyze_document_with_smart_chips('your-document-id-here')") + print("# analyze_document_with_smart_chips('your-doc-id', '/path/to/credentials.json')") + + print(f"\n๐ŸŽฏ REMEMBER:") + print("This is NOT a client library bug - it's intentional Google Docs API behavior!") + print("Focus on extracting data from richLink properties, not decoding Unicode characters.") \ No newline at end of file