code-anyway · astaff · Aug 21, 2024 · Aug 21, 2024 · Aug 22, 2024 · Aug 22, 2024
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -4,5 +4,6 @@
     ],
     "python.testing.unittestEnabled": false,
     "python.testing.pytestEnabled": true,
-    "python.analysis.typeCheckingMode": "basic"
+    "python.analysis.typeCheckingMode": "basic",
+    "makefile.configureOnOpen": false
 }
diff --git a/examples/audio_to_paper.sh b/examples/audio_to_paper.sh
@@ -132,7 +132,10 @@ echo "Generating Documents..."
     sed -E 's/\[\[([0-9]+)\]\]\([^)]+\)//g' |
     sed -E 's/\[([0-9]+)\]//g' |
     tee \
-        >(pandoc -o "$(echo "$TITLE" | sed 's/[^a-zA-Z0-9]/_/g')-no-refs.pdf" --from markdown --pdf-engine=xelatex) >/dev/null
+        >(pandoc -o "$(echo "$TITLE" | sed 's/[^a-zA-Z0-9]/_/g')-no-refs.pdf" --from markdown --pdf-engine=xelatex) \
+        >(pandoc -o "$(echo "$TITLE" | sed 's/[^a-zA-Z0-9]/_/g')-no-refs.docx" --from markdown) \
+        >(cat >"$(echo "$TITLE" | sed 's/[^a-zA-Z0-9]/_/g')-no-refs.md") \
+        >/dev/null
 
 # With References
 (
@@ -149,33 +152,6 @@ echo "Generating Documents..."
     tee \
         >(pandoc -o "$(echo "$TITLE" | sed 's/[^a-zA-Z0-9]/_/g')-refs.pdf" --from markdown+header_attributes --pdf-engine=xelatex) >/dev/null
 
-# Check if images are requested
-if [ "$IMAGES" = true ]; then
-    echo "Extracting images..."
-    IMAGE_OUTPUT=$(plato --images "$URL" --lang "$LANG" | sed '/^$/d' | sed -e :a -e '/^\n*$/{$d;N;ba' -e '}')
-
-    if [ ! -z "$IMAGE_OUTPUT" ]; then
-        # Create a temporary directory for images
-        TMP_IMG_DIR=$(mktemp -d)
-
-        # Save image paths to temporary files
-        echo "$IMAGE_OUTPUT" | while read -r img_path; do
-            cp ".platogram-cache/$img_path" "$TMP_IMG_DIR/"
-        done
-
-        # Create zip file with images
-        ZIP_FILE="$(echo "$TITLE" | sed 's/[^a-zA-Z0-9]/_/g')-images.zip"
-        zip -j "$ZIP_FILE" "$TMP_IMG_DIR"/*
-
-        # Clean up temporary directory
-        rm -rf "$TMP_IMG_DIR"
-
-        echo "Images saved to $ZIP_FILE"
-    else
-        echo "No images found or extracted."
-    fi
-fi
-
 wait
 
 if [ "$VERBOSE" = true ]; then

diff --git a/platogram/cli.py b/platogram/cli.py
@@ -77,17 +77,18 @@ def process_url(
     with tqdm(total=4, desc=f"Processing {url}", file=sys.stderr) as pbar:
         transcript = plato.extract_transcript(url, asr, lang=lang)
         pbar.update(1)
-        pbar.set_description("Indexing content")
-        content = plato.index(transcript, llm, lang=lang)
-        pbar.update(1)
+        images = []
         if extract_images:
             pbar.set_description("Extracting images")
             images_dir = library.home / id
             images_dir.mkdir(exist_ok=True)
-            timestamps_ms = [event.time_ms for event in content.transcript]
+            timestamps_ms = [event.time_ms for event in transcript]
             images = ingest.extract_images(url, images_dir, timestamps_ms)
-            content.images = [str(image.relative_to(library.home)) for image in images]
+            images = {i: image for i, image in enumerate(images)}
             pbar.update(1)
+        pbar.set_description("Indexing content")
+        content = plato.index(transcript, images, llm, lang=lang)
+        pbar.update(1)
         pbar.set_description("Saving content")
         library.put(id, content)
         pbar.update(1)
@@ -242,17 +243,20 @@ def get_chapter(passage_marker: int) -> int | None:
             passages = ""
             if args.chapters:
                 current_chapter = None
-                for passage in content.passages:
-                    passage_markers = [int(m) for m in re.findall(r"【(\d+)】", passage)]
-                    chapter_marker = get_chapter(passage_markers[0]) if passage_markers else None
-                    if chapter_marker is not None and chapter_marker != current_chapter:
-                        passages += f"### {content.chapters[chapter_marker]}\n\n"
-                        current_chapter = chapter_marker
-                    passages += f"{passage.strip()}\n\n"
+                for text in content.text:
+                    for passage in text.split("\n\n"):
+                        passage_markers = [int(m) for m in re.findall(r"【(\d+)】", passage)]
+                        chapter_marker = get_chapter(passage_markers[0]) if passage_markers else None
+                        if chapter_marker is not None and chapter_marker != current_chapter:
+                            passages += f"### {content.chapters[chapter_marker]}\n\n"
+                            current_chapter = chapter_marker
+                        passages += f"{passage.strip()}\n\n"
+                        if content.figures:
+                            for image_marker in set(passage_markers) & set(content.figures):
+                                figure_html = f"![{content.figures[image_marker][0]}]({content.figures[image_marker][1]}){{ width=80% }}\n\n"
+                                passages += figure_html
             else:
-                passages = "\n\n".join(
-                    passage.strip() for passage in content.passages
-                )
+                passages = content.text
 
             result += f"""{passages}\n\n\n\n"""
 

diff --git a/platogram/ingest.py b/platogram/ingest.py
@@ -1,19 +1,18 @@
 import logging
 import mimetypes
+import subprocess
 from functools import lru_cache
 from pathlib import Path
 from tempfile import TemporaryDirectory
 
 import requests  # type: ignore
 from yt_dlp import YoutubeDL  # type: ignore
-import subprocess
 
-from platogram.parsers import parse_subtitles, parse_waffly
 from platogram.asr import ASRModel
+from platogram.parsers import parse_subtitles, parse_waffly
 from platogram.types import SpeechEvent
 from platogram.utils import get_sha256_hash
 
-
 logger = logging.getLogger(__name__)
 
 
@@ -164,33 +163,35 @@ def extract_images(
 
     image_paths = []
     try:
-        for timestamp_ms in timestamps_ms:
-            timestamp_s = timestamp_ms / 1000
-            image_path = Path(output_dir) / f"image_{timestamp_ms:09d}.png"
-
-            subprocess.run(
-                [
-                    "ffmpeg",
-                    "-ss",
-                    f"{timestamp_s:.3f}",
-                    "-i",
-                    str(video_path),
-                    "-frames:v",
-                    "1",
-                    "-q:v",
-                    "2",
-                    "-f",
-                    "image2",
-                    str(image_path),
-                ],
-                check=True,
-                stdout=subprocess.DEVNULL,
-                stderr=subprocess.DEVNULL,
-            )
-
-            image_paths.append(image_path)
+        # Convert timestamps to seconds and create the select filter
+        timestamps_s = sorted(list(set(ts // 1000 for ts in timestamps_ms)))
+        select_filter = '+'.join([f"eq(t,{t})" for t in timestamps_s])
+
+        # Construct the FFmpeg command
+        ffmpeg_command = [
+            "ffmpeg",
+            "-i", str(video_path),
+            "-vf", f"select='{select_filter}',setpts=N/FRAME_RATE/TB",
+            "-vsync", "0",
+            "-q:v", "2",
+            "-f", "image2",
+            str(output_dir / "image_%d.png")
+        ]
+
+        # Run FFmpeg command to extract all images at once
+        subprocess.run(
+            ffmpeg_command,
+            check=True,
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.DEVNULL,
+        )
+
+        # Get all image files in the output directory and sort them alphabetically
+        image_paths = [output_dir / f"image_{timestamps_s.index(ts // 1000) + 1}.png" for ts in timestamps_ms]
+
+        if not image_paths:
+            raise RuntimeError("No images were extracted from the video.")
     finally:
-        # Delete the downloaded video file
         if video_path:
             video_path.unlink()
 

diff --git a/platogram/llm/__init__.py b/platogram/llm/__init__.py
@@ -1,5 +1,6 @@
 from typing import Protocol, Literal, Generator, Sequence
 from platogram.types import Content, User, Assistant
+from pathlib import Path
 
 
 class LanguageModel(Protocol):
@@ -10,8 +11,12 @@ def get_meta(
     ) -> tuple[str, str]: ...
 
     def get_chapters(
-        self, passages: list[str], max_tokens: int = 4096, temperature: float = 0.5, lang: str | None = None
+        self, passages: list[str], context: dict[int, str], chapter_size_words: int, temperature: float = 0.5, lang: str | None = None
     ) -> dict[int, str]: ...
+
+    def expand_chapter_text(self, chapter_text: str, images: dict[int, Path]) -> str: ...
+
+    def get_figures(self, chapter_text: str, images: list[Path]) -> dict[int, str]: ...
 
     def get_paragraphs(
         self,