video-db · sankalp-videodb · Sep 4, 2025 · Sep 4, 2025
diff --git a/PromptClip_multimodal.ipynb b/PromptClip_multimodal.ipynb
@@ -87,17 +87,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "try:\n",
     "    transcript = video.get_transcript()\n",
-    "    transcript_text = video.get_transcript_text()\n",
+    "    transcript_text = video.get_transcript()\n",
     "except Exception:\n",
     "    video.index_spoken_words()\n",
     "    transcript = video.get_transcript()\n",
-    "    transcript_text = video.get_transcript_text()"
+    "    transcript_text = video.get_transcript()"
    ]
   },
   {
@@ -156,6 +156,18 @@
     "The `multimodal_promper` will return sentences which are visual description from matched chunks. Review these to ensure they align with your needs. You can then use these segments to create your clip, focusing on the content that's most relevant to your use case."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llm_agent import LLM, LLMType, Models\n",
+    "\n",
+    "# Change the model/provider here if you want to experiment.\n",
+    "llm = LLM(llm_type=LLMType.OPENAI, model=Models.GPT_4O)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -169,20 +181,14 @@
     "# If we create a clip using only the spoken index data, we won't know where the infographics are.\n",
     "# If we create a clip using only the visual index data, we may include additional infographics that aren't actually about the rules but might appear to be rule explanations based on visual information.\n",
     "# By creating a clip using combined data, we achieve a much more precise intersection where the infographics are present, and the rules are being explained.\n",
-    "result = multimodal_prompter(transcript, scenes, user_prompt)"
+    "result = multimodal_prompter(transcript, scenes, user_prompt, llm=llm)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "### Generate The Clip\n",
-    "To generate a clip, first we'll call `get_result_timestamps` from `video_prompter.py` it uses VideoDB's `keyword search` feature. We already leveraged the power of the LLM (Large Language Model) to identify relevant sentences. We'll use the search results to create a programmable video stream. Here's how you can approach this process:\n",
-    "\n",
-    "We have the descriptions in the `results` variable. We input these keywords into VideoDB's keyword search feature. This function will search through the indexed scenes of your videos to find matches.\n",
-    "\n",
-    "The search will return a SearchResult object, which contains detailed information about the found segments, including their timestamps, the text of the scene description.\n",
-    "\n",
     "**Create a Programmable Video Stream with Timeline:** With the specific segments identified, you can now use `build_video_timeline` from `video_prompter.py` to get the Timeline to create a new programmable video stream. The Timeline tool allows you to stitch together video segments based on the timestamps provided in the SearchResult object. You can arrange, cut, or combine these segments to craft a fresh video stream that focuses on your topic of interest."
    ]
   },
@@ -195,11 +201,10 @@
     "from videodb import play_stream\n",
     "from videodb.timeline import Timeline\n",
     "\n",
-    "from video_prompter import get_result_timestamps, build_video_timeline\n",
+    "from video_prompter import build_video_timeline\n",
     "\n",
     "timeline = Timeline(conn)\n",
-    "result_timestamps = get_result_timestamps(video, result, scene_index_id=scene_index_id)\n",
-    "timeline, duration = build_video_timeline(video, result_timestamps, timeline)\n",
+    "timeline, duration = build_video_timeline(video, result, timeline)\n",
     "stream = timeline.generate_stream()\n",
     "play_stream(stream)"
    ]
@@ -222,14 +227,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "from video_prompter import text_prompter\n",
     "\n",
     "user_prompt = \"find the scene explaining the cricket rules using infographics, remember rules with infographics.\"\n",
-    "text_result = text_prompter(transcript_text, user_prompt)"
+    "text_result = text_prompter(transcript, user_prompt, llm=llm)"
    ]
   },
   {
@@ -239,8 +244,8 @@
    "outputs": [],
    "source": [
     "timeline = Timeline(conn)\n",
-    "result_timestamps = get_result_timestamps(video, text_result, index_type=\"spoken_word\")\n",
-    "timeline, duration  = build_video_timeline(video, result_timestamps, timeline)\n",
+    "\n",
+    "timeline, duration  = build_video_timeline(video, text_result, timeline)\n",
     "stream = timeline.generate_stream()\n",
     "play_stream(stream)"
    ]
@@ -263,14 +268,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "from  video_prompter import scene_prompter\n",
     "\n",
     "user_prompt = \"find the scene explaining the cricket rules using infographics, remember rules with infographics.\"\n",
-    "scene_result = scene_prompter(scenes, user_prompt)"
+    "scene_result = scene_prompter(scenes, user_prompt, llm=llm)"
    ]
   },
   {
@@ -280,8 +285,8 @@
    "outputs": [],
    "source": [
     "timeline = Timeline(conn)\n",
-    "result_timestamps = get_result_timestamps(video, scene_result, scene_index_id=scene_index_id)\n",
-    "timeline, duration = build_video_timeline(video, result_timestamps, timeline)\n",
+    "\n",
+    "timeline, duration = build_video_timeline(video, scene_result, timeline)\n",
     "stream = timeline.generate_stream()\n",
     "play_stream(stream)"
    ]
@@ -341,11 +346,11 @@
    "source": [
     "try:\n",
     "    transcript = video.get_transcript()\n",
-    "    transcript_text = video.get_transcript_text()\n",
+    "    transcript_text = video.get_transcript()\n",
     "except Exception:\n",
     "    video.index_spoken_words()\n",
     "    transcript = video.get_transcript()\n",
-    "    transcript_text = video.get_transcript_text()"
+    "    transcript_text = video.get_transcript()"
    ]
   },
   {
@@ -373,7 +378,7 @@
     "from video_prompter import multimodal_prompter\n",
     "\n",
     "user_prompt = \"\"\n",
-    "result = multimodal_prompter(transcript, scenes, user_prompt)"
+    "result = multimodal_prompter(transcript, scenes, user_prompt, llm=llm)"
    ]
   },
   {
@@ -385,11 +390,11 @@
     "from videodb import play_stream\n",
     "from videodb.timeline import Timeline\n",
     "\n",
-    "from video_prompter import get_result_timestamps, build_video_timeline\n",
+    "from video_prompter import build_video_timeline\n",
     "\n",
     "timeline = Timeline(conn)\n",
-    "result_timestamps = get_result_timestamps(video, result, scene_index_id=scene_index_id)\n",
-    "timeline, duration = build_video_timeline(video, result_timestamps, timeline)\n",
+    "\n",
+    "timeline, duration = build_video_timeline(video, result, timeline)\n",
     "stream = timeline.generate_stream()\n",
     "play_stream(stream)"
    ]

diff --git a/PromptClip_spoken.ipynb b/PromptClip_spoken.ipynb
@@ -78,15 +78,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "try:\n",
-    "    transcript_text = video.get_transcript_text()\n",
+    "    transcript_text = video.get_transcript()\n",
     "except Exception:\n",
     "    video.index_spoken_words()\n",
-    "    transcript_text = video.get_transcript_text()"
+    "    transcript_text = video.get_transcript()"
    ]
   },
   {
@@ -102,6 +102,18 @@
     "The `text_prompter` will return sentences or segments from the video that match your prompt. Review these to ensure they align with your needs. You can then use these segments to create your clip, focusing on the content that's most relevant to your use case."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llm_agent import LLM, LLMType, Models\n",
+    "\n",
+    "# Change the model/provider here if you want to experiment.\n",
+    "llm = LLM(llm_type=LLMType.OPENAI, model=Models.GPT_4O)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -114,7 +126,7 @@
     "\n",
     "# Choose a prompt to create create clip. \n",
     "user_prompt = \"find sentences where a deal is discussed\"\n",
-    "result = text_prompter(transcript_text, user_prompt)\n",
+    "result = text_prompter(transcript_text, user_prompt, llm=llm)\n",
     "print(f\"Found {len(result)} segments in the video.\")"
    ]
   },
@@ -124,12 +136,6 @@
    "source": [
     "### Generate the Clip\n",
     "\n",
-    "To generate a clip, we'll  use **VideoDB**'s `keyword search` feature. We already leveraged the power of the LLM (Large Language Model) to identify relevant sentences. We'll use the search results to create a `programmable video stream`. Here's how you can approach this process:\n",
-    "\n",
-    "We have the keywords in the `results` variable. Input these keywords into VideoDB's keyword search feature. This function will search through the indexed spoken content of your videos to find matches. \n",
-    "\n",
-    "The search will return a SearchResult object, which contains detailed information about the found segments, including their timestamps, the text of the spoken content, and possibly other metadata.\n",
-    "\n",
     "**Create a Programmable Video Stream with Timeline**: With the specific segments identified, you can now use the Timeline to create a new programmable video stream. The Timeline tool allows you to stitch together video segments based on the timestamps provided in the SearchResult object. You can arrange, cut, or combine these segments to craft a fresh video stream that focuses on your topic of interest."
    ]
   },
@@ -141,11 +147,11 @@
    "source": [
     "from videodb import play_stream\n",
     "from videodb.timeline import Timeline\n",
-    "from video_prompter import get_result_timestamps, build_video_timeline\n",
+    "from video_prompter import build_video_timeline\n",
     "\n",
     "timeline = Timeline(conn)\n",
-    "result_timestamps = get_result_timestamps(video, result, index_type=\"spoken_word\")\n",
-    "timeline, duration  = build_video_timeline(video, result_timestamps, timeline)\n",
+    "\n",
+    "timeline, duration  = build_video_timeline(video, result, timeline)\n",
     "stream = timeline.generate_stream()\n",
     "print(stream)\n",
     "play_stream(stream)"
@@ -214,7 +220,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -258,10 +264,10 @@
     "    rank and give score to each result\n",
     "    \"\"\"\n",
     "    res_score = []\n",
-    "    for text in res:\n",
-    "        res_score.append((text, ranking_prompt_llm(text,prompt)))\n",
+    "    for start, end, text in res:\n",
+    "        res_score.append((start, end, text, ranking_prompt_llm(text,prompt)))\n",
     "    \n",
-    "    res_score_sorted = sorted(res_score, key=lambda x: x[1], reverse=True)\n",
+    "    res_score_sorted = sorted(res_score, key=lambda x: x[-1], reverse=True)\n",
     "    return res_score_sorted[0: floor(len(res_score_sorted)*score_percentage)]"
    ]
   },
@@ -287,7 +293,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -296,19 +302,9 @@
     "from videodb.timeline import Timeline, VideoAsset, AudioAsset\n",
     "\n",
     "timeline = Timeline(conn)\n",
-    "for sentences, score in ranked_results:\n",
-    "    search_res = video.search(sentences, search_type=SearchType.keyword)\n",
-    "    matched_segments = search_res.get_shots()\n",
+    "for start, end, sentence, score in ranked_results:\n",
     "    \n",
-    "    # No exact match found\n",
-    "    if len(matched_segments) == 0:\n",
-    "        continue\n",
-    "\n",
-    "    # Get the first matched video segment\n",
-    "    video_shot = matched_segments[0]\n",
-    "\n",
-    "    # Create a new Video Asset and add it to a timeline.\n",
-    "    timeline.add_inline(VideoAsset(asset_id=video.id, start=video_shot.start, end=video_shot.end))"
+    "    timeline.add_inline(VideoAsset(asset_id=video.id, start=start, end=end))"
    ]
   },
   {
@@ -362,28 +358,15 @@
    "source": [
     "timeline = Timeline(conn)\n",
     "dur_so_far = 0\n",
-    "for clip_sentences, score in ranked_results:\n",
-    "    try:\n",
-    "        search_res = video.search(clip_sentences, search_type=SearchType.keyword)\n",
-    "        matched_segments = search_res.get_shots()\n",
-    "        \n",
-    "        # No exact match found\n",
-    "        if len(matched_segments) == 0:\n",
-    "            continue\n",
-    "    \n",
-    "        #video segment\n",
-    "        video_shot = matched_segments[0]\n",
-    "    \n",
-    "        # Create a new Video Asset and add it to a timeline.\n",
-    "        timeline.add_inline(VideoAsset(asset_id=video.id, start=video_shot.start, end=video_shot.end))\n",
-    "        chunk_dur = (video_shot.end - video_shot.start)\n",
-    "        dur_so_far += chunk_dur \n",
-    "        if chunk_dur < 2:\n",
-    "            print(\"Skipping since chunk duration is less then the overlay audio.\")\n",
-    "            continue\n",
-    "        timeline.add_overlay(dur_so_far-2, background)\n",
-    "    except Exception as e:\n",
-    "        print(f\"Error: skipping the segment {str(e)}\")"
+    "for start, end, clip_sentences, score in ranked_results:\n",
+    "\n",
+    "    timeline.add_inline(VideoAsset(asset_id=video.id, start=start, end=end))\n",
+    "    chunk_dur = (end - start)\n",
+    "    dur_so_far += chunk_dur \n",
+    "    if chunk_dur < 2:\n",
+    "        print(\"Skipping since chunk duration is less then the overlay audio.\")\n",
+    "        continue\n",
+    "    timeline.add_overlay(dur_so_far-2, background)"
    ]
   },
   {
@@ -443,11 +426,11 @@
    "source": [
     "try:\n",
     "    transcript = video.get_transcript()\n",
-    "    transcript_text = video.get_transcript_text()\n",
+    "    transcript_text = video.get_transcript()\n",
     "except Exception:\n",
     "    video.index_spoken_words()\n",
     "    transcript = video.get_transcript()\n",
-    "    transcript_text = video.get_transcript_text()"
+    "    transcript_text = video.get_transcript()"
    ]
   },
   {
@@ -459,7 +442,7 @@
     "from video_prompter import text_prompter\n",
     "\n",
     "user_prompt = \"\"\n",
-    "result = text_prompter(transcript_text, user_prompt)\n",
+    "result = text_prompter(transcript_text, user_prompt, llm=llm)\n",
     "print(f\"Found {len(result)} segments in the video.\")"
    ]
   },
@@ -471,11 +454,11 @@
    "source": [
     "from videodb import play_stream\n",
     "from videodb.timeline import Timeline\n",
-    "from video_prompter import get_result_timestamps, build_video_timeline\n",
+    "from video_prompter import build_video_timeline\n",
     "\n",
     "timeline = Timeline(conn)\n",
-    "result_timestamps = get_result_timestamps(video, result, index_type=\"spoken_word\")\n",
-    "timeline, duration  = build_video_timeline(video, result_timestamps, timeline)\n",
+    "\n",
+    "timeline, duration  = build_video_timeline(video, result, timeline)\n",
     "stream = timeline.generate_stream()\n",
     "print(stream)\n",
     "play_stream(stream)"