Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 32 additions & 27 deletions PromptClip_multimodal.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -87,17 +87,17 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"try:\n",
" transcript = video.get_transcript()\n",
" transcript_text = video.get_transcript_text()\n",
" transcript_text = video.get_transcript()\n",
"except Exception:\n",
" video.index_spoken_words()\n",
" transcript = video.get_transcript()\n",
" transcript_text = video.get_transcript_text()"
" transcript_text = video.get_transcript()"
]
},
{
Expand Down Expand Up @@ -156,6 +156,18 @@
"The `multimodal_promper` will return sentences which are visual description from matched chunks. Review these to ensure they align with your needs. You can then use these segments to create your clip, focusing on the content that's most relevant to your use case."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from llm_agent import LLM, LLMType, Models\n",
"\n",
"# Change the model/provider here if you want to experiment.\n",
"llm = LLM(llm_type=LLMType.OPENAI, model=Models.GPT_4O)"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand All @@ -169,20 +181,14 @@
"# If we create a clip using only the spoken index data, we won't know where the infographics are.\n",
"# If we create a clip using only the visual index data, we may include additional infographics that aren't actually about the rules but might appear to be rule explanations based on visual information.\n",
"# By creating a clip using combined data, we achieve a much more precise intersection where the infographics are present, and the rules are being explained.\n",
"result = multimodal_prompter(transcript, scenes, user_prompt)"
"result = multimodal_prompter(transcript, scenes, user_prompt, llm=llm)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Generate The Clip\n",
"To generate a clip, first we'll call `get_result_timestamps` from `video_prompter.py` it uses VideoDB's `keyword search` feature. We already leveraged the power of the LLM (Large Language Model) to identify relevant sentences. We'll use the search results to create a programmable video stream. Here's how you can approach this process:\n",
"\n",
"We have the descriptions in the `results` variable. We input these keywords into VideoDB's keyword search feature. This function will search through the indexed scenes of your videos to find matches.\n",
"\n",
"The search will return a SearchResult object, which contains detailed information about the found segments, including their timestamps, the text of the scene description.\n",
"\n",
"**Create a Programmable Video Stream with Timeline:** With the specific segments identified, you can now use `build_video_timeline` from `video_prompter.py` to get the Timeline to create a new programmable video stream. The Timeline tool allows you to stitch together video segments based on the timestamps provided in the SearchResult object. You can arrange, cut, or combine these segments to craft a fresh video stream that focuses on your topic of interest."
]
},
Expand All @@ -195,11 +201,10 @@
"from videodb import play_stream\n",
"from videodb.timeline import Timeline\n",
"\n",
"from video_prompter import get_result_timestamps, build_video_timeline\n",
"from video_prompter import build_video_timeline\n",
"\n",
"timeline = Timeline(conn)\n",
"result_timestamps = get_result_timestamps(video, result, scene_index_id=scene_index_id)\n",
"timeline, duration = build_video_timeline(video, result_timestamps, timeline)\n",
"timeline, duration = build_video_timeline(video, result, timeline)\n",
"stream = timeline.generate_stream()\n",
"play_stream(stream)"
]
Expand All @@ -222,14 +227,14 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from video_prompter import text_prompter\n",
"\n",
"user_prompt = \"find the scene explaining the cricket rules using infographics, remember rules with infographics.\"\n",
"text_result = text_prompter(transcript_text, user_prompt)"
"text_result = text_prompter(transcript, user_prompt, llm=llm)"
]
},
{
Expand All @@ -239,8 +244,8 @@
"outputs": [],
"source": [
"timeline = Timeline(conn)\n",
"result_timestamps = get_result_timestamps(video, text_result, index_type=\"spoken_word\")\n",
"timeline, duration = build_video_timeline(video, result_timestamps, timeline)\n",
"\n",
"timeline, duration = build_video_timeline(video, text_result, timeline)\n",
"stream = timeline.generate_stream()\n",
"play_stream(stream)"
]
Expand All @@ -263,14 +268,14 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from video_prompter import scene_prompter\n",
"\n",
"user_prompt = \"find the scene explaining the cricket rules using infographics, remember rules with infographics.\"\n",
"scene_result = scene_prompter(scenes, user_prompt)"
"scene_result = scene_prompter(scenes, user_prompt, llm=llm)"
]
},
{
Expand All @@ -280,8 +285,8 @@
"outputs": [],
"source": [
"timeline = Timeline(conn)\n",
"result_timestamps = get_result_timestamps(video, scene_result, scene_index_id=scene_index_id)\n",
"timeline, duration = build_video_timeline(video, result_timestamps, timeline)\n",
"\n",
"timeline, duration = build_video_timeline(video, scene_result, timeline)\n",
"stream = timeline.generate_stream()\n",
"play_stream(stream)"
]
Expand Down Expand Up @@ -341,11 +346,11 @@
"source": [
"try:\n",
" transcript = video.get_transcript()\n",
" transcript_text = video.get_transcript_text()\n",
" transcript_text = video.get_transcript()\n",
"except Exception:\n",
" video.index_spoken_words()\n",
" transcript = video.get_transcript()\n",
" transcript_text = video.get_transcript_text()"
" transcript_text = video.get_transcript()"
]
},
{
Expand Down Expand Up @@ -373,7 +378,7 @@
"from video_prompter import multimodal_prompter\n",
"\n",
"user_prompt = \"\"\n",
"result = multimodal_prompter(transcript, scenes, user_prompt)"
"result = multimodal_prompter(transcript, scenes, user_prompt, llm=llm)"
]
},
{
Expand All @@ -385,11 +390,11 @@
"from videodb import play_stream\n",
"from videodb.timeline import Timeline\n",
"\n",
"from video_prompter import get_result_timestamps, build_video_timeline\n",
"from video_prompter import build_video_timeline\n",
"\n",
"timeline = Timeline(conn)\n",
"result_timestamps = get_result_timestamps(video, result, scene_index_id=scene_index_id)\n",
"timeline, duration = build_video_timeline(video, result_timestamps, timeline)\n",
"\n",
"timeline, duration = build_video_timeline(video, result, timeline)\n",
"stream = timeline.generate_stream()\n",
"play_stream(stream)"
]
Expand Down
99 changes: 41 additions & 58 deletions PromptClip_spoken.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -78,15 +78,15 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"try:\n",
" transcript_text = video.get_transcript_text()\n",
" transcript_text = video.get_transcript()\n",
"except Exception:\n",
" video.index_spoken_words()\n",
" transcript_text = video.get_transcript_text()"
" transcript_text = video.get_transcript()"
]
},
{
Expand All @@ -102,6 +102,18 @@
"The `text_prompter` will return sentences or segments from the video that match your prompt. Review these to ensure they align with your needs. You can then use these segments to create your clip, focusing on the content that's most relevant to your use case."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from llm_agent import LLM, LLMType, Models\n",
"\n",
"# Change the model/provider here if you want to experiment.\n",
"llm = LLM(llm_type=LLMType.OPENAI, model=Models.GPT_4O)"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand All @@ -114,7 +126,7 @@
"\n",
"# Choose a prompt to create create clip. \n",
"user_prompt = \"find sentences where a deal is discussed\"\n",
"result = text_prompter(transcript_text, user_prompt)\n",
"result = text_prompter(transcript_text, user_prompt, llm=llm)\n",
"print(f\"Found {len(result)} segments in the video.\")"
]
},
Expand All @@ -124,12 +136,6 @@
"source": [
"### Generate the Clip\n",
"\n",
"To generate a clip, we'll use **VideoDB**'s `keyword search` feature. We already leveraged the power of the LLM (Large Language Model) to identify relevant sentences. We'll use the search results to create a `programmable video stream`. Here's how you can approach this process:\n",
"\n",
"We have the keywords in the `results` variable. Input these keywords into VideoDB's keyword search feature. This function will search through the indexed spoken content of your videos to find matches. \n",
"\n",
"The search will return a SearchResult object, which contains detailed information about the found segments, including their timestamps, the text of the spoken content, and possibly other metadata.\n",
"\n",
"**Create a Programmable Video Stream with Timeline**: With the specific segments identified, you can now use the Timeline to create a new programmable video stream. The Timeline tool allows you to stitch together video segments based on the timestamps provided in the SearchResult object. You can arrange, cut, or combine these segments to craft a fresh video stream that focuses on your topic of interest."
]
},
Expand All @@ -141,11 +147,11 @@
"source": [
"from videodb import play_stream\n",
"from videodb.timeline import Timeline\n",
"from video_prompter import get_result_timestamps, build_video_timeline\n",
"from video_prompter import build_video_timeline\n",
"\n",
"timeline = Timeline(conn)\n",
"result_timestamps = get_result_timestamps(video, result, index_type=\"spoken_word\")\n",
"timeline, duration = build_video_timeline(video, result_timestamps, timeline)\n",
"\n",
"timeline, duration = build_video_timeline(video, result, timeline)\n",
"stream = timeline.generate_stream()\n",
"print(stream)\n",
"play_stream(stream)"
Expand Down Expand Up @@ -214,7 +220,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -258,10 +264,10 @@
" rank and give score to each result\n",
" \"\"\"\n",
" res_score = []\n",
" for text in res:\n",
" res_score.append((text, ranking_prompt_llm(text,prompt)))\n",
" for start, end, text in res:\n",
" res_score.append((start, end, text, ranking_prompt_llm(text,prompt)))\n",
" \n",
" res_score_sorted = sorted(res_score, key=lambda x: x[1], reverse=True)\n",
" res_score_sorted = sorted(res_score, key=lambda x: x[-1], reverse=True)\n",
" return res_score_sorted[0: floor(len(res_score_sorted)*score_percentage)]"
]
},
Expand All @@ -287,7 +293,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -296,19 +302,9 @@
"from videodb.timeline import Timeline, VideoAsset, AudioAsset\n",
"\n",
"timeline = Timeline(conn)\n",
"for sentences, score in ranked_results:\n",
" search_res = video.search(sentences, search_type=SearchType.keyword)\n",
" matched_segments = search_res.get_shots()\n",
"for start, end, sentence, score in ranked_results:\n",
" \n",
" # No exact match found\n",
" if len(matched_segments) == 0:\n",
" continue\n",
"\n",
" # Get the first matched video segment\n",
" video_shot = matched_segments[0]\n",
"\n",
" # Create a new Video Asset and add it to a timeline.\n",
" timeline.add_inline(VideoAsset(asset_id=video.id, start=video_shot.start, end=video_shot.end))"
" timeline.add_inline(VideoAsset(asset_id=video.id, start=start, end=end))"
]
},
{
Expand Down Expand Up @@ -362,28 +358,15 @@
"source": [
"timeline = Timeline(conn)\n",
"dur_so_far = 0\n",
"for clip_sentences, score in ranked_results:\n",
" try:\n",
" search_res = video.search(clip_sentences, search_type=SearchType.keyword)\n",
" matched_segments = search_res.get_shots()\n",
" \n",
" # No exact match found\n",
" if len(matched_segments) == 0:\n",
" continue\n",
" \n",
" #video segment\n",
" video_shot = matched_segments[0]\n",
" \n",
" # Create a new Video Asset and add it to a timeline.\n",
" timeline.add_inline(VideoAsset(asset_id=video.id, start=video_shot.start, end=video_shot.end))\n",
" chunk_dur = (video_shot.end - video_shot.start)\n",
" dur_so_far += chunk_dur \n",
" if chunk_dur < 2:\n",
" print(\"Skipping since chunk duration is less then the overlay audio.\")\n",
" continue\n",
" timeline.add_overlay(dur_so_far-2, background)\n",
" except Exception as e:\n",
" print(f\"Error: skipping the segment {str(e)}\")"
"for start, end, clip_sentences, score in ranked_results:\n",
"\n",
" timeline.add_inline(VideoAsset(asset_id=video.id, start=start, end=end))\n",
" chunk_dur = (end - start)\n",
" dur_so_far += chunk_dur \n",
" if chunk_dur < 2:\n",
" print(\"Skipping since chunk duration is less then the overlay audio.\")\n",
" continue\n",
" timeline.add_overlay(dur_so_far-2, background)"
]
},
{
Expand Down Expand Up @@ -443,11 +426,11 @@
"source": [
"try:\n",
" transcript = video.get_transcript()\n",
" transcript_text = video.get_transcript_text()\n",
" transcript_text = video.get_transcript()\n",
"except Exception:\n",
" video.index_spoken_words()\n",
" transcript = video.get_transcript()\n",
" transcript_text = video.get_transcript_text()"
" transcript_text = video.get_transcript()"
]
},
{
Expand All @@ -459,7 +442,7 @@
"from video_prompter import text_prompter\n",
"\n",
"user_prompt = \"\"\n",
"result = text_prompter(transcript_text, user_prompt)\n",
"result = text_prompter(transcript_text, user_prompt, llm=llm)\n",
"print(f\"Found {len(result)} segments in the video.\")"
]
},
Expand All @@ -471,11 +454,11 @@
"source": [
"from videodb import play_stream\n",
"from videodb.timeline import Timeline\n",
"from video_prompter import get_result_timestamps, build_video_timeline\n",
"from video_prompter import build_video_timeline\n",
"\n",
"timeline = Timeline(conn)\n",
"result_timestamps = get_result_timestamps(video, result, index_type=\"spoken_word\")\n",
"timeline, duration = build_video_timeline(video, result_timestamps, timeline)\n",
"\n",
"timeline, duration = build_video_timeline(video, result, timeline)\n",
"stream = timeline.generate_stream()\n",
"print(stream)\n",
"play_stream(stream)"
Expand Down
Loading