diff --git a/PromptClip_multimodal.ipynb b/PromptClip_multimodal.ipynb index bb599df..e7a3099 100644 --- a/PromptClip_multimodal.ipynb +++ b/PromptClip_multimodal.ipynb @@ -87,17 +87,17 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "try:\n", " transcript = video.get_transcript()\n", - " transcript_text = video.get_transcript_text()\n", + " transcript_text = video.get_transcript()\n", "except Exception:\n", " video.index_spoken_words()\n", " transcript = video.get_transcript()\n", - " transcript_text = video.get_transcript_text()" + " transcript_text = video.get_transcript()" ] }, { @@ -156,6 +156,18 @@ "The `multimodal_promper` will return sentences which are visual description from matched chunks. Review these to ensure they align with your needs. You can then use these segments to create your clip, focusing on the content that's most relevant to your use case." ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from llm_agent import LLM, LLMType, Models\n", + "\n", + "# Change the model/provider here if you want to experiment.\n", + "llm = LLM(llm_type=LLMType.OPENAI, model=Models.GPT_4O)" + ] + }, { "cell_type": "code", "execution_count": null, @@ -169,7 +181,7 @@ "# If we create a clip using only the spoken index data, we won't know where the infographics are.\n", "# If we create a clip using only the visual index data, we may include additional infographics that aren't actually about the rules but might appear to be rule explanations based on visual information.\n", "# By creating a clip using combined data, we achieve a much more precise intersection where the infographics are present, and the rules are being explained.\n", - "result = multimodal_prompter(transcript, scenes, user_prompt)" + "result = multimodal_prompter(transcript, scenes, user_prompt, llm=llm)" ] }, { @@ -177,12 +189,6 @@ "metadata": {}, "source": [ "### Generate The Clip\n", - "To generate a clip, first we'll call `get_result_timestamps` from `video_prompter.py` it uses VideoDB's `keyword search` feature. We already leveraged the power of the LLM (Large Language Model) to identify relevant sentences. We'll use the search results to create a programmable video stream. Here's how you can approach this process:\n", - "\n", - "We have the descriptions in the `results` variable. We input these keywords into VideoDB's keyword search feature. This function will search through the indexed scenes of your videos to find matches.\n", - "\n", - "The search will return a SearchResult object, which contains detailed information about the found segments, including their timestamps, the text of the scene description.\n", - "\n", "**Create a Programmable Video Stream with Timeline:** With the specific segments identified, you can now use `build_video_timeline` from `video_prompter.py` to get the Timeline to create a new programmable video stream. The Timeline tool allows you to stitch together video segments based on the timestamps provided in the SearchResult object. You can arrange, cut, or combine these segments to craft a fresh video stream that focuses on your topic of interest." ] }, @@ -195,11 +201,10 @@ "from videodb import play_stream\n", "from videodb.timeline import Timeline\n", "\n", - "from video_prompter import get_result_timestamps, build_video_timeline\n", + "from video_prompter import build_video_timeline\n", "\n", "timeline = Timeline(conn)\n", - "result_timestamps = get_result_timestamps(video, result, scene_index_id=scene_index_id)\n", - "timeline, duration = build_video_timeline(video, result_timestamps, timeline)\n", + "timeline, duration = build_video_timeline(video, result, timeline)\n", "stream = timeline.generate_stream()\n", "play_stream(stream)" ] @@ -222,14 +227,14 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from video_prompter import text_prompter\n", "\n", "user_prompt = \"find the scene explaining the cricket rules using infographics, remember rules with infographics.\"\n", - "text_result = text_prompter(transcript_text, user_prompt)" + "text_result = text_prompter(transcript, user_prompt, llm=llm)" ] }, { @@ -239,8 +244,8 @@ "outputs": [], "source": [ "timeline = Timeline(conn)\n", - "result_timestamps = get_result_timestamps(video, text_result, index_type=\"spoken_word\")\n", - "timeline, duration = build_video_timeline(video, result_timestamps, timeline)\n", + "\n", + "timeline, duration = build_video_timeline(video, text_result, timeline)\n", "stream = timeline.generate_stream()\n", "play_stream(stream)" ] @@ -263,14 +268,14 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from video_prompter import scene_prompter\n", "\n", "user_prompt = \"find the scene explaining the cricket rules using infographics, remember rules with infographics.\"\n", - "scene_result = scene_prompter(scenes, user_prompt)" + "scene_result = scene_prompter(scenes, user_prompt, llm=llm)" ] }, { @@ -280,8 +285,8 @@ "outputs": [], "source": [ "timeline = Timeline(conn)\n", - "result_timestamps = get_result_timestamps(video, scene_result, scene_index_id=scene_index_id)\n", - "timeline, duration = build_video_timeline(video, result_timestamps, timeline)\n", + "\n", + "timeline, duration = build_video_timeline(video, scene_result, timeline)\n", "stream = timeline.generate_stream()\n", "play_stream(stream)" ] @@ -341,11 +346,11 @@ "source": [ "try:\n", " transcript = video.get_transcript()\n", - " transcript_text = video.get_transcript_text()\n", + " transcript_text = video.get_transcript()\n", "except Exception:\n", " video.index_spoken_words()\n", " transcript = video.get_transcript()\n", - " transcript_text = video.get_transcript_text()" + " transcript_text = video.get_transcript()" ] }, { @@ -373,7 +378,7 @@ "from video_prompter import multimodal_prompter\n", "\n", "user_prompt = \"\"\n", - "result = multimodal_prompter(transcript, scenes, user_prompt)" + "result = multimodal_prompter(transcript, scenes, user_prompt, llm=llm)" ] }, { @@ -385,11 +390,11 @@ "from videodb import play_stream\n", "from videodb.timeline import Timeline\n", "\n", - "from video_prompter import get_result_timestamps, build_video_timeline\n", + "from video_prompter import build_video_timeline\n", "\n", "timeline = Timeline(conn)\n", - "result_timestamps = get_result_timestamps(video, result, scene_index_id=scene_index_id)\n", - "timeline, duration = build_video_timeline(video, result_timestamps, timeline)\n", + "\n", + "timeline, duration = build_video_timeline(video, result, timeline)\n", "stream = timeline.generate_stream()\n", "play_stream(stream)" ] diff --git a/PromptClip_spoken.ipynb b/PromptClip_spoken.ipynb index 5a5adc3..ee46ac0 100644 --- a/PromptClip_spoken.ipynb +++ b/PromptClip_spoken.ipynb @@ -78,15 +78,15 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "try:\n", - " transcript_text = video.get_transcript_text()\n", + " transcript_text = video.get_transcript()\n", "except Exception:\n", " video.index_spoken_words()\n", - " transcript_text = video.get_transcript_text()" + " transcript_text = video.get_transcript()" ] }, { @@ -102,6 +102,18 @@ "The `text_prompter` will return sentences or segments from the video that match your prompt. Review these to ensure they align with your needs. You can then use these segments to create your clip, focusing on the content that's most relevant to your use case." ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from llm_agent import LLM, LLMType, Models\n", + "\n", + "# Change the model/provider here if you want to experiment.\n", + "llm = LLM(llm_type=LLMType.OPENAI, model=Models.GPT_4O)" + ] + }, { "cell_type": "code", "execution_count": null, @@ -114,7 +126,7 @@ "\n", "# Choose a prompt to create create clip. \n", "user_prompt = \"find sentences where a deal is discussed\"\n", - "result = text_prompter(transcript_text, user_prompt)\n", + "result = text_prompter(transcript_text, user_prompt, llm=llm)\n", "print(f\"Found {len(result)} segments in the video.\")" ] }, @@ -124,12 +136,6 @@ "source": [ "### Generate the Clip\n", "\n", - "To generate a clip, we'll use **VideoDB**'s `keyword search` feature. We already leveraged the power of the LLM (Large Language Model) to identify relevant sentences. We'll use the search results to create a `programmable video stream`. Here's how you can approach this process:\n", - "\n", - "We have the keywords in the `results` variable. Input these keywords into VideoDB's keyword search feature. This function will search through the indexed spoken content of your videos to find matches. \n", - "\n", - "The search will return a SearchResult object, which contains detailed information about the found segments, including their timestamps, the text of the spoken content, and possibly other metadata.\n", - "\n", "**Create a Programmable Video Stream with Timeline**: With the specific segments identified, you can now use the Timeline to create a new programmable video stream. The Timeline tool allows you to stitch together video segments based on the timestamps provided in the SearchResult object. You can arrange, cut, or combine these segments to craft a fresh video stream that focuses on your topic of interest." ] }, @@ -141,11 +147,11 @@ "source": [ "from videodb import play_stream\n", "from videodb.timeline import Timeline\n", - "from video_prompter import get_result_timestamps, build_video_timeline\n", + "from video_prompter import build_video_timeline\n", "\n", "timeline = Timeline(conn)\n", - "result_timestamps = get_result_timestamps(video, result, index_type=\"spoken_word\")\n", - "timeline, duration = build_video_timeline(video, result_timestamps, timeline)\n", + "\n", + "timeline, duration = build_video_timeline(video, result, timeline)\n", "stream = timeline.generate_stream()\n", "print(stream)\n", "play_stream(stream)" @@ -214,7 +220,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -258,10 +264,10 @@ " rank and give score to each result\n", " \"\"\"\n", " res_score = []\n", - " for text in res:\n", - " res_score.append((text, ranking_prompt_llm(text,prompt)))\n", + " for start, end, text in res:\n", + " res_score.append((start, end, text, ranking_prompt_llm(text,prompt)))\n", " \n", - " res_score_sorted = sorted(res_score, key=lambda x: x[1], reverse=True)\n", + " res_score_sorted = sorted(res_score, key=lambda x: x[-1], reverse=True)\n", " return res_score_sorted[0: floor(len(res_score_sorted)*score_percentage)]" ] }, @@ -287,7 +293,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -296,19 +302,9 @@ "from videodb.timeline import Timeline, VideoAsset, AudioAsset\n", "\n", "timeline = Timeline(conn)\n", - "for sentences, score in ranked_results:\n", - " search_res = video.search(sentences, search_type=SearchType.keyword)\n", - " matched_segments = search_res.get_shots()\n", + "for start, end, sentence, score in ranked_results:\n", " \n", - " # No exact match found\n", - " if len(matched_segments) == 0:\n", - " continue\n", - "\n", - " # Get the first matched video segment\n", - " video_shot = matched_segments[0]\n", - "\n", - " # Create a new Video Asset and add it to a timeline.\n", - " timeline.add_inline(VideoAsset(asset_id=video.id, start=video_shot.start, end=video_shot.end))" + " timeline.add_inline(VideoAsset(asset_id=video.id, start=start, end=end))" ] }, { @@ -362,28 +358,15 @@ "source": [ "timeline = Timeline(conn)\n", "dur_so_far = 0\n", - "for clip_sentences, score in ranked_results:\n", - " try:\n", - " search_res = video.search(clip_sentences, search_type=SearchType.keyword)\n", - " matched_segments = search_res.get_shots()\n", - " \n", - " # No exact match found\n", - " if len(matched_segments) == 0:\n", - " continue\n", - " \n", - " #video segment\n", - " video_shot = matched_segments[0]\n", - " \n", - " # Create a new Video Asset and add it to a timeline.\n", - " timeline.add_inline(VideoAsset(asset_id=video.id, start=video_shot.start, end=video_shot.end))\n", - " chunk_dur = (video_shot.end - video_shot.start)\n", - " dur_so_far += chunk_dur \n", - " if chunk_dur < 2:\n", - " print(\"Skipping since chunk duration is less then the overlay audio.\")\n", - " continue\n", - " timeline.add_overlay(dur_so_far-2, background)\n", - " except Exception as e:\n", - " print(f\"Error: skipping the segment {str(e)}\")" + "for start, end, clip_sentences, score in ranked_results:\n", + "\n", + " timeline.add_inline(VideoAsset(asset_id=video.id, start=start, end=end))\n", + " chunk_dur = (end - start)\n", + " dur_so_far += chunk_dur \n", + " if chunk_dur < 2:\n", + " print(\"Skipping since chunk duration is less then the overlay audio.\")\n", + " continue\n", + " timeline.add_overlay(dur_so_far-2, background)" ] }, { @@ -443,11 +426,11 @@ "source": [ "try:\n", " transcript = video.get_transcript()\n", - " transcript_text = video.get_transcript_text()\n", + " transcript_text = video.get_transcript()\n", "except Exception:\n", " video.index_spoken_words()\n", " transcript = video.get_transcript()\n", - " transcript_text = video.get_transcript_text()" + " transcript_text = video.get_transcript()" ] }, { @@ -459,7 +442,7 @@ "from video_prompter import text_prompter\n", "\n", "user_prompt = \"\"\n", - "result = text_prompter(transcript_text, user_prompt)\n", + "result = text_prompter(transcript_text, user_prompt, llm=llm)\n", "print(f\"Found {len(result)} segments in the video.\")" ] }, @@ -471,11 +454,11 @@ "source": [ "from videodb import play_stream\n", "from videodb.timeline import Timeline\n", - "from video_prompter import get_result_timestamps, build_video_timeline\n", + "from video_prompter import build_video_timeline\n", "\n", "timeline = Timeline(conn)\n", - "result_timestamps = get_result_timestamps(video, result, index_type=\"spoken_word\")\n", - "timeline, duration = build_video_timeline(video, result_timestamps, timeline)\n", + "\n", + "timeline, duration = build_video_timeline(video, result, timeline)\n", "stream = timeline.generate_stream()\n", "print(stream)\n", "play_stream(stream)" diff --git a/PromptClip_visual.ipynb b/PromptClip_visual.ipynb index 177ebf3..342ee2d 100644 --- a/PromptClip_visual.ipynb +++ b/PromptClip_visual.ipynb @@ -143,6 +143,18 @@ "The `scene_prompter` will return sentences or segments from the video that match your prompt. Review these to ensure they align with your needs. You can then use these segments to create your clip, focusing on the content that's most relevant to your use case." ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from llm_agent import LLM, LLMType, Models\n", + "\n", + "# Change the model/provider here if you want to experiment.\n", + "llm = LLM(llm_type=LLMType.OPENAI, model=Models.GPT_4O)" + ] + }, { "cell_type": "code", "execution_count": null, @@ -153,7 +165,7 @@ "\n", "# This prompt is for finding the iconic copying in examination scene of Mr. Bean\n", "user_prompt = \"find the moment where mr.bean is attempting to cheat peeking over at the answer sheet of man beside him, find it with high accuracy.\"\n", - "result = scene_prompter(scenes, user_prompt)" + "result = scene_prompter(scenes, user_prompt, llm=llm)" ] }, { @@ -161,11 +173,6 @@ "metadata": {}, "source": [ "### Generate The Clip\n", - "To generate a clip, first we'll call `get_result_timestamps` from `video_prompter.py` it uses VideoDB's `keyword search` feature. We already leveraged the power of the LLM (Large Language Model) to identify relevant sentences. We'll use the search results to create a programmable video stream. Here's how you can approach this process:\n", - "\n", - "We have the descriptions in the `results` variable. We input these keywords into VideoDB's keyword search feature. This function will search through the indexed scenes of your videos to find matches.\n", - "\n", - "The search will return a SearchResult object, which contains detailed information about the found segments, including their timestamps, the text of the scene description.\n", "\n", "**Create a Programmable Video Stream with Timeline**: With the specific segments identified, you can now use `build_video_timeline` from `video_prompter.py` to get the Timeline to create a new programmable video stream. The Timeline tool allows you to stitch together video segments based on the timestamps provided in the SearchResult object. You can arrange, cut, or combine these segments to craft a fresh video stream that focuses on your topic of interest." ] @@ -178,17 +185,15 @@ "source": [ "from videodb import play_stream\n", "from videodb.timeline import Timeline\n", - "from video_prompter import get_result_timestamps, build_video_timeline\n", + "from video_prompter import build_video_timeline\n", "\n", "timeline = Timeline(conn)\n", "\n", - "result_timestamps = get_result_timestamps(video, result, scene_index_id=scene_index_id)\n", - "print(f\"We have got {len(result_timestamps)} segments matching with the user prompt.\")\n", "# Since we are only interested in one segment for the meme, we've hardcoded the timestamp to filter it out.\n", "# In an actual scenario, you can inspect all the segments and select the ones you're interested in.\n", "# Alternatively, you can skip the filtering if you want a clip of all the segments involving cheating.\n", "meme_start_time = 370.4\n", - "meme_result = [next((item for item in result_timestamps if item[0] == meme_start_time), None)]\n", + "meme_result = [next((item for item in result if item[0] == meme_start_time), None)]\n", "if meme_result:\n", " print(\"Selecting the segment with the meme.\")\n", "else:\n", @@ -323,7 +328,7 @@ "outputs": [], "source": [ "user_prompt = \"find all the car gags with high accuracy\"\n", - "result = scene_prompter(scenes, user_prompt)" + "result = scene_prompter(scenes, user_prompt, llm=llm)" ] }, { @@ -333,8 +338,8 @@ "outputs": [], "source": [ "timeline = Timeline(conn)\n", - "result_timestamps = get_result_timestamps(video, result, scene_index_id=scene_index_id)\n", - "timeline, duration = build_video_timeline(video, result_timestamps, timeline)\n", + "\n", + "timeline, duration = build_video_timeline(video, result, timeline)\n", "stream = timeline.generate_stream()\n", "play_stream(stream)" ] @@ -413,7 +418,7 @@ "from video_prompter import scene_prompter\n", "\n", "user_prompt = \"\"\n", - "result = scene_prompter(scenes, user_prompt)" + "result = scene_prompter(scenes, user_prompt, llm=llm)" ] }, { @@ -423,8 +428,8 @@ "outputs": [], "source": [ "timeline = Timeline(conn)\n", - "result_timestamps = get_result_timestamps(video, result, scene_index_id=scene_index_id)\n", - "timeline, duration = build_video_timeline(video, result_timestamps, timeline)\n", + "\n", + "timeline, duration = build_video_timeline(video, result, timeline)\n", "stream = timeline.generate_stream()\n", "play_stream(stream)" ] diff --git a/llm_agent.py b/llm_agent.py index 0ce5126..7d7ea0a 100644 --- a/llm_agent.py +++ b/llm_agent.py @@ -1,20 +1,16 @@ """ Extend this codebase to add any LLM """ - import json import os import requests +from openai import OpenAI from anthropic import Anthropic, HUMAN_PROMPT, AI_PROMPT import google.generativeai as genai from dotenv import load_dotenv load_dotenv() -OPENAI_KEY = os.getenv("OPENAI_API_KEY") -CLAUDE_KEY = os.getenv("ANTHROPIC_KEY") -GEMINI_KEY = os.getenv("GEMINI_API_KEY") - class LLMType: OPENAI = "openAI" @@ -23,33 +19,35 @@ class LLMType: class Models: - GPT3 = "gpt-3.5-turbo-16k" - GPT4 = "gpt-4" GPT4o = "gpt-4o" - GPT4o_new = "gpt-4o-2024-08-06" - CLAUDE_INSTANT = "claude-instant-1.1" - CLAUDE2 = "claude-2" - GEMINI_1_5_FLASH = "gemini-1.5-flash" - GEMINI_1_5_PRO = "gemini-1.5-pro" - OA_MODELS_WITH_RESPONSE_TYPE_SUPPORT = [GPT4o, GPT4o_new] + GPT4o_mini = "gpt-4o-mini" + GPTo3 = "o3" + GPTo3_mini = "o3-mini" + + CLAUDE4_SONNET = "claude-sonnet-4-20250514" + + GEMINI_2_5_FLASH = "gemini-2.5-flash" + GEMINI_2_5_PRO = "gemini-2.5-pro" class LLM: - def __init__(self, llm_type=LLMType.OPENAI, model=Models.GPT4): + def __init__(self, llm_type=LLMType.OPENAI, model=Models.GPT4o): + self.type = llm_type self.model = model self.openai_key = os.getenv("OPENAI_API_KEY") - self.claude_key = os.getenv("ANTHROPIC_KEY") - self.gemini_key = os.getenv("GEMINI_KEY") + self.claude_key = os.getenv("ANTHROPIC_API_KEY") + self.gemini_key = os.getenv("GEMINI_API_KEY") + + def chat(self, message, temperature=0.6, functions=None): - def chat(self, message, functions=None): if self.type == LLMType.OPENAI: message = [self._to_gpt_msg(message)] - return self._call_openai(message, functions) + return self._call_openai(message, temperature, functions) elif self.type == LLMType.CLAUDE: - return self._call_claude(message) + return self._call_claude(message, temperature) elif self.type == LLMType.GEMINI: - return self._call_gemini(message) + return self._call_gemini(message, temperature) else: raise ValueError("Unsupported LLM type.") @@ -62,37 +60,41 @@ def _to_gpt_msg(self, data): context_msg = "" context_msg += str(data) - return {"role": "system", "content": context_msg} + return {"role": "user", "content": context_msg} - def _call_openai(self, message, functions=None): - url = "https://api.openai.com/v1/chat/completions" - # print(f'call openAI with message {message}') - headers = { - "Content-Type": "application/json", - "Authorization": f"Bearer {self.openai_key}", - } - data = { + def _call_openai(self, message, temperature=0.6, functions=None): + + if not self.openai_key: + raise ValueError("OPENAI_API_KEY environment variable is not set.") + client = OpenAI(api_key=self.openai_key) + + # Build the base payload + payload = { "model": self.model, "messages": message, - "temperature": 0.6, + "response_format": {"type": "json_object"}, } - if self.model in Models.OA_MODELS_WITH_RESPONSE_TYPE_SUPPORT: - data["response_format"] = {"type": "json_object"} + + if "o3" not in self.model: + payload.update({"temperature": temperature}) + if functions: - data.update( - { - "functions": functions, - "function_call": "auto", - } - ) + payload.update({ + "tools": functions, + "tool_choice": "auto", + }) - response = requests.post(url, headers=headers, data=json.dumps(data)) try: - return response.json() + response = client.chat.completions.create(**payload) + content = response.choices[0].message.content + return json.loads(content) except json.JSONDecodeError: return {"error": "Failed to decode JSON response."} + except Exception as e: + return {"error": str(e)} + + def _call_claude(self, message, temperature=0.6): - def _call_claude(self, message): anthropic = Anthropic(api_key=self.claude_key) prompt = f"{HUMAN_PROMPT} {message} {AI_PROMPT}" try: @@ -100,25 +102,45 @@ def _call_claude(self, message): model=self.model, max_tokens_to_sample=80000, prompt=prompt, + temperature=temperature, ) return {"response": completion.completion} except ( Exception - ) as e: # Consider a more specific exception based on the Anthropic SDK + ) as e: + print(f'call claude with error: {e}') return {"error": str(e)} - def _call_gemini(self, message): - genai.configure(api_key=GEMINI_KEY) - model = genai.GenerativeModel(self.model) + def _call_gemini(self, message, temperature=0.6): + + if not self.gemini_key: + raise ValueError("GEMINI_API_KEY environment variable is not set.") + + # Google provides an OpenAI-compatible endpoint for Gemini models (currently in beta) + client = OpenAI( + api_key=self.gemini_key, + base_url="https://generativelanguage.googleapis.com/v1beta/openai/", + ) + + # Ensure the message is wrapped in the expected list-of-dicts format + if isinstance(message, str): + messages = [self._to_gpt_msg(message)] + else: + messages = message # assume already formatted + + payload = { + "model": self.model, + "messages": messages, + "response_format": {"type": "json_object"}, + "temperature": temperature, + } + try: - response = model.generate_content(message) - response_text = response.text.replace("```json", "").replace("```", "") - response_json = json.loads(response_text) - return response_json.get("sentences") + response = client.chat.completions.create(**payload) + content = response.choices[0].message.content + + return json.loads(content) + except json.JSONDecodeError: + return {"error": "Failed to decode JSON response."} except Exception as e: return {"error": str(e)} - - def get_word_limit(self): - if self.type == LLMType.CLAUDE: - return 10000 - return 2000 diff --git a/requirements.txt b/requirements.txt index 753ff71..993184f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,28 +1,7 @@ -annotated-types==0.6.0 -anthropic==0.15.0 -anyio==4.2.0 -backoff==2.2.1 -certifi==2024.2.2 -charset-normalizer==3.3.2 -distro==1.9.0 -exceptiongroup==1.2.0 -filelock==3.13.1 -fsspec==2024.2.0 +anthropic==0.66.0 google-generativeai==0.5.4 -h11==0.14.0 -httpcore==1.0.2 -httpx==0.26.0 -huggingface-hub==0.20.3 -idna==3.6 -packaging==23.2 -pydantic==2.6.1 -pydantic_core==2.16.2 -python-dotenv==1.0.1 -PyYAML==6.0.1 -requests==2.31.0 -sniffio==1.3.0 -tokenizers==0.15.1 -tqdm==4.66.1 -typing_extensions==4.9.0 -urllib3==2.2.0 +python-dotenv>=1.0 +pydantic>=2.8.0 +requests>=2.31 videodb==0.2.3 +openai==1.105.0 \ No newline at end of file diff --git a/video_prompter.py b/video_prompter.py index 412fa5b..f597f48 100644 --- a/video_prompter.py +++ b/video_prompter.py @@ -2,7 +2,7 @@ import concurrent.futures -from llm_agent import LLM, LLMType +from llm_agent import LLM from videodb import connect from videodb import SearchType, IndexType from videodb.timeline import VideoAsset @@ -41,67 +41,10 @@ def chunk_docs(docs, chunk_size): :param chunk_size: :return: """ + chunks = [] for i in range(0, len(docs), chunk_size): - yield docs[i : i + chunk_size] # Yield the current chunk - - -def get_result_timestamps( - video, - result, - index_type="scene", - scene_index_id=None, - sort="time", - run_concurrent=True, -): - """ - This function takes the result from scene_prompter and performs a keyword search on the video. - By default, the function sorts the results by time. - It returns a list of (start, end, description) for the matched segments. - """ - result_timestamps = [] - - def search_description(description): - # keyword search on each result description - if index_type == "scene": - search_res = video.search( - description, - index_type=IndexType.scene, - search_type=SearchType.keyword, - scene_index_id=scene_index_id, - ) - else: - search_res = video.search( - description, - index_type=IndexType.spoken_word, - search_type=SearchType.keyword, - ) - matched_segments = search_res.get_shots() - if len(matched_segments) == 0: - return None # No match found - - video_shot = matched_segments[0] - return (video_shot.start, video_shot.end, video_shot.text) - - if run_concurrent: - with concurrent.futures.ThreadPoolExecutor() as executor: - future_to_desc = { - executor.submit(search_description, desc): desc for desc in result - } - for future in concurrent.futures.as_completed(future_to_desc): - res = future.result() - if res: - result_timestamps.append(res) - else: - for description in result: - res = search_description(description) - if res: - result_timestamps.append(res) - - # Sorting the results if needed - if sort == "time": - result_timestamps.sort(key=lambda x: x[0]) - - return result_timestamps + chunks.append(docs[i : i + chunk_size]) + return chunks # Creating and returning timeline of given result timestamps @@ -162,43 +105,18 @@ def get_multimodal_docs(transcript, scenes, club_on="scene"): return docs -def send_msg_openai(chunk_prompt, llm=LLM()): - response = llm.chat(message=chunk_prompt) - output = json.loads(response["choices"][0]["message"]["content"]) - sentences = output.get("sentences") - return sentences - - -def send_msg_claude(chunk_prompt, llm): - response = llm.chat(message=chunk_prompt) - # TODO : add claude reposnse parser - return response +def send_msg_llm(chunk_prompt, llm): + output = llm.chat(message=chunk_prompt) + return output - -def send_msg_gemini(chunk_prompt, llm): - response = llm.chat(message=chunk_prompt) - # TODO : add claude reposnse parser - return response - - -def text_prompter(transcript_text, prompt, llm=None): - chunk_size = 10000 +def text_prompter(transcript, prompt, llm=None): + chunk_size = 2000 # sentence tokenizer - chunks = chunk_docs(transcript_text, chunk_size=chunk_size) - # print(f"Length of the sentence chunk are {len(chunks)}") + chunks = chunk_docs(transcript, chunk_size=chunk_size) if llm is None: llm = LLM() - # 400 sentence at a time - if llm.type == LLMType.OPENAI: - llm_caller_fn = send_msg_openai - elif llm.type == LLMType.GEMINI: - llm_caller_fn = send_msg_gemini - else: - # claude for now - llm_caller_fn = send_msg_claude - matches = [] prompts = [] i = 0 @@ -212,7 +130,7 @@ def text_prompter(transcript_text, prompt, llm=None): - If the matched sentences are not too far, merge them into one sentence. - Strictly make each result minimum 20 words long. If the match is smaller, adjust the boundries and add more context around the sentences. - - **Output Format**: Return a JSON list of strings named 'sentences' that containes the output sentences, make sure they are exact substrings. + - **Output Format**: Return a JSON list of strings named 'relevant_timestamps' that containes the start and end timestamps along with the text of the relevant sentences. - **User Prompts**: User prompts may include requests like 'find funny moments' or 'find moments for social media'. Interpret these prompts by identifying keywords or themes in the transcript that match the intent of the prompt. """ @@ -228,8 +146,12 @@ def text_prompter(transcript_text, prompt, llm=None): Ensure the final output strictly adheres to the JSON format specified without including additional text or explanations. \ If there is no match return empty list without additional text. Use the following structure for your response: { - "sentences": [ - {}, + "relevant_timestamps": [ + { + "start": __, + "end": __, + "text": __ + }, ... ] } @@ -240,38 +162,30 @@ def text_prompter(transcript_text, prompt, llm=None): # make a parallel call to all chunks with prompts with concurrent.futures.ThreadPoolExecutor() as executor: future_to_index = { - executor.submit(llm_caller_fn, prompt, llm): prompt for prompt in prompts + executor.submit(send_msg_llm, prompt, llm): prompt for prompt in prompts } for future in concurrent.futures.as_completed(future_to_index): try: - matches.extend(future.result()) + results = future.result() + matches.extend([(timestamp["start"], timestamp["end"], timestamp["text"]) for timestamp in results["relevant_timestamps"]]) except Exception as e: print(f"Chunk failed to work with LLM {str(e)}") return matches -def scene_prompter(transcript_text, prompt, llm=None, run_concurrent=True): +def scene_prompter(scenes, prompt, llm=None, run_concurrent=True): chunk_size = 100 - chunks = chunk_docs(transcript_text, chunk_size=chunk_size) + chunks = chunk_docs(scenes, chunk_size=chunk_size) - llm_caller_fn = send_msg_openai if llm is None: llm = LLM() - # TODO: llm should have caller function - # 400 sentence at a time - if llm.type == LLMType.OPENAI: - llm_caller_fn = send_msg_openai - else: - # claude for now - llm_caller_fn = send_msg_claude - matches = [] prompts = [] i = 0 for chunk in chunks: - descriptions = [scene["description"] for scene in chunk] + descriptions = {idx : (scene["start"], scene["end"], scene["description"]) for idx, scene in enumerate(chunk)} chunk_prompt = """ You are a video editor who uses AI. Given a user prompt and AI-generated scene descriptions of a video, analyze the descriptions to identify segments relevant to the user prompt for creating clips. @@ -283,58 +197,60 @@ def scene_prompter(transcript_text, prompt, llm=None, run_concurrent=True): - User Prompts: Interpret prompts like 'find exciting moments' or 'identify key plot points' by matching keywords or themes in the scene descriptions to the intent of the prompt. """ + descriptions_str = json.dumps([{idx : description} for idx, (start, end, description) in descriptions.items()]) chunk_prompt += f""" - Descriptions: {json.dumps(descriptions)} + Descriptions: {descriptions_str} User Prompt: {prompt} """ chunk_prompt += """ - **Output Format**: Return a JSON list of strings named 'result' that containes the fileds `sentence` Ensure the final output - strictly adheres to the JSON format specified without including additional text or explanations. \ + **Output Format**: Return a JSON list of relevant ids with a field 'relevant_ids' + Ensure the final output strictly adheres to the JSON format specified without including additional text or explanations. \ If there is no match return empty list without additional text. Use the following structure for your response: - {"sentences": []} + {"relevant_ids": []} """ - prompts.append(chunk_prompt) + + prompts.append((chunk_prompt, descriptions)) i += 1 if run_concurrent: with concurrent.futures.ThreadPoolExecutor() as executor: future_to_index = { - executor.submit(llm_caller_fn, prompt, llm): prompt - for prompt in prompts + executor.submit(send_msg_llm, prompt, llm): descriptions + for prompt, descriptions in prompts } for future in concurrent.futures.as_completed(future_to_index): try: - matches.extend(future.result()) + results = future.result() + descriptions = future_to_index[future] + matches.extend(descriptions[int(res_idx)] for res_idx in results["relevant_ids"]) except Exception as e: print(f"Chunk failed to work with LLM {str(e)}") else: - for prompt in prompts: + for prompt, descriptions in prompts: try: - res = llm_caller_fn(prompt) - matches.extend(res) + res = send_msg_llm(prompt, llm) + matches.extend(descriptions[int(res_idx)] for res_idx in res["relevant_ids"]) except Exception as e: print(f"Chunk failed to work with LLM {str(e)}") return matches -def multimodal_prompter(transcript, scene_index, prompt, llm=None, run_concurrent=True): - docs = get_multimodal_docs(transcript, scene_index) - chunk_size = 80 +def multimodal_prompter(transcript, scenes, prompt, llm=None, run_concurrent=True): + docs = get_multimodal_docs(transcript, scenes) + chunk_size = 100 chunks = chunk_docs(docs, chunk_size=chunk_size) if llm is None: llm = LLM() - if llm.type == LLMType.OPENAI: - llm_caller_fn = send_msg_openai - else: - llm_caller_fn = send_msg_claude - matches = [] prompts = [] i = 0 + for chunk in chunks: + chunk_lookup = {idx: (doc["start"], doc["end"], (doc["spoken"], doc["visual"])) for idx, doc in enumerate(chunk)} + chunk = {idx: (spoken, visual) for idx, (start, end, (spoken, visual)) in chunk_lookup.items()} chunk_prompt = f""" You are given visual and spoken information of the video of each second, and a transcipt of what's being spoken along with timestamp. Your task is to evaluate the data for relevance to the specified user prompt. @@ -347,31 +263,33 @@ def multimodal_prompter(transcript, scene_index, prompt, llm=None, run_concurren """ chunk_prompt += """ - **Output Format**: Return a JSON list of strings named 'result' that containes the fileds `sentence`. - sentence is from the visual section of the input. + **Output Format**: Return a JSON list of relevant ids (list(str)) in a field named 'relevant_ids'. + Ensure the final output strictly adheres to the JSON format specified without including additional text or explanations. If there is no match return empty list without additional text. Use the following structure for your response: - {"sentences": []} + {"relevant_ids": []} """ - prompts.append(chunk_prompt) + prompts.append((chunk_prompt, chunk_lookup)) i += 1 if run_concurrent: with concurrent.futures.ThreadPoolExecutor() as executor: future_to_index = { - executor.submit(llm_caller_fn, prompt, llm): prompt - for prompt in prompts + executor.submit(send_msg_llm, prompt, llm): chunk_lookup + for prompt, chunk_lookup in prompts } for future in concurrent.futures.as_completed(future_to_index): try: - matches.extend(future.result()) + results = future.result() + chunk_lookup = future_to_index[future] + matches.extend(chunk_lookup[int(res_idx)] for res_idx in results["relevant_ids"]) except Exception as e: print(f"Chunk failed to work with LLM {str(e)}") else: - for prompt in prompts: + for prompt, chunk_lookup in prompts: try: - res = llm_caller_fn(prompt) - matches.extend(res) + res = send_msg_llm(prompt, llm) + matches.extend(chunk_lookup[int(res_idx)] for res_idx in res["relevant_ids"]) except Exception as e: import traceback