11from __future__ import annotations
22
3+ import os
34import subprocess
5+ import tempfile
46import time
57from pathlib import Path
68from typing import Literal
@@ -84,6 +86,67 @@ def field_to_cli_arg(flag: str, value) -> list[str] | None:
8486
8587ResponseFormat = Literal ["json" , "verbose_json" , "srt" , "vtt" , "text" , "tsv" ]
8688
89+ VIDEO_EXT = {
90+ ".mp4" ,
91+ ".mkv" ,
92+ ".mov" ,
93+ ".avi" ,
94+ ".webm" ,
95+ ".flv" ,
96+ ".wmv" ,
97+ ".m4v" ,
98+ }
99+
100+
101+ def is_video_file (path : Path ) -> bool :
102+ return path .suffix .lower () in VIDEO_EXT
103+
104+
105+ def convert_video_to_audio (path : Path ) -> Path :
106+ fd , temp_path = tempfile .mkstemp (suffix = ".wav" )
107+ os .close (fd )
108+ audio_path = Path (temp_path )
109+
110+ command = [
111+ "ffmpeg" ,
112+ "-hide_banner" ,
113+ "-loglevel" ,
114+ "warning" ,
115+ "-y" ,
116+ "-i" ,
117+ str (path ),
118+ "-vn" ,
119+ "-acodec" ,
120+ "pcm_s16le" ,
121+ "-ar" ,
122+ "16000" ,
123+ "-ac" ,
124+ "1" ,
125+ str (audio_path ),
126+ ]
127+
128+ try :
129+ subprocess .run (
130+ command ,
131+ check = True ,
132+ stdout = subprocess .DEVNULL ,
133+ stderr = subprocess .PIPE ,
134+ )
135+ except FileNotFoundError as exc :
136+ audio_path .unlink (missing_ok = True )
137+ raise RuntimeError (
138+ "ffmpeg is required to convert video files to audio but was not found"
139+ ) from exc
140+ except subprocess .CalledProcessError as exc :
141+ audio_path .unlink (missing_ok = True )
142+ stderr = exc .stderr .decode (errors = "ignore" ) if exc .stderr else ""
143+ raise RuntimeError (
144+ "ffmpeg failed to convert video file to audio"
145+ + (f": { stderr .strip ()} " if stderr else "" )
146+ ) from exc
147+
148+ return audio_path
149+
87150
88151def generate_start_server_command (
89152 server_opts : WhisperCppServerOptions ,
@@ -239,19 +302,30 @@ def inference(
239302 ) -> InferenceJSONVerbose :
240303 self ._wait_until_ready ()
241304 url = self ._get_url (self ._server_options .inference_path )
242- with file .open ("rb" ) as file_handle :
243- response = requests .post (
244- url ,
245- files = {"file" : (file .name , file_handle )},
246- data = {
247- "temperature" : str (temperature ),
248- "temperature_inc" : str (temperature_inc ),
249- "response_format" : "verbose_json" ,
250- },
251- )
252- response .raise_for_status ()
253- response_json = response .json ()
254- return InferenceJSONVerbose (** response_json )
305+
306+ upload_path = file
307+ temp_audio_path : Path | None = None
308+ if is_video_file (file ):
309+ temp_audio_path = convert_video_to_audio (file )
310+ upload_path = temp_audio_path
311+
312+ try :
313+ with upload_path .open ("rb" ) as file_handle :
314+ response = requests .post (
315+ url ,
316+ files = {"file" : (upload_path .name , file_handle )},
317+ data = {
318+ "temperature" : str (temperature ),
319+ "temperature_inc" : str (temperature_inc ),
320+ "response_format" : "verbose_json" ,
321+ },
322+ )
323+ response .raise_for_status ()
324+ response_json = response .json ()
325+ return InferenceJSONVerbose (** response_json )
326+ finally :
327+ if temp_audio_path is not None :
328+ temp_audio_path .unlink (missing_ok = True )
255329
256330 def load (self , model : Path ) -> requests .Response :
257331 self ._wait_until_ready ()
@@ -262,3 +336,6 @@ def load(self, model: Path) -> requests.Response:
262336 )
263337 response .raise_for_status ()
264338 return response
339+
340+
341+ __all__ = ["WhisperCppServer" , "WhisperCppServerOptions" , "VoiceActivityDetectionOptions" ]
0 commit comments