Skip to content

Commit 9910015

Browse files
committed
convert_video_to_audio
1 parent ac9e44b commit 9910015

File tree

1 file changed

+90
-13
lines changed

1 file changed

+90
-13
lines changed

whispercpppy/server.py

Lines changed: 90 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
from __future__ import annotations
22

3+
import os
34
import subprocess
5+
import tempfile
46
import time
57
from pathlib import Path
68
from typing import Literal
@@ -84,6 +86,67 @@ def field_to_cli_arg(flag: str, value) -> list[str] | None:
8486

8587
ResponseFormat = Literal["json", "verbose_json", "srt", "vtt", "text", "tsv"]
8688

89+
VIDEO_EXT = {
90+
".mp4",
91+
".mkv",
92+
".mov",
93+
".avi",
94+
".webm",
95+
".flv",
96+
".wmv",
97+
".m4v",
98+
}
99+
100+
101+
def is_video_file(path: Path) -> bool:
102+
return path.suffix.lower() in VIDEO_EXT
103+
104+
105+
def convert_video_to_audio(path: Path) -> Path:
106+
fd, temp_path = tempfile.mkstemp(suffix=".wav")
107+
os.close(fd)
108+
audio_path = Path(temp_path)
109+
110+
command = [
111+
"ffmpeg",
112+
"-hide_banner",
113+
"-loglevel",
114+
"warning",
115+
"-y",
116+
"-i",
117+
str(path),
118+
"-vn",
119+
"-acodec",
120+
"pcm_s16le",
121+
"-ar",
122+
"16000",
123+
"-ac",
124+
"1",
125+
str(audio_path),
126+
]
127+
128+
try:
129+
subprocess.run(
130+
command,
131+
check=True,
132+
stdout=subprocess.DEVNULL,
133+
stderr=subprocess.PIPE,
134+
)
135+
except FileNotFoundError as exc:
136+
audio_path.unlink(missing_ok=True)
137+
raise RuntimeError(
138+
"ffmpeg is required to convert video files to audio but was not found"
139+
) from exc
140+
except subprocess.CalledProcessError as exc:
141+
audio_path.unlink(missing_ok=True)
142+
stderr = exc.stderr.decode(errors="ignore") if exc.stderr else ""
143+
raise RuntimeError(
144+
"ffmpeg failed to convert video file to audio"
145+
+ (f": {stderr.strip()}" if stderr else "")
146+
) from exc
147+
148+
return audio_path
149+
87150

88151
def generate_start_server_command(
89152
server_opts: WhisperCppServerOptions,
@@ -239,19 +302,30 @@ def inference(
239302
) -> InferenceJSONVerbose:
240303
self._wait_until_ready()
241304
url = self._get_url(self._server_options.inference_path)
242-
with file.open("rb") as file_handle:
243-
response = requests.post(
244-
url,
245-
files={"file": (file.name, file_handle)},
246-
data={
247-
"temperature": str(temperature),
248-
"temperature_inc": str(temperature_inc),
249-
"response_format": "verbose_json",
250-
},
251-
)
252-
response.raise_for_status()
253-
response_json = response.json()
254-
return InferenceJSONVerbose(**response_json)
305+
306+
upload_path = file
307+
temp_audio_path: Path | None = None
308+
if is_video_file(file):
309+
temp_audio_path = convert_video_to_audio(file)
310+
upload_path = temp_audio_path
311+
312+
try:
313+
with upload_path.open("rb") as file_handle:
314+
response = requests.post(
315+
url,
316+
files={"file": (upload_path.name, file_handle)},
317+
data={
318+
"temperature": str(temperature),
319+
"temperature_inc": str(temperature_inc),
320+
"response_format": "verbose_json",
321+
},
322+
)
323+
response.raise_for_status()
324+
response_json = response.json()
325+
return InferenceJSONVerbose(**response_json)
326+
finally:
327+
if temp_audio_path is not None:
328+
temp_audio_path.unlink(missing_ok=True)
255329

256330
def load(self, model: Path) -> requests.Response:
257331
self._wait_until_ready()
@@ -262,3 +336,6 @@ def load(self, model: Path) -> requests.Response:
262336
)
263337
response.raise_for_status()
264338
return response
339+
340+
341+
__all__ = ["WhisperCppServer", "WhisperCppServerOptions", "VoiceActivityDetectionOptions"]

0 commit comments

Comments
 (0)