Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file not shown.
Empty file.
83 changes: 83 additions & 0 deletions audio_processor/audio_processor/audio_processor_node.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import rclpy
from rclpy.node import Node
from std_msgs.msg import String
import ffmpeg
import numpy as np
from pydub import AudioSegment
from openvino.runtime import Core

class AudioProcessorNode(Node):
Copy link

Copilot AI Oct 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The class is missing a docstring. Add a docstring to describe the purpose of this audio processing node and its functionality.

Suggested change
class AudioProcessorNode(Node):
class AudioProcessorNode(Node):
"""
A ROS2 node for audio processing that loads an OpenVINO speech-to-text model,
processes audio files (WAV or MP4), performs inference, and publishes the
transcribed text to a ROS2 topic.
Functionality includes:
- Extracting audio from MP4 or reading WAV files.
- Preprocessing audio for model input.
- Running inference using OpenVINO.
- Postprocessing model output to text.
- Publishing results to the 'stt_output' topic.
"""

Copilot uses AI. Check for mistakes.
def __init__(self):
super().__init__('audio_processor_node')
self.publisher_ = self.create_publisher(String, 'stt_output', 10)
self.ie = Core()
# Load the converted OpenVINO model
# self.model = self.ie.read_model(model='wav2vec2-base/wav2vec2-base.xml')
self.model = self.ie.read_model(model='/root/ros2_ws/audio_processor/audio_processor/wav2vec2-base/wav2vec2-base.xml')
Comment on lines +12 to +16
Copy link

Copilot AI Oct 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hard-coded absolute path makes the code non-portable. Consider using a relative path or making this configurable through a ROS parameter.

Suggested change
self.publisher_ = self.create_publisher(String, 'stt_output', 10)
self.ie = Core()
# Load the converted OpenVINO model
# self.model = self.ie.read_model(model='wav2vec2-base/wav2vec2-base.xml')
self.model = self.ie.read_model(model='/root/ros2_ws/audio_processor/audio_processor/wav2vec2-base/wav2vec2-base.xml')
self.declare_parameter('model_path', 'wav2vec2-base/wav2vec2-base.xml')
model_path = self.get_parameter('model_path').get_parameter_value().string_value
self.publisher_ = self.create_publisher(String, 'stt_output', 10)
self.ie = Core()
self.model = self.ie.read_model(model=model_path)

Copilot uses AI. Check for mistakes.
self.compiled_model = self.ie.compile_model(model=self.model, device_name='CPU')
self.input_layer = self.compiled_model.input(0)
self.output_layer = self.compiled_model.output(0)

def process_audio_file(self, file_path):
if file_path.endswith('.mp4'):
audio_data = self.extract_audio_from_mp4(file_path)
elif file_path.endswith('.wav'):
audio_data = self.read_wav_file(file_path)
else:
self.get_logger().error('Unsupported file format')
return

self.process_audio(audio_data)

def extract_audio_from_mp4(self, file_path):
audio_output = 'temp_audio.wav'
ffmpeg.input(file_path).output(audio_output, ac=1, ar='16000').run(overwrite_output=True)
return self.read_wav_file(audio_output)

def read_wav_file(self, file_path):
audio = AudioSegment.from_wav(file_path)
samples = np.array(audio.get_array_of_samples())
return samples

def process_audio(self, audio_data):
# Preprocess audio_data as needed for your model
input_data = self.preprocess_audio(audio_data)
result = self.compiled_model([input_data])[self.output_layer]
text_output = self.postprocess_result(result)
self.publish_text(text_output)

def preprocess_audio(self, audio_data):
# Normalize audio data
audio_data = audio_data / np.max(np.abs(audio_data))
Copy link

Copilot AI Oct 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Division by zero will occur if audio_data contains only zeros. Add a check to prevent division by zero.

Suggested change
audio_data = audio_data / np.max(np.abs(audio_data))
max_val = np.max(np.abs(audio_data))
if max_val == 0:
self.get_logger().warning("Audio data contains only zeros; skipping normalization to avoid division by zero.")
else:
audio_data = audio_data / max_val

Copilot uses AI. Check for mistakes.

# Resample or trim/pad the audio data to 16000 samples
target_length = 16000
if len(audio_data) > target_length:
audio_data = audio_data[:target_length] # Trim
else:
audio_data = np.pad(audio_data, (0, max(0, target_length - len(audio_data))), 'constant') # Pad

return np.expand_dims(audio_data, axis=0) # Add batch dimension

def postprocess_result(self, result):
# Implement postprocessing logic to convert model output to text
return "example text"
Copy link

Copilot AI Oct 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The postprocess_result method returns a placeholder string instead of implementing actual text conversion logic. This should be implemented to properly decode the model output.

Copilot uses AI. Check for mistakes.

def publish_text(self, text):
msg = String()
msg.data = text
self.publisher_.publish(msg)
self.get_logger().info(f'Published: {text}')

def main(args=None):
rclpy.init(args=args)
node = AudioProcessorNode()
# Example: Process an audio file
node.process_audio_file('/root/ros2_ws/audio_processor/audio_processor/1089-134686-0001.wav')
Copy link

Copilot AI Oct 7, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hard-coded absolute path in main function makes the code non-portable. Consider making this configurable or removing the hard-coded test call.

Copilot uses AI. Check for mistakes.
# node.process_audio_file('1089-134686-0001.wav')
rclpy.spin(node)
node.destroy_node()
rclpy.shutdown()

if __name__ == '__main__':
main()
Loading