"""Bots package whisper agent (bots/whisper) Lightweight agent descriptor; heavy model loading must be done by a controller when the agent is actually used. """ from typing import Dict, Any, Optional, Callable, Awaitable import librosa from logger import logger from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq from aiortc import MediaStreamTrack # Import shared models for chat functionality import sys import os sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) from shared.models import ChatMessageModel AGENT_NAME = "whisper" AGENT_DESCRIPTION = "Speech recognition agent (Whisper) - processes incoming audio" def agent_info() -> Dict[str, str]: return {"name": AGENT_NAME, "description": AGENT_DESCRIPTION} def create_agent_tracks(session_name: str) -> dict[str, MediaStreamTrack]: """Whisper is not a media source - return no local tracks.""" return {} async def handle_chat_message(chat_message: ChatMessageModel, send_message_func: Callable[[str], Awaitable[None]]) -> Optional[str]: """Handle incoming chat messages and optionally return a response. Args: chat_message: The received chat message send_message_func: Function to send messages back to the lobby Returns: Optional response message to send back to the lobby """ logger.info(f"Whisper bot received chat message from {chat_message.sender_name}: {chat_message.message}") # Simple echo bot behavior for demonstration if chat_message.message.lower().startswith("whisper:"): command = chat_message.message[8:].strip() # Remove "whisper:" prefix if command.lower() == "hello": return f"Hello {chat_message.sender_name}! I'm the Whisper speech recognition bot." elif command.lower() == "help": return "I can process speech and respond to simple commands. Try 'whisper: hello' or 'whisper: status'" elif command.lower() == "status": return "Whisper bot is running and ready to process audio and chat messages." else: return f"I heard you say: {command}. Try 'whisper: help' for available commands." # Don't respond to other messages return None def do_work(): model_ids = { "Distil-Whisper": [ "distil-whisper/distil-large-v2", "distil-whisper/distil-medium.en", "distil-whisper/distil-small.en" ], "Whisper": [ "openai/whisper-large-v3", "openai/whisper-large-v2", "openai/whisper-large", "openai/whisper-medium", "openai/whisper-small", "openai/whisper-base", "openai/whisper-tiny", "openai/whisper-medium.en", "openai/whisper-small.en", "openai/whisper-base.en", "openai/whisper-tiny.en", ] } model_type = model_ids["Distil-Whisper"] logger.info(model_type) model_id = model_type[0] processor: Any = AutoProcessor.from_pretrained(pretrained_model_name_or_path=model_id) # type: ignore pt_model: Any = AutoModelForSpeechSeq2Seq.from_pretrained(pretrained_model_name_or_path=model_id) # type: ignore pt_model.eval() # type: ignore def extract_input_features(audio_array: Any, sampling_rate: int) -> Any: """Extract input features from audio array and sampling rate.""" processor_output = processor( # type: ignore audio_array, sampling_rate=sampling_rate, return_tensors="pt", ) input_features: Any = processor_output.input_features # type: ignore return input_features # type: ignore def load_audio_file(file_path: str) -> tuple[Any, int]: """Load audio file from disk and return audio array and sampling rate.""" # Whisper models expect 16kHz sample rate target_sample_rate = 16000 try: # Load audio file using librosa and resample to target rate audio_array, original_sampling_rate = librosa.load(file_path, sr=None) # type: ignore logger.info(f"Loaded audio file: {file_path}, duration: {len(audio_array)/original_sampling_rate:.2f}s, original sample rate: {original_sampling_rate}Hz") # type: ignore # Resample if necessary if original_sampling_rate != target_sample_rate: audio_array = librosa.resample(audio_array, orig_sr=original_sampling_rate, target_sr=target_sample_rate) # type: ignore logger.info(f"Resampled audio from {original_sampling_rate}Hz to {target_sample_rate}Hz") return audio_array, target_sample_rate # type: ignore except Exception as e: logger.error(f"Error loading audio file {file_path}: {e}") raise # Example usage - replace with your audio file path audio_file_path = "/voicebot/F_0818_15y11m_1.wav" # Load audio from file instead of dataset try: audio_array, sampling_rate = load_audio_file(audio_file_path) input_features = extract_input_features(audio_array, sampling_rate) predicted_ids = pt_model.generate(input_features) # type: ignore transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True) # type: ignore print(f"Audio file: {audio_file_path}") print(f"Transcription: {transcription[0]}") except FileNotFoundError: logger.error(f"Audio file not found: {audio_file_path}") print("Please update the audio_file_path variable with a valid path to your wav file") except Exception as e: logger.error(f"Error processing audio: {e}") print(f"Error: {e}")