Multi-user transcription

2025-09-05 16:50:19 -07:00 · 2025-09-05 16:50:19 -07:00 · 9089edaeea
commit 9089edaeea
parent d6791a5233
6 changed files with 788 additions and 107 deletions
--- a/client/src/MediaControl.tsx
+++ b/client/src/MediaControl.tsx
@ -792,8 +792,44 @@ const MediaAgent = (props: MediaAgentProps) => {

          for (const candidate of pendingCandidates) {
            try {
-              await pc.addIceCandidate(new RTCIceCandidate(candidate));
-              console.log(`media-agent - sessionDescription:${peer_name} - Queued ICE candidate added`);
+              if (!candidate.candidate) {
+                // End-of-candidates signal
+                await pc.addIceCandidate(undefined);
+                console.log(`media-agent - sessionDescription:${peer_name} - Queued end-of-candidates added`);
+              } else {
+                // Coerce and sanitize the incoming candidate before handing to the browser
+                let candStr: string | null = candidate.candidate ?? null;
+                if (typeof candStr === "string") {
+                  candStr = candStr.trim();
+                  // Strip leading 'a=' if present (sometimes sent from SDP parsing)
+                  if (candStr.startsWith("a=candidate:")) {
+                    candStr = candStr.replace(/^a=/, "");
+                  }
+                  // Ensure the string starts with the expected keyword
+                  if (!candStr.startsWith("candidate:")) {
+                    candStr = `candidate:${candStr}`;
+                  }
+                }
+
+                const candidateInit: RTCIceCandidateInit = {
+                  candidate: candStr ?? "",
+                  sdpMid: candidate.sdpMid ?? undefined,
+                  sdpMLineIndex:
+                    typeof candidate.sdpMLineIndex === "number"
+                      ? candidate.sdpMLineIndex
+                      : undefined,
+                };
+
+                try {
+                  await pc.addIceCandidate(candidateInit);
+                  console.log(`media-agent - sessionDescription:${peer_name} - Queued ICE candidate added`);
+                } catch (err) {
+                  console.error(
+                    `media-agent - sessionDescription:${peer_name} - Failed to add queued ICE candidate:`,
+                    { candidateInit, err }
+                  );
+                }
+              }
            } catch (err) {
              console.error(`media-agent - sessionDescription:${peer_name} - Failed to add queued ICE candidate:`, err);
            }
@ -899,10 +935,42 @@ const MediaAgent = (props: MediaAgentProps) => {
      }

      // Add the ICE candidate
-      peer.connection
-        .addIceCandidate(new RTCIceCandidate(candidate))
-        .then(() => console.log(`media-agent - iceCandidate::${peer_name} - ICE candidate added for ${peer.peer_name}`))
-        .catch((err) => console.error(`media-agent - iceCandidate::${peer_name} - Failed to add ICE candidate:`, err));
+      if (!candidate.candidate) {
+        // End-of-candidates signal
+        peer.connection
+          .addIceCandidate(undefined)
+          .then(() =>
+            console.log(`media-agent - iceCandidate::${peer_name} - End-of-candidates added for ${peer.peer_name}`)
+          )
+          .catch((err) =>
+            console.error(`media-agent - iceCandidate::${peer_name} - Failed to add end-of-candidates:`, err)
+          );
+      } else {
+        // Sanitize and coerce incoming candidate
+        let candStr: string | null = candidate.candidate ?? null;
+        if (typeof candStr === "string") {
+          candStr = candStr.trim();
+          if (candStr.startsWith("a=candidate:")) {
+            candStr = candStr.replace(/^a=/, "");
+          }
+          if (!candStr.startsWith("candidate:")) {
+            candStr = `candidate:${candStr}`;
+          }
+        }
+
+        const candidateInit: RTCIceCandidateInit = {
+          candidate: candStr ?? "",
+          sdpMid: candidate.sdpMid ?? undefined,
+          sdpMLineIndex: typeof candidate.sdpMLineIndex === "number" ? candidate.sdpMLineIndex : undefined,
+        };
+
+        peer.connection
+          .addIceCandidate(candidateInit)
+          .then(() =>
+            console.log(`media-agent - iceCandidate::${peer_name} - ICE candidate added for ${peer.peer_name}`)
+          )
+          .catch((err) => console.error(`media-agent - iceCandidate::${peer_name} - Failed to add ICE candidate:`, { candidateInit, err }));
+      }
    },
    [peers]
  );
--- a/voicebot/bots/whisper.py
+++ b/voicebot/bots/whisper.py
@ -25,7 +25,10 @@ import sys
 import os

 from voicebot.models import Peer
-sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
+
+sys.path.append(
+    os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+)
 from shared.models import ChatMessageModel

 from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
@ -33,20 +36,25 @@ from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
 # Type definitions
 AudioArray = npt.NDArray[np.float32]

+
 class AudioQueueItem(BaseModel):
    """Audio data with timestamp for processing queue."""
+
    audio: AudioArray
    timestamp: float

    class Config:
        arbitrary_types_allowed = True

+
 class TranscriptionHistoryItem(BaseModel):
    """Transcription history item with metadata."""
+
    message: str
    timestamp: float
    is_final: bool

+
 AGENT_NAME = "whisper"
 AGENT_DESCRIPTION = "Real-time speech transcription (Whisper) - converts speech to text"
 sample_rate = 16000  # Whisper expects 16kHz
@ -55,7 +63,7 @@ model_ids = {
    "Distil-Whisper": [
        "distil-whisper/distil-large-v2",
        "distil-whisper/distil-medium.en",
-        "distil-whisper/distil-small.en"
+        "distil-whisper/distil-small.en",
    ],
    "Whisper": [
        "openai/whisper-large-v3",
@ -69,15 +77,24 @@ model_ids = {
        "openai/whisper-small.en",
        "openai/whisper-base.en",
        "openai/whisper-tiny.en",
-    ]
+    ],
 }
 # Global whisper model and transcription handler
 _model_type = model_ids["Distil-Whisper"]
 _model_id = _model_type[0]
+
+logger.info(f"Loading Whisper model: {_model_id}")
 _processor: Any = AutoProcessor.from_pretrained(pretrained_model_name_or_path=_model_id)  # type: ignore
-_pt_model: Any = AutoModelForSpeechSeq2Seq.from_pretrained(pretrained_model_name_or_path=_model_id)  # type: ignore
+logger.info("Whisper processor loaded successfully")
+
+_pt_model: Any = AutoModelForSpeechSeq2Seq.from_pretrained(
+    pretrained_model_name_or_path=_model_id
+)  # type: ignore
 _pt_model.eval()  # type: ignore
-_audio_processor: Optional['AudioProcessor'] = None
+logger.info("Whisper model loaded and set to evaluation mode")
+
+_audio_processors: Dict[str, "AudioProcessor"] = {}  # Per-peer audio processors
+_send_chat_func: Optional[Callable[[str], Awaitable[None]]] = None


 def extract_input_features(audio_array: Any, sampling_rate: int) -> Any:
@ -90,17 +107,25 @@ def extract_input_features(audio_array: Any, sampling_rate: int) -> Any:
    input_features: Any = processor_output.input_features  # type: ignore
    return input_features  # type: ignore

-class AudioProcessor:
-    """Handles audio stream processing and transcription with sentence chunking."""

-    def __init__(self, send_chat_func: Callable[[str], Awaitable[None]]):
+class AudioProcessor:
+    """Handles audio stream processing and transcription with sentence chunking for a specific peer."""
+
+    def __init__(
+        self, peer_name: str, send_chat_func: Callable[[str], Awaitable[None]]
+    ):
+        self.peer_name = peer_name
        self.send_chat_func = send_chat_func
        self.sample_rate = 16000  # Whisper expects 16kHz
        self.samples_per_frame = 480  # Common WebRTC frame size at 16kHz (30ms)

        # Audio buffering
-        self.audio_buffer: Deque[AudioArray] = deque(maxlen=1000)  # ~30 seconds at 30ms frames
-        self.phrase_timeout = 3.0  # seconds of silence before considering phrase complete
+        self.audio_buffer: Deque[AudioArray] = deque(
+            maxlen=1000
+        )  # ~30 seconds at 30ms frames
+        self.phrase_timeout = (
+            3.0  # seconds of silence before considering phrase complete
+        )
        self.last_activity_time = time.time()

        # Transcription state
@ -110,14 +135,19 @@ class AudioProcessor:
        # Background processing
        self.processing_queue: Queue[AudioQueueItem] = Queue()
        self.is_running = True
-        self.processor_thread = threading.Thread(target=self._processing_loop, daemon=True)
+        self.processor_thread = threading.Thread(
+            target=self._processing_loop, daemon=True
+        )
        self.processor_thread.start()

-        logger.info("AudioProcessor initialized for real-time transcription")
+        logger.info(
+            f"AudioProcessor initialized for {self.peer_name} - sample_rate: {self.sample_rate}Hz, frame_size: {self.samples_per_frame}, phrase_timeout: {self.phrase_timeout}s"
+        )

    def add_audio_data(self, audio_data: AudioArray):
        """Add new audio data to the processing buffer."""
        if not self.is_running:
+            logger.debug("AudioProcessor not running, ignoring audio data")
            return

        # Resample if needed (WebRTC might provide different sample rates)
@ -125,40 +155,93 @@ class AudioProcessor:
            self.audio_buffer.append(audio_data)
            self.last_activity_time = time.time()

+            # Calculate audio metrics to detect silence
+            audio_rms = np.sqrt(np.mean(audio_data**2))
+            audio_peak = np.max(np.abs(audio_data))
+
+            # Log audio buffer status (reduced verbosity)
+            buffer_duration_ms = len(self.audio_buffer) * 30  # assuming 30ms frames
+
+            # Only log if we have meaningful audio or every 50 frames
+            if audio_rms > 0.001 or len(self.audio_buffer) % 50 == 0:
+                logger.info(
+                    f"Added audio chunk: {len(audio_data)} samples, buffer size: {len(self.audio_buffer)} frames ({buffer_duration_ms}ms), RMS: {audio_rms:.4f}, Peak: {audio_peak:.4f}"
+                )
+            else:
+                logger.debug(
+                    f"Added silent audio chunk: {len(audio_data)} samples, buffer size: {len(self.audio_buffer)} frames"
+                )
+
            # Check if we should process accumulated audio
            if len(self.audio_buffer) >= 10:  # Process every ~300ms (10 * 30ms frames)
-                self._queue_for_processing()
+                # Check if we have any meaningful audio in the buffer
+                combined_audio = np.concatenate(list(self.audio_buffer))
+                combined_rms = np.sqrt(np.mean(combined_audio**2))
+
+                if combined_rms > 0.001:  # Only process if not silence
+                    logger.info(
+                        f"Buffer threshold reached with meaningful audio (RMS: {combined_rms:.4f}), queuing for processing"
+                    )
+                    self._queue_for_processing()
+                else:
+                    logger.debug(
+                        f"Buffer threshold reached but audio is silent (RMS: {combined_rms:.4f}), clearing buffer"
+                    )
+                    self.audio_buffer.clear()  # Clear silent audio

    def _queue_for_processing(self):
        """Queue current audio buffer for transcription processing."""
        if not self.audio_buffer:
+            logger.debug("No audio in buffer to queue for processing")
            return

        # Combine recent audio frames
        combined_audio = np.concatenate(list(self.audio_buffer))
        self.audio_buffer.clear()

+        # Calculate audio metrics
+        audio_duration_sec = len(combined_audio) / self.sample_rate
+        audio_rms = np.sqrt(np.mean(combined_audio**2))
+        audio_peak = np.max(np.abs(combined_audio))
+
+        # Skip completely silent audio
+        if audio_rms < 0.001 and audio_peak < 0.001:
+            logger.debug(
+                f"Skipping silent audio chunk: RMS: {audio_rms:.4f}, Peak: {audio_peak:.4f}"
+            )
+            return
+
+        logger.info(
+            f"Queuing audio chunk: {len(combined_audio)} samples, {audio_duration_sec:.2f}s duration, RMS: {audio_rms:.4f}, Peak: {audio_peak:.4f}"
+        )
+
        # Add to processing queue
        try:
-            queue_item = AudioQueueItem(
-                audio=combined_audio,
-                timestamp=time.time()
-            )
+            queue_item = AudioQueueItem(audio=combined_audio, timestamp=time.time())
            self.processing_queue.put_nowait(queue_item)
-        except Exception:
+            logger.info(
+                f"Added to processing queue, queue size: {self.processing_queue.qsize()}"
+            )
+        except Exception as e:
            # Queue full, skip this chunk
-            logger.debug("Audio processing queue full, dropping audio chunk")
+            logger.warning(f"Audio processing queue full, dropping audio chunk: {e}")

    def _processing_loop(self):
        """Background thread that processes audio chunks for transcription."""
        global _whisper_model

+        logger.info("ASR processing loop started")
+
        while self.is_running:
            try:
                # Get audio chunk to process (blocking with timeout)
                try:
                    audio_data = self.processing_queue.get(timeout=1.0)
+                    logger.debug(
+                        f"Retrieved audio chunk from queue, remaining queue size: {self.processing_queue.qsize()}"
+                    )
                except Empty:
+                    logger.debug("Processing queue timeout, checking for more audio...")
                    continue

                audio_array = audio_data.audio
@ -168,69 +251,159 @@ class AudioProcessor:
                time_since_last = chunk_timestamp - self.last_activity_time
                phrase_complete = time_since_last > self.phrase_timeout

+                logger.debug(
+                    f"Processing audio chunk: {len(audio_array)} samples, time since last: {time_since_last:.2f}s, phrase_complete: {phrase_complete}"
+                )
+
                if phrase_complete and len(self.current_phrase_audio) > 0:
                    # Process the completed phrase
+                    phrase_duration = len(self.current_phrase_audio) / self.sample_rate
+                    phrase_rms = np.sqrt(np.mean(self.current_phrase_audio**2))
+
+                    logger.info(
+                        f"Processing completed phrase: {phrase_duration:.2f}s duration, {len(self.current_phrase_audio)} samples, RMS: {phrase_rms:.4f}"
+                    )
+
                    try:
                        loop = asyncio.get_event_loop()
                        asyncio.run_coroutine_threadsafe(
-                            self._transcribe_and_send(self.current_phrase_audio.copy(), is_final=True),
-                            loop
+                            self._transcribe_and_send(
+                                self.current_phrase_audio.copy(), is_final=True
+                            ),
+                            loop,
                        )
-                    except RuntimeError:
+                    except RuntimeError as e:
                        # No event loop running, skip this transcription
+                        logger.warning(
+                            f"No event loop available for final transcription: {e}"
+                        )
                        pass
                    self.current_phrase_audio = np.array([], dtype=np.float32)

                # Add new audio to current phrase
-                self.current_phrase_audio = np.concatenate([self.current_phrase_audio, audio_array])
+                old_phrase_length = len(self.current_phrase_audio)
+                self.current_phrase_audio = np.concatenate(
+                    [self.current_phrase_audio, audio_array]
+                )
+                current_phrase_duration = (
+                    len(self.current_phrase_audio) / self.sample_rate
+                )

-                # Also do streaming transcription for immediate feedback
-                if len(self.current_phrase_audio) > self.sample_rate * 2:  # At least 2 seconds
+                logger.debug(
+                    f"Updated current phrase: {old_phrase_length} -> {len(self.current_phrase_audio)} samples ({current_phrase_duration:.2f}s)"
+                )
+
+                # Lower the threshold for streaming transcription to catch shorter phrases
+                min_transcription_duration = 1.0  # Reduced from 2.0 seconds
+
+                if (
+                    len(self.current_phrase_audio)
+                    > self.sample_rate * min_transcription_duration
+                ):  # At least 1 second
+                    phrase_rms = np.sqrt(np.mean(self.current_phrase_audio**2))
+                    logger.info(
+                        f"Current phrase >= {min_transcription_duration}s (RMS: {phrase_rms:.4f}), attempting streaming transcription"
+                    )
                    try:
                        loop = asyncio.get_event_loop()
                        asyncio.run_coroutine_threadsafe(
-                            self._transcribe_and_send(self.current_phrase_audio.copy(), is_final=False),
-                            loop
+                            self._transcribe_and_send(
+                                self.current_phrase_audio.copy(), is_final=False
+                            ),
+                            loop,
                        )
-                    except RuntimeError:
+                    except RuntimeError as e:
                        # No event loop running, skip this transcription
+                        logger.warning(
+                            f"No event loop available for streaming transcription: {e}"
+                        )
                        pass

            except Exception as e:
                logger.error(f"Error in audio processing loop: {e}", exc_info=True)

+        logger.info("ASR processing loop ended")
+
    async def _transcribe_and_send(self, audio_array: AudioArray, is_final: bool):
        """Transcribe audio and send result as chat message."""
        global sample_rate

+        transcription_start_time = time.time()
+        transcription_type = "final" if is_final else "streaming"
+
        try:
-            if len(audio_array) < self.sample_rate * 0.5:  # Skip very short audio
+            audio_duration_sec = len(audio_array) / self.sample_rate
+
+            # Reduce minimum audio duration threshold
+            min_duration = 0.3  # Reduced from 0.5 seconds
+            if len(audio_array) < self.sample_rate * min_duration:
+                logger.debug(
+                    f"Skipping {transcription_type} transcription: audio too short ({audio_duration_sec:.2f}s < {min_duration}s)"
+                )
                return

+            # Calculate audio quality metrics
+            audio_rms = np.sqrt(np.mean(audio_array**2))
+            audio_peak = np.max(np.abs(audio_array))
+
+            # More lenient silence detection
+            if audio_rms < 0.0005:  # Very quiet threshold
+                logger.debug(
+                    f"Skipping {transcription_type} transcription: audio too quiet (RMS: {audio_rms:.6f})"
+                )
+                return
+
+            logger.info(
+                f"Starting {transcription_type} transcription: {audio_duration_sec:.2f}s audio, RMS: {audio_rms:.4f}, Peak: {audio_peak:.4f}"
+            )
+
            # Ensure audio is in the right format for Whisper
            audio_array = audio_array.astype(np.float32)

            # Transcribe with Whisper
+            feature_extraction_start = time.time()
            input_features = extract_input_features(audio_array, sample_rate)
+            feature_extraction_time = time.time() - feature_extraction_start
+
+            model_inference_start = time.time()
            predicted_ids = _pt_model.generate(input_features)  # type: ignore
-            transcription = _processor.batch_decode(predicted_ids, skip_special_tokens=True)  # type: ignore
+            model_inference_time = time.time() - model_inference_start

+            decoding_start = time.time()
+            transcription = _processor.batch_decode(
+                predicted_ids, skip_special_tokens=True
+            )  # type: ignore
+            decoding_time = time.time() - decoding_start

-            text = transcription.strip()
+            total_transcription_time = time.time() - transcription_start_time

-            if text and len(text) > 1:  # Only send meaningful transcriptions
-                prefix = "🎤 " if is_final else "🎤 [partial] "
+            logger.debug(
+                f"ASR timing - Feature extraction: {feature_extraction_time:.3f}s, Model inference: {model_inference_time:.3f}s, Decoding: {decoding_time:.3f}s, Total: {total_transcription_time:.3f}s"
+            )
+
+            text = (
+                transcription[0].strip()
+                if transcription and len(transcription) > 0
+                else ""
+            )
+
+            if text and len(text) > 0:  # Accept any non-empty text
+                prefix = (
+                    f"🎤 {self.peer_name}: "
+                    if is_final
+                    else f"🎤 {self.peer_name} [partial]: "
+                )
                message = f"{prefix}{text}"

                # Avoid sending duplicate messages
-                if is_final or message not in [h.message for h in self.transcription_history[-3:]]:
+                if is_final or message not in [
+                    h.message for h in self.transcription_history[-3:]
+                ]:
                    await self.send_chat_func(message)

                    # Keep history for deduplication
                    history_item = TranscriptionHistoryItem(
-                        message=message,
-                        timestamp=time.time(),
-                        is_final=is_final
+                        message=message, timestamp=time.time(), is_final=is_final
                    )
                    self.transcription_history.append(history_item)

@ -238,66 +411,152 @@ class AudioProcessor:
                    if len(self.transcription_history) > 10:
                        self.transcription_history.pop(0)

-                    logger.info(f"Transcribed ({'final' if is_final else 'partial'}): {text}")
+                    logger.info(
+                        f"✅ Transcribed ({transcription_type}) for {self.peer_name}: '{text}' (processing time: {total_transcription_time:.3f}s, audio duration: {audio_duration_sec:.2f}s)"
+                    )
+                else:
+                    logger.debug(
+                        f"Skipping duplicate {transcription_type} transcription: '{text}'"
+                    )
+            else:
+                logger.info(
+                    f"❌ No text from {transcription_type} transcription for {self.peer_name} (empty result from model)"
+                )

        except Exception as e:
-            logger.error(f"Error in transcription: {e}", exc_info=True)
+            logger.error(
+                f"Error in {transcription_type} transcription: {e}", exc_info=True
+            )

    def shutdown(self):
        """Shutdown the audio processor."""
+        logger.info(f"Shutting down AudioProcessor for {self.peer_name}...")
        self.is_running = False
        if self.processor_thread.is_alive():
+            logger.debug(
+                f"Waiting for processor thread for {self.peer_name} to finish..."
+            )
            self.processor_thread.join(timeout=2.0)
+            if self.processor_thread.is_alive():
+                logger.warning(
+                    f"Processor thread for {self.peer_name} did not shut down cleanly within timeout"
+                )
+            else:
+                logger.info(
+                    f"Processor thread for {self.peer_name} shut down successfully"
+                )
+        logger.info(f"AudioProcessor for {self.peer_name} shutdown complete")


 async def handle_track_received(peer: Peer, track: MediaStreamTrack):
    """Handle incoming audio tracks from WebRTC peers."""
-    global _audio_processor
+    global _audio_processors, _send_chat_func

    if track.kind != "audio":
-        logger.info(f"Ignoring non-audio track: {track.kind}")
+        logger.info(f"Ignoring non-audio track from {peer.peer_name}: {track.kind}")
        return

-    logger.info(f"Received audio track from {peer.peer_name}, starting transcription")
+    # Create or get audio processor for this peer
+    if peer.peer_name not in _audio_processors:
+        if _send_chat_func is None:
+            logger.error(
+                f"Cannot create AudioProcessor for {peer.peer_name}: no send_chat_func available"
+            )
+            return
+
+        logger.info(f"Creating new AudioProcessor for {peer.peer_name}")
+        _audio_processors[peer.peer_name] = AudioProcessor(
+            peer_name=peer.peer_name, send_chat_func=_send_chat_func
+        )
+
+    audio_processor = _audio_processors[peer.peer_name]
+
+    logger.info(
+        f"Received audio track from {peer.peer_name}, starting transcription (processor available: {audio_processor is not None})"
+    )

    try:
        while True:
            # Receive audio frame
            frame = await track.recv()
            if isinstance(frame, AudioFrame):
-                logger.info(f"Received audio frame: {frame.sample_rate}Hz, {frame.format.name}, {frame.layout.name}")
+                frame_info = (
+                    f"{frame.sample_rate}Hz, {frame.format.name}, {frame.layout.name}"
+                )
+                logger.debug(
+                    f"Received audio frame from {peer.peer_name}: {frame_info}"
+                )
+
                # Convert AudioFrame to numpy array
                audio_data = frame.to_ndarray()
+                original_shape = audio_data.shape
+                original_dtype = audio_data.dtype
+
+                logger.debug(
+                    f"Audio frame data: shape={original_shape}, dtype={original_dtype}"
+                )

                # Handle different audio formats
                if audio_data.ndim == 2:  # Stereo -> mono
                    audio_data = np.mean(audio_data, axis=1)
+                    logger.debug(
+                        f"Converted stereo to mono: {original_shape} -> {audio_data.shape}"
+                    )

                # Convert to float32 and normalize
                if audio_data.dtype == np.int16:
                    audio_data = audio_data.astype(np.float32) / 32768.0
+                    logger.debug("Normalized int16 audio to float32")
                elif audio_data.dtype == np.int32:
                    audio_data = audio_data.astype(np.float32) / 2147483648.0
+                    logger.debug("Normalized int32 audio to float32")

                # Resample to 16kHz if needed
                if frame.sample_rate != sample_rate:
-                    audio_data = librosa.resample( # type: ignore
-                        audio_data, 
-                        orig_sr=frame.sample_rate, 
-                        target_sr=sample_rate
+                    original_length = len(audio_data)
+                    audio_data = librosa.resample(  # type: ignore
+                        audio_data, orig_sr=frame.sample_rate, target_sr=sample_rate
+                    )
+                    logger.debug(
+                        f"Resampled audio: {frame.sample_rate}Hz -> {sample_rate}Hz, {original_length} -> {len(audio_data)} samples"
                    )

                # Ensure audio_data is AudioArray (float32)
                audio_data_float32 = cast(AudioArray, audio_data.astype(np.float32))

+                # Calculate audio quality metrics for this frame
+                frame_rms = np.sqrt(np.mean(audio_data_float32**2))
+                frame_peak = np.max(np.abs(audio_data_float32))
+
+                # Only log full frame details every 20 frames to reduce noise
+                frame_count = getattr(peer, "_whisper_frame_count", 0) + 1
+                setattr(peer, "_whisper_frame_count", frame_count)
+
+                if frame_count % 20 == 0:
+                    logger.info(
+                        f"Audio frame #{frame_count} from {peer.peer_name}: {frame_info}, {len(audio_data_float32)} samples, RMS: {frame_rms:.4f}, Peak: {frame_peak:.4f}"
+                    )
+                else:
+                    logger.debug(
+                        f"Audio frame #{frame_count}: RMS: {frame_rms:.4f}, Peak: {frame_peak:.4f}"
+                    )
+
                # Send to audio processor
-                if _audio_processor:
-                    _audio_processor.add_audio_data(audio_data_float32)
+                if audio_processor:
+                    audio_processor.add_audio_data(audio_data_float32)
+                else:
+                    logger.warning(
+                        f"No audio processor available to handle audio data for {peer.peer_name}"
+                    )
            else:
-                logger.warning(f"Received non-audio frame on audio track from {peer.peer_name}")
+                logger.warning(
+                    f"Received non-audio frame on audio track from {peer.peer_name}: type={type(frame)}"
+                )

    except Exception as e:
-        logger.error(f"Error processing audio track from {peer.peer_name}: {e}", exc_info=True)
+        logger.error(
+            f"Error processing audio track from {peer.peer_name}: {e}", exc_info=True
+        )


 def agent_info() -> Dict[str, str]:
@ -309,7 +568,9 @@ def create_agent_tracks(session_name: str) -> dict[str, MediaStreamTrack]:
    return {}


-async def handle_chat_message(chat_message: ChatMessageModel, send_message_func: Callable[[str], Awaitable[None]]) -> Optional[str]:
+async def handle_chat_message(
+    chat_message: ChatMessageModel, send_message_func: Callable[[str], Awaitable[None]]
+) -> Optional[str]:
    """Handle incoming chat messages and optionally return a response."""
    pass

@ -318,16 +579,41 @@ async def on_track_received(peer: Peer, track: MediaStreamTrack):
    """Callback when a new track is received from a peer."""
    await handle_track_received(peer, track)

+
 # Export functions for the orchestrator to discover
 def get_track_handler():
    """Return the track handler function for the orchestrator to use."""
    return on_track_received

+
 def bind_send_chat_function(send_chat_func: Callable[[str], Awaitable[None]]):
-    """Bind the send chat function to the audio processor."""
-    global _send_chat_func, _audio_processor
+    """Bind the send chat function to be used for all audio processors."""
+    global _send_chat_func, _audio_processors
+    logger.info("Binding send chat function to whisper agent")
    _send_chat_func = send_chat_func
-    if _audio_processor:
-        _audio_processor.send_chat_func = send_chat_func
+
+    # Update existing audio processors
+    for peer_name, processor in _audio_processors.items():
+        logger.debug(
+            f"Updating AudioProcessor for {peer_name} with new send chat function"
+        )
+        processor.send_chat_func = send_chat_func
+
+
+def cleanup_peer_processor(peer_name: str):
+    """Clean up audio processor for a disconnected peer."""
+    global _audio_processors
+
+    if peer_name in _audio_processors:
+        logger.info(f"Cleaning up AudioProcessor for disconnected peer: {peer_name}")
+        processor = _audio_processors[peer_name]
+        processor.shutdown()
+        del _audio_processors[peer_name]
+        logger.info(f"AudioProcessor for {peer_name} cleaned up successfully")
    else:
-        _audio_processor = AudioProcessor(send_chat_func=send_chat_func)
+        logger.debug(f"No AudioProcessor found for peer {peer_name} during cleanup")
+
+
+def get_active_processors() -> Dict[str, "AudioProcessor"]:
+    """Get currently active audio processors (for debugging)."""
+    return _audio_processors.copy()
--- a/voicebot/force_transcription.py
+++ b/voicebot/force_transcription.py
@ -0,0 +1,148 @@
+#!/usr/bin/env python3
+"""
+Force transcription debug - processes any accumulated audio immediately.
+Run this to force the whisper agent to attempt transcription of current audio buffer.
+"""
+
+import sys
+import os
+import asyncio
+import numpy as np
+
+# Add the voicebot directory to the path
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+
+
+def force_transcription():
+    """Force transcription of any accumulated audio."""
+    try:
+        from bots.whisper import _audio_processors
+
+        if not _audio_processors:
+            print(
+                "❌ No audio processors found. Whisper agent may not be running or no peers connected."
+            )
+            return
+
+        print(f"🔍 Found {len(_audio_processors)} active audio processors:")
+
+        for peer_name, audio_processor in _audio_processors.items():
+            print(f"\n👤 {peer_name}:")
+            print(f"  - Running: {audio_processor.is_running}")
+            print(f"  - Buffer size: {len(audio_processor.audio_buffer)} frames")
+            print(f"  - Queue size: {audio_processor.processing_queue.qsize()}")
+            print(
+                f"  - Current phrase length: {len(audio_processor.current_phrase_audio)} samples"
+            )
+
+            # Force processing of current buffer
+            if len(audio_processor.audio_buffer) > 0:
+                print(
+                    f"🔄 Forcing processing of {len(audio_processor.audio_buffer)} buffered frames for {peer_name}..."
+                )
+                audio_processor._queue_for_processing()
+            else:
+                print(f"📭 No audio in buffer to process for {peer_name}")
+
+            # If we have a current phrase, try to transcribe it
+            if len(audio_processor.current_phrase_audio) > 0:
+                phrase_duration = (
+                    len(audio_processor.current_phrase_audio)
+                    / audio_processor.sample_rate
+                )
+                phrase_rms = np.sqrt(np.mean(audio_processor.current_phrase_audio**2))
+                print(
+                    f"🎤 Current phrase for {peer_name}: {phrase_duration:.2f}s, RMS: {phrase_rms:.6f}"
+                )
+
+                if phrase_duration > 0.3:  # Minimum duration
+                    print(
+                        f"🚀 Forcing transcription of current phrase for {peer_name}..."
+                    )
+
+                    # Create an event loop if none exists
+                    try:
+                        loop = asyncio.get_event_loop()
+                    except RuntimeError:
+                        loop = asyncio.new_event_loop()
+                        asyncio.set_event_loop(loop)
+
+                    # Force transcription
+                    async def force_transcribe():
+                        await audio_processor._transcribe_and_send(
+                            audio_processor.current_phrase_audio.copy(), is_final=True
+                        )
+
+                    loop.run_until_complete(force_transcribe())
+                    print(f"✅ Forced transcription completed for {peer_name}")
+                else:
+                    print(
+                        f"⏱️ Current phrase too short for {peer_name} ({phrase_duration:.2f}s < 0.3s)"
+                    )
+            else:
+                print(f"🤐 No current phrase to transcribe for {peer_name}")
+
+    except ImportError:
+        print(
+            "❌ Could not import whisper components. Make sure the whisper agent is loaded."
+        )
+    except Exception as e:
+        print(f"❌ Error: {e}")
+
+
+def show_audio_stats():
+    """Show detailed audio statistics."""
+    try:
+        from bots.whisper import _audio_processors
+
+        if not _audio_processors:
+            print("❌ No audio processors found")
+            return
+
+        print(
+            f"\n📊 Detailed Audio Statistics for {len(_audio_processors)} processors:"
+        )
+
+        for peer_name, audio_processor in _audio_processors.items():
+            print(f"\n👤 {peer_name}:")
+            print(f"Sample rate: {audio_processor.sample_rate}Hz")
+            print(f"Samples per frame: {audio_processor.samples_per_frame}")
+            print(f"Phrase timeout: {audio_processor.phrase_timeout}s")
+            print(f"Buffer max length: {audio_processor.audio_buffer.maxlen}")
+            print(f"Current buffer size: {len(audio_processor.audio_buffer)}")
+            print(f"Processing queue size: {audio_processor.processing_queue.qsize()}")
+
+            if len(audio_processor.current_phrase_audio) > 0:
+                phrase_duration = (
+                    len(audio_processor.current_phrase_audio)
+                    / audio_processor.sample_rate
+                )
+                phrase_rms = np.sqrt(np.mean(audio_processor.current_phrase_audio**2))
+                phrase_peak = np.max(np.abs(audio_processor.current_phrase_audio))
+                print("  Current phrase:")
+                print(f"    Duration: {phrase_duration:.2f}s")
+                print(f"    Samples: {len(audio_processor.current_phrase_audio)}")
+                print(f"    RMS: {phrase_rms:.6f}")
+                print(f"    Peak: {phrase_peak:.6f}")
+
+            if len(audio_processor.audio_buffer) > 0:
+                combined = np.concatenate(list(audio_processor.audio_buffer))
+                buffer_duration = len(combined) / audio_processor.sample_rate
+                buffer_rms = np.sqrt(np.mean(combined**2))
+                buffer_peak = np.max(np.abs(combined))
+                print("  Buffer contents:")
+                print(f"    Duration: {buffer_duration:.2f}s")
+                print(f"    Samples: {len(combined)}")
+                print(f"    RMS: {buffer_rms:.6f}")
+                print(f"    Peak: {buffer_peak:.6f}")
+
+    except Exception as e:
+        print(f"❌ Error getting stats: {e}")
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1 and sys.argv[1] == "stats":
+        show_audio_stats()
+    else:
+        force_transcription()
+        show_audio_stats()
--- a/voicebot/set_whisper_debug.py
+++ b/voicebot/set_whisper_debug.py
@ -0,0 +1,53 @@
+#!/usr/bin/env python3
+"""
+Helper script to adjust whisper ASR logging levels for debugging.
+Run this to see more detailed ASR logging.
+"""
+
+import logging
+import sys
+import os
+
+# Add the voicebot directory to the path
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+
+from logger import logger
+
+
+def set_debug_logging():
+    """Set logger to DEBUG level for detailed ASR logging."""
+    logger.setLevel(logging.DEBUG)
+
+    # Also set the root logger
+    logging.getLogger().setLevel(logging.DEBUG)
+
+    # Create a more detailed formatter if needed
+    formatter = logging.Formatter(
+        "%(asctime)s - %(name)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s"
+    )
+
+    # Update all handlers
+    for handler in logger.handlers:
+        handler.setLevel(logging.DEBUG)
+        handler.setFormatter(formatter)
+
+    logger.info("Debug logging enabled for Whisper ASR")
+
+
+def set_info_logging():
+    """Set logger back to INFO level."""
+    logger.setLevel(logging.INFO)
+    logging.getLogger().setLevel(logging.INFO)
+
+    # Update all handlers
+    for handler in logger.handlers:
+        handler.setLevel(logging.INFO)
+
+    logger.info("Info logging enabled for Whisper ASR")
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1 and sys.argv[1] == "info":
+        set_info_logging()
+    else:
+        set_debug_logging()
--- a/voicebot/test_whisper_pipeline.py
+++ b/voicebot/test_whisper_pipeline.py
@ -0,0 +1,110 @@
+#!/usr/bin/env python3
+"""
+Debug script to test Whisper transcription with synthetic audio.
+This helps identify if the issue is with audio processing or the transcription pipeline.
+"""
+
+import numpy as np
+import time
+import sys
+import os
+
+# Add the voicebot directory to the path
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+
+try:
+    from bots.whisper import extract_input_features, _pt_model, _processor, sample_rate
+except ImportError as e:
+    print(f"Error importing whisper components: {e}")
+    print("Make sure you're running this from the voicebot directory")
+    sys.exit(1)
+
+
+def generate_test_audio(
+    duration_seconds: float = 2.0, frequency: float = 440.0
+) -> np.ndarray:
+    """Generate a synthetic sine wave for testing."""
+    samples = int(duration_seconds * sample_rate)
+    t = np.linspace(0, duration_seconds, samples, False)
+    # Generate a sine wave with some amplitude modulation to simulate speech-like patterns
+    amplitude = 0.1 * (
+        1 + 0.5 * np.sin(2 * np.pi * 2 * t)
+    )  # Amplitude modulation at 2Hz
+    audio = amplitude * np.sin(2 * np.pi * frequency * t)
+    return audio.astype(np.float32)
+
+
+def test_transcription_pipeline():
+    """Test the Whisper transcription pipeline with synthetic audio."""
+    print("Testing Whisper transcription pipeline...")
+
+    # Test 1: Complete silence
+    print("\n=== Test 1: Complete Silence ===")
+    silent_audio = np.zeros(int(sample_rate * 2.0), dtype=np.float32)
+    test_audio_transcription(silent_audio, "Silent audio")
+
+    # Test 2: Very quiet noise
+    print("\n=== Test 2: Very Quiet Noise ===")
+    quiet_noise = np.random.normal(0, 0.001, int(sample_rate * 2.0)).astype(np.float32)
+    test_audio_transcription(quiet_noise, "Quiet noise")
+
+    # Test 3: Sine wave (should produce some output)
+    print("\n=== Test 3: Sine Wave ===")
+    sine_audio = generate_test_audio(2.0, 440.0)
+    test_audio_transcription(sine_audio, "Sine wave")
+
+    # Test 4: Multiple frequency sine wave
+    print("\n=== Test 4: Complex Sine Wave ===")
+    complex_audio = (
+        generate_test_audio(2.0, 220.0)
+        + generate_test_audio(2.0, 440.0)
+        + generate_test_audio(2.0, 880.0)
+    ) / 3.0
+    test_audio_transcription(complex_audio, "Complex sine wave")
+
+
+def test_audio_transcription(audio_array: np.ndarray, description: str):
+    """Test transcription of a specific audio array."""
+    try:
+        # Calculate metrics
+        duration = len(audio_array) / sample_rate
+        rms = np.sqrt(np.mean(audio_array**2))
+        peak = np.max(np.abs(audio_array))
+
+        print(f"Testing {description}:")
+        print(f"  Duration: {duration:.2f}s")
+        print(f"  Samples: {len(audio_array)}")
+        print(f"  RMS: {rms:.6f}")
+        print(f"  Peak: {peak:.6f}")
+
+        # Test feature extraction
+        start_time = time.time()
+        input_features = extract_input_features(audio_array, sample_rate)
+        feature_time = time.time() - start_time
+        print(f"  Feature extraction: {feature_time:.3f}s")
+
+        # Test model inference
+        start_time = time.time()
+        predicted_ids = _pt_model.generate(input_features)
+        inference_time = time.time() - start_time
+        print(f"  Model inference: {inference_time:.3f}s")
+
+        # Test decoding
+        start_time = time.time()
+        transcription = _processor.batch_decode(predicted_ids, skip_special_tokens=True)
+        decoding_time = time.time() - start_time
+        print(f"  Decoding: {decoding_time:.3f}s")
+
+        # Show result
+        text = (
+            transcription[0].strip() if transcription and len(transcription) > 0 else ""
+        )
+        print(f"  Result: '{text}'" if text else "  Result: (empty)")
+        print(f"  Result length: {len(text)}")
+
+    except Exception as e:
+        print(f"  ERROR: {e}")
+
+
+if __name__ == "__main__":
+    test_transcription_pipeline()
--- a/voicebot/webrtc_signaling.py
+++ b/voicebot/webrtc_signaling.py
@ -778,6 +778,14 @@ class WebRTCSignalingClient:
                f"ICE candidate outgoing for {peer_name}: type={cand_type} protocol={protocol} sdp={raw}"
            )

+            # Ensure candidate has the proper SDP format
+            if raw and not raw.startswith("candidate:"):
+                raw = f"candidate:{raw}"
+            
+            # Clean up any extra spaces
+            if raw:
+                raw = raw.replace("candidate: ", "candidate:")
+
            candidate_model = ICECandidateDictModel(
                candidate=raw,
                sdpMid=getattr(candidate, "sdpMid", None),
@ -965,6 +973,14 @@ class WebRTCSignalingClient:
            elif line.startswith("a=candidate:"):
                candidate_sdp = line[2:]  # Remove 'a=' prefix
                
+                # Ensure candidate has the proper SDP format
+                if candidate_sdp and not candidate_sdp.startswith("candidate:"):
+                    candidate_sdp = f"candidate:{candidate_sdp}"
+                
+                # Clean up any extra spaces
+                if candidate_sdp:
+                    candidate_sdp = candidate_sdp.replace("candidate: ", "candidate:")
+                
                # Only send if we have valid MID and media index
                if current_section_mid is not None and current_media_index >= 0:
                    candidate_model = ICECandidateDictModel(