Snapshot

2025-09-07 21:56:08 -07:00 · 2025-09-07 21:56:08 -07:00 · 795e9b1d67
commit 795e9b1d67
parent 9089edaeea
3 changed files with 84 additions and 30 deletions
--- a/client/src/MediaControl.tsx
+++ b/client/src/MediaControl.tsx
@ -688,7 +688,23 @@ const MediaAgent = (props: MediaAgentProps) => {
      );
      if (media && localUserHasMedia) {
        media.getTracks().forEach((t) => {
-          console.log(`media-agent - addPeer:${peer.peer_name} Adding track:`, t.kind, t.enabled);
+          console.log(`media-agent - addPeer:${peer.peer_name} Adding track:`, {
+            kind: t.kind,
+            enabled: t.enabled,
+            muted: t.muted,
+            readyState: t.readyState,
+            label: t.label,
+            id: t.id,
+          });
+          
+          // Enable tracks for bots that need audio/video input (whisper, synthetic media, etc.)
+          if (peer.peer_name.includes("-bot")) {
+            if (t.kind === "audio" || t.kind === "video") {
+              t.enabled = true;
+              console.log(`media-agent - addPeer:${peer.peer_name} Force enabled ${t.kind} track for bot`);
+            }
+          }
+          
          connection.addTrack(t, media);
        });
      } else if (!localUserHasMedia) {
@ -824,10 +840,11 @@ const MediaAgent = (props: MediaAgentProps) => {
                  await pc.addIceCandidate(candidateInit);
                  console.log(`media-agent - sessionDescription:${peer_name} - Queued ICE candidate added`);
                } catch (err) {
-                  console.error(
-                    `media-agent - sessionDescription:${peer_name} - Failed to add queued ICE candidate:`,
-                    { candidateInit, err }
-                  );
+                  console.error(`media-agent - sessionDescription:${peer_name} - Failed to add queued ICE candidate:`, {
+                    candidateInit,
+                    rawCandidate: candidate,
+                    err,
+                  });
                }
              }
            } catch (err) {
@ -969,7 +986,13 @@ const MediaAgent = (props: MediaAgentProps) => {
          .then(() =>
            console.log(`media-agent - iceCandidate::${peer_name} - ICE candidate added for ${peer.peer_name}`)
          )
-          .catch((err) => console.error(`media-agent - iceCandidate::${peer_name} - Failed to add ICE candidate:`, { candidateInit, err }));
+          .catch((err) =>
+            console.error(`media-agent - iceCandidate::${peer_name} - Failed to add ICE candidate:`, {
+              candidateInit,
+              rawCandidate: candidate,
+              err,
+            })
+          );
      }
    },
    [peers]
@ -1102,9 +1125,16 @@ const MediaAgent = (props: MediaAgentProps) => {
      const videoTracks = media.getVideoTracks();

      if (audioTracks.length > 0) {
-        tracks.push(audioTracks[0]);
+        const audioTrack = audioTracks[0];
+        tracks.push(audioTrack);
        hasRealAudio = true;
-        console.log("media-agent - Using real audio");
+        console.log("media-agent - Using real audio:", {
+          enabled: audioTrack.enabled,
+          muted: audioTrack.muted,
+          readyState: audioTrack.readyState,
+          label: audioTrack.label,
+          id: audioTrack.id,
+        });
      }

      if (videoTracks.length > 0) {
@ -1130,7 +1160,13 @@ const MediaAgent = (props: MediaAgentProps) => {
    }

    const finalMedia = new MediaStream(tracks);
-    console.log(`media-agent - Media setup complete`);
+    console.log(`media-agent - Media setup complete:`, {
+      totalTracks: finalMedia.getTracks().length,
+      audioTracks: finalMedia.getAudioTracks().length,
+      videoTracks: finalMedia.getVideoTracks().length,
+      hasRealAudio,
+      hasRealVideo,
+    });
    return finalMedia;
  }, [session.name]);

--- a/voicebot/bots/whisper.py
+++ b/voicebot/bots/whisper.py
@ -480,20 +480,13 @@ async def handle_track_received(peer: Peer, track: MediaStreamTrack):
            # Receive audio frame
            frame = await track.recv()
            if isinstance(frame, AudioFrame):
-                frame_info = (
-                    f"{frame.sample_rate}Hz, {frame.format.name}, {frame.layout.name}"
-                )
-                logger.debug(
-                    f"Received audio frame from {peer.peer_name}: {frame_info}"
-                )
-
                # Convert AudioFrame to numpy array
                audio_data = frame.to_ndarray()
                original_shape = audio_data.shape
                original_dtype = audio_data.dtype

                logger.debug(
-                    f"Audio frame data: shape={original_shape}, dtype={original_dtype}"
+                    f"Audio frame data: shape={original_shape}, dtype={original_dtype}, samples={frame.samples if hasattr(frame, 'samples') else 'unknown'}"
                )

                # Handle different audio formats
@ -528,17 +521,46 @@ async def handle_track_received(peer: Peer, track: MediaStreamTrack):
                frame_rms = np.sqrt(np.mean(audio_data_float32**2))
                frame_peak = np.max(np.abs(audio_data_float32))

-                # Only log full frame details every 20 frames to reduce noise
+                # Track frame count and audio state
                frame_count = getattr(peer, "_whisper_frame_count", 0) + 1
                setattr(peer, "_whisper_frame_count", frame_count)
-
-                if frame_count % 20 == 0:
+                
+                # Track if we've seen audio before (to detect start of speech)
+                had_audio = getattr(peer, "_whisper_had_audio", False)
+                
+                # Define thresholds for "real audio" detection
+                audio_threshold = 0.001  # RMS threshold for detecting speech
+                has_audio = frame_rms > audio_threshold
+                
+                # Log important audio events
+                if has_audio and not had_audio:
+                    # Started receiving audio
+                    frame_info = f"{frame.sample_rate}Hz, {frame.format.name}, {frame.layout.name}"
                    logger.info(
-                        f"Audio frame #{frame_count} from {peer.peer_name}: {frame_info}, {len(audio_data_float32)} samples, RMS: {frame_rms:.4f}, Peak: {frame_peak:.4f}"
+                        f"🎤 AUDIO DETECTED from {peer.peer_name}! Frame #{frame_count}: {frame_info}, RMS: {frame_rms:.4f}, Peak: {frame_peak:.4f}"
                    )
-                else:
+                    setattr(peer, "_whisper_had_audio", True)
+                    setattr(peer, "_whisper_last_audio_frame", frame_count)
+                elif not has_audio and had_audio:
+                    # Stopped receiving audio
+                    last_audio_frame = getattr(peer, "_whisper_last_audio_frame", 0)
+                    logger.info(
+                        f"🔇 Audio stopped from {peer.peer_name} at frame #{frame_count} (last audio was frame #{last_audio_frame})"
+                    )
+                    setattr(peer, "_whisper_had_audio", False)
+                elif has_audio:
+                    # Continue receiving audio - update last audio frame but don't spam logs
+                    setattr(peer, "_whisper_last_audio_frame", frame_count)
+                    # Only log every 100 frames when continuously receiving audio
+                    if frame_count % 100 == 0:
+                        logger.info(
+                            f"🎤 Audio continuing from {peer.peer_name}: Frame #{frame_count}, RMS: {frame_rms:.4f}"
+                        )
+                
+                # Log connection info much less frequently (every 200 frames when silent)
+                if not has_audio and frame_count % 200 == 0:
                    logger.debug(
-                        f"Audio frame #{frame_count}: RMS: {frame_rms:.4f}, Peak: {frame_peak:.4f}"
+                        f"Connection active from {peer.peer_name}: Frame #{frame_count} (silent, RMS: {frame_rms:.6f})"
                    )

                # Send to audio processor
--- a/voicebot/webrtc_signaling.py
+++ b/voicebot/webrtc_signaling.py
@ -795,7 +795,7 @@ class WebRTCSignalingClient:
                peer_id=peer_id, peer_name=peer_name, candidate=candidate_model
            )
            logger.info(
-                f"on_ice_candidate: Sending relayICECandidate for {peer_name}: {candidate_model}"
+                f"on_ice_candidate: Sending relayICECandidate for {peer_name}: candidate='{candidate_model.candidate}' sdpMid={candidate_model.sdpMid} sdpMLineIndex={candidate_model.sdpMLineIndex}"
            )
            asyncio.ensure_future(
                self._send_message("relayICECandidate", payload_model.model_dump())
@ -971,11 +971,7 @@ class WebRTCSignalingClient:
                    current_section_mid = str(current_media_index)
                    
            elif line.startswith("a=candidate:"):
-                candidate_sdp = line[2:]  # Remove 'a=' prefix
-                
-                # Ensure candidate has the proper SDP format
-                if candidate_sdp and not candidate_sdp.startswith("candidate:"):
-                    candidate_sdp = f"candidate:{candidate_sdp}"
+                candidate_sdp = line[2:]  # Remove 'a=' prefix, keeping "candidate:..."
                
                # Clean up any extra spaces
                if candidate_sdp:
@ -995,7 +991,7 @@ class WebRTCSignalingClient:
                    )

                    logger.debug(
-                        f"_extract_and_send_candidates: Sending ICE candidate for {peer_name} (mid={current_section_mid}, idx={current_media_index}): {candidate_sdp[:60]}..."
+                        f"_extract_and_send_candidates: Sending ICE candidate for {peer_name} (mid={current_section_mid}, idx={current_media_index}): candidate='{candidate_sdp}'"
                    )
                    await self._send_message(
                        "relayICECandidate", payload_candidate.model_dump()