From 795e9b1d6707f6ccbd0c2d43598a72e30c1e5c88 Mon Sep 17 00:00:00 2001
From: James Ketrenos <james_git@ketrenos.com>
Date: Sun, 7 Sep 2025 21:56:08 -0700
Subject: [PATCH] Snapshot

---
 client/src/MediaControl.tsx  | 54 ++++++++++++++++++++++++++++++------
 voicebot/bots/whisper.py     | 50 +++++++++++++++++++++++----------
 voicebot/webrtc_signaling.py | 10 ++-----
 3 files changed, 84 insertions(+), 30 deletions(-)

diff --git a/client/src/MediaControl.tsx b/client/src/MediaControl.tsx
index eb12fb6..038befe 100644
--- a/client/src/MediaControl.tsx
+++ b/client/src/MediaControl.tsx
@@ -688,7 +688,23 @@ const MediaAgent = (props: MediaAgentProps) => {
       );
       if (media && localUserHasMedia) {
         media.getTracks().forEach((t) => {
-          console.log(`media-agent - addPeer:${peer.peer_name} Adding track:`, t.kind, t.enabled);
+          console.log(`media-agent - addPeer:${peer.peer_name} Adding track:`, {
+            kind: t.kind,
+            enabled: t.enabled,
+            muted: t.muted,
+            readyState: t.readyState,
+            label: t.label,
+            id: t.id,
+          });
+          
+          // Enable tracks for bots that need audio/video input (whisper, synthetic media, etc.)
+          if (peer.peer_name.includes("-bot")) {
+            if (t.kind === "audio" || t.kind === "video") {
+              t.enabled = true;
+              console.log(`media-agent - addPeer:${peer.peer_name} Force enabled ${t.kind} track for bot`);
+            }
+          }
+          
           connection.addTrack(t, media);
         });
       } else if (!localUserHasMedia) {
@@ -824,10 +840,11 @@ const MediaAgent = (props: MediaAgentProps) => {
                   await pc.addIceCandidate(candidateInit);
                   console.log(`media-agent - sessionDescription:${peer_name} - Queued ICE candidate added`);
                 } catch (err) {
-                  console.error(
-                    `media-agent - sessionDescription:${peer_name} - Failed to add queued ICE candidate:`,
-                    { candidateInit, err }
-                  );
+                  console.error(`media-agent - sessionDescription:${peer_name} - Failed to add queued ICE candidate:`, {
+                    candidateInit,
+                    rawCandidate: candidate,
+                    err,
+                  });
                 }
               }
             } catch (err) {
@@ -969,7 +986,13 @@ const MediaAgent = (props: MediaAgentProps) => {
           .then(() =>
             console.log(`media-agent - iceCandidate::${peer_name} - ICE candidate added for ${peer.peer_name}`)
           )
-          .catch((err) => console.error(`media-agent - iceCandidate::${peer_name} - Failed to add ICE candidate:`, { candidateInit, err }));
+          .catch((err) =>
+            console.error(`media-agent - iceCandidate::${peer_name} - Failed to add ICE candidate:`, {
+              candidateInit,
+              rawCandidate: candidate,
+              err,
+            })
+          );
       }
     },
     [peers]
@@ -1102,9 +1125,16 @@ const MediaAgent = (props: MediaAgentProps) => {
       const videoTracks = media.getVideoTracks();
 
       if (audioTracks.length > 0) {
-        tracks.push(audioTracks[0]);
+        const audioTrack = audioTracks[0];
+        tracks.push(audioTrack);
         hasRealAudio = true;
-        console.log("media-agent - Using real audio");
+        console.log("media-agent - Using real audio:", {
+          enabled: audioTrack.enabled,
+          muted: audioTrack.muted,
+          readyState: audioTrack.readyState,
+          label: audioTrack.label,
+          id: audioTrack.id,
+        });
       }
 
       if (videoTracks.length > 0) {
@@ -1130,7 +1160,13 @@ const MediaAgent = (props: MediaAgentProps) => {
     }
 
     const finalMedia = new MediaStream(tracks);
-    console.log(`media-agent - Media setup complete`);
+    console.log(`media-agent - Media setup complete:`, {
+      totalTracks: finalMedia.getTracks().length,
+      audioTracks: finalMedia.getAudioTracks().length,
+      videoTracks: finalMedia.getVideoTracks().length,
+      hasRealAudio,
+      hasRealVideo,
+    });
     return finalMedia;
   }, [session.name]);
 
diff --git a/voicebot/bots/whisper.py b/voicebot/bots/whisper.py
index 9246882..de876a7 100644
--- a/voicebot/bots/whisper.py
+++ b/voicebot/bots/whisper.py
@@ -480,20 +480,13 @@ async def handle_track_received(peer: Peer, track: MediaStreamTrack):
             # Receive audio frame
             frame = await track.recv()
             if isinstance(frame, AudioFrame):
-                frame_info = (
-                    f"{frame.sample_rate}Hz, {frame.format.name}, {frame.layout.name}"
-                )
-                logger.debug(
-                    f"Received audio frame from {peer.peer_name}: {frame_info}"
-                )
-
                 # Convert AudioFrame to numpy array
                 audio_data = frame.to_ndarray()
                 original_shape = audio_data.shape
                 original_dtype = audio_data.dtype
 
                 logger.debug(
-                    f"Audio frame data: shape={original_shape}, dtype={original_dtype}"
+                    f"Audio frame data: shape={original_shape}, dtype={original_dtype}, samples={frame.samples if hasattr(frame, 'samples') else 'unknown'}"
                 )
 
                 # Handle different audio formats
@@ -528,17 +521,46 @@ async def handle_track_received(peer: Peer, track: MediaStreamTrack):
                 frame_rms = np.sqrt(np.mean(audio_data_float32**2))
                 frame_peak = np.max(np.abs(audio_data_float32))
 
-                # Only log full frame details every 20 frames to reduce noise
+                # Track frame count and audio state
                 frame_count = getattr(peer, "_whisper_frame_count", 0) + 1
                 setattr(peer, "_whisper_frame_count", frame_count)
-
-                if frame_count % 20 == 0:
+                
+                # Track if we've seen audio before (to detect start of speech)
+                had_audio = getattr(peer, "_whisper_had_audio", False)
+                
+                # Define thresholds for "real audio" detection
+                audio_threshold = 0.001  # RMS threshold for detecting speech
+                has_audio = frame_rms > audio_threshold
+                
+                # Log important audio events
+                if has_audio and not had_audio:
+                    # Started receiving audio
+                    frame_info = f"{frame.sample_rate}Hz, {frame.format.name}, {frame.layout.name}"
                     logger.info(
-                        f"Audio frame #{frame_count} from {peer.peer_name}: {frame_info}, {len(audio_data_float32)} samples, RMS: {frame_rms:.4f}, Peak: {frame_peak:.4f}"
+                        f"🎤 AUDIO DETECTED from {peer.peer_name}! Frame #{frame_count}: {frame_info}, RMS: {frame_rms:.4f}, Peak: {frame_peak:.4f}"
                     )
-                else:
+                    setattr(peer, "_whisper_had_audio", True)
+                    setattr(peer, "_whisper_last_audio_frame", frame_count)
+                elif not has_audio and had_audio:
+                    # Stopped receiving audio
+                    last_audio_frame = getattr(peer, "_whisper_last_audio_frame", 0)
+                    logger.info(
+                        f"🔇 Audio stopped from {peer.peer_name} at frame #{frame_count} (last audio was frame #{last_audio_frame})"
+                    )
+                    setattr(peer, "_whisper_had_audio", False)
+                elif has_audio:
+                    # Continue receiving audio - update last audio frame but don't spam logs
+                    setattr(peer, "_whisper_last_audio_frame", frame_count)
+                    # Only log every 100 frames when continuously receiving audio
+                    if frame_count % 100 == 0:
+                        logger.info(
+                            f"🎤 Audio continuing from {peer.peer_name}: Frame #{frame_count}, RMS: {frame_rms:.4f}"
+                        )
+                
+                # Log connection info much less frequently (every 200 frames when silent)
+                if not has_audio and frame_count % 200 == 0:
                     logger.debug(
-                        f"Audio frame #{frame_count}: RMS: {frame_rms:.4f}, Peak: {frame_peak:.4f}"
+                        f"Connection active from {peer.peer_name}: Frame #{frame_count} (silent, RMS: {frame_rms:.6f})"
                     )
 
                 # Send to audio processor
diff --git a/voicebot/webrtc_signaling.py b/voicebot/webrtc_signaling.py
index 80e747f..e830c3d 100644
--- a/voicebot/webrtc_signaling.py
+++ b/voicebot/webrtc_signaling.py
@@ -795,7 +795,7 @@ class WebRTCSignalingClient:
                 peer_id=peer_id, peer_name=peer_name, candidate=candidate_model
             )
             logger.info(
-                f"on_ice_candidate: Sending relayICECandidate for {peer_name}: {candidate_model}"
+                f"on_ice_candidate: Sending relayICECandidate for {peer_name}: candidate='{candidate_model.candidate}' sdpMid={candidate_model.sdpMid} sdpMLineIndex={candidate_model.sdpMLineIndex}"
             )
             asyncio.ensure_future(
                 self._send_message("relayICECandidate", payload_model.model_dump())
@@ -971,11 +971,7 @@ class WebRTCSignalingClient:
                     current_section_mid = str(current_media_index)
                     
             elif line.startswith("a=candidate:"):
-                candidate_sdp = line[2:]  # Remove 'a=' prefix
-                
-                # Ensure candidate has the proper SDP format
-                if candidate_sdp and not candidate_sdp.startswith("candidate:"):
-                    candidate_sdp = f"candidate:{candidate_sdp}"
+                candidate_sdp = line[2:]  # Remove 'a=' prefix, keeping "candidate:..."
                 
                 # Clean up any extra spaces
                 if candidate_sdp:
@@ -995,7 +991,7 @@ class WebRTCSignalingClient:
                     )
 
                     logger.debug(
-                        f"_extract_and_send_candidates: Sending ICE candidate for {peer_name} (mid={current_section_mid}, idx={current_media_index}): {candidate_sdp[:60]}..."
+                        f"_extract_and_send_candidates: Sending ICE candidate for {peer_name} (mid={current_section_mid}, idx={current_media_index}): candidate='{candidate_sdp}'"
                     )
                     await self._send_message(
                         "relayICECandidate", payload_candidate.model_dump()