From 795e9b1d6707f6ccbd0c2d43598a72e30c1e5c88 Mon Sep 17 00:00:00 2001 From: James Ketrenos Date: Sun, 7 Sep 2025 21:56:08 -0700 Subject: [PATCH] Snapshot --- client/src/MediaControl.tsx | 54 ++++++++++++++++++++++++++++++------ voicebot/bots/whisper.py | 50 +++++++++++++++++++++++---------- voicebot/webrtc_signaling.py | 10 ++----- 3 files changed, 84 insertions(+), 30 deletions(-) diff --git a/client/src/MediaControl.tsx b/client/src/MediaControl.tsx index eb12fb6..038befe 100644 --- a/client/src/MediaControl.tsx +++ b/client/src/MediaControl.tsx @@ -688,7 +688,23 @@ const MediaAgent = (props: MediaAgentProps) => { ); if (media && localUserHasMedia) { media.getTracks().forEach((t) => { - console.log(`media-agent - addPeer:${peer.peer_name} Adding track:`, t.kind, t.enabled); + console.log(`media-agent - addPeer:${peer.peer_name} Adding track:`, { + kind: t.kind, + enabled: t.enabled, + muted: t.muted, + readyState: t.readyState, + label: t.label, + id: t.id, + }); + + // Enable tracks for bots that need audio/video input (whisper, synthetic media, etc.) + if (peer.peer_name.includes("-bot")) { + if (t.kind === "audio" || t.kind === "video") { + t.enabled = true; + console.log(`media-agent - addPeer:${peer.peer_name} Force enabled ${t.kind} track for bot`); + } + } + connection.addTrack(t, media); }); } else if (!localUserHasMedia) { @@ -824,10 +840,11 @@ const MediaAgent = (props: MediaAgentProps) => { await pc.addIceCandidate(candidateInit); console.log(`media-agent - sessionDescription:${peer_name} - Queued ICE candidate added`); } catch (err) { - console.error( - `media-agent - sessionDescription:${peer_name} - Failed to add queued ICE candidate:`, - { candidateInit, err } - ); + console.error(`media-agent - sessionDescription:${peer_name} - Failed to add queued ICE candidate:`, { + candidateInit, + rawCandidate: candidate, + err, + }); } } } catch (err) { @@ -969,7 +986,13 @@ const MediaAgent = (props: MediaAgentProps) => { .then(() => console.log(`media-agent - iceCandidate::${peer_name} - ICE candidate added for ${peer.peer_name}`) ) - .catch((err) => console.error(`media-agent - iceCandidate::${peer_name} - Failed to add ICE candidate:`, { candidateInit, err })); + .catch((err) => + console.error(`media-agent - iceCandidate::${peer_name} - Failed to add ICE candidate:`, { + candidateInit, + rawCandidate: candidate, + err, + }) + ); } }, [peers] @@ -1102,9 +1125,16 @@ const MediaAgent = (props: MediaAgentProps) => { const videoTracks = media.getVideoTracks(); if (audioTracks.length > 0) { - tracks.push(audioTracks[0]); + const audioTrack = audioTracks[0]; + tracks.push(audioTrack); hasRealAudio = true; - console.log("media-agent - Using real audio"); + console.log("media-agent - Using real audio:", { + enabled: audioTrack.enabled, + muted: audioTrack.muted, + readyState: audioTrack.readyState, + label: audioTrack.label, + id: audioTrack.id, + }); } if (videoTracks.length > 0) { @@ -1130,7 +1160,13 @@ const MediaAgent = (props: MediaAgentProps) => { } const finalMedia = new MediaStream(tracks); - console.log(`media-agent - Media setup complete`); + console.log(`media-agent - Media setup complete:`, { + totalTracks: finalMedia.getTracks().length, + audioTracks: finalMedia.getAudioTracks().length, + videoTracks: finalMedia.getVideoTracks().length, + hasRealAudio, + hasRealVideo, + }); return finalMedia; }, [session.name]); diff --git a/voicebot/bots/whisper.py b/voicebot/bots/whisper.py index 9246882..de876a7 100644 --- a/voicebot/bots/whisper.py +++ b/voicebot/bots/whisper.py @@ -480,20 +480,13 @@ async def handle_track_received(peer: Peer, track: MediaStreamTrack): # Receive audio frame frame = await track.recv() if isinstance(frame, AudioFrame): - frame_info = ( - f"{frame.sample_rate}Hz, {frame.format.name}, {frame.layout.name}" - ) - logger.debug( - f"Received audio frame from {peer.peer_name}: {frame_info}" - ) - # Convert AudioFrame to numpy array audio_data = frame.to_ndarray() original_shape = audio_data.shape original_dtype = audio_data.dtype logger.debug( - f"Audio frame data: shape={original_shape}, dtype={original_dtype}" + f"Audio frame data: shape={original_shape}, dtype={original_dtype}, samples={frame.samples if hasattr(frame, 'samples') else 'unknown'}" ) # Handle different audio formats @@ -528,17 +521,46 @@ async def handle_track_received(peer: Peer, track: MediaStreamTrack): frame_rms = np.sqrt(np.mean(audio_data_float32**2)) frame_peak = np.max(np.abs(audio_data_float32)) - # Only log full frame details every 20 frames to reduce noise + # Track frame count and audio state frame_count = getattr(peer, "_whisper_frame_count", 0) + 1 setattr(peer, "_whisper_frame_count", frame_count) - - if frame_count % 20 == 0: + + # Track if we've seen audio before (to detect start of speech) + had_audio = getattr(peer, "_whisper_had_audio", False) + + # Define thresholds for "real audio" detection + audio_threshold = 0.001 # RMS threshold for detecting speech + has_audio = frame_rms > audio_threshold + + # Log important audio events + if has_audio and not had_audio: + # Started receiving audio + frame_info = f"{frame.sample_rate}Hz, {frame.format.name}, {frame.layout.name}" logger.info( - f"Audio frame #{frame_count} from {peer.peer_name}: {frame_info}, {len(audio_data_float32)} samples, RMS: {frame_rms:.4f}, Peak: {frame_peak:.4f}" + f"🎤 AUDIO DETECTED from {peer.peer_name}! Frame #{frame_count}: {frame_info}, RMS: {frame_rms:.4f}, Peak: {frame_peak:.4f}" ) - else: + setattr(peer, "_whisper_had_audio", True) + setattr(peer, "_whisper_last_audio_frame", frame_count) + elif not has_audio and had_audio: + # Stopped receiving audio + last_audio_frame = getattr(peer, "_whisper_last_audio_frame", 0) + logger.info( + f"🔇 Audio stopped from {peer.peer_name} at frame #{frame_count} (last audio was frame #{last_audio_frame})" + ) + setattr(peer, "_whisper_had_audio", False) + elif has_audio: + # Continue receiving audio - update last audio frame but don't spam logs + setattr(peer, "_whisper_last_audio_frame", frame_count) + # Only log every 100 frames when continuously receiving audio + if frame_count % 100 == 0: + logger.info( + f"🎤 Audio continuing from {peer.peer_name}: Frame #{frame_count}, RMS: {frame_rms:.4f}" + ) + + # Log connection info much less frequently (every 200 frames when silent) + if not has_audio and frame_count % 200 == 0: logger.debug( - f"Audio frame #{frame_count}: RMS: {frame_rms:.4f}, Peak: {frame_peak:.4f}" + f"Connection active from {peer.peer_name}: Frame #{frame_count} (silent, RMS: {frame_rms:.6f})" ) # Send to audio processor diff --git a/voicebot/webrtc_signaling.py b/voicebot/webrtc_signaling.py index 80e747f..e830c3d 100644 --- a/voicebot/webrtc_signaling.py +++ b/voicebot/webrtc_signaling.py @@ -795,7 +795,7 @@ class WebRTCSignalingClient: peer_id=peer_id, peer_name=peer_name, candidate=candidate_model ) logger.info( - f"on_ice_candidate: Sending relayICECandidate for {peer_name}: {candidate_model}" + f"on_ice_candidate: Sending relayICECandidate for {peer_name}: candidate='{candidate_model.candidate}' sdpMid={candidate_model.sdpMid} sdpMLineIndex={candidate_model.sdpMLineIndex}" ) asyncio.ensure_future( self._send_message("relayICECandidate", payload_model.model_dump()) @@ -971,11 +971,7 @@ class WebRTCSignalingClient: current_section_mid = str(current_media_index) elif line.startswith("a=candidate:"): - candidate_sdp = line[2:] # Remove 'a=' prefix - - # Ensure candidate has the proper SDP format - if candidate_sdp and not candidate_sdp.startswith("candidate:"): - candidate_sdp = f"candidate:{candidate_sdp}" + candidate_sdp = line[2:] # Remove 'a=' prefix, keeping "candidate:..." # Clean up any extra spaces if candidate_sdp: @@ -995,7 +991,7 @@ class WebRTCSignalingClient: ) logger.debug( - f"_extract_and_send_candidates: Sending ICE candidate for {peer_name} (mid={current_section_mid}, idx={current_media_index}): {candidate_sdp[:60]}..." + f"_extract_and_send_candidates: Sending ICE candidate for {peer_name} (mid={current_section_mid}, idx={current_media_index}): candidate='{candidate_sdp}'" ) await self._send_message( "relayICECandidate", payload_candidate.model_dump()