Multi-user transcription
This commit is contained in:
parent
d6791a5233
commit
9089edaeea
@ -792,8 +792,44 @@ const MediaAgent = (props: MediaAgentProps) => {
|
|||||||
|
|
||||||
for (const candidate of pendingCandidates) {
|
for (const candidate of pendingCandidates) {
|
||||||
try {
|
try {
|
||||||
await pc.addIceCandidate(new RTCIceCandidate(candidate));
|
if (!candidate.candidate) {
|
||||||
console.log(`media-agent - sessionDescription:${peer_name} - Queued ICE candidate added`);
|
// End-of-candidates signal
|
||||||
|
await pc.addIceCandidate(undefined);
|
||||||
|
console.log(`media-agent - sessionDescription:${peer_name} - Queued end-of-candidates added`);
|
||||||
|
} else {
|
||||||
|
// Coerce and sanitize the incoming candidate before handing to the browser
|
||||||
|
let candStr: string | null = candidate.candidate ?? null;
|
||||||
|
if (typeof candStr === "string") {
|
||||||
|
candStr = candStr.trim();
|
||||||
|
// Strip leading 'a=' if present (sometimes sent from SDP parsing)
|
||||||
|
if (candStr.startsWith("a=candidate:")) {
|
||||||
|
candStr = candStr.replace(/^a=/, "");
|
||||||
|
}
|
||||||
|
// Ensure the string starts with the expected keyword
|
||||||
|
if (!candStr.startsWith("candidate:")) {
|
||||||
|
candStr = `candidate:${candStr}`;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const candidateInit: RTCIceCandidateInit = {
|
||||||
|
candidate: candStr ?? "",
|
||||||
|
sdpMid: candidate.sdpMid ?? undefined,
|
||||||
|
sdpMLineIndex:
|
||||||
|
typeof candidate.sdpMLineIndex === "number"
|
||||||
|
? candidate.sdpMLineIndex
|
||||||
|
: undefined,
|
||||||
|
};
|
||||||
|
|
||||||
|
try {
|
||||||
|
await pc.addIceCandidate(candidateInit);
|
||||||
|
console.log(`media-agent - sessionDescription:${peer_name} - Queued ICE candidate added`);
|
||||||
|
} catch (err) {
|
||||||
|
console.error(
|
||||||
|
`media-agent - sessionDescription:${peer_name} - Failed to add queued ICE candidate:`,
|
||||||
|
{ candidateInit, err }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
console.error(`media-agent - sessionDescription:${peer_name} - Failed to add queued ICE candidate:`, err);
|
console.error(`media-agent - sessionDescription:${peer_name} - Failed to add queued ICE candidate:`, err);
|
||||||
}
|
}
|
||||||
@ -899,10 +935,42 @@ const MediaAgent = (props: MediaAgentProps) => {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Add the ICE candidate
|
// Add the ICE candidate
|
||||||
peer.connection
|
if (!candidate.candidate) {
|
||||||
.addIceCandidate(new RTCIceCandidate(candidate))
|
// End-of-candidates signal
|
||||||
.then(() => console.log(`media-agent - iceCandidate::${peer_name} - ICE candidate added for ${peer.peer_name}`))
|
peer.connection
|
||||||
.catch((err) => console.error(`media-agent - iceCandidate::${peer_name} - Failed to add ICE candidate:`, err));
|
.addIceCandidate(undefined)
|
||||||
|
.then(() =>
|
||||||
|
console.log(`media-agent - iceCandidate::${peer_name} - End-of-candidates added for ${peer.peer_name}`)
|
||||||
|
)
|
||||||
|
.catch((err) =>
|
||||||
|
console.error(`media-agent - iceCandidate::${peer_name} - Failed to add end-of-candidates:`, err)
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
// Sanitize and coerce incoming candidate
|
||||||
|
let candStr: string | null = candidate.candidate ?? null;
|
||||||
|
if (typeof candStr === "string") {
|
||||||
|
candStr = candStr.trim();
|
||||||
|
if (candStr.startsWith("a=candidate:")) {
|
||||||
|
candStr = candStr.replace(/^a=/, "");
|
||||||
|
}
|
||||||
|
if (!candStr.startsWith("candidate:")) {
|
||||||
|
candStr = `candidate:${candStr}`;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const candidateInit: RTCIceCandidateInit = {
|
||||||
|
candidate: candStr ?? "",
|
||||||
|
sdpMid: candidate.sdpMid ?? undefined,
|
||||||
|
sdpMLineIndex: typeof candidate.sdpMLineIndex === "number" ? candidate.sdpMLineIndex : undefined,
|
||||||
|
};
|
||||||
|
|
||||||
|
peer.connection
|
||||||
|
.addIceCandidate(candidateInit)
|
||||||
|
.then(() =>
|
||||||
|
console.log(`media-agent - iceCandidate::${peer_name} - ICE candidate added for ${peer.peer_name}`)
|
||||||
|
)
|
||||||
|
.catch((err) => console.error(`media-agent - iceCandidate::${peer_name} - Failed to add ICE candidate:`, { candidateInit, err }));
|
||||||
|
}
|
||||||
},
|
},
|
||||||
[peers]
|
[peers]
|
||||||
);
|
);
|
||||||
|
@ -25,7 +25,10 @@ import sys
|
|||||||
import os
|
import os
|
||||||
|
|
||||||
from voicebot.models import Peer
|
from voicebot.models import Peer
|
||||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
|
||||||
|
sys.path.append(
|
||||||
|
os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
|
)
|
||||||
from shared.models import ChatMessageModel
|
from shared.models import ChatMessageModel
|
||||||
|
|
||||||
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
|
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
|
||||||
@ -33,20 +36,25 @@ from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
|
|||||||
# Type definitions
|
# Type definitions
|
||||||
AudioArray = npt.NDArray[np.float32]
|
AudioArray = npt.NDArray[np.float32]
|
||||||
|
|
||||||
|
|
||||||
class AudioQueueItem(BaseModel):
|
class AudioQueueItem(BaseModel):
|
||||||
"""Audio data with timestamp for processing queue."""
|
"""Audio data with timestamp for processing queue."""
|
||||||
|
|
||||||
audio: AudioArray
|
audio: AudioArray
|
||||||
timestamp: float
|
timestamp: float
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
arbitrary_types_allowed = True
|
arbitrary_types_allowed = True
|
||||||
|
|
||||||
|
|
||||||
class TranscriptionHistoryItem(BaseModel):
|
class TranscriptionHistoryItem(BaseModel):
|
||||||
"""Transcription history item with metadata."""
|
"""Transcription history item with metadata."""
|
||||||
|
|
||||||
message: str
|
message: str
|
||||||
timestamp: float
|
timestamp: float
|
||||||
is_final: bool
|
is_final: bool
|
||||||
|
|
||||||
|
|
||||||
AGENT_NAME = "whisper"
|
AGENT_NAME = "whisper"
|
||||||
AGENT_DESCRIPTION = "Real-time speech transcription (Whisper) - converts speech to text"
|
AGENT_DESCRIPTION = "Real-time speech transcription (Whisper) - converts speech to text"
|
||||||
sample_rate = 16000 # Whisper expects 16kHz
|
sample_rate = 16000 # Whisper expects 16kHz
|
||||||
@ -55,7 +63,7 @@ model_ids = {
|
|||||||
"Distil-Whisper": [
|
"Distil-Whisper": [
|
||||||
"distil-whisper/distil-large-v2",
|
"distil-whisper/distil-large-v2",
|
||||||
"distil-whisper/distil-medium.en",
|
"distil-whisper/distil-medium.en",
|
||||||
"distil-whisper/distil-small.en"
|
"distil-whisper/distil-small.en",
|
||||||
],
|
],
|
||||||
"Whisper": [
|
"Whisper": [
|
||||||
"openai/whisper-large-v3",
|
"openai/whisper-large-v3",
|
||||||
@ -69,15 +77,24 @@ model_ids = {
|
|||||||
"openai/whisper-small.en",
|
"openai/whisper-small.en",
|
||||||
"openai/whisper-base.en",
|
"openai/whisper-base.en",
|
||||||
"openai/whisper-tiny.en",
|
"openai/whisper-tiny.en",
|
||||||
]
|
],
|
||||||
}
|
}
|
||||||
# Global whisper model and transcription handler
|
# Global whisper model and transcription handler
|
||||||
_model_type = model_ids["Distil-Whisper"]
|
_model_type = model_ids["Distil-Whisper"]
|
||||||
_model_id = _model_type[0]
|
_model_id = _model_type[0]
|
||||||
|
|
||||||
|
logger.info(f"Loading Whisper model: {_model_id}")
|
||||||
_processor: Any = AutoProcessor.from_pretrained(pretrained_model_name_or_path=_model_id) # type: ignore
|
_processor: Any = AutoProcessor.from_pretrained(pretrained_model_name_or_path=_model_id) # type: ignore
|
||||||
_pt_model: Any = AutoModelForSpeechSeq2Seq.from_pretrained(pretrained_model_name_or_path=_model_id) # type: ignore
|
logger.info("Whisper processor loaded successfully")
|
||||||
|
|
||||||
|
_pt_model: Any = AutoModelForSpeechSeq2Seq.from_pretrained(
|
||||||
|
pretrained_model_name_or_path=_model_id
|
||||||
|
) # type: ignore
|
||||||
_pt_model.eval() # type: ignore
|
_pt_model.eval() # type: ignore
|
||||||
_audio_processor: Optional['AudioProcessor'] = None
|
logger.info("Whisper model loaded and set to evaluation mode")
|
||||||
|
|
||||||
|
_audio_processors: Dict[str, "AudioProcessor"] = {} # Per-peer audio processors
|
||||||
|
_send_chat_func: Optional[Callable[[str], Awaitable[None]]] = None
|
||||||
|
|
||||||
|
|
||||||
def extract_input_features(audio_array: Any, sampling_rate: int) -> Any:
|
def extract_input_features(audio_array: Any, sampling_rate: int) -> Any:
|
||||||
@ -90,214 +107,456 @@ def extract_input_features(audio_array: Any, sampling_rate: int) -> Any:
|
|||||||
input_features: Any = processor_output.input_features # type: ignore
|
input_features: Any = processor_output.input_features # type: ignore
|
||||||
return input_features # type: ignore
|
return input_features # type: ignore
|
||||||
|
|
||||||
|
|
||||||
class AudioProcessor:
|
class AudioProcessor:
|
||||||
"""Handles audio stream processing and transcription with sentence chunking."""
|
"""Handles audio stream processing and transcription with sentence chunking for a specific peer."""
|
||||||
|
|
||||||
def __init__(self, send_chat_func: Callable[[str], Awaitable[None]]):
|
def __init__(
|
||||||
|
self, peer_name: str, send_chat_func: Callable[[str], Awaitable[None]]
|
||||||
|
):
|
||||||
|
self.peer_name = peer_name
|
||||||
self.send_chat_func = send_chat_func
|
self.send_chat_func = send_chat_func
|
||||||
self.sample_rate = 16000 # Whisper expects 16kHz
|
self.sample_rate = 16000 # Whisper expects 16kHz
|
||||||
self.samples_per_frame = 480 # Common WebRTC frame size at 16kHz (30ms)
|
self.samples_per_frame = 480 # Common WebRTC frame size at 16kHz (30ms)
|
||||||
|
|
||||||
# Audio buffering
|
# Audio buffering
|
||||||
self.audio_buffer: Deque[AudioArray] = deque(maxlen=1000) # ~30 seconds at 30ms frames
|
self.audio_buffer: Deque[AudioArray] = deque(
|
||||||
self.phrase_timeout = 3.0 # seconds of silence before considering phrase complete
|
maxlen=1000
|
||||||
|
) # ~30 seconds at 30ms frames
|
||||||
|
self.phrase_timeout = (
|
||||||
|
3.0 # seconds of silence before considering phrase complete
|
||||||
|
)
|
||||||
self.last_activity_time = time.time()
|
self.last_activity_time = time.time()
|
||||||
|
|
||||||
# Transcription state
|
# Transcription state
|
||||||
self.current_phrase_audio: AudioArray = np.array([], dtype=np.float32)
|
self.current_phrase_audio: AudioArray = np.array([], dtype=np.float32)
|
||||||
self.transcription_history: list[TranscriptionHistoryItem] = []
|
self.transcription_history: list[TranscriptionHistoryItem] = []
|
||||||
|
|
||||||
# Background processing
|
# Background processing
|
||||||
self.processing_queue: Queue[AudioQueueItem] = Queue()
|
self.processing_queue: Queue[AudioQueueItem] = Queue()
|
||||||
self.is_running = True
|
self.is_running = True
|
||||||
self.processor_thread = threading.Thread(target=self._processing_loop, daemon=True)
|
self.processor_thread = threading.Thread(
|
||||||
|
target=self._processing_loop, daemon=True
|
||||||
|
)
|
||||||
self.processor_thread.start()
|
self.processor_thread.start()
|
||||||
|
|
||||||
logger.info("AudioProcessor initialized for real-time transcription")
|
logger.info(
|
||||||
|
f"AudioProcessor initialized for {self.peer_name} - sample_rate: {self.sample_rate}Hz, frame_size: {self.samples_per_frame}, phrase_timeout: {self.phrase_timeout}s"
|
||||||
|
)
|
||||||
|
|
||||||
def add_audio_data(self, audio_data: AudioArray):
|
def add_audio_data(self, audio_data: AudioArray):
|
||||||
"""Add new audio data to the processing buffer."""
|
"""Add new audio data to the processing buffer."""
|
||||||
if not self.is_running:
|
if not self.is_running:
|
||||||
|
logger.debug("AudioProcessor not running, ignoring audio data")
|
||||||
return
|
return
|
||||||
|
|
||||||
# Resample if needed (WebRTC might provide different sample rates)
|
# Resample if needed (WebRTC might provide different sample rates)
|
||||||
if len(audio_data) > 0:
|
if len(audio_data) > 0:
|
||||||
self.audio_buffer.append(audio_data)
|
self.audio_buffer.append(audio_data)
|
||||||
self.last_activity_time = time.time()
|
self.last_activity_time = time.time()
|
||||||
|
|
||||||
|
# Calculate audio metrics to detect silence
|
||||||
|
audio_rms = np.sqrt(np.mean(audio_data**2))
|
||||||
|
audio_peak = np.max(np.abs(audio_data))
|
||||||
|
|
||||||
|
# Log audio buffer status (reduced verbosity)
|
||||||
|
buffer_duration_ms = len(self.audio_buffer) * 30 # assuming 30ms frames
|
||||||
|
|
||||||
|
# Only log if we have meaningful audio or every 50 frames
|
||||||
|
if audio_rms > 0.001 or len(self.audio_buffer) % 50 == 0:
|
||||||
|
logger.info(
|
||||||
|
f"Added audio chunk: {len(audio_data)} samples, buffer size: {len(self.audio_buffer)} frames ({buffer_duration_ms}ms), RMS: {audio_rms:.4f}, Peak: {audio_peak:.4f}"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.debug(
|
||||||
|
f"Added silent audio chunk: {len(audio_data)} samples, buffer size: {len(self.audio_buffer)} frames"
|
||||||
|
)
|
||||||
|
|
||||||
# Check if we should process accumulated audio
|
# Check if we should process accumulated audio
|
||||||
if len(self.audio_buffer) >= 10: # Process every ~300ms (10 * 30ms frames)
|
if len(self.audio_buffer) >= 10: # Process every ~300ms (10 * 30ms frames)
|
||||||
self._queue_for_processing()
|
# Check if we have any meaningful audio in the buffer
|
||||||
|
combined_audio = np.concatenate(list(self.audio_buffer))
|
||||||
|
combined_rms = np.sqrt(np.mean(combined_audio**2))
|
||||||
|
|
||||||
|
if combined_rms > 0.001: # Only process if not silence
|
||||||
|
logger.info(
|
||||||
|
f"Buffer threshold reached with meaningful audio (RMS: {combined_rms:.4f}), queuing for processing"
|
||||||
|
)
|
||||||
|
self._queue_for_processing()
|
||||||
|
else:
|
||||||
|
logger.debug(
|
||||||
|
f"Buffer threshold reached but audio is silent (RMS: {combined_rms:.4f}), clearing buffer"
|
||||||
|
)
|
||||||
|
self.audio_buffer.clear() # Clear silent audio
|
||||||
|
|
||||||
def _queue_for_processing(self):
|
def _queue_for_processing(self):
|
||||||
"""Queue current audio buffer for transcription processing."""
|
"""Queue current audio buffer for transcription processing."""
|
||||||
if not self.audio_buffer:
|
if not self.audio_buffer:
|
||||||
|
logger.debug("No audio in buffer to queue for processing")
|
||||||
return
|
return
|
||||||
|
|
||||||
# Combine recent audio frames
|
# Combine recent audio frames
|
||||||
combined_audio = np.concatenate(list(self.audio_buffer))
|
combined_audio = np.concatenate(list(self.audio_buffer))
|
||||||
self.audio_buffer.clear()
|
self.audio_buffer.clear()
|
||||||
|
|
||||||
|
# Calculate audio metrics
|
||||||
|
audio_duration_sec = len(combined_audio) / self.sample_rate
|
||||||
|
audio_rms = np.sqrt(np.mean(combined_audio**2))
|
||||||
|
audio_peak = np.max(np.abs(combined_audio))
|
||||||
|
|
||||||
|
# Skip completely silent audio
|
||||||
|
if audio_rms < 0.001 and audio_peak < 0.001:
|
||||||
|
logger.debug(
|
||||||
|
f"Skipping silent audio chunk: RMS: {audio_rms:.4f}, Peak: {audio_peak:.4f}"
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"Queuing audio chunk: {len(combined_audio)} samples, {audio_duration_sec:.2f}s duration, RMS: {audio_rms:.4f}, Peak: {audio_peak:.4f}"
|
||||||
|
)
|
||||||
|
|
||||||
# Add to processing queue
|
# Add to processing queue
|
||||||
try:
|
try:
|
||||||
queue_item = AudioQueueItem(
|
queue_item = AudioQueueItem(audio=combined_audio, timestamp=time.time())
|
||||||
audio=combined_audio,
|
|
||||||
timestamp=time.time()
|
|
||||||
)
|
|
||||||
self.processing_queue.put_nowait(queue_item)
|
self.processing_queue.put_nowait(queue_item)
|
||||||
except Exception:
|
logger.info(
|
||||||
|
f"Added to processing queue, queue size: {self.processing_queue.qsize()}"
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
# Queue full, skip this chunk
|
# Queue full, skip this chunk
|
||||||
logger.debug("Audio processing queue full, dropping audio chunk")
|
logger.warning(f"Audio processing queue full, dropping audio chunk: {e}")
|
||||||
|
|
||||||
def _processing_loop(self):
|
def _processing_loop(self):
|
||||||
"""Background thread that processes audio chunks for transcription."""
|
"""Background thread that processes audio chunks for transcription."""
|
||||||
global _whisper_model
|
global _whisper_model
|
||||||
|
|
||||||
|
logger.info("ASR processing loop started")
|
||||||
|
|
||||||
while self.is_running:
|
while self.is_running:
|
||||||
try:
|
try:
|
||||||
# Get audio chunk to process (blocking with timeout)
|
# Get audio chunk to process (blocking with timeout)
|
||||||
try:
|
try:
|
||||||
audio_data = self.processing_queue.get(timeout=1.0)
|
audio_data = self.processing_queue.get(timeout=1.0)
|
||||||
|
logger.debug(
|
||||||
|
f"Retrieved audio chunk from queue, remaining queue size: {self.processing_queue.qsize()}"
|
||||||
|
)
|
||||||
except Empty:
|
except Empty:
|
||||||
|
logger.debug("Processing queue timeout, checking for more audio...")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
audio_array = audio_data.audio
|
audio_array = audio_data.audio
|
||||||
chunk_timestamp = audio_data.timestamp
|
chunk_timestamp = audio_data.timestamp
|
||||||
|
|
||||||
# Check if this is a new phrase (gap in audio)
|
# Check if this is a new phrase (gap in audio)
|
||||||
time_since_last = chunk_timestamp - self.last_activity_time
|
time_since_last = chunk_timestamp - self.last_activity_time
|
||||||
phrase_complete = time_since_last > self.phrase_timeout
|
phrase_complete = time_since_last > self.phrase_timeout
|
||||||
|
|
||||||
|
logger.debug(
|
||||||
|
f"Processing audio chunk: {len(audio_array)} samples, time since last: {time_since_last:.2f}s, phrase_complete: {phrase_complete}"
|
||||||
|
)
|
||||||
|
|
||||||
if phrase_complete and len(self.current_phrase_audio) > 0:
|
if phrase_complete and len(self.current_phrase_audio) > 0:
|
||||||
# Process the completed phrase
|
# Process the completed phrase
|
||||||
|
phrase_duration = len(self.current_phrase_audio) / self.sample_rate
|
||||||
|
phrase_rms = np.sqrt(np.mean(self.current_phrase_audio**2))
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"Processing completed phrase: {phrase_duration:.2f}s duration, {len(self.current_phrase_audio)} samples, RMS: {phrase_rms:.4f}"
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
loop = asyncio.get_event_loop()
|
loop = asyncio.get_event_loop()
|
||||||
asyncio.run_coroutine_threadsafe(
|
asyncio.run_coroutine_threadsafe(
|
||||||
self._transcribe_and_send(self.current_phrase_audio.copy(), is_final=True),
|
self._transcribe_and_send(
|
||||||
loop
|
self.current_phrase_audio.copy(), is_final=True
|
||||||
|
),
|
||||||
|
loop,
|
||||||
)
|
)
|
||||||
except RuntimeError:
|
except RuntimeError as e:
|
||||||
# No event loop running, skip this transcription
|
# No event loop running, skip this transcription
|
||||||
|
logger.warning(
|
||||||
|
f"No event loop available for final transcription: {e}"
|
||||||
|
)
|
||||||
pass
|
pass
|
||||||
self.current_phrase_audio = np.array([], dtype=np.float32)
|
self.current_phrase_audio = np.array([], dtype=np.float32)
|
||||||
|
|
||||||
# Add new audio to current phrase
|
# Add new audio to current phrase
|
||||||
self.current_phrase_audio = np.concatenate([self.current_phrase_audio, audio_array])
|
old_phrase_length = len(self.current_phrase_audio)
|
||||||
|
self.current_phrase_audio = np.concatenate(
|
||||||
# Also do streaming transcription for immediate feedback
|
[self.current_phrase_audio, audio_array]
|
||||||
if len(self.current_phrase_audio) > self.sample_rate * 2: # At least 2 seconds
|
)
|
||||||
|
current_phrase_duration = (
|
||||||
|
len(self.current_phrase_audio) / self.sample_rate
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.debug(
|
||||||
|
f"Updated current phrase: {old_phrase_length} -> {len(self.current_phrase_audio)} samples ({current_phrase_duration:.2f}s)"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Lower the threshold for streaming transcription to catch shorter phrases
|
||||||
|
min_transcription_duration = 1.0 # Reduced from 2.0 seconds
|
||||||
|
|
||||||
|
if (
|
||||||
|
len(self.current_phrase_audio)
|
||||||
|
> self.sample_rate * min_transcription_duration
|
||||||
|
): # At least 1 second
|
||||||
|
phrase_rms = np.sqrt(np.mean(self.current_phrase_audio**2))
|
||||||
|
logger.info(
|
||||||
|
f"Current phrase >= {min_transcription_duration}s (RMS: {phrase_rms:.4f}), attempting streaming transcription"
|
||||||
|
)
|
||||||
try:
|
try:
|
||||||
loop = asyncio.get_event_loop()
|
loop = asyncio.get_event_loop()
|
||||||
asyncio.run_coroutine_threadsafe(
|
asyncio.run_coroutine_threadsafe(
|
||||||
self._transcribe_and_send(self.current_phrase_audio.copy(), is_final=False),
|
self._transcribe_and_send(
|
||||||
loop
|
self.current_phrase_audio.copy(), is_final=False
|
||||||
|
),
|
||||||
|
loop,
|
||||||
)
|
)
|
||||||
except RuntimeError:
|
except RuntimeError as e:
|
||||||
# No event loop running, skip this transcription
|
# No event loop running, skip this transcription
|
||||||
|
logger.warning(
|
||||||
|
f"No event loop available for streaming transcription: {e}"
|
||||||
|
)
|
||||||
pass
|
pass
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error in audio processing loop: {e}", exc_info=True)
|
logger.error(f"Error in audio processing loop: {e}", exc_info=True)
|
||||||
|
|
||||||
|
logger.info("ASR processing loop ended")
|
||||||
|
|
||||||
async def _transcribe_and_send(self, audio_array: AudioArray, is_final: bool):
|
async def _transcribe_and_send(self, audio_array: AudioArray, is_final: bool):
|
||||||
"""Transcribe audio and send result as chat message."""
|
"""Transcribe audio and send result as chat message."""
|
||||||
global sample_rate
|
global sample_rate
|
||||||
|
|
||||||
|
transcription_start_time = time.time()
|
||||||
|
transcription_type = "final" if is_final else "streaming"
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if len(audio_array) < self.sample_rate * 0.5: # Skip very short audio
|
audio_duration_sec = len(audio_array) / self.sample_rate
|
||||||
|
|
||||||
|
# Reduce minimum audio duration threshold
|
||||||
|
min_duration = 0.3 # Reduced from 0.5 seconds
|
||||||
|
if len(audio_array) < self.sample_rate * min_duration:
|
||||||
|
logger.debug(
|
||||||
|
f"Skipping {transcription_type} transcription: audio too short ({audio_duration_sec:.2f}s < {min_duration}s)"
|
||||||
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# Calculate audio quality metrics
|
||||||
|
audio_rms = np.sqrt(np.mean(audio_array**2))
|
||||||
|
audio_peak = np.max(np.abs(audio_array))
|
||||||
|
|
||||||
|
# More lenient silence detection
|
||||||
|
if audio_rms < 0.0005: # Very quiet threshold
|
||||||
|
logger.debug(
|
||||||
|
f"Skipping {transcription_type} transcription: audio too quiet (RMS: {audio_rms:.6f})"
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"Starting {transcription_type} transcription: {audio_duration_sec:.2f}s audio, RMS: {audio_rms:.4f}, Peak: {audio_peak:.4f}"
|
||||||
|
)
|
||||||
|
|
||||||
# Ensure audio is in the right format for Whisper
|
# Ensure audio is in the right format for Whisper
|
||||||
audio_array = audio_array.astype(np.float32)
|
audio_array = audio_array.astype(np.float32)
|
||||||
|
|
||||||
# Transcribe with Whisper
|
|
||||||
input_features = extract_input_features(audio_array, sample_rate)
|
|
||||||
predicted_ids = _pt_model.generate(input_features) # type: ignore
|
|
||||||
transcription = _processor.batch_decode(predicted_ids, skip_special_tokens=True) # type: ignore
|
|
||||||
|
|
||||||
|
# Transcribe with Whisper
|
||||||
text = transcription.strip()
|
feature_extraction_start = time.time()
|
||||||
|
input_features = extract_input_features(audio_array, sample_rate)
|
||||||
if text and len(text) > 1: # Only send meaningful transcriptions
|
feature_extraction_time = time.time() - feature_extraction_start
|
||||||
prefix = "🎤 " if is_final else "🎤 [partial] "
|
|
||||||
|
model_inference_start = time.time()
|
||||||
|
predicted_ids = _pt_model.generate(input_features) # type: ignore
|
||||||
|
model_inference_time = time.time() - model_inference_start
|
||||||
|
|
||||||
|
decoding_start = time.time()
|
||||||
|
transcription = _processor.batch_decode(
|
||||||
|
predicted_ids, skip_special_tokens=True
|
||||||
|
) # type: ignore
|
||||||
|
decoding_time = time.time() - decoding_start
|
||||||
|
|
||||||
|
total_transcription_time = time.time() - transcription_start_time
|
||||||
|
|
||||||
|
logger.debug(
|
||||||
|
f"ASR timing - Feature extraction: {feature_extraction_time:.3f}s, Model inference: {model_inference_time:.3f}s, Decoding: {decoding_time:.3f}s, Total: {total_transcription_time:.3f}s"
|
||||||
|
)
|
||||||
|
|
||||||
|
text = (
|
||||||
|
transcription[0].strip()
|
||||||
|
if transcription and len(transcription) > 0
|
||||||
|
else ""
|
||||||
|
)
|
||||||
|
|
||||||
|
if text and len(text) > 0: # Accept any non-empty text
|
||||||
|
prefix = (
|
||||||
|
f"🎤 {self.peer_name}: "
|
||||||
|
if is_final
|
||||||
|
else f"🎤 {self.peer_name} [partial]: "
|
||||||
|
)
|
||||||
message = f"{prefix}{text}"
|
message = f"{prefix}{text}"
|
||||||
|
|
||||||
# Avoid sending duplicate messages
|
# Avoid sending duplicate messages
|
||||||
if is_final or message not in [h.message for h in self.transcription_history[-3:]]:
|
if is_final or message not in [
|
||||||
|
h.message for h in self.transcription_history[-3:]
|
||||||
|
]:
|
||||||
await self.send_chat_func(message)
|
await self.send_chat_func(message)
|
||||||
|
|
||||||
# Keep history for deduplication
|
# Keep history for deduplication
|
||||||
history_item = TranscriptionHistoryItem(
|
history_item = TranscriptionHistoryItem(
|
||||||
message=message,
|
message=message, timestamp=time.time(), is_final=is_final
|
||||||
timestamp=time.time(),
|
|
||||||
is_final=is_final
|
|
||||||
)
|
)
|
||||||
self.transcription_history.append(history_item)
|
self.transcription_history.append(history_item)
|
||||||
|
|
||||||
# Limit history size
|
# Limit history size
|
||||||
if len(self.transcription_history) > 10:
|
if len(self.transcription_history) > 10:
|
||||||
self.transcription_history.pop(0)
|
self.transcription_history.pop(0)
|
||||||
|
|
||||||
logger.info(f"Transcribed ({'final' if is_final else 'partial'}): {text}")
|
logger.info(
|
||||||
|
f"✅ Transcribed ({transcription_type}) for {self.peer_name}: '{text}' (processing time: {total_transcription_time:.3f}s, audio duration: {audio_duration_sec:.2f}s)"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.debug(
|
||||||
|
f"Skipping duplicate {transcription_type} transcription: '{text}'"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.info(
|
||||||
|
f"❌ No text from {transcription_type} transcription for {self.peer_name} (empty result from model)"
|
||||||
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error in transcription: {e}", exc_info=True)
|
logger.error(
|
||||||
|
f"Error in {transcription_type} transcription: {e}", exc_info=True
|
||||||
|
)
|
||||||
|
|
||||||
def shutdown(self):
|
def shutdown(self):
|
||||||
"""Shutdown the audio processor."""
|
"""Shutdown the audio processor."""
|
||||||
|
logger.info(f"Shutting down AudioProcessor for {self.peer_name}...")
|
||||||
self.is_running = False
|
self.is_running = False
|
||||||
if self.processor_thread.is_alive():
|
if self.processor_thread.is_alive():
|
||||||
|
logger.debug(
|
||||||
|
f"Waiting for processor thread for {self.peer_name} to finish..."
|
||||||
|
)
|
||||||
self.processor_thread.join(timeout=2.0)
|
self.processor_thread.join(timeout=2.0)
|
||||||
|
if self.processor_thread.is_alive():
|
||||||
|
logger.warning(
|
||||||
|
f"Processor thread for {self.peer_name} did not shut down cleanly within timeout"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.info(
|
||||||
|
f"Processor thread for {self.peer_name} shut down successfully"
|
||||||
|
)
|
||||||
|
logger.info(f"AudioProcessor for {self.peer_name} shutdown complete")
|
||||||
|
|
||||||
|
|
||||||
async def handle_track_received(peer: Peer, track: MediaStreamTrack):
|
async def handle_track_received(peer: Peer, track: MediaStreamTrack):
|
||||||
"""Handle incoming audio tracks from WebRTC peers."""
|
"""Handle incoming audio tracks from WebRTC peers."""
|
||||||
global _audio_processor
|
global _audio_processors, _send_chat_func
|
||||||
|
|
||||||
if track.kind != "audio":
|
if track.kind != "audio":
|
||||||
logger.info(f"Ignoring non-audio track: {track.kind}")
|
logger.info(f"Ignoring non-audio track from {peer.peer_name}: {track.kind}")
|
||||||
return
|
return
|
||||||
|
|
||||||
logger.info(f"Received audio track from {peer.peer_name}, starting transcription")
|
# Create or get audio processor for this peer
|
||||||
|
if peer.peer_name not in _audio_processors:
|
||||||
|
if _send_chat_func is None:
|
||||||
|
logger.error(
|
||||||
|
f"Cannot create AudioProcessor for {peer.peer_name}: no send_chat_func available"
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
logger.info(f"Creating new AudioProcessor for {peer.peer_name}")
|
||||||
|
_audio_processors[peer.peer_name] = AudioProcessor(
|
||||||
|
peer_name=peer.peer_name, send_chat_func=_send_chat_func
|
||||||
|
)
|
||||||
|
|
||||||
|
audio_processor = _audio_processors[peer.peer_name]
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"Received audio track from {peer.peer_name}, starting transcription (processor available: {audio_processor is not None})"
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
while True:
|
while True:
|
||||||
# Receive audio frame
|
# Receive audio frame
|
||||||
frame = await track.recv()
|
frame = await track.recv()
|
||||||
if isinstance(frame, AudioFrame):
|
if isinstance(frame, AudioFrame):
|
||||||
logger.info(f"Received audio frame: {frame.sample_rate}Hz, {frame.format.name}, {frame.layout.name}")
|
frame_info = (
|
||||||
|
f"{frame.sample_rate}Hz, {frame.format.name}, {frame.layout.name}"
|
||||||
|
)
|
||||||
|
logger.debug(
|
||||||
|
f"Received audio frame from {peer.peer_name}: {frame_info}"
|
||||||
|
)
|
||||||
|
|
||||||
# Convert AudioFrame to numpy array
|
# Convert AudioFrame to numpy array
|
||||||
audio_data = frame.to_ndarray()
|
audio_data = frame.to_ndarray()
|
||||||
|
original_shape = audio_data.shape
|
||||||
|
original_dtype = audio_data.dtype
|
||||||
|
|
||||||
|
logger.debug(
|
||||||
|
f"Audio frame data: shape={original_shape}, dtype={original_dtype}"
|
||||||
|
)
|
||||||
|
|
||||||
# Handle different audio formats
|
# Handle different audio formats
|
||||||
if audio_data.ndim == 2: # Stereo -> mono
|
if audio_data.ndim == 2: # Stereo -> mono
|
||||||
audio_data = np.mean(audio_data, axis=1)
|
audio_data = np.mean(audio_data, axis=1)
|
||||||
|
logger.debug(
|
||||||
|
f"Converted stereo to mono: {original_shape} -> {audio_data.shape}"
|
||||||
|
)
|
||||||
|
|
||||||
# Convert to float32 and normalize
|
# Convert to float32 and normalize
|
||||||
if audio_data.dtype == np.int16:
|
if audio_data.dtype == np.int16:
|
||||||
audio_data = audio_data.astype(np.float32) / 32768.0
|
audio_data = audio_data.astype(np.float32) / 32768.0
|
||||||
|
logger.debug("Normalized int16 audio to float32")
|
||||||
elif audio_data.dtype == np.int32:
|
elif audio_data.dtype == np.int32:
|
||||||
audio_data = audio_data.astype(np.float32) / 2147483648.0
|
audio_data = audio_data.astype(np.float32) / 2147483648.0
|
||||||
|
logger.debug("Normalized int32 audio to float32")
|
||||||
|
|
||||||
# Resample to 16kHz if needed
|
# Resample to 16kHz if needed
|
||||||
if frame.sample_rate != sample_rate:
|
if frame.sample_rate != sample_rate:
|
||||||
audio_data = librosa.resample( # type: ignore
|
original_length = len(audio_data)
|
||||||
audio_data,
|
audio_data = librosa.resample( # type: ignore
|
||||||
orig_sr=frame.sample_rate,
|
audio_data, orig_sr=frame.sample_rate, target_sr=sample_rate
|
||||||
target_sr=sample_rate
|
|
||||||
)
|
)
|
||||||
|
logger.debug(
|
||||||
# Ensure audio_data is AudioArray (float32)
|
f"Resampled audio: {frame.sample_rate}Hz -> {sample_rate}Hz, {original_length} -> {len(audio_data)} samples"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Ensure audio_data is AudioArray (float32)
|
||||||
audio_data_float32 = cast(AudioArray, audio_data.astype(np.float32))
|
audio_data_float32 = cast(AudioArray, audio_data.astype(np.float32))
|
||||||
|
|
||||||
|
# Calculate audio quality metrics for this frame
|
||||||
|
frame_rms = np.sqrt(np.mean(audio_data_float32**2))
|
||||||
|
frame_peak = np.max(np.abs(audio_data_float32))
|
||||||
|
|
||||||
|
# Only log full frame details every 20 frames to reduce noise
|
||||||
|
frame_count = getattr(peer, "_whisper_frame_count", 0) + 1
|
||||||
|
setattr(peer, "_whisper_frame_count", frame_count)
|
||||||
|
|
||||||
|
if frame_count % 20 == 0:
|
||||||
|
logger.info(
|
||||||
|
f"Audio frame #{frame_count} from {peer.peer_name}: {frame_info}, {len(audio_data_float32)} samples, RMS: {frame_rms:.4f}, Peak: {frame_peak:.4f}"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.debug(
|
||||||
|
f"Audio frame #{frame_count}: RMS: {frame_rms:.4f}, Peak: {frame_peak:.4f}"
|
||||||
|
)
|
||||||
|
|
||||||
# Send to audio processor
|
# Send to audio processor
|
||||||
if _audio_processor:
|
if audio_processor:
|
||||||
_audio_processor.add_audio_data(audio_data_float32)
|
audio_processor.add_audio_data(audio_data_float32)
|
||||||
|
else:
|
||||||
|
logger.warning(
|
||||||
|
f"No audio processor available to handle audio data for {peer.peer_name}"
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
logger.warning(f"Received non-audio frame on audio track from {peer.peer_name}")
|
logger.warning(
|
||||||
|
f"Received non-audio frame on audio track from {peer.peer_name}: type={type(frame)}"
|
||||||
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error processing audio track from {peer.peer_name}: {e}", exc_info=True)
|
logger.error(
|
||||||
|
f"Error processing audio track from {peer.peer_name}: {e}", exc_info=True
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def agent_info() -> Dict[str, str]:
|
def agent_info() -> Dict[str, str]:
|
||||||
@ -309,7 +568,9 @@ def create_agent_tracks(session_name: str) -> dict[str, MediaStreamTrack]:
|
|||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
|
||||||
async def handle_chat_message(chat_message: ChatMessageModel, send_message_func: Callable[[str], Awaitable[None]]) -> Optional[str]:
|
async def handle_chat_message(
|
||||||
|
chat_message: ChatMessageModel, send_message_func: Callable[[str], Awaitable[None]]
|
||||||
|
) -> Optional[str]:
|
||||||
"""Handle incoming chat messages and optionally return a response."""
|
"""Handle incoming chat messages and optionally return a response."""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@ -318,16 +579,41 @@ async def on_track_received(peer: Peer, track: MediaStreamTrack):
|
|||||||
"""Callback when a new track is received from a peer."""
|
"""Callback when a new track is received from a peer."""
|
||||||
await handle_track_received(peer, track)
|
await handle_track_received(peer, track)
|
||||||
|
|
||||||
|
|
||||||
# Export functions for the orchestrator to discover
|
# Export functions for the orchestrator to discover
|
||||||
def get_track_handler():
|
def get_track_handler():
|
||||||
"""Return the track handler function for the orchestrator to use."""
|
"""Return the track handler function for the orchestrator to use."""
|
||||||
return on_track_received
|
return on_track_received
|
||||||
|
|
||||||
|
|
||||||
def bind_send_chat_function(send_chat_func: Callable[[str], Awaitable[None]]):
|
def bind_send_chat_function(send_chat_func: Callable[[str], Awaitable[None]]):
|
||||||
"""Bind the send chat function to the audio processor."""
|
"""Bind the send chat function to be used for all audio processors."""
|
||||||
global _send_chat_func, _audio_processor
|
global _send_chat_func, _audio_processors
|
||||||
|
logger.info("Binding send chat function to whisper agent")
|
||||||
_send_chat_func = send_chat_func
|
_send_chat_func = send_chat_func
|
||||||
if _audio_processor:
|
|
||||||
_audio_processor.send_chat_func = send_chat_func
|
# Update existing audio processors
|
||||||
|
for peer_name, processor in _audio_processors.items():
|
||||||
|
logger.debug(
|
||||||
|
f"Updating AudioProcessor for {peer_name} with new send chat function"
|
||||||
|
)
|
||||||
|
processor.send_chat_func = send_chat_func
|
||||||
|
|
||||||
|
|
||||||
|
def cleanup_peer_processor(peer_name: str):
|
||||||
|
"""Clean up audio processor for a disconnected peer."""
|
||||||
|
global _audio_processors
|
||||||
|
|
||||||
|
if peer_name in _audio_processors:
|
||||||
|
logger.info(f"Cleaning up AudioProcessor for disconnected peer: {peer_name}")
|
||||||
|
processor = _audio_processors[peer_name]
|
||||||
|
processor.shutdown()
|
||||||
|
del _audio_processors[peer_name]
|
||||||
|
logger.info(f"AudioProcessor for {peer_name} cleaned up successfully")
|
||||||
else:
|
else:
|
||||||
_audio_processor = AudioProcessor(send_chat_func=send_chat_func)
|
logger.debug(f"No AudioProcessor found for peer {peer_name} during cleanup")
|
||||||
|
|
||||||
|
|
||||||
|
def get_active_processors() -> Dict[str, "AudioProcessor"]:
|
||||||
|
"""Get currently active audio processors (for debugging)."""
|
||||||
|
return _audio_processors.copy()
|
||||||
|
148
voicebot/force_transcription.py
Normal file
148
voicebot/force_transcription.py
Normal file
@ -0,0 +1,148 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Force transcription debug - processes any accumulated audio immediately.
|
||||||
|
Run this to force the whisper agent to attempt transcription of current audio buffer.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import asyncio
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
# Add the voicebot directory to the path
|
||||||
|
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
||||||
|
|
||||||
|
|
||||||
|
def force_transcription():
|
||||||
|
"""Force transcription of any accumulated audio."""
|
||||||
|
try:
|
||||||
|
from bots.whisper import _audio_processors
|
||||||
|
|
||||||
|
if not _audio_processors:
|
||||||
|
print(
|
||||||
|
"❌ No audio processors found. Whisper agent may not be running or no peers connected."
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
print(f"🔍 Found {len(_audio_processors)} active audio processors:")
|
||||||
|
|
||||||
|
for peer_name, audio_processor in _audio_processors.items():
|
||||||
|
print(f"\n👤 {peer_name}:")
|
||||||
|
print(f" - Running: {audio_processor.is_running}")
|
||||||
|
print(f" - Buffer size: {len(audio_processor.audio_buffer)} frames")
|
||||||
|
print(f" - Queue size: {audio_processor.processing_queue.qsize()}")
|
||||||
|
print(
|
||||||
|
f" - Current phrase length: {len(audio_processor.current_phrase_audio)} samples"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Force processing of current buffer
|
||||||
|
if len(audio_processor.audio_buffer) > 0:
|
||||||
|
print(
|
||||||
|
f"🔄 Forcing processing of {len(audio_processor.audio_buffer)} buffered frames for {peer_name}..."
|
||||||
|
)
|
||||||
|
audio_processor._queue_for_processing()
|
||||||
|
else:
|
||||||
|
print(f"📭 No audio in buffer to process for {peer_name}")
|
||||||
|
|
||||||
|
# If we have a current phrase, try to transcribe it
|
||||||
|
if len(audio_processor.current_phrase_audio) > 0:
|
||||||
|
phrase_duration = (
|
||||||
|
len(audio_processor.current_phrase_audio)
|
||||||
|
/ audio_processor.sample_rate
|
||||||
|
)
|
||||||
|
phrase_rms = np.sqrt(np.mean(audio_processor.current_phrase_audio**2))
|
||||||
|
print(
|
||||||
|
f"🎤 Current phrase for {peer_name}: {phrase_duration:.2f}s, RMS: {phrase_rms:.6f}"
|
||||||
|
)
|
||||||
|
|
||||||
|
if phrase_duration > 0.3: # Minimum duration
|
||||||
|
print(
|
||||||
|
f"🚀 Forcing transcription of current phrase for {peer_name}..."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create an event loop if none exists
|
||||||
|
try:
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
except RuntimeError:
|
||||||
|
loop = asyncio.new_event_loop()
|
||||||
|
asyncio.set_event_loop(loop)
|
||||||
|
|
||||||
|
# Force transcription
|
||||||
|
async def force_transcribe():
|
||||||
|
await audio_processor._transcribe_and_send(
|
||||||
|
audio_processor.current_phrase_audio.copy(), is_final=True
|
||||||
|
)
|
||||||
|
|
||||||
|
loop.run_until_complete(force_transcribe())
|
||||||
|
print(f"✅ Forced transcription completed for {peer_name}")
|
||||||
|
else:
|
||||||
|
print(
|
||||||
|
f"⏱️ Current phrase too short for {peer_name} ({phrase_duration:.2f}s < 0.3s)"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
print(f"🤐 No current phrase to transcribe for {peer_name}")
|
||||||
|
|
||||||
|
except ImportError:
|
||||||
|
print(
|
||||||
|
"❌ Could not import whisper components. Make sure the whisper agent is loaded."
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Error: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def show_audio_stats():
|
||||||
|
"""Show detailed audio statistics."""
|
||||||
|
try:
|
||||||
|
from bots.whisper import _audio_processors
|
||||||
|
|
||||||
|
if not _audio_processors:
|
||||||
|
print("❌ No audio processors found")
|
||||||
|
return
|
||||||
|
|
||||||
|
print(
|
||||||
|
f"\n📊 Detailed Audio Statistics for {len(_audio_processors)} processors:"
|
||||||
|
)
|
||||||
|
|
||||||
|
for peer_name, audio_processor in _audio_processors.items():
|
||||||
|
print(f"\n👤 {peer_name}:")
|
||||||
|
print(f"Sample rate: {audio_processor.sample_rate}Hz")
|
||||||
|
print(f"Samples per frame: {audio_processor.samples_per_frame}")
|
||||||
|
print(f"Phrase timeout: {audio_processor.phrase_timeout}s")
|
||||||
|
print(f"Buffer max length: {audio_processor.audio_buffer.maxlen}")
|
||||||
|
print(f"Current buffer size: {len(audio_processor.audio_buffer)}")
|
||||||
|
print(f"Processing queue size: {audio_processor.processing_queue.qsize()}")
|
||||||
|
|
||||||
|
if len(audio_processor.current_phrase_audio) > 0:
|
||||||
|
phrase_duration = (
|
||||||
|
len(audio_processor.current_phrase_audio)
|
||||||
|
/ audio_processor.sample_rate
|
||||||
|
)
|
||||||
|
phrase_rms = np.sqrt(np.mean(audio_processor.current_phrase_audio**2))
|
||||||
|
phrase_peak = np.max(np.abs(audio_processor.current_phrase_audio))
|
||||||
|
print(" Current phrase:")
|
||||||
|
print(f" Duration: {phrase_duration:.2f}s")
|
||||||
|
print(f" Samples: {len(audio_processor.current_phrase_audio)}")
|
||||||
|
print(f" RMS: {phrase_rms:.6f}")
|
||||||
|
print(f" Peak: {phrase_peak:.6f}")
|
||||||
|
|
||||||
|
if len(audio_processor.audio_buffer) > 0:
|
||||||
|
combined = np.concatenate(list(audio_processor.audio_buffer))
|
||||||
|
buffer_duration = len(combined) / audio_processor.sample_rate
|
||||||
|
buffer_rms = np.sqrt(np.mean(combined**2))
|
||||||
|
buffer_peak = np.max(np.abs(combined))
|
||||||
|
print(" Buffer contents:")
|
||||||
|
print(f" Duration: {buffer_duration:.2f}s")
|
||||||
|
print(f" Samples: {len(combined)}")
|
||||||
|
print(f" RMS: {buffer_rms:.6f}")
|
||||||
|
print(f" Peak: {buffer_peak:.6f}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Error getting stats: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
if len(sys.argv) > 1 and sys.argv[1] == "stats":
|
||||||
|
show_audio_stats()
|
||||||
|
else:
|
||||||
|
force_transcription()
|
||||||
|
show_audio_stats()
|
53
voicebot/set_whisper_debug.py
Normal file
53
voicebot/set_whisper_debug.py
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Helper script to adjust whisper ASR logging levels for debugging.
|
||||||
|
Run this to see more detailed ASR logging.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
# Add the voicebot directory to the path
|
||||||
|
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
||||||
|
|
||||||
|
from logger import logger
|
||||||
|
|
||||||
|
|
||||||
|
def set_debug_logging():
|
||||||
|
"""Set logger to DEBUG level for detailed ASR logging."""
|
||||||
|
logger.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
|
# Also set the root logger
|
||||||
|
logging.getLogger().setLevel(logging.DEBUG)
|
||||||
|
|
||||||
|
# Create a more detailed formatter if needed
|
||||||
|
formatter = logging.Formatter(
|
||||||
|
"%(asctime)s - %(name)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Update all handlers
|
||||||
|
for handler in logger.handlers:
|
||||||
|
handler.setLevel(logging.DEBUG)
|
||||||
|
handler.setFormatter(formatter)
|
||||||
|
|
||||||
|
logger.info("Debug logging enabled for Whisper ASR")
|
||||||
|
|
||||||
|
|
||||||
|
def set_info_logging():
|
||||||
|
"""Set logger back to INFO level."""
|
||||||
|
logger.setLevel(logging.INFO)
|
||||||
|
logging.getLogger().setLevel(logging.INFO)
|
||||||
|
|
||||||
|
# Update all handlers
|
||||||
|
for handler in logger.handlers:
|
||||||
|
handler.setLevel(logging.INFO)
|
||||||
|
|
||||||
|
logger.info("Info logging enabled for Whisper ASR")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
if len(sys.argv) > 1 and sys.argv[1] == "info":
|
||||||
|
set_info_logging()
|
||||||
|
else:
|
||||||
|
set_debug_logging()
|
110
voicebot/test_whisper_pipeline.py
Normal file
110
voicebot/test_whisper_pipeline.py
Normal file
@ -0,0 +1,110 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Debug script to test Whisper transcription with synthetic audio.
|
||||||
|
This helps identify if the issue is with audio processing or the transcription pipeline.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import time
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
# Add the voicebot directory to the path
|
||||||
|
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
|
||||||
|
|
||||||
|
try:
|
||||||
|
from bots.whisper import extract_input_features, _pt_model, _processor, sample_rate
|
||||||
|
except ImportError as e:
|
||||||
|
print(f"Error importing whisper components: {e}")
|
||||||
|
print("Make sure you're running this from the voicebot directory")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
def generate_test_audio(
|
||||||
|
duration_seconds: float = 2.0, frequency: float = 440.0
|
||||||
|
) -> np.ndarray:
|
||||||
|
"""Generate a synthetic sine wave for testing."""
|
||||||
|
samples = int(duration_seconds * sample_rate)
|
||||||
|
t = np.linspace(0, duration_seconds, samples, False)
|
||||||
|
# Generate a sine wave with some amplitude modulation to simulate speech-like patterns
|
||||||
|
amplitude = 0.1 * (
|
||||||
|
1 + 0.5 * np.sin(2 * np.pi * 2 * t)
|
||||||
|
) # Amplitude modulation at 2Hz
|
||||||
|
audio = amplitude * np.sin(2 * np.pi * frequency * t)
|
||||||
|
return audio.astype(np.float32)
|
||||||
|
|
||||||
|
|
||||||
|
def test_transcription_pipeline():
|
||||||
|
"""Test the Whisper transcription pipeline with synthetic audio."""
|
||||||
|
print("Testing Whisper transcription pipeline...")
|
||||||
|
|
||||||
|
# Test 1: Complete silence
|
||||||
|
print("\n=== Test 1: Complete Silence ===")
|
||||||
|
silent_audio = np.zeros(int(sample_rate * 2.0), dtype=np.float32)
|
||||||
|
test_audio_transcription(silent_audio, "Silent audio")
|
||||||
|
|
||||||
|
# Test 2: Very quiet noise
|
||||||
|
print("\n=== Test 2: Very Quiet Noise ===")
|
||||||
|
quiet_noise = np.random.normal(0, 0.001, int(sample_rate * 2.0)).astype(np.float32)
|
||||||
|
test_audio_transcription(quiet_noise, "Quiet noise")
|
||||||
|
|
||||||
|
# Test 3: Sine wave (should produce some output)
|
||||||
|
print("\n=== Test 3: Sine Wave ===")
|
||||||
|
sine_audio = generate_test_audio(2.0, 440.0)
|
||||||
|
test_audio_transcription(sine_audio, "Sine wave")
|
||||||
|
|
||||||
|
# Test 4: Multiple frequency sine wave
|
||||||
|
print("\n=== Test 4: Complex Sine Wave ===")
|
||||||
|
complex_audio = (
|
||||||
|
generate_test_audio(2.0, 220.0)
|
||||||
|
+ generate_test_audio(2.0, 440.0)
|
||||||
|
+ generate_test_audio(2.0, 880.0)
|
||||||
|
) / 3.0
|
||||||
|
test_audio_transcription(complex_audio, "Complex sine wave")
|
||||||
|
|
||||||
|
|
||||||
|
def test_audio_transcription(audio_array: np.ndarray, description: str):
|
||||||
|
"""Test transcription of a specific audio array."""
|
||||||
|
try:
|
||||||
|
# Calculate metrics
|
||||||
|
duration = len(audio_array) / sample_rate
|
||||||
|
rms = np.sqrt(np.mean(audio_array**2))
|
||||||
|
peak = np.max(np.abs(audio_array))
|
||||||
|
|
||||||
|
print(f"Testing {description}:")
|
||||||
|
print(f" Duration: {duration:.2f}s")
|
||||||
|
print(f" Samples: {len(audio_array)}")
|
||||||
|
print(f" RMS: {rms:.6f}")
|
||||||
|
print(f" Peak: {peak:.6f}")
|
||||||
|
|
||||||
|
# Test feature extraction
|
||||||
|
start_time = time.time()
|
||||||
|
input_features = extract_input_features(audio_array, sample_rate)
|
||||||
|
feature_time = time.time() - start_time
|
||||||
|
print(f" Feature extraction: {feature_time:.3f}s")
|
||||||
|
|
||||||
|
# Test model inference
|
||||||
|
start_time = time.time()
|
||||||
|
predicted_ids = _pt_model.generate(input_features)
|
||||||
|
inference_time = time.time() - start_time
|
||||||
|
print(f" Model inference: {inference_time:.3f}s")
|
||||||
|
|
||||||
|
# Test decoding
|
||||||
|
start_time = time.time()
|
||||||
|
transcription = _processor.batch_decode(predicted_ids, skip_special_tokens=True)
|
||||||
|
decoding_time = time.time() - start_time
|
||||||
|
print(f" Decoding: {decoding_time:.3f}s")
|
||||||
|
|
||||||
|
# Show result
|
||||||
|
text = (
|
||||||
|
transcription[0].strip() if transcription and len(transcription) > 0 else ""
|
||||||
|
)
|
||||||
|
print(f" Result: '{text}'" if text else " Result: (empty)")
|
||||||
|
print(f" Result length: {len(text)}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ERROR: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
test_transcription_pipeline()
|
@ -778,6 +778,14 @@ class WebRTCSignalingClient:
|
|||||||
f"ICE candidate outgoing for {peer_name}: type={cand_type} protocol={protocol} sdp={raw}"
|
f"ICE candidate outgoing for {peer_name}: type={cand_type} protocol={protocol} sdp={raw}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Ensure candidate has the proper SDP format
|
||||||
|
if raw and not raw.startswith("candidate:"):
|
||||||
|
raw = f"candidate:{raw}"
|
||||||
|
|
||||||
|
# Clean up any extra spaces
|
||||||
|
if raw:
|
||||||
|
raw = raw.replace("candidate: ", "candidate:")
|
||||||
|
|
||||||
candidate_model = ICECandidateDictModel(
|
candidate_model = ICECandidateDictModel(
|
||||||
candidate=raw,
|
candidate=raw,
|
||||||
sdpMid=getattr(candidate, "sdpMid", None),
|
sdpMid=getattr(candidate, "sdpMid", None),
|
||||||
@ -965,6 +973,14 @@ class WebRTCSignalingClient:
|
|||||||
elif line.startswith("a=candidate:"):
|
elif line.startswith("a=candidate:"):
|
||||||
candidate_sdp = line[2:] # Remove 'a=' prefix
|
candidate_sdp = line[2:] # Remove 'a=' prefix
|
||||||
|
|
||||||
|
# Ensure candidate has the proper SDP format
|
||||||
|
if candidate_sdp and not candidate_sdp.startswith("candidate:"):
|
||||||
|
candidate_sdp = f"candidate:{candidate_sdp}"
|
||||||
|
|
||||||
|
# Clean up any extra spaces
|
||||||
|
if candidate_sdp:
|
||||||
|
candidate_sdp = candidate_sdp.replace("candidate: ", "candidate:")
|
||||||
|
|
||||||
# Only send if we have valid MID and media index
|
# Only send if we have valid MID and media index
|
||||||
if current_section_mid is not None and current_media_index >= 0:
|
if current_section_mid is not None and current_media_index >= 0:
|
||||||
candidate_model = ICECandidateDictModel(
|
candidate_model = ICECandidateDictModel(
|
||||||
|
Loading…
x
Reference in New Issue
Block a user