Transcription almost working

This commit is contained in:
James Ketr 2025-09-07 23:10:55 -07:00
parent 0691dbf97f
commit df88374999

View File

@ -112,6 +112,8 @@ def extract_input_features(audio_array: Any, sampling_rate: int) -> Any:
class AudioProcessor: class AudioProcessor:
"""Handles audio stream processing and transcription with sentence chunking for a specific peer.""" """Handles audio stream processing and transcription with sentence chunking for a specific peer."""
main_loop: Optional[asyncio.AbstractEventLoop]
def __init__( def __init__(
self, peer_name: str, send_chat_func: Callable[[str], Awaitable[None]] self, peer_name: str, send_chat_func: Callable[[str], Awaitable[None]]
): ):
@ -133,6 +135,15 @@ class AudioProcessor:
self.current_phrase_audio: AudioArray = np.array([], dtype=np.float32) self.current_phrase_audio: AudioArray = np.array([], dtype=np.float32)
self.transcription_history: list[TranscriptionHistoryItem] = [] self.transcription_history: list[TranscriptionHistoryItem] = []
# Capture the main thread's event loop for background processing
try:
self.main_loop = asyncio.get_running_loop()
logger.debug(f"Captured main event loop for {self.peer_name}")
except RuntimeError:
# No event loop running, we'll need to create one
self.main_loop = None
logger.warning(f"No event loop running when initializing AudioProcessor for {self.peer_name}")
# Background processing # Background processing
self.processing_queue: Queue[AudioQueueItem] = Queue() self.processing_queue: Queue[AudioQueueItem] = Queue()
self.is_running = True self.is_running = True
@ -153,8 +164,9 @@ class AudioProcessor:
# Resample if needed (WebRTC might provide different sample rates) # Resample if needed (WebRTC might provide different sample rates)
if len(audio_data) > 0: if len(audio_data) > 0:
audio_received_time = time.time()
self.audio_buffer.append(audio_data) self.audio_buffer.append(audio_data)
self.last_activity_time = time.time() self.last_activity_time = audio_received_time
# Calculate audio metrics to detect silence # Calculate audio metrics to detect silence
audio_rms = np.sqrt(np.mean(audio_data**2)) audio_rms = np.sqrt(np.mean(audio_data**2))
@ -166,7 +178,7 @@ class AudioProcessor:
# Only log if we have meaningful audio or every 50 frames # Only log if we have meaningful audio or every 50 frames
if audio_rms > 0.001 or len(self.audio_buffer) % 50 == 0: if audio_rms > 0.001 or len(self.audio_buffer) % 50 == 0:
logger.info( logger.info(
f"Added audio chunk: {len(audio_data)} samples, buffer size: {len(self.audio_buffer)} frames ({buffer_duration_ms}ms), RMS: {audio_rms:.4f}, Peak: {audio_peak:.4f}" f"📥 AUDIO BUFFER ADD at {audio_received_time:.3f}: {len(audio_data)} samples, buffer size: {len(self.audio_buffer)} frames ({buffer_duration_ms}ms), RMS: {audio_rms:.4f}, Peak: {audio_peak:.4f} (peer: {self.peer_name})"
) )
else: else:
logger.debug( logger.debug(
@ -180,8 +192,9 @@ class AudioProcessor:
combined_rms = np.sqrt(np.mean(combined_audio**2)) combined_rms = np.sqrt(np.mean(combined_audio**2))
if combined_rms > 0.001: # Only process if not silence if combined_rms > 0.001: # Only process if not silence
buffer_queue_time = time.time()
logger.info( logger.info(
f"Buffer threshold reached with meaningful audio (RMS: {combined_rms:.4f}), queuing for processing" f"🚀 BUFFER QUEUING at {buffer_queue_time:.3f}: Buffer threshold reached with meaningful audio (RMS: {combined_rms:.4f}), queuing for processing (peer: {self.peer_name})"
) )
self._queue_for_processing() self._queue_for_processing()
else: else:
@ -213,7 +226,7 @@ class AudioProcessor:
return return
logger.info( logger.info(
f"Queuing audio chunk: {len(combined_audio)} samples, {audio_duration_sec:.2f}s duration, RMS: {audio_rms:.4f}, Peak: {audio_peak:.4f}" f"📦 AUDIO CHUNK QUEUED at {time.time():.3f}: {len(combined_audio)} samples, {audio_duration_sec:.2f}s duration, RMS: {audio_rms:.4f}, Peak: {audio_peak:.4f} (peer: {self.peer_name})"
) )
# Add to processing queue # Add to processing queue
@ -221,7 +234,7 @@ class AudioProcessor:
queue_item = AudioQueueItem(audio=combined_audio, timestamp=time.time()) queue_item = AudioQueueItem(audio=combined_audio, timestamp=time.time())
self.processing_queue.put_nowait(queue_item) self.processing_queue.put_nowait(queue_item)
logger.info( logger.info(
f"Added to processing queue, queue size: {self.processing_queue.qsize()}" f"📋 PROCESSING QUEUE ADD at {time.time():.3f}: Added to processing queue, queue size: {self.processing_queue.qsize()} (peer: {self.peer_name})"
) )
except Exception as e: except Exception as e:
# Queue full, skip this chunk # Queue full, skip this chunk
@ -238,8 +251,9 @@ class AudioProcessor:
# Get audio chunk to process (blocking with timeout) # Get audio chunk to process (blocking with timeout)
try: try:
audio_data = self.processing_queue.get(timeout=1.0) audio_data = self.processing_queue.get(timeout=1.0)
logger.debug( processing_start_time = time.time()
f"Retrieved audio chunk from queue, remaining queue size: {self.processing_queue.qsize()}" logger.info(
f"🔄 PROCESSING STARTED at {processing_start_time:.3f}: Retrieved audio chunk from queue, remaining queue size: {self.processing_queue.qsize()} (peer: {self.peer_name})"
) )
except Empty: except Empty:
logger.debug("Processing queue timeout, checking for more audio...") logger.debug("Processing queue timeout, checking for more audio...")
@ -265,18 +279,16 @@ class AudioProcessor:
f"Processing completed phrase: {phrase_duration:.2f}s duration, {len(self.current_phrase_audio)} samples, RMS: {phrase_rms:.4f}" f"Processing completed phrase: {phrase_duration:.2f}s duration, {len(self.current_phrase_audio)} samples, RMS: {phrase_rms:.4f}"
) )
try: if self.main_loop and not self.main_loop.is_closed():
loop = asyncio.get_event_loop()
asyncio.run_coroutine_threadsafe( asyncio.run_coroutine_threadsafe(
self._transcribe_and_send( self._transcribe_and_send(
self.current_phrase_audio.copy(), is_final=True self.current_phrase_audio.copy(), is_final=True
), ),
loop, self.main_loop,
) )
except RuntimeError as e: else:
# No event loop running, skip this transcription
logger.warning( logger.warning(
f"No event loop available for final transcription: {e}" f"No event loop available for final transcription (peer: {self.peer_name})"
) )
pass pass
self.current_phrase_audio = np.array([], dtype=np.float32) self.current_phrase_audio = np.array([], dtype=np.float32)
@ -305,20 +317,17 @@ class AudioProcessor:
logger.info( logger.info(
f"Current phrase >= {min_transcription_duration}s (RMS: {phrase_rms:.4f}), attempting streaming transcription" f"Current phrase >= {min_transcription_duration}s (RMS: {phrase_rms:.4f}), attempting streaming transcription"
) )
try: if self.main_loop and not self.main_loop.is_closed():
loop = asyncio.get_event_loop()
asyncio.run_coroutine_threadsafe( asyncio.run_coroutine_threadsafe(
self._transcribe_and_send( self._transcribe_and_send(
self.current_phrase_audio.copy(), is_final=False self.current_phrase_audio.copy(), is_final=False
), ),
loop, self.main_loop,
) )
except RuntimeError as e: else:
# No event loop running, skip this transcription
logger.warning( logger.warning(
f"No event loop available for streaming transcription: {e}" f"No event loop available for streaming transcription (peer: {self.peer_name})"
) )
pass
except Exception as e: except Exception as e:
logger.error(f"Error in audio processing loop: {e}", exc_info=True) logger.error(f"Error in audio processing loop: {e}", exc_info=True)
@ -355,7 +364,7 @@ class AudioProcessor:
return return
logger.info( logger.info(
f"Starting {transcription_type} transcription: {audio_duration_sec:.2f}s audio, RMS: {audio_rms:.4f}, Peak: {audio_peak:.4f}" f"🎬 TRANSCRIPTION STARTED ({transcription_type}) at {time.time():.3f}: {audio_duration_sec:.2f}s audio, RMS: {audio_rms:.4f}, Peak: {audio_peak:.4f} (peer: {self.peer_name})"
) )
# Ensure audio is in the right format for Whisper # Ensure audio is in the right format for Whisper
@ -389,18 +398,30 @@ class AudioProcessor:
) )
if text and len(text) > 0: # Accept any non-empty text if text and len(text) > 0: # Accept any non-empty text
prefix = ( # Calculate timing information for the message
f"🎤 {self.peer_name}: " chat_send_start = time.time()
if is_final total_transcription_time = chat_send_start - transcription_start_time
else f"🎤 {self.peer_name} [partial]: "
)
message = f"{prefix}{text}"
# Avoid sending duplicate messages # Create message with timing information included
if is_final or message not in [ status_marker = "🎤" if is_final else "🎤"
h.message for h in self.transcription_history[-3:] type_marker = "" if is_final else " [partial]"
timing_info = f" (⏱️ {total_transcription_time:.2f}s from start: {transcription_start_time:.3f})"
prefix = f"{status_marker} {self.peer_name}{type_marker}: "
message = f"{prefix}{text}{timing_info}"
# Avoid sending duplicate messages (check text only, not timing)
text_only_message = f"{prefix}{text}"
if is_final or text_only_message not in [
h.message.split(' (⏱️')[0] for h in self.transcription_history[-3:]
]: ]:
await self.send_chat_func(message) await self.send_chat_func(message)
chat_send_time = time.time() - chat_send_start
message_sent_time = time.time()
logger.info(
f"💬 CHAT MESSAGE SENT at {message_sent_time:.3f}: '{text}' (transcription started: {transcription_start_time:.3f}, chat send took: {chat_send_time:.3f}s, peer: {self.peer_name})"
)
# Keep history for deduplication # Keep history for deduplication
history_item = TranscriptionHistoryItem( history_item = TranscriptionHistoryItem(
@ -415,6 +436,12 @@ class AudioProcessor:
logger.info( logger.info(
f"✅ Transcribed ({transcription_type}) for {self.peer_name}: '{text}' (processing time: {total_transcription_time:.3f}s, audio duration: {audio_duration_sec:.2f}s)" f"✅ Transcribed ({transcription_type}) for {self.peer_name}: '{text}' (processing time: {total_transcription_time:.3f}s, audio duration: {audio_duration_sec:.2f}s)"
) )
# Log end-to-end pipeline timing
total_pipeline_time = message_sent_time - transcription_start_time
logger.info(
f"⏱️ PIPELINE TIMING ({transcription_type}): Total={total_pipeline_time:.3f}s (Transcription={total_transcription_time:.3f}s, Chat Send={chat_send_time:.3f}s, peer: {self.peer_name}) | 🕐 Start: {transcription_start_time:.3f}, End: {message_sent_time:.3f}"
)
else: else:
logger.debug( logger.debug(
f"Skipping duplicate {transcription_type} transcription: '{text}'" f"Skipping duplicate {transcription_type} transcription: '{text}'"