Transcription almost working
This commit is contained in:
parent
0691dbf97f
commit
df88374999
@ -112,6 +112,8 @@ def extract_input_features(audio_array: Any, sampling_rate: int) -> Any:
|
|||||||
class AudioProcessor:
|
class AudioProcessor:
|
||||||
"""Handles audio stream processing and transcription with sentence chunking for a specific peer."""
|
"""Handles audio stream processing and transcription with sentence chunking for a specific peer."""
|
||||||
|
|
||||||
|
main_loop: Optional[asyncio.AbstractEventLoop]
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, peer_name: str, send_chat_func: Callable[[str], Awaitable[None]]
|
self, peer_name: str, send_chat_func: Callable[[str], Awaitable[None]]
|
||||||
):
|
):
|
||||||
@ -133,6 +135,15 @@ class AudioProcessor:
|
|||||||
self.current_phrase_audio: AudioArray = np.array([], dtype=np.float32)
|
self.current_phrase_audio: AudioArray = np.array([], dtype=np.float32)
|
||||||
self.transcription_history: list[TranscriptionHistoryItem] = []
|
self.transcription_history: list[TranscriptionHistoryItem] = []
|
||||||
|
|
||||||
|
# Capture the main thread's event loop for background processing
|
||||||
|
try:
|
||||||
|
self.main_loop = asyncio.get_running_loop()
|
||||||
|
logger.debug(f"Captured main event loop for {self.peer_name}")
|
||||||
|
except RuntimeError:
|
||||||
|
# No event loop running, we'll need to create one
|
||||||
|
self.main_loop = None
|
||||||
|
logger.warning(f"No event loop running when initializing AudioProcessor for {self.peer_name}")
|
||||||
|
|
||||||
# Background processing
|
# Background processing
|
||||||
self.processing_queue: Queue[AudioQueueItem] = Queue()
|
self.processing_queue: Queue[AudioQueueItem] = Queue()
|
||||||
self.is_running = True
|
self.is_running = True
|
||||||
@ -153,8 +164,9 @@ class AudioProcessor:
|
|||||||
|
|
||||||
# Resample if needed (WebRTC might provide different sample rates)
|
# Resample if needed (WebRTC might provide different sample rates)
|
||||||
if len(audio_data) > 0:
|
if len(audio_data) > 0:
|
||||||
|
audio_received_time = time.time()
|
||||||
self.audio_buffer.append(audio_data)
|
self.audio_buffer.append(audio_data)
|
||||||
self.last_activity_time = time.time()
|
self.last_activity_time = audio_received_time
|
||||||
|
|
||||||
# Calculate audio metrics to detect silence
|
# Calculate audio metrics to detect silence
|
||||||
audio_rms = np.sqrt(np.mean(audio_data**2))
|
audio_rms = np.sqrt(np.mean(audio_data**2))
|
||||||
@ -166,7 +178,7 @@ class AudioProcessor:
|
|||||||
# Only log if we have meaningful audio or every 50 frames
|
# Only log if we have meaningful audio or every 50 frames
|
||||||
if audio_rms > 0.001 or len(self.audio_buffer) % 50 == 0:
|
if audio_rms > 0.001 or len(self.audio_buffer) % 50 == 0:
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Added audio chunk: {len(audio_data)} samples, buffer size: {len(self.audio_buffer)} frames ({buffer_duration_ms}ms), RMS: {audio_rms:.4f}, Peak: {audio_peak:.4f}"
|
f"📥 AUDIO BUFFER ADD at {audio_received_time:.3f}: {len(audio_data)} samples, buffer size: {len(self.audio_buffer)} frames ({buffer_duration_ms}ms), RMS: {audio_rms:.4f}, Peak: {audio_peak:.4f} (peer: {self.peer_name})"
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
logger.debug(
|
logger.debug(
|
||||||
@ -180,8 +192,9 @@ class AudioProcessor:
|
|||||||
combined_rms = np.sqrt(np.mean(combined_audio**2))
|
combined_rms = np.sqrt(np.mean(combined_audio**2))
|
||||||
|
|
||||||
if combined_rms > 0.001: # Only process if not silence
|
if combined_rms > 0.001: # Only process if not silence
|
||||||
|
buffer_queue_time = time.time()
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Buffer threshold reached with meaningful audio (RMS: {combined_rms:.4f}), queuing for processing"
|
f"🚀 BUFFER QUEUING at {buffer_queue_time:.3f}: Buffer threshold reached with meaningful audio (RMS: {combined_rms:.4f}), queuing for processing (peer: {self.peer_name})"
|
||||||
)
|
)
|
||||||
self._queue_for_processing()
|
self._queue_for_processing()
|
||||||
else:
|
else:
|
||||||
@ -213,7 +226,7 @@ class AudioProcessor:
|
|||||||
return
|
return
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Queuing audio chunk: {len(combined_audio)} samples, {audio_duration_sec:.2f}s duration, RMS: {audio_rms:.4f}, Peak: {audio_peak:.4f}"
|
f"📦 AUDIO CHUNK QUEUED at {time.time():.3f}: {len(combined_audio)} samples, {audio_duration_sec:.2f}s duration, RMS: {audio_rms:.4f}, Peak: {audio_peak:.4f} (peer: {self.peer_name})"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Add to processing queue
|
# Add to processing queue
|
||||||
@ -221,7 +234,7 @@ class AudioProcessor:
|
|||||||
queue_item = AudioQueueItem(audio=combined_audio, timestamp=time.time())
|
queue_item = AudioQueueItem(audio=combined_audio, timestamp=time.time())
|
||||||
self.processing_queue.put_nowait(queue_item)
|
self.processing_queue.put_nowait(queue_item)
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Added to processing queue, queue size: {self.processing_queue.qsize()}"
|
f"📋 PROCESSING QUEUE ADD at {time.time():.3f}: Added to processing queue, queue size: {self.processing_queue.qsize()} (peer: {self.peer_name})"
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# Queue full, skip this chunk
|
# Queue full, skip this chunk
|
||||||
@ -238,8 +251,9 @@ class AudioProcessor:
|
|||||||
# Get audio chunk to process (blocking with timeout)
|
# Get audio chunk to process (blocking with timeout)
|
||||||
try:
|
try:
|
||||||
audio_data = self.processing_queue.get(timeout=1.0)
|
audio_data = self.processing_queue.get(timeout=1.0)
|
||||||
logger.debug(
|
processing_start_time = time.time()
|
||||||
f"Retrieved audio chunk from queue, remaining queue size: {self.processing_queue.qsize()}"
|
logger.info(
|
||||||
|
f"🔄 PROCESSING STARTED at {processing_start_time:.3f}: Retrieved audio chunk from queue, remaining queue size: {self.processing_queue.qsize()} (peer: {self.peer_name})"
|
||||||
)
|
)
|
||||||
except Empty:
|
except Empty:
|
||||||
logger.debug("Processing queue timeout, checking for more audio...")
|
logger.debug("Processing queue timeout, checking for more audio...")
|
||||||
@ -265,18 +279,16 @@ class AudioProcessor:
|
|||||||
f"Processing completed phrase: {phrase_duration:.2f}s duration, {len(self.current_phrase_audio)} samples, RMS: {phrase_rms:.4f}"
|
f"Processing completed phrase: {phrase_duration:.2f}s duration, {len(self.current_phrase_audio)} samples, RMS: {phrase_rms:.4f}"
|
||||||
)
|
)
|
||||||
|
|
||||||
try:
|
if self.main_loop and not self.main_loop.is_closed():
|
||||||
loop = asyncio.get_event_loop()
|
|
||||||
asyncio.run_coroutine_threadsafe(
|
asyncio.run_coroutine_threadsafe(
|
||||||
self._transcribe_and_send(
|
self._transcribe_and_send(
|
||||||
self.current_phrase_audio.copy(), is_final=True
|
self.current_phrase_audio.copy(), is_final=True
|
||||||
),
|
),
|
||||||
loop,
|
self.main_loop,
|
||||||
)
|
)
|
||||||
except RuntimeError as e:
|
else:
|
||||||
# No event loop running, skip this transcription
|
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"No event loop available for final transcription: {e}"
|
f"No event loop available for final transcription (peer: {self.peer_name})"
|
||||||
)
|
)
|
||||||
pass
|
pass
|
||||||
self.current_phrase_audio = np.array([], dtype=np.float32)
|
self.current_phrase_audio = np.array([], dtype=np.float32)
|
||||||
@ -305,20 +317,17 @@ class AudioProcessor:
|
|||||||
logger.info(
|
logger.info(
|
||||||
f"Current phrase >= {min_transcription_duration}s (RMS: {phrase_rms:.4f}), attempting streaming transcription"
|
f"Current phrase >= {min_transcription_duration}s (RMS: {phrase_rms:.4f}), attempting streaming transcription"
|
||||||
)
|
)
|
||||||
try:
|
if self.main_loop and not self.main_loop.is_closed():
|
||||||
loop = asyncio.get_event_loop()
|
|
||||||
asyncio.run_coroutine_threadsafe(
|
asyncio.run_coroutine_threadsafe(
|
||||||
self._transcribe_and_send(
|
self._transcribe_and_send(
|
||||||
self.current_phrase_audio.copy(), is_final=False
|
self.current_phrase_audio.copy(), is_final=False
|
||||||
),
|
),
|
||||||
loop,
|
self.main_loop,
|
||||||
)
|
)
|
||||||
except RuntimeError as e:
|
else:
|
||||||
# No event loop running, skip this transcription
|
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"No event loop available for streaming transcription: {e}"
|
f"No event loop available for streaming transcription (peer: {self.peer_name})"
|
||||||
)
|
)
|
||||||
pass
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error in audio processing loop: {e}", exc_info=True)
|
logger.error(f"Error in audio processing loop: {e}", exc_info=True)
|
||||||
@ -355,7 +364,7 @@ class AudioProcessor:
|
|||||||
return
|
return
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Starting {transcription_type} transcription: {audio_duration_sec:.2f}s audio, RMS: {audio_rms:.4f}, Peak: {audio_peak:.4f}"
|
f"🎬 TRANSCRIPTION STARTED ({transcription_type}) at {time.time():.3f}: {audio_duration_sec:.2f}s audio, RMS: {audio_rms:.4f}, Peak: {audio_peak:.4f} (peer: {self.peer_name})"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Ensure audio is in the right format for Whisper
|
# Ensure audio is in the right format for Whisper
|
||||||
@ -389,18 +398,30 @@ class AudioProcessor:
|
|||||||
)
|
)
|
||||||
|
|
||||||
if text and len(text) > 0: # Accept any non-empty text
|
if text and len(text) > 0: # Accept any non-empty text
|
||||||
prefix = (
|
# Calculate timing information for the message
|
||||||
f"🎤 {self.peer_name}: "
|
chat_send_start = time.time()
|
||||||
if is_final
|
total_transcription_time = chat_send_start - transcription_start_time
|
||||||
else f"🎤 {self.peer_name} [partial]: "
|
|
||||||
)
|
|
||||||
message = f"{prefix}{text}"
|
|
||||||
|
|
||||||
# Avoid sending duplicate messages
|
# Create message with timing information included
|
||||||
if is_final or message not in [
|
status_marker = "🎤" if is_final else "🎤"
|
||||||
h.message for h in self.transcription_history[-3:]
|
type_marker = "" if is_final else " [partial]"
|
||||||
|
timing_info = f" (⏱️ {total_transcription_time:.2f}s from start: {transcription_start_time:.3f})"
|
||||||
|
|
||||||
|
prefix = f"{status_marker} {self.peer_name}{type_marker}: "
|
||||||
|
message = f"{prefix}{text}{timing_info}"
|
||||||
|
|
||||||
|
# Avoid sending duplicate messages (check text only, not timing)
|
||||||
|
text_only_message = f"{prefix}{text}"
|
||||||
|
if is_final or text_only_message not in [
|
||||||
|
h.message.split(' (⏱️')[0] for h in self.transcription_history[-3:]
|
||||||
]:
|
]:
|
||||||
await self.send_chat_func(message)
|
await self.send_chat_func(message)
|
||||||
|
chat_send_time = time.time() - chat_send_start
|
||||||
|
message_sent_time = time.time()
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"💬 CHAT MESSAGE SENT at {message_sent_time:.3f}: '{text}' (transcription started: {transcription_start_time:.3f}, chat send took: {chat_send_time:.3f}s, peer: {self.peer_name})"
|
||||||
|
)
|
||||||
|
|
||||||
# Keep history for deduplication
|
# Keep history for deduplication
|
||||||
history_item = TranscriptionHistoryItem(
|
history_item = TranscriptionHistoryItem(
|
||||||
@ -415,6 +436,12 @@ class AudioProcessor:
|
|||||||
logger.info(
|
logger.info(
|
||||||
f"✅ Transcribed ({transcription_type}) for {self.peer_name}: '{text}' (processing time: {total_transcription_time:.3f}s, audio duration: {audio_duration_sec:.2f}s)"
|
f"✅ Transcribed ({transcription_type}) for {self.peer_name}: '{text}' (processing time: {total_transcription_time:.3f}s, audio duration: {audio_duration_sec:.2f}s)"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Log end-to-end pipeline timing
|
||||||
|
total_pipeline_time = message_sent_time - transcription_start_time
|
||||||
|
logger.info(
|
||||||
|
f"⏱️ PIPELINE TIMING ({transcription_type}): Total={total_pipeline_time:.3f}s (Transcription={total_transcription_time:.3f}s, Chat Send={chat_send_time:.3f}s, peer: {self.peer_name}) | 🕐 Start: {transcription_start_time:.3f}, End: {message_sent_time:.3f}"
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f"Skipping duplicate {transcription_type} transcription: '{text}'"
|
f"Skipping duplicate {transcription_type} transcription: '{text}'"
|
||||||
|
Loading…
x
Reference in New Issue
Block a user