Appears to be working
This commit is contained in:
parent
bfcfa899ea
commit
25fc91a50c
@ -656,13 +656,22 @@ _whisper_model: Optional[OpenVINOWhisperModel] = None
|
|||||||
_audio_processors: Dict[str, "OptimizedAudioProcessor"] = {}
|
_audio_processors: Dict[str, "OptimizedAudioProcessor"] = {}
|
||||||
_send_chat_func: Optional[Callable[[str], Awaitable[None]]] = None
|
_send_chat_func: Optional[Callable[[str], Awaitable[None]]] = None
|
||||||
|
|
||||||
|
# Model loading status for video display
|
||||||
|
_model_loading_status: str = "Not loaded"
|
||||||
|
_model_loading_progress: float = 0.0
|
||||||
|
|
||||||
def _ensure_model_loaded(device: str = _device) -> OpenVINOWhisperModel:
|
def _ensure_model_loaded(device: str = _device) -> OpenVINOWhisperModel:
|
||||||
"""Ensure the global model is loaded."""
|
"""Ensure the global model is loaded."""
|
||||||
global _whisper_model
|
global _whisper_model, _model_loading_status, _model_loading_progress
|
||||||
if _whisper_model is None:
|
if _whisper_model is None:
|
||||||
setup_intel_arc_environment()
|
setup_intel_arc_environment()
|
||||||
logger.info(f"Loading OpenVINO Whisper model: {_model_id}")
|
logger.info(f"Loading OpenVINO Whisper model: {_model_id}")
|
||||||
|
_model_loading_status = "Loading model..."
|
||||||
|
_model_loading_progress = 0.1
|
||||||
|
|
||||||
_whisper_model = OpenVINOWhisperModel(model_id=_model_id, config=_ov_config, device=device)
|
_whisper_model = OpenVINOWhisperModel(model_id=_model_id, config=_ov_config, device=device)
|
||||||
|
_model_loading_status = "Model loaded successfully"
|
||||||
|
_model_loading_progress = 1.0
|
||||||
logger.info("OpenVINO Whisper model loaded successfully")
|
logger.info("OpenVINO Whisper model loaded successfully")
|
||||||
return _whisper_model
|
return _whisper_model
|
||||||
|
|
||||||
@ -1012,7 +1021,7 @@ class WaveformVideoTrack(MediaStreamTrack):
|
|||||||
|
|
||||||
kind = "video"
|
kind = "video"
|
||||||
|
|
||||||
def __init__(self, session_name: str, width: int = 640, height: int = 240, fps: int = 15) -> None:
|
def __init__(self, session_name: str, width: int = 640, height: int = 480, fps: int = 15) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.session_name = session_name
|
self.session_name = session_name
|
||||||
self.width = int(width)
|
self.width = int(width)
|
||||||
@ -1039,6 +1048,22 @@ class WaveformVideoTrack(MediaStreamTrack):
|
|||||||
|
|
||||||
frame_array: npt.NDArray[np.uint8] = np.zeros((self.height, self.width, 3), dtype=np.uint8)
|
frame_array: npt.NDArray[np.uint8] = np.zeros((self.height, self.width, 3), dtype=np.uint8)
|
||||||
|
|
||||||
|
# Display model loading status prominently
|
||||||
|
status_text = _model_loading_status
|
||||||
|
progress = _model_loading_progress
|
||||||
|
|
||||||
|
# Draw status background
|
||||||
|
cv2.rectangle(frame_array, (0, 0), (self.width, 60), (0, 0, 0), -1)
|
||||||
|
|
||||||
|
# Draw progress bar if loading
|
||||||
|
if progress < 1.0 and "Ready" not in status_text:
|
||||||
|
bar_width = int(progress * (self.width - 40))
|
||||||
|
cv2.rectangle(frame_array, (20, 40), (20 + bar_width, 50), (0, 255, 0), -1)
|
||||||
|
cv2.rectangle(frame_array, (20, 40), (self.width - 20, 50), (255, 255, 255), 1)
|
||||||
|
|
||||||
|
# Draw status text
|
||||||
|
cv2.putText(frame_array, f"Status: {status_text}", (10, 25), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 255, 255), 2)
|
||||||
|
|
||||||
# Select the most active processor (highest RMS) and draw its waveform
|
# Select the most active processor (highest RMS) and draw its waveform
|
||||||
best_proc = None
|
best_proc = None
|
||||||
best_rms = 0.0
|
best_rms = 0.0
|
||||||
@ -1088,16 +1113,16 @@ class WaveformVideoTrack(MediaStreamTrack):
|
|||||||
points: list[tuple[int, int]] = []
|
points: list[tuple[int, int]] = []
|
||||||
for x in range(self.width):
|
for x in range(self.width):
|
||||||
v = float(norm[x]) if x < norm.size and not np.isnan(norm[x]) else 0.0
|
v = float(norm[x]) if x < norm.size and not np.isnan(norm[x]) else 0.0
|
||||||
y = int((1.0 - ((v + 1.0) / 2.0)) * (self.height - 1))
|
y = int((1.0 - ((v + 1.0) / 2.0)) * (self.height - 70)) + 60 # Offset below status bar
|
||||||
points.append((x, max(0, min(self.height - 1, y))))
|
points.append((x, max(60, min(self.height - 1, y))))
|
||||||
|
|
||||||
if len(points) > 1:
|
if len(points) > 1:
|
||||||
pts_np = np.array(points, dtype=np.int32)
|
pts_np = np.array(points, dtype=np.int32)
|
||||||
cv2.polylines(frame_array, [pts_np], isClosed=False, color=(0, 200, 80), thickness=2)
|
cv2.polylines(frame_array, [pts_np], isClosed=False, color=(0, 200, 80), thickness=2)
|
||||||
|
|
||||||
cv2.putText(frame_array, f"Waveform: {pname}", (10, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
|
cv2.putText(frame_array, f"Waveform: {pname}", (10, self.height - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
|
||||||
else:
|
else:
|
||||||
cv2.putText(frame_array, "No audio", (10, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (200, 200, 200), 1)
|
cv2.putText(frame_array, "No audio", (10, self.height - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (200, 200, 200), 1)
|
||||||
|
|
||||||
frame = VideoFrame.from_ndarray(frame_array, format="bgr24")
|
frame = VideoFrame.from_ndarray(frame_array, format="bgr24")
|
||||||
frame.pts = pts
|
frame.pts = pts
|
||||||
@ -1112,21 +1137,86 @@ async def handle_track_received(peer: Peer, track: MediaStreamTrack) -> None:
|
|||||||
logger.info(f"Ignoring non-audio track from {peer.peer_name}: {track.kind}")
|
logger.info(f"Ignoring non-audio track from {peer.peer_name}: {track.kind}")
|
||||||
return
|
return
|
||||||
|
|
||||||
# Create audio processor
|
|
||||||
if peer.peer_name not in _audio_processors:
|
if peer.peer_name not in _audio_processors:
|
||||||
if _send_chat_func is None:
|
if _send_chat_func is None:
|
||||||
logger.error(f"Cannot create processor for {peer.peer_name}: no send_chat_func")
|
logger.error(f"Cannot create processor for {peer.peer_name}: no send_chat_func")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# Start background task to load model and create processor
|
||||||
|
async def init_processor():
|
||||||
|
global _model_loading_status, _model_loading_progress
|
||||||
|
# Load model asynchronously to avoid blocking frame reading
|
||||||
|
_model_loading_status = "Initializing model loading..."
|
||||||
|
_model_loading_progress = 0.0
|
||||||
|
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
await loop.run_in_executor(None, _ensure_model_loaded)
|
||||||
|
|
||||||
|
_model_loading_status = "Model loaded, creating processor..."
|
||||||
|
_model_loading_progress = 0.8
|
||||||
|
|
||||||
logger.info(f"Creating OptimizedAudioProcessor for {peer.peer_name}")
|
logger.info(f"Creating OptimizedAudioProcessor for {peer.peer_name}")
|
||||||
|
if _send_chat_func is None:
|
||||||
|
logger.error(f"No send_chat_func available for {peer.peer_name}")
|
||||||
|
_model_loading_status = "Error: No send function available"
|
||||||
|
return
|
||||||
_audio_processors[peer.peer_name] = OptimizedAudioProcessor(
|
_audio_processors[peer.peer_name] = OptimizedAudioProcessor(
|
||||||
peer_name=peer.peer_name,
|
peer_name=peer.peer_name,
|
||||||
send_chat_func=_send_chat_func
|
send_chat_func=_send_chat_func
|
||||||
)
|
)
|
||||||
|
|
||||||
audio_processor = _audio_processors[peer.peer_name]
|
audio_processor = _audio_processors[peer.peer_name]
|
||||||
|
_model_loading_status = "Ready for transcription"
|
||||||
|
_model_loading_progress = 1.0
|
||||||
logger.info(f"Starting OpenVINO audio processing for {peer.peer_name}")
|
logger.info(f"Starting OpenVINO audio processing for {peer.peer_name}")
|
||||||
|
|
||||||
|
# Now start processing frames
|
||||||
|
frame_count = 0
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
frame = await track.recv()
|
||||||
|
frame_count += 1
|
||||||
|
|
||||||
|
if frame_count % 100 == 0:
|
||||||
|
logger.debug(f"Received {frame_count} frames from {peer.peer_name}")
|
||||||
|
|
||||||
|
except MediaStreamError as e:
|
||||||
|
logger.info(f"Audio stream ended for {peer.peer_name}: {e}")
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error receiving frame from {peer.peer_name}: {e}")
|
||||||
|
break
|
||||||
|
|
||||||
|
if isinstance(frame, AudioFrame):
|
||||||
|
try:
|
||||||
|
# Convert frame to numpy array
|
||||||
|
audio_data = frame.to_ndarray()
|
||||||
|
|
||||||
|
# Handle audio format conversion
|
||||||
|
audio_data = _process_audio_frame(audio_data, frame)
|
||||||
|
|
||||||
|
# Resample if needed
|
||||||
|
if frame.sample_rate != SAMPLE_RATE:
|
||||||
|
audio_data = _resample_audio(audio_data, frame.sample_rate, SAMPLE_RATE)
|
||||||
|
|
||||||
|
# Convert to float32
|
||||||
|
audio_data_float32 = audio_data.astype(np.float32)
|
||||||
|
audio_data = normalize_audio(audio_data)
|
||||||
|
|
||||||
|
# Process with optimized processor
|
||||||
|
audio_processor.add_audio_data(audio_data_float32)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error processing audio frame for {peer.peer_name}: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
asyncio.create_task(init_processor())
|
||||||
|
return # Exit early, processing is handled in background
|
||||||
|
|
||||||
|
# If processor already exists, just continue processing
|
||||||
|
audio_processor = _audio_processors[peer.peer_name]
|
||||||
|
logger.info(f"Continuing OpenVINO audio processing for {peer.peer_name}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
frame_count = 0
|
frame_count = 0
|
||||||
while True:
|
while True:
|
||||||
@ -1239,7 +1329,7 @@ def create_agent_tracks(session_name: str) -> Dict[str, MediaStreamTrack]:
|
|||||||
await asyncio.sleep(1 / self.fps)
|
await asyncio.sleep(1 / self.fps)
|
||||||
return frame
|
return frame
|
||||||
try:
|
try:
|
||||||
video_track = WaveformVideoTrack(session_name=session_name, width=640, height=240, fps=15)
|
video_track = WaveformVideoTrack(session_name=session_name, width=640, height=480, fps=15)
|
||||||
audio_track = SilentAudioTrack()
|
audio_track = SilentAudioTrack()
|
||||||
return {"video": video_track, "audio": audio_track}
|
return {"video": video_track, "audio": audio_track}
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@ -1304,5 +1394,7 @@ def get_model_info() -> Dict[str, Any]:
|
|||||||
"quantization_enabled": _ov_config.enable_quantization,
|
"quantization_enabled": _ov_config.enable_quantization,
|
||||||
"is_quantized": ov_model.is_quantized,
|
"is_quantized": ov_model.is_quantized,
|
||||||
"sample_rate": SAMPLE_RATE,
|
"sample_rate": SAMPLE_RATE,
|
||||||
"chunk_duration_ms": CHUNK_DURATION_MS
|
"chunk_duration_ms": CHUNK_DURATION_MS,
|
||||||
|
"loading_status": _model_loading_status,
|
||||||
|
"loading_progress": _model_loading_progress
|
||||||
}
|
}
|
Loading…
x
Reference in New Issue
Block a user