Tmp
This commit is contained in:
parent
394d3f349c
commit
51b1ef7bc2
@ -724,8 +724,7 @@ _send_chat_func: Optional[Callable[[str], Awaitable[None]]] = None
|
|||||||
_model_loading_status: str = "Not loaded"
|
_model_loading_status: str = "Not loaded"
|
||||||
_model_loading_progress: float = 0.0
|
_model_loading_progress: float = 0.0
|
||||||
|
|
||||||
# Raw audio buffer for immediate graphing
|
# Raw audio buffer for immediate graphing (now handled by WaveformVideoTrack.buffer)
|
||||||
_raw_audio_buffer: Dict[str, npt.NDArray[np.float32]] = {}
|
|
||||||
|
|
||||||
|
|
||||||
def _ensure_model_loaded(device: str = _device) -> OpenVINOWhisperModel:
|
def _ensure_model_loaded(device: str = _device) -> OpenVINOWhisperModel:
|
||||||
@ -837,6 +836,10 @@ class OptimizedAudioProcessor:
|
|||||||
self.send_chat_func = send_chat_func
|
self.send_chat_func = send_chat_func
|
||||||
self.sample_rate = SAMPLE_RATE
|
self.sample_rate = SAMPLE_RATE
|
||||||
|
|
||||||
|
# Initialize visualization buffer if not already done
|
||||||
|
if self.peer_name not in WaveformVideoTrack.buffer:
|
||||||
|
WaveformVideoTrack.buffer[self.peer_name] = np.array([], dtype=np.float32)
|
||||||
|
|
||||||
# Optimized buffering parameters
|
# Optimized buffering parameters
|
||||||
self.chunk_size = int(self.sample_rate * CHUNK_DURATION_MS / 1000)
|
self.chunk_size = int(self.sample_rate * CHUNK_DURATION_MS / 1000)
|
||||||
self.buffer_size = self.chunk_size * 50
|
self.buffer_size = self.chunk_size * 50
|
||||||
@ -952,6 +955,12 @@ class OptimizedAudioProcessor:
|
|||||||
self.silence_frames = 0
|
self.silence_frames = 0
|
||||||
self.last_activity_time = time.time()
|
self.last_activity_time = time.time()
|
||||||
self._add_to_circular_buffer(audio_data)
|
self._add_to_circular_buffer(audio_data)
|
||||||
|
# Update visualization buffer
|
||||||
|
WaveformVideoTrack.buffer[self.peer_name] = np.concatenate([WaveformVideoTrack.buffer[self.peer_name], audio_data])
|
||||||
|
# Limit buffer size to last 10 seconds
|
||||||
|
max_samples = SAMPLE_RATE * 10
|
||||||
|
if len(WaveformVideoTrack.buffer[self.peer_name]) > max_samples:
|
||||||
|
WaveformVideoTrack.buffer[self.peer_name] = WaveformVideoTrack.buffer[self.peer_name][-max_samples:]
|
||||||
elif (
|
elif (
|
||||||
len(self.current_phrase_audio) > 0
|
len(self.current_phrase_audio) > 0
|
||||||
and self.silence_frames < self.max_trailing_silence_frames
|
and self.silence_frames < self.max_trailing_silence_frames
|
||||||
@ -959,6 +968,12 @@ class OptimizedAudioProcessor:
|
|||||||
logger.info(f"Trailing silence accepted for {self.peer_name}")
|
logger.info(f"Trailing silence accepted for {self.peer_name}")
|
||||||
self.silence_frames += 1
|
self.silence_frames += 1
|
||||||
self._add_to_circular_buffer(audio_data)
|
self._add_to_circular_buffer(audio_data)
|
||||||
|
# Update visualization buffer
|
||||||
|
WaveformVideoTrack.buffer[self.peer_name] = np.concatenate([WaveformVideoTrack.buffer[self.peer_name], audio_data])
|
||||||
|
# Limit buffer size to last 10 seconds
|
||||||
|
max_samples = SAMPLE_RATE * 10
|
||||||
|
if len(WaveformVideoTrack.buffer[self.peer_name]) > max_samples:
|
||||||
|
WaveformVideoTrack.buffer[self.peer_name] = WaveformVideoTrack.buffer[self.peer_name][-max_samples:]
|
||||||
else:
|
else:
|
||||||
if (self.silence_frames % 10 == 0) and (self.silence_frames > 0):
|
if (self.silence_frames % 10 == 0) and (self.silence_frames > 0):
|
||||||
logger.info(
|
logger.info(
|
||||||
@ -1380,6 +1395,9 @@ class WaveformVideoTrack(MediaStreamTrack):
|
|||||||
|
|
||||||
kind = "video"
|
kind = "video"
|
||||||
|
|
||||||
|
# Shared buffer for audio data
|
||||||
|
buffer: Dict[str, npt.NDArray[np.float32]] = {}
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, session_name: str, width: int = 640, height: int = 480, fps: int = 15
|
self, session_name: str, width: int = 640, height: int = 480, fps: int = 15
|
||||||
) -> None:
|
) -> None:
|
||||||
@ -1443,7 +1461,7 @@ class WaveformVideoTrack(MediaStreamTrack):
|
|||||||
current_time, cv2.FONT_HERSHEY_SIMPLEX, 1.0, 2
|
current_time, cv2.FONT_HERSHEY_SIMPLEX, 1.0, 2
|
||||||
)
|
)
|
||||||
clock_x = self.width - text_width - 10 # 10px margin from right edge
|
clock_x = self.width - text_width - 10 # 10px margin from right edge
|
||||||
clock_y = self.height - 10 # 10px margin from bottom
|
clock_y = self.height - 30 # Move to 450 for height=480
|
||||||
cv2.putText(
|
cv2.putText(
|
||||||
frame_array,
|
frame_array,
|
||||||
current_time,
|
current_time,
|
||||||
@ -1458,10 +1476,12 @@ class WaveformVideoTrack(MediaStreamTrack):
|
|||||||
best_proc = None
|
best_proc = None
|
||||||
best_rms = 0.0
|
best_rms = 0.0
|
||||||
try:
|
try:
|
||||||
for pname, arr in _raw_audio_buffer.items():
|
for pname, arr in self.__class__.buffer.items():
|
||||||
try:
|
try:
|
||||||
if arr is None or len(arr) == 0:
|
# Allow empty arrays to be selected
|
||||||
continue
|
if len(arr) == 0:
|
||||||
|
rms = 0.0
|
||||||
|
else:
|
||||||
rms = float(np.sqrt(np.mean(arr**2)))
|
rms = float(np.sqrt(np.mean(arr**2)))
|
||||||
if rms > best_rms:
|
if rms > best_rms:
|
||||||
best_rms = rms
|
best_rms = rms
|
||||||
@ -1507,7 +1527,7 @@ class WaveformVideoTrack(MediaStreamTrack):
|
|||||||
points: list[tuple[int, int]] = []
|
points: list[tuple[int, int]] = []
|
||||||
for x in range(self.width):
|
for x in range(self.width):
|
||||||
v = float(norm[x]) if x < norm.size and not np.isnan(norm[x]) else 0.0
|
v = float(norm[x]) if x < norm.size and not np.isnan(norm[x]) else 0.0
|
||||||
y = int((1.0 - ((v + 1.0) / 2.0)) * self.height)
|
y = int((1.0 - ((v + 1.0) / 2.0)) * (self.height - 80)) + 80
|
||||||
points.append((x, y))
|
points.append((x, y))
|
||||||
|
|
||||||
if len(points) > 1:
|
if len(points) > 1:
|
||||||
@ -1516,8 +1536,8 @@ class WaveformVideoTrack(MediaStreamTrack):
|
|||||||
frame_array,
|
frame_array,
|
||||||
[pts_np],
|
[pts_np],
|
||||||
isClosed=False,
|
isClosed=False,
|
||||||
color=(0, 200, 80),
|
color=(255, 255, 255),
|
||||||
thickness=2,
|
thickness=4,
|
||||||
)
|
)
|
||||||
|
|
||||||
cv2.putText(
|
cv2.putText(
|
||||||
@ -1530,13 +1550,35 @@ class WaveformVideoTrack(MediaStreamTrack):
|
|||||||
2,
|
2,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
|
# Draw a test sine wave to verify drawing works
|
||||||
|
import math
|
||||||
|
test_points: list[tuple[int, int]] = []
|
||||||
|
for x in range(self.width):
|
||||||
|
# Create a sine wave with some amplitude
|
||||||
|
v = 0.5 * math.sin(2 * math.pi * x / self.width * 4) # 4 cycles across width
|
||||||
|
y = int((1.0 - ((v + 1.0) / 2.0)) * (self.height - 80)) + 80
|
||||||
|
test_points.append((x, y))
|
||||||
|
|
||||||
|
if len(test_points) > 1:
|
||||||
|
pts_np = np.array(test_points, dtype=np.int32)
|
||||||
|
cv2.polylines(
|
||||||
|
frame_array,
|
||||||
|
[pts_np],
|
||||||
|
isClosed=False,
|
||||||
|
color=(255, 255, 255),
|
||||||
|
thickness=4,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Show buffer status
|
||||||
|
buffer_keys = len(self.__class__.buffer)
|
||||||
|
total_samples = sum(len(arr) for arr in self.__class__.buffer.values())
|
||||||
cv2.putText(
|
cv2.putText(
|
||||||
frame_array,
|
frame_array,
|
||||||
"No audio",
|
f"Test waveform - Buffer: {buffer_keys} keys, {total_samples} samples",
|
||||||
(10, self.height - 15),
|
(10, 400),
|
||||||
cv2.FONT_HERSHEY_SIMPLEX,
|
cv2.FONT_HERSHEY_SIMPLEX,
|
||||||
1.2,
|
0.8,
|
||||||
(200, 200, 200),
|
(255, 255, 255),
|
||||||
2,
|
2,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -1548,15 +1590,15 @@ class WaveformVideoTrack(MediaStreamTrack):
|
|||||||
|
|
||||||
async def handle_track_received(peer: Peer, track: MediaStreamTrack) -> None:
|
async def handle_track_received(peer: Peer, track: MediaStreamTrack) -> None:
|
||||||
"""Handle incoming audio tracks from WebRTC peers."""
|
"""Handle incoming audio tracks from WebRTC peers."""
|
||||||
global _audio_processors, _send_chat_func, _raw_audio_buffer
|
global _audio_processors, _send_chat_func
|
||||||
|
|
||||||
if track.kind != "audio":
|
if track.kind != "audio":
|
||||||
logger.info(f"Ignoring non-audio track from {peer.peer_name}: {track.kind}")
|
logger.info(f"Ignoring non-audio track from {peer.peer_name}: {track.kind}")
|
||||||
return
|
return
|
||||||
|
|
||||||
# Initialize raw audio buffer for immediate graphing
|
# Initialize raw audio buffer for immediate graphing
|
||||||
if peer.peer_name not in _raw_audio_buffer:
|
if peer.peer_name not in WaveformVideoTrack.buffer:
|
||||||
_raw_audio_buffer[peer.peer_name] = np.array([], dtype=np.float32)
|
WaveformVideoTrack.buffer[peer.peer_name] = np.array([], dtype=np.float32)
|
||||||
|
|
||||||
if peer.peer_name not in _audio_processors:
|
if peer.peer_name not in _audio_processors:
|
||||||
if _send_chat_func is None:
|
if _send_chat_func is None:
|
||||||
@ -1686,13 +1728,6 @@ async def handle_track_received(peer: Peer, track: MediaStreamTrack) -> None:
|
|||||||
|
|
||||||
logger.debug(f"Processed audio frame {frame_count} from {peer.peer_name}: {len(audio_data_float32)} samples")
|
logger.debug(f"Processed audio frame {frame_count} from {peer.peer_name}: {len(audio_data_float32)} samples")
|
||||||
|
|
||||||
# Update raw buffer for graphing
|
|
||||||
_raw_audio_buffer[peer.peer_name] = np.concatenate([_raw_audio_buffer[peer.peer_name], audio_data_float32])
|
|
||||||
# Limit buffer size to last 10 seconds
|
|
||||||
max_samples = SAMPLE_RATE * 10
|
|
||||||
if len(_raw_audio_buffer[peer.peer_name]) > max_samples:
|
|
||||||
_raw_audio_buffer[peer.peer_name] = _raw_audio_buffer[peer.peer_name][-max_samples:]
|
|
||||||
|
|
||||||
# Process with optimized processor if available
|
# Process with optimized processor if available
|
||||||
audio_processor.add_audio_data(audio_data_float32)
|
audio_processor.add_audio_data(audio_data_float32)
|
||||||
|
|
||||||
@ -1834,7 +1869,7 @@ def bind_send_chat_function(send_chat_func: Callable[[str], Awaitable[None]]) ->
|
|||||||
|
|
||||||
def cleanup_peer_processor(peer_name: str) -> None:
|
def cleanup_peer_processor(peer_name: str) -> None:
|
||||||
"""Clean up processor for disconnected peer."""
|
"""Clean up processor for disconnected peer."""
|
||||||
global _audio_processors, _raw_audio_buffer
|
global _audio_processors
|
||||||
|
|
||||||
if peer_name in _audio_processors:
|
if peer_name in _audio_processors:
|
||||||
logger.info(f"Cleaning up processor for {peer_name}")
|
logger.info(f"Cleaning up processor for {peer_name}")
|
||||||
@ -1843,8 +1878,8 @@ def cleanup_peer_processor(peer_name: str) -> None:
|
|||||||
del _audio_processors[peer_name]
|
del _audio_processors[peer_name]
|
||||||
logger.info(f"Processor cleanup complete for {peer_name}")
|
logger.info(f"Processor cleanup complete for {peer_name}")
|
||||||
|
|
||||||
if peer_name in _raw_audio_buffer:
|
if peer_name in WaveformVideoTrack.buffer:
|
||||||
del _raw_audio_buffer[peer_name]
|
del WaveformVideoTrack.buffer[peer_name]
|
||||||
|
|
||||||
|
|
||||||
def get_active_processors() -> Dict[str, OptimizedAudioProcessor]:
|
def get_active_processors() -> Dict[str, OptimizedAudioProcessor]:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user