Updated deployed instance with visualization
This commit is contained in:
parent
1ad2638277
commit
1dd5ae115d
11
Dockerfile
11
Dockerfile
@ -257,7 +257,6 @@ FROM llm-base AS backstory
|
|||||||
|
|
||||||
COPY /src/requirements.txt /opt/backstory/src/requirements.txt
|
COPY /src/requirements.txt /opt/backstory/src/requirements.txt
|
||||||
RUN pip install -r /opt/backstory/src/requirements.txt
|
RUN pip install -r /opt/backstory/src/requirements.txt
|
||||||
COPY /src/ /opt/backstory/src/
|
|
||||||
|
|
||||||
SHELL [ "/bin/bash", "-c" ]
|
SHELL [ "/bin/bash", "-c" ]
|
||||||
|
|
||||||
@ -303,6 +302,8 @@ ENV SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
|
|||||||
ENV SYCL_CACHE_PERSISTENT=1
|
ENV SYCL_CACHE_PERSISTENT=1
|
||||||
ENV PATH=/opt/backstory:$PATH
|
ENV PATH=/opt/backstory:$PATH
|
||||||
|
|
||||||
|
COPY /src/ /opt/backstory/src/
|
||||||
|
|
||||||
ENTRYPOINT [ "/entrypoint.sh" ]
|
ENTRYPOINT [ "/entrypoint.sh" ]
|
||||||
|
|
||||||
FROM ubuntu:oracular AS ollama
|
FROM ubuntu:oracular AS ollama
|
||||||
@ -406,6 +407,14 @@ RUN { \
|
|||||||
&& chmod +x /fetch-models.sh
|
&& chmod +x /fetch-models.sh
|
||||||
|
|
||||||
ENV PYTHONUNBUFFERED=1
|
ENV PYTHONUNBUFFERED=1
|
||||||
|
# Enable ext_intel_free_memory
|
||||||
|
ENV ZES_ENABLE_SYSMAN=1
|
||||||
|
# Use all GPUs
|
||||||
|
ENV OLLAMA_NUM_GPU=999
|
||||||
|
# Use immediate command lists
|
||||||
|
ENV SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1
|
||||||
|
# Use persistent cache
|
||||||
|
ENV SYCL_CACHE_PERSISTENT=1
|
||||||
|
|
||||||
VOLUME [" /root/.ollama" ]
|
VOLUME [" /root/.ollama" ]
|
||||||
|
|
||||||
|
@ -4,6 +4,7 @@ services:
|
|||||||
context: .
|
context: .
|
||||||
dockerfile: Dockerfile
|
dockerfile: Dockerfile
|
||||||
target: backstory
|
target: backstory
|
||||||
|
container_name: backstory
|
||||||
image: backstory
|
image: backstory
|
||||||
restart: "no"
|
restart: "no"
|
||||||
env_file:
|
env_file:
|
||||||
@ -70,6 +71,7 @@ services:
|
|||||||
dockerfile: Dockerfile
|
dockerfile: Dockerfile
|
||||||
target: ollama
|
target: ollama
|
||||||
image: ollama
|
image: ollama
|
||||||
|
container_name: ollama
|
||||||
restart: "always"
|
restart: "always"
|
||||||
env_file:
|
env_file:
|
||||||
- .env
|
- .env
|
||||||
@ -133,6 +135,7 @@ services:
|
|||||||
dockerfile: Dockerfile
|
dockerfile: Dockerfile
|
||||||
target: miniircd
|
target: miniircd
|
||||||
image: miniircd
|
image: miniircd
|
||||||
|
container_name: miniircd
|
||||||
restart: "no"
|
restart: "no"
|
||||||
env_file:
|
env_file:
|
||||||
- .env
|
- .env
|
||||||
|
@ -36,7 +36,7 @@ import { Message, MessageList } from './Message';
|
|||||||
import { MessageData } from './MessageMeta';
|
import { MessageData } from './MessageMeta';
|
||||||
import { SeverityType } from './Snack';
|
import { SeverityType } from './Snack';
|
||||||
import { ContextStatus } from './ContextStatus';
|
import { ContextStatus } from './ContextStatus';
|
||||||
import { VectorVisualizer, ResultData } from './VectorVisualizer';
|
import { VectorVisualizer } from './VectorVisualizer';
|
||||||
|
|
||||||
|
|
||||||
import './App.css';
|
import './App.css';
|
||||||
@ -343,7 +343,6 @@ const App = () => {
|
|||||||
const [resume, setResume] = useState<MessageData | undefined>(undefined);
|
const [resume, setResume] = useState<MessageData | undefined>(undefined);
|
||||||
const [facts, setFacts] = useState<MessageData | undefined>(undefined);
|
const [facts, setFacts] = useState<MessageData | undefined>(undefined);
|
||||||
const timerRef = useRef<any>(null);
|
const timerRef = useRef<any>(null);
|
||||||
const [result, setResult] = useState<ResultData | undefined>(undefined);
|
|
||||||
|
|
||||||
const startCountdown = (seconds: number) => {
|
const startCountdown = (seconds: number) => {
|
||||||
if (timerRef.current) clearInterval(timerRef.current);
|
if (timerRef.current) clearInterval(timerRef.current);
|
||||||
@ -422,31 +421,6 @@ const App = () => {
|
|||||||
});
|
});
|
||||||
}, [systemInfo, setSystemInfo, connectionBase, setSnack, sessionId])
|
}, [systemInfo, setSystemInfo, connectionBase, setSnack, sessionId])
|
||||||
|
|
||||||
// Get the collection to visualize
|
|
||||||
useEffect(() => {
|
|
||||||
if (result !== undefined || sessionId === undefined) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
const fetchCollection = async () => {
|
|
||||||
try {
|
|
||||||
const response = await fetch(connectionBase + `/api/tsne/${sessionId}`, {
|
|
||||||
method: 'PUT',
|
|
||||||
headers: {
|
|
||||||
'Content-Type': 'application/json',
|
|
||||||
},
|
|
||||||
body: JSON.stringify({ dimensions: 3 }),
|
|
||||||
});
|
|
||||||
const data = await response.json();
|
|
||||||
setResult(data);
|
|
||||||
} catch (error) {
|
|
||||||
console.error('Error obtaining collection information:', error);
|
|
||||||
setSnack("Unable to obtain collection information.", "error");
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
fetchCollection();
|
|
||||||
}, [result, setResult, connectionBase, setSnack, sessionId])
|
|
||||||
|
|
||||||
// Get the About markdown
|
// Get the About markdown
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
if (about !== "") {
|
if (about !== "") {
|
||||||
@ -1101,7 +1075,7 @@ const App = () => {
|
|||||||
onChange={handleTabChange} aria-label="Backstory navigation">
|
onChange={handleTabChange} aria-label="Backstory navigation">
|
||||||
<Tab label="Backstory" icon={<Avatar sx={{ width: 24, height: 24 }} variant="rounded" alt="Backstory logo" src="/logo192.png" />} iconPosition="start" />
|
<Tab label="Backstory" icon={<Avatar sx={{ width: 24, height: 24 }} variant="rounded" alt="Backstory logo" src="/logo192.png" />} iconPosition="start" />
|
||||||
<Tab label="Resume Builder"/>
|
<Tab label="Resume Builder"/>
|
||||||
<Tab label="Visualizer" />
|
<Tab label="Context Visualizer" />
|
||||||
<Tab label="About"/>
|
<Tab label="About"/>
|
||||||
</Tabs>
|
</Tabs>
|
||||||
</Box>}
|
</Box>}
|
||||||
@ -1215,7 +1189,7 @@ const App = () => {
|
|||||||
<CustomTabPanel tab={tab} index={2}>
|
<CustomTabPanel tab={tab} index={2}>
|
||||||
<Box className="ChatBox">
|
<Box className="ChatBox">
|
||||||
<Box className="Conversation">
|
<Box className="Conversation">
|
||||||
{result !== undefined && <VectorVisualizer {...{ result, connectionBase, sessionId }} />}
|
<VectorVisualizer {...{ connectionBase, sessionId, setSnack }} />
|
||||||
</Box>
|
</Box>
|
||||||
</Box>
|
</Box>
|
||||||
</CustomTabPanel>
|
</CustomTabPanel>
|
||||||
|
@ -1,13 +1,17 @@
|
|||||||
import React, { useEffect, useState } from 'react';
|
import React, { useEffect, useState } from 'react';
|
||||||
import Box from '@mui/material/Box';
|
import Box from '@mui/material/Box';
|
||||||
|
import Card from '@mui/material/Card';
|
||||||
|
import Typography from '@mui/material/Typography';
|
||||||
import Plot from 'react-plotly.js';
|
import Plot from 'react-plotly.js';
|
||||||
import TextField from '@mui/material/TextField';
|
import TextField from '@mui/material/TextField';
|
||||||
import Tooltip from '@mui/material/Tooltip';
|
import Tooltip from '@mui/material/Tooltip';
|
||||||
import Button from '@mui/material/Button';
|
import Button from '@mui/material/Button';
|
||||||
import SendIcon from '@mui/icons-material/Send';
|
import SendIcon from '@mui/icons-material/Send';
|
||||||
|
|
||||||
|
import { SeverityType } from './Snack';
|
||||||
|
|
||||||
interface Metadata {
|
interface Metadata {
|
||||||
type?: string;
|
doc_type?: string;
|
||||||
[key: string]: any;
|
[key: string]: any;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -18,6 +22,7 @@ interface ResultData {
|
|||||||
}
|
}
|
||||||
|
|
||||||
interface PlotData {
|
interface PlotData {
|
||||||
|
data: {
|
||||||
x: number[];
|
x: number[];
|
||||||
y: number[];
|
y: number[];
|
||||||
z?: number[];
|
z?: number[];
|
||||||
@ -25,12 +30,15 @@ interface PlotData {
|
|||||||
text: string[];
|
text: string[];
|
||||||
sizes: number[];
|
sizes: number[];
|
||||||
symbols: string[];
|
symbols: string[];
|
||||||
|
doc_types: string[];
|
||||||
|
};
|
||||||
|
layout: Partial<Plotly.Layout>;
|
||||||
}
|
}
|
||||||
|
|
||||||
interface VectorVisualizerProps {
|
interface VectorVisualizerProps {
|
||||||
result: ResultData;
|
|
||||||
connectionBase: string;
|
connectionBase: string;
|
||||||
sessionId?: string;
|
sessionId?: string;
|
||||||
|
setSnack: (message: string, severity: SeverityType) => void;
|
||||||
}
|
}
|
||||||
|
|
||||||
interface ChromaResult {
|
interface ChromaResult {
|
||||||
@ -43,10 +51,82 @@ interface ChromaResult {
|
|||||||
vector_embedding?: number[];
|
vector_embedding?: number[];
|
||||||
}
|
}
|
||||||
|
|
||||||
const VectorVisualizer: React.FC<VectorVisualizerProps> = ({ result, connectionBase, sessionId }) => {
|
const normalizeDimension = (arr: number[]): number[] => {
|
||||||
|
const min = Math.min(...arr);
|
||||||
|
const max = Math.max(...arr);
|
||||||
|
const range = max - min;
|
||||||
|
if (range === 0) return arr.map(() => 0.5); // flat dimension
|
||||||
|
return arr.map(v => (v - min) / range);
|
||||||
|
};
|
||||||
|
|
||||||
|
const getTextColorForBackground = (bgColor: string): string => {
|
||||||
|
const r = parseInt(bgColor.slice(1, 3), 16);
|
||||||
|
const g = parseInt(bgColor.slice(3, 5), 16);
|
||||||
|
const b = parseInt(bgColor.slice(5, 7), 16);
|
||||||
|
const luminance = 0.299 * r + 0.587 * g + 0.114 * b;
|
||||||
|
return luminance > 186 ? '#2E2E2E' : '#FFFFFF'; // Charcoal or white from your theme
|
||||||
|
};
|
||||||
|
|
||||||
|
const emojiMap: Record<string, string> = {
|
||||||
|
query: '🔍',
|
||||||
|
resume: '📄',
|
||||||
|
projects: '📁',
|
||||||
|
news: '📰',
|
||||||
|
};
|
||||||
|
|
||||||
|
const colorMap: Record<string, string> = {
|
||||||
|
query: '#D4A017', // Golden Ochre — strong highlight
|
||||||
|
resume: '#4A7A7D', // Dusty Teal — secondary theme color
|
||||||
|
projects: '#1A2536', // Midnight Blue — rich and deep
|
||||||
|
news: '#D3CDBF', // Warm Gray — soft and neutral
|
||||||
|
};
|
||||||
|
|
||||||
|
const sizeMap: Record<string, number> = {
|
||||||
|
'query': 10,
|
||||||
|
};
|
||||||
|
|
||||||
|
const symbolMap: Record<string, string> = {
|
||||||
|
'query': 'circle',
|
||||||
|
};
|
||||||
|
|
||||||
|
const VectorVisualizer: React.FC<VectorVisualizerProps> = ({ setSnack, connectionBase, sessionId }) => {
|
||||||
const [plotData, setPlotData] = useState<PlotData | null>(null);
|
const [plotData, setPlotData] = useState<PlotData | null>(null);
|
||||||
const [query, setQuery] = useState<string>('');
|
const [query, setQuery] = useState<string>('');
|
||||||
const [queryEmbedding, setQueryEmbedding] = useState<ChromaResult | undefined>(undefined);
|
const [queryEmbedding, setQueryEmbedding] = useState<ChromaResult | undefined>(undefined);
|
||||||
|
const [result, setResult] = useState<ResultData | undefined>(undefined);
|
||||||
|
const [tooltip, setTooltip] = useState<{
|
||||||
|
visible: boolean,
|
||||||
|
// x: number,
|
||||||
|
// y: number,
|
||||||
|
content: string,
|
||||||
|
background: string,
|
||||||
|
color: string,
|
||||||
|
} | null>(null);
|
||||||
|
|
||||||
|
// Get the collection to visualize
|
||||||
|
useEffect(() => {
|
||||||
|
if (result !== undefined || sessionId === undefined) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const fetchCollection = async () => {
|
||||||
|
try {
|
||||||
|
const response = await fetch(connectionBase + `/api/umap/${sessionId}`, {
|
||||||
|
method: 'PUT',
|
||||||
|
headers: {
|
||||||
|
'Content-Type': 'application/json',
|
||||||
|
},
|
||||||
|
body: JSON.stringify({ dimensions: 3 }),
|
||||||
|
});
|
||||||
|
const data = await response.json();
|
||||||
|
setResult(data);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Error obtaining collection information:', error);
|
||||||
|
setSnack("Unable to obtain collection information.", "error");
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
fetchCollection();
|
||||||
|
}, [result, setResult, connectionBase, setSnack, sessionId])
|
||||||
|
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
if (!result || !result.embeddings) return;
|
if (!result || !result.embeddings) return;
|
||||||
@ -57,7 +137,7 @@ const VectorVisualizer: React.FC<VectorVisualizerProps> = ({ result, connectionB
|
|||||||
const metadatas = [...result.metadatas || []];
|
const metadatas = [...result.metadatas || []];
|
||||||
|
|
||||||
if (queryEmbedding !== undefined && queryEmbedding.vector_embedding !== undefined) {
|
if (queryEmbedding !== undefined && queryEmbedding.vector_embedding !== undefined) {
|
||||||
metadatas.unshift({ type: 'query' });
|
metadatas.unshift({ doc_type: 'query' });
|
||||||
documents.unshift(queryEmbedding.query || '');
|
documents.unshift(queryEmbedding.query || '');
|
||||||
vectors.unshift(queryEmbedding.vector_embedding);
|
vectors.unshift(queryEmbedding.vector_embedding);
|
||||||
}
|
}
|
||||||
@ -67,19 +147,8 @@ const VectorVisualizer: React.FC<VectorVisualizerProps> = ({ result, connectionB
|
|||||||
console.error('Vectors are neither 2D nor 3D');
|
console.error('Vectors are neither 2D nor 3D');
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
console.log('Vectors:', vectors);
|
|
||||||
// Placeholder color assignment
|
|
||||||
const colorMap: Record<string, string> = {
|
|
||||||
'query': '#00ff00',
|
|
||||||
};
|
|
||||||
const sizeMap: Record<string, number> = {
|
|
||||||
'query': 10,
|
|
||||||
};
|
|
||||||
const symbolMap: Record<string, string> = {
|
|
||||||
'query': 'circle',
|
|
||||||
};
|
|
||||||
|
|
||||||
const doc_types = metadatas.map(m => m.type || 'unknown');
|
const doc_types = metadatas.map(m => m.doc_type || 'unknown');
|
||||||
const sizes = doc_types.map(type => {
|
const sizes = doc_types.map(type => {
|
||||||
if (!sizeMap[type]) {
|
if (!sizeMap[type]) {
|
||||||
sizeMap[type] = 5;
|
sizeMap[type] = 5;
|
||||||
@ -98,31 +167,51 @@ const VectorVisualizer: React.FC<VectorVisualizerProps> = ({ result, connectionB
|
|||||||
}
|
}
|
||||||
return colorMap[type];
|
return colorMap[type];
|
||||||
});
|
});
|
||||||
|
const customdata = metadatas.map((m, index) => {
|
||||||
|
return { doc: documents[index], type: m.doc_type || 'unknown' };
|
||||||
|
});
|
||||||
|
const x = normalizeDimension(vectors.map((v: number[]) => v[0]));
|
||||||
|
const y = normalizeDimension(vectors.map((v: number[]) => v[1]));
|
||||||
|
const z = is3D ? normalizeDimension(vectors.map((v: number[]) => v[2])) : undefined
|
||||||
|
|
||||||
|
const layout: Partial<Plotly.Layout> = {
|
||||||
|
autosize: true,
|
||||||
|
paper_bgcolor: '#FFFFFF', // white
|
||||||
|
plot_bgcolor: '#FFFFFF', // white plot background
|
||||||
|
font: {
|
||||||
|
family: 'Roboto, sans-serif',
|
||||||
|
color: '#2E2E2E', // charcoal black
|
||||||
|
},
|
||||||
|
hovermode: 'closest',
|
||||||
|
scene: {
|
||||||
|
bgcolor: '#FFFFFF', // 3D plot background
|
||||||
|
zaxis: { title: 'Z', gridcolor: '#cccccc', zerolinecolor: '#aaaaaa' },
|
||||||
|
},
|
||||||
|
xaxis: { title: 'X', gridcolor: '#cccccc', zerolinecolor: '#aaaaaa' },
|
||||||
|
yaxis: { title: 'Y', gridcolor: '#cccccc', zerolinecolor: '#aaaaaa' },
|
||||||
|
margin: { r: 20, b: 10, l: 10, t: 40 },
|
||||||
|
};
|
||||||
|
|
||||||
|
const data: any = {
|
||||||
|
x: x,
|
||||||
|
y: y,
|
||||||
|
mode: 'markers',
|
||||||
|
marker: {
|
||||||
|
size: sizes,
|
||||||
|
symbol: symbols,
|
||||||
|
color: colors,
|
||||||
|
opacity: 0.8,
|
||||||
|
},
|
||||||
|
customdata: customdata,
|
||||||
|
type: z?.length ? 'scatter3d' : 'scatter',
|
||||||
|
};
|
||||||
|
|
||||||
const x = vectors.map((v: number[]) => v[0]);
|
|
||||||
const y = vectors.map((v: number[]) => v[1]);
|
|
||||||
const text = documents.map((doc, i) => `Type: ${doc_types[i]}<br>Text: ${doc.slice(0, 100)}...`);
|
|
||||||
if (is3D) {
|
if (is3D) {
|
||||||
const z = vectors.map((v: number[]) => v[2]);
|
data.z = z;
|
||||||
setPlotData({
|
|
||||||
x: x,
|
|
||||||
y: y,
|
|
||||||
z: z,
|
|
||||||
colors: colors,
|
|
||||||
sizes: sizes,
|
|
||||||
symbols: symbols,
|
|
||||||
text: text
|
|
||||||
});
|
|
||||||
} else {
|
|
||||||
setPlotData({
|
|
||||||
x: x,
|
|
||||||
y: y,
|
|
||||||
colors: colors,
|
|
||||||
sizes: sizes,
|
|
||||||
symbols: symbols,
|
|
||||||
text: text
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
setPlotData({ data, layout });
|
||||||
|
|
||||||
}, [result, queryEmbedding]);
|
}, [result, queryEmbedding]);
|
||||||
|
|
||||||
const handleKeyPress = (event: any) => {
|
const handleKeyPress = (event: any) => {
|
||||||
@ -134,7 +223,7 @@ const VectorVisualizer: React.FC<VectorVisualizerProps> = ({ result, connectionB
|
|||||||
const sendQuery = async (query: string) => {
|
const sendQuery = async (query: string) => {
|
||||||
if (!query.trim()) return;
|
if (!query.trim()) return;
|
||||||
setQuery('');
|
setQuery('');
|
||||||
|
try {
|
||||||
const response = await fetch(`${connectionBase}/api/similarity/${sessionId}`, {
|
const response = await fetch(`${connectionBase}/api/similarity/${sessionId}`, {
|
||||||
method: 'PUT',
|
method: 'PUT',
|
||||||
headers: {
|
headers: {
|
||||||
@ -147,6 +236,10 @@ const VectorVisualizer: React.FC<VectorVisualizerProps> = ({ result, connectionB
|
|||||||
const chroma: ChromaResult = await response.json();
|
const chroma: ChromaResult = await response.json();
|
||||||
console.log('Chroma:', chroma);
|
console.log('Chroma:', chroma);
|
||||||
setQueryEmbedding(chroma);
|
setQueryEmbedding(chroma);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Error obtaining query similarity information:', error);
|
||||||
|
setSnack("Unable to obtain query similarity information.", "error");
|
||||||
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
if (!plotData || sessionId === undefined) return (
|
if (!plotData || sessionId === undefined) return (
|
||||||
@ -157,45 +250,65 @@ const VectorVisualizer: React.FC<VectorVisualizerProps> = ({ result, connectionB
|
|||||||
|
|
||||||
return (
|
return (
|
||||||
<>
|
<>
|
||||||
|
<Card sx={{ display: 'flex', flexDirection: 'column', justifyContent: 'center', alignItems: 'center', mb: 1, pt: 0 }}>
|
||||||
|
<Typography variant="h6" sx={{ p: 1, pt: 0 }}>
|
||||||
|
Similarity Visualization via Uniform Manifold Approximation and Projection (UMAP)
|
||||||
|
</Typography>
|
||||||
|
</Card>
|
||||||
<Box sx={{ display: 'flex', flexGrow: 1, justifyContent: 'center', alignItems: 'center' }}>
|
<Box sx={{ display: 'flex', flexGrow: 1, justifyContent: 'center', alignItems: 'center' }}>
|
||||||
<Plot
|
<Plot
|
||||||
data={[
|
onHover={(event: any) => {
|
||||||
{
|
const point = event.points[0];
|
||||||
x: plotData.x,
|
console.log('Point:', point);
|
||||||
y: plotData.y,
|
const type = point.customdata.type;
|
||||||
z: plotData.z,
|
const text = point.customdata.doc;
|
||||||
mode: 'markers',
|
const emoji = emojiMap[type] || '❓';
|
||||||
marker: {
|
setTooltip({
|
||||||
size: plotData.sizes,
|
visible: true,
|
||||||
symbol: plotData.symbols,
|
background: point['marker.color'],
|
||||||
color: plotData.colors,
|
color: getTextColorForBackground(point['marker.color']),
|
||||||
opacity: 0.8,
|
content: `${emoji} ${type.toUpperCase()}\n${text}`,
|
||||||
},
|
});
|
||||||
text: plotData.text,
|
|
||||||
hoverinfo: 'text',
|
|
||||||
type: plotData.z?.length ? 'scatter3d' : 'scatter',
|
|
||||||
},
|
|
||||||
]}
|
|
||||||
useResizeHandler={true}
|
|
||||||
config={{ responsive: true }}
|
|
||||||
style={{ width: '100%', height: '100%' }}
|
|
||||||
layout={{
|
|
||||||
autosize: true,
|
|
||||||
title: 'Vector Store Visualization',
|
|
||||||
xaxis: { title: 'x' },
|
|
||||||
yaxis: { title: 'y' },
|
|
||||||
zaxis: { title: 'z' },
|
|
||||||
margin: { r: 20, b: 10, l: 10, t: 40 },
|
|
||||||
}}
|
}}
|
||||||
|
|
||||||
|
data={[plotData.data]}
|
||||||
|
useResizeHandler={true}
|
||||||
|
config={{
|
||||||
|
responsive: true,
|
||||||
|
displayModeBar: false,
|
||||||
|
displaylogo: false,
|
||||||
|
showSendToCloud: false,
|
||||||
|
staticPlot: false,
|
||||||
|
}}
|
||||||
|
style={{ width: '100%', height: '100%' }}
|
||||||
|
layout={plotData.layout}
|
||||||
/>
|
/>
|
||||||
</Box>
|
</Box>
|
||||||
|
<Card sx={{
|
||||||
|
display: 'flex',
|
||||||
|
flexDirection: 'column',
|
||||||
|
flexGrow: 1,
|
||||||
|
mt: 1,
|
||||||
|
p: 0.5,
|
||||||
|
color: tooltip?.color || '#2E2E2E',
|
||||||
|
background: tooltip?.background || '#FFFFFF',
|
||||||
|
whiteSpace: 'pre-line',
|
||||||
|
zIndex: 1000,
|
||||||
|
overflow: 'auto',
|
||||||
|
maxHeight: '20vh',
|
||||||
|
minHeight: '20vh',
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
<Typography variant="body2" sx={{ p: 1, pt: 0 }}>
|
||||||
|
{tooltip?.content}
|
||||||
|
</Typography>
|
||||||
|
</Card>
|
||||||
{ queryEmbedding !== undefined &&
|
{ queryEmbedding !== undefined &&
|
||||||
<Box sx={{ display: 'flex', flexDirection: 'column', p: 1 }}>
|
<Card sx={{ display: 'flex', flexDirection: 'column', justifyContent: 'center', alignItems: 'center', mt: 1, pb: 0 }}>
|
||||||
<Box sx={{ fontSize: '0.8rem', mb: 1 }}>
|
<Typography variant="h6" sx={{ p: 1, pt: 0 }}>
|
||||||
Query: {queryEmbedding.query}
|
Query: {queryEmbedding.query}
|
||||||
</Box>
|
</Typography>
|
||||||
</Box>
|
</Card>
|
||||||
}
|
}
|
||||||
|
|
||||||
<Box className="Query" sx={{ display: "flex", flexDirection: "row", p: 1 }}>
|
<Box className="Query" sx={{ display: "flex", flexDirection: "row", p: 1 }}>
|
||||||
|
@ -48,7 +48,6 @@ Deprecated==1.2.18
|
|||||||
diffusers==0.33.1
|
diffusers==0.33.1
|
||||||
dill==0.3.8
|
dill==0.3.8
|
||||||
distro==1.9.0
|
distro==1.9.0
|
||||||
dpcpp-cpp-rt==2025.0.4
|
|
||||||
durationpy==0.9
|
durationpy==0.9
|
||||||
einops==0.8.1
|
einops==0.8.1
|
||||||
emoji==2.14.1
|
emoji==2.14.1
|
||||||
@ -104,15 +103,6 @@ impi-devel==2021.14.1
|
|||||||
impi-rt==2021.14.1
|
impi-rt==2021.14.1
|
||||||
importlib_metadata==8.6.1
|
importlib_metadata==8.6.1
|
||||||
importlib_resources==6.5.2
|
importlib_resources==6.5.2
|
||||||
intel-cmplr-lib-rt==2025.0.2
|
|
||||||
intel-cmplr-lib-ur==2025.0.2
|
|
||||||
intel-cmplr-lic-rt==2025.0.2
|
|
||||||
intel-opencl-rt==2025.0.4
|
|
||||||
intel-openmp==2025.0.4
|
|
||||||
intel-pti==0.10.0
|
|
||||||
intel-sycl-rt==2025.0.2
|
|
||||||
intel_extension_for_pytorch==2.6.10+xpu
|
|
||||||
ipex-llm @ file:///opt/wheels/ipex_llm-2.2.0.dev0-py3-none-any.whl#sha256=5023ff4dc9799838486b4d160d5f3dcd5f6d3bb9ac8a2c6cabaf90034b540ba3
|
|
||||||
ipykernel==6.29.5
|
ipykernel==6.29.5
|
||||||
ipython==9.1.0
|
ipython==9.1.0
|
||||||
ipython_pygments_lexers==1.1.1
|
ipython_pygments_lexers==1.1.1
|
||||||
@ -160,8 +150,6 @@ matplotlib==3.10.1
|
|||||||
matplotlib-inline==0.1.7
|
matplotlib-inline==0.1.7
|
||||||
mdurl==0.1.2
|
mdurl==0.1.2
|
||||||
mistune==3.1.3
|
mistune==3.1.3
|
||||||
mkl==2025.0.1
|
|
||||||
mkl-dpcpp==2025.0.1
|
|
||||||
mmh3==5.1.0
|
mmh3==5.1.0
|
||||||
modal==0.74.4
|
modal==0.74.4
|
||||||
monotonic==1.6
|
monotonic==1.6
|
||||||
|
@ -25,9 +25,9 @@ try_import('requests')
|
|||||||
try_import('bs4', 'beautifulsoup4')
|
try_import('bs4', 'beautifulsoup4')
|
||||||
try_import('fastapi')
|
try_import('fastapi')
|
||||||
try_import('uvicorn')
|
try_import('uvicorn')
|
||||||
try_import('sklearn')
|
|
||||||
try_import('numpy')
|
try_import('numpy')
|
||||||
try_import('umap')
|
try_import('umap')
|
||||||
|
try_import('sklearn')
|
||||||
|
|
||||||
import ollama
|
import ollama
|
||||||
import requests
|
import requests
|
||||||
@ -37,8 +37,8 @@ from fastapi.responses import JSONResponse, StreamingResponse, FileResponse, Red
|
|||||||
from fastapi.middleware.cors import CORSMiddleware
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
import uvicorn
|
import uvicorn
|
||||||
import numpy as np
|
import numpy as np
|
||||||
#from sklearn.manifold import TSNE
|
|
||||||
import umap
|
import umap
|
||||||
|
from sklearn.preprocessing import MinMaxScaler
|
||||||
|
|
||||||
from utils import (
|
from utils import (
|
||||||
rag as Rag,
|
rag as Rag,
|
||||||
@ -382,7 +382,6 @@ class WebServer:
|
|||||||
|
|
||||||
@self.app.on_event("startup")
|
@self.app.on_event("startup")
|
||||||
async def startup_event():
|
async def startup_event():
|
||||||
|
|
||||||
# Start the file watcher
|
# Start the file watcher
|
||||||
self.observer, self.file_watcher = Rag.start_file_watcher(
|
self.observer, self.file_watcher = Rag.start_file_watcher(
|
||||||
llm=client,
|
llm=client,
|
||||||
@ -475,8 +474,8 @@ class WebServer:
|
|||||||
# "document_count": file_watcher.collection.count()
|
# "document_count": file_watcher.collection.count()
|
||||||
# }
|
# }
|
||||||
|
|
||||||
@self.app.put('/api/tsne/{context_id}')
|
@self.app.put('/api/umap/{context_id}')
|
||||||
async def put_tsne(context_id: str, request: Request):
|
async def put_umap(context_id: str, request: Request):
|
||||||
if not self.file_watcher:
|
if not self.file_watcher:
|
||||||
return
|
return
|
||||||
|
|
||||||
@ -495,7 +494,7 @@ class WebServer:
|
|||||||
try:
|
try:
|
||||||
result = self.file_watcher.collection.get(include=['embeddings', 'documents', 'metadatas'])
|
result = self.file_watcher.collection.get(include=['embeddings', 'documents', 'metadatas'])
|
||||||
vectors = np.array(result['embeddings'])
|
vectors = np.array(result['embeddings'])
|
||||||
umap_model = umap.UMAP(n_components=dimensions, random_state=42)
|
umap_model = umap.UMAP(n_components=dimensions, random_state=42) #, n_neighbors=15, min_dist=0.1)
|
||||||
embedding = umap_model.fit_transform(vectors)
|
embedding = umap_model.fit_transform(vectors)
|
||||||
context['umap_model'] = umap_model
|
context['umap_model'] = umap_model
|
||||||
result['embeddings'] = embedding.tolist()
|
result['embeddings'] = embedding.tolist()
|
||||||
@ -531,9 +530,8 @@ class WebServer:
|
|||||||
if not chroma_results:
|
if not chroma_results:
|
||||||
return JSONResponse({"error": "No results found"}, status_code=404)
|
return JSONResponse({"error": "No results found"}, status_code=404)
|
||||||
chroma_embedding = chroma_results["query_embedding"]
|
chroma_embedding = chroma_results["query_embedding"]
|
||||||
normalized = (chroma_embedding - chroma_embedding.min()) / (chroma_embedding.max() - chroma_embedding.min())
|
umap_embedding = context["umap_model"].transform([chroma_embedding])[0].tolist()
|
||||||
vector_embedding = context["umap_model"].transform([normalized])[0].tolist()
|
return JSONResponse({ **chroma_results, "query": query, "vector_embedding": umap_embedding })
|
||||||
return JSONResponse({ **chroma_results, "query": query, "vector_embedding": vector_embedding })
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(e)
|
logging.error(e)
|
||||||
|
134
src/utils/rag.py
134
src/utils/rag.py
@ -35,7 +35,7 @@ __all__ = [
|
|||||||
|
|
||||||
class ChromaDBFileWatcher(FileSystemEventHandler):
|
class ChromaDBFileWatcher(FileSystemEventHandler):
|
||||||
def __init__(self, llm, watch_directory, loop, persist_directory=None, collection_name="documents",
|
def __init__(self, llm, watch_directory, loop, persist_directory=None, collection_name="documents",
|
||||||
chunk_size=1000, chunk_overlap=200, recreate=False):
|
chunk_size=500, chunk_overlap=200, recreate=False):
|
||||||
self.llm = llm
|
self.llm = llm
|
||||||
self.watch_directory = watch_directory
|
self.watch_directory = watch_directory
|
||||||
self.persist_directory = persist_directory or defines.persist_directory
|
self.persist_directory = persist_directory or defines.persist_directory
|
||||||
@ -47,8 +47,11 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
|
|||||||
# Path for storing file hash state
|
# Path for storing file hash state
|
||||||
self.hash_state_path = os.path.join(self.persist_directory, f"{collection_name}_hash_state.json")
|
self.hash_state_path = os.path.join(self.persist_directory, f"{collection_name}_hash_state.json")
|
||||||
|
|
||||||
|
# Flag to track if this is a new collection
|
||||||
|
self.is_new_collection = False
|
||||||
|
|
||||||
# Initialize ChromaDB collection
|
# Initialize ChromaDB collection
|
||||||
self.collection = self._get_vector_collection(recreate=recreate)
|
self._collection = self._get_vector_collection(recreate=recreate)
|
||||||
|
|
||||||
# Setup text splitter
|
# Setup text splitter
|
||||||
self.text_splitter = CharacterTextSplitter(
|
self.text_splitter = CharacterTextSplitter(
|
||||||
@ -61,17 +64,10 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
|
|||||||
self.update_lock = asyncio.Lock()
|
self.update_lock = asyncio.Lock()
|
||||||
self.processing_files = set()
|
self.processing_files = set()
|
||||||
|
|
||||||
# Always scan for new/changed files at startup
|
|
||||||
self._update_file_hashes()
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def collection(self):
|
def collection(self):
|
||||||
return self._collection
|
return self._collection
|
||||||
|
|
||||||
@collection.setter
|
|
||||||
def collection(self, value):
|
|
||||||
self._collection = value
|
|
||||||
|
|
||||||
def _save_hash_state(self):
|
def _save_hash_state(self):
|
||||||
"""Save the current file hash state to disk."""
|
"""Save the current file hash state to disk."""
|
||||||
try:
|
try:
|
||||||
@ -98,12 +94,20 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
|
|||||||
|
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
def _update_file_hashes(self):
|
async def scan_directory(self, process_all=False):
|
||||||
"""Update file hashes by checking for new or modified files."""
|
"""
|
||||||
|
Scan directory for new, modified, or deleted files and update collection.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
process_all: If True, process all files regardless of hash status
|
||||||
|
"""
|
||||||
# Check for new or modified files
|
# Check for new or modified files
|
||||||
file_paths = glob.glob(os.path.join(self.watch_directory, "**/*"), recursive=True)
|
file_paths = glob.glob(os.path.join(self.watch_directory, "**/*"), recursive=True)
|
||||||
files_checked = 0
|
files_checked = 0
|
||||||
files_changed = 0
|
files_processed = 0
|
||||||
|
files_to_process = []
|
||||||
|
|
||||||
|
logging.info(f"Starting directory scan. Found {len(file_paths)} total paths.")
|
||||||
|
|
||||||
for file_path in file_paths:
|
for file_path in file_paths:
|
||||||
if os.path.isfile(file_path):
|
if os.path.isfile(file_path):
|
||||||
@ -112,13 +116,13 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
|
|||||||
if not current_hash:
|
if not current_hash:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# If file is new or changed
|
# If file is new, changed, or we're processing all files
|
||||||
if file_path not in self.file_hashes or self.file_hashes[file_path] != current_hash:
|
if process_all or file_path not in self.file_hashes or self.file_hashes[file_path] != current_hash:
|
||||||
self.file_hashes[file_path] = current_hash
|
self.file_hashes[file_path] = current_hash
|
||||||
files_changed += 1
|
files_to_process.append(file_path)
|
||||||
# Schedule an update for this file
|
logging.info(f"File {'found' if process_all else 'changed'}: {file_path}")
|
||||||
asyncio.run_coroutine_threadsafe(self.process_file_update(file_path), self.loop)
|
|
||||||
logging.info(f"File changed: {file_path}")
|
logging.info(f"Found {len(files_to_process)} files to process after scanning {files_checked} files")
|
||||||
|
|
||||||
# Check for deleted files
|
# Check for deleted files
|
||||||
deleted_files = []
|
deleted_files = []
|
||||||
@ -127,17 +131,29 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
|
|||||||
deleted_files.append(file_path)
|
deleted_files.append(file_path)
|
||||||
# Schedule removal
|
# Schedule removal
|
||||||
asyncio.run_coroutine_threadsafe(self.remove_file_from_collection(file_path), self.loop)
|
asyncio.run_coroutine_threadsafe(self.remove_file_from_collection(file_path), self.loop)
|
||||||
|
# Don't block on result, just let it run
|
||||||
logging.info(f"File deleted: {file_path}")
|
logging.info(f"File deleted: {file_path}")
|
||||||
|
|
||||||
# Remove deleted files from hash state
|
# Remove deleted files from hash state
|
||||||
for file_path in deleted_files:
|
for file_path in deleted_files:
|
||||||
del self.file_hashes[file_path]
|
del self.file_hashes[file_path]
|
||||||
|
|
||||||
logging.info(f"Checked {files_checked} files: {files_changed} new/changed, {len(deleted_files)} deleted")
|
# Process all discovered files using asyncio.gather with the existing loop
|
||||||
|
if files_to_process:
|
||||||
|
logging.info(f"Starting to process {len(files_to_process)} files")
|
||||||
|
|
||||||
|
for file_path in files_to_process:
|
||||||
|
async with self.update_lock:
|
||||||
|
await self._update_document_in_collection(file_path)
|
||||||
|
else:
|
||||||
|
logging.info("No files to process")
|
||||||
|
|
||||||
# Save the updated state
|
# Save the updated state
|
||||||
self._save_hash_state()
|
self._save_hash_state()
|
||||||
|
|
||||||
|
logging.info(f"Scan complete: Checked {files_checked} files, processed {files_processed}, removed {len(deleted_files)}")
|
||||||
|
return files_processed
|
||||||
|
|
||||||
async def process_file_update(self, file_path):
|
async def process_file_update(self, file_path):
|
||||||
"""Process a file update event."""
|
"""Process a file update event."""
|
||||||
# Skip if already being processed
|
# Skip if already being processed
|
||||||
@ -204,12 +220,23 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
|
|||||||
settings=chromadb.Settings(anonymized_telemetry=False)
|
settings=chromadb.Settings(anonymized_telemetry=False)
|
||||||
)
|
)
|
||||||
|
|
||||||
# Check if the collection exists and delete it if recreate is True
|
# Check if the collection exists
|
||||||
if recreate and os.path.exists(self.persist_directory):
|
|
||||||
try:
|
try:
|
||||||
|
chroma_client.get_collection(self.collection_name)
|
||||||
|
collection_exists = True
|
||||||
|
except:
|
||||||
|
collection_exists = False
|
||||||
|
|
||||||
|
# If collection doesn't exist, mark it as new
|
||||||
|
if not collection_exists:
|
||||||
|
self.is_new_collection = True
|
||||||
|
logging.info(f"Creating new collection: {self.collection_name}")
|
||||||
|
|
||||||
|
# Delete if recreate is True
|
||||||
|
if recreate and collection_exists:
|
||||||
chroma_client.delete_collection(name=self.collection_name)
|
chroma_client.delete_collection(name=self.collection_name)
|
||||||
except Exception as e:
|
self.is_new_collection = True
|
||||||
logging.error(f"Failed to delete existing collection: {e}")
|
logging.info(f"Recreating collection: {self.collection_name}")
|
||||||
|
|
||||||
return chroma_client.get_or_create_collection(
|
return chroma_client.get_or_create_collection(
|
||||||
name=self.collection_name,
|
name=self.collection_name,
|
||||||
@ -246,14 +273,17 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
|
|||||||
"""Split documents into chunks using the text splitter."""
|
"""Split documents into chunks using the text splitter."""
|
||||||
return self.text_splitter.split_documents(docs)
|
return self.text_splitter.split_documents(docs)
|
||||||
|
|
||||||
def get_embedding(self, text):
|
def get_embedding(self, text, normalize=True):
|
||||||
"""Generate embeddings using Ollama."""
|
"""Generate embeddings using Ollama."""
|
||||||
response = self.llm.embeddings(
|
response = self.llm.embeddings(
|
||||||
model=defines.model,
|
model=defines.model,
|
||||||
prompt=text,
|
prompt=text,
|
||||||
options={"num_ctx": defines.max_context}
|
options={"num_ctx": self.chunk_size * 3} # No need waste ctx space
|
||||||
)
|
)
|
||||||
return self._normalize_embeddings(response["embedding"])
|
if normalize:
|
||||||
|
normalized = self._normalize_embeddings(response["embedding"])
|
||||||
|
return normalized
|
||||||
|
return response["embedding"]
|
||||||
|
|
||||||
def add_embeddings_to_collection(self, chunks):
|
def add_embeddings_to_collection(self, chunks):
|
||||||
"""Add embeddings for chunks to the collection."""
|
"""Add embeddings for chunks to the collection."""
|
||||||
@ -293,18 +323,6 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
|
|||||||
"metadatas": results["metadatas"][0],
|
"metadatas": results["metadatas"][0],
|
||||||
}
|
}
|
||||||
|
|
||||||
def _initialize_file_hashes(self):
|
|
||||||
"""Initialize the hash dictionary for all files in the directory."""
|
|
||||||
file_paths = glob.glob(os.path.join(self.watch_directory, "**/*"), recursive=True)
|
|
||||||
for file_path in file_paths:
|
|
||||||
if os.path.isfile(file_path):
|
|
||||||
hash = self._get_file_hash(file_path)
|
|
||||||
if hash:
|
|
||||||
self.file_hashes[file_path] = hash
|
|
||||||
|
|
||||||
# Save the initialized hash state
|
|
||||||
self._save_hash_state()
|
|
||||||
|
|
||||||
def _get_file_hash(self, file_path):
|
def _get_file_hash(self, file_path):
|
||||||
"""Calculate MD5 hash of a file."""
|
"""Calculate MD5 hash of a file."""
|
||||||
try:
|
try:
|
||||||
@ -358,6 +376,7 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
|
|||||||
async def _update_document_in_collection(self, file_path):
|
async def _update_document_in_collection(self, file_path):
|
||||||
"""Update a document in the ChromaDB collection."""
|
"""Update a document in the ChromaDB collection."""
|
||||||
try:
|
try:
|
||||||
|
logging.info(f"Updating document in collection: {file_path}")
|
||||||
# Remove existing entries for this file
|
# Remove existing entries for this file
|
||||||
existing_results = self.collection.get(where={"path": file_path})
|
existing_results = self.collection.get(where={"path": file_path})
|
||||||
if existing_results and 'ids' in existing_results and existing_results['ids']:
|
if existing_results and 'ids' in existing_results and existing_results['ids']:
|
||||||
@ -387,25 +406,27 @@ class ChromaDBFileWatcher(FileSystemEventHandler):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"Error updating document in collection: {e}")
|
logging.error(f"Error updating document in collection: {e}")
|
||||||
|
|
||||||
def initialize_collection(self):
|
async def initialize_collection(self):
|
||||||
"""Initialize the collection with all documents from the watch directory."""
|
"""Initialize the collection with all documents from the watch directory."""
|
||||||
documents = self.load_text_files()
|
# Process all files regardless of hash state
|
||||||
logging.info(f"Documents loaded: {len(documents)}")
|
num_processed = await self.scan_directory(process_all=True)
|
||||||
|
|
||||||
chunks = self.create_chunks_from_documents(documents)
|
logging.info(f"Vectorstore initialized with {self.collection.count()} documents")
|
||||||
self.add_embeddings_to_collection(chunks)
|
|
||||||
|
|
||||||
logging.info(f"Vectorstore created with {self.collection.count()} documents")
|
# Show stats
|
||||||
|
try:
|
||||||
# Display document types
|
all_metadata = self.collection.get()['metadatas']
|
||||||
doc_types = set(chunk.metadata['doc_type'] for chunk in chunks)
|
if all_metadata:
|
||||||
|
doc_types = set(m.get('doc_type', 'unknown') for m in all_metadata)
|
||||||
logging.info(f"Document types: {doc_types}")
|
logging.info(f"Document types: {doc_types}")
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"Error getting document types: {e}")
|
||||||
|
|
||||||
return len(chunks)
|
return num_processed
|
||||||
|
|
||||||
# Function to start the file watcher
|
# Function to start the file watcher
|
||||||
def start_file_watcher(llm, watch_directory, persist_directory=None,
|
def start_file_watcher(llm, watch_directory, persist_directory=None,
|
||||||
collection_name="documents", recreate=False):
|
collection_name="documents", initialize=False, recreate=False):
|
||||||
"""
|
"""
|
||||||
Start watching a directory for file changes.
|
Start watching a directory for file changes.
|
||||||
|
|
||||||
@ -414,6 +435,7 @@ def start_file_watcher(llm, watch_directory, persist_directory=None,
|
|||||||
watch_directory: Directory to watch for changes
|
watch_directory: Directory to watch for changes
|
||||||
persist_directory: Directory to persist ChromaDB and hash state
|
persist_directory: Directory to persist ChromaDB and hash state
|
||||||
collection_name: Name of the ChromaDB collection
|
collection_name: Name of the ChromaDB collection
|
||||||
|
initialize: Whether to forcibly initialize the collection with all documents
|
||||||
recreate: Whether to recreate the collection (will delete existing)
|
recreate: Whether to recreate the collection (will delete existing)
|
||||||
"""
|
"""
|
||||||
loop = asyncio.get_event_loop()
|
loop = asyncio.get_event_loop()
|
||||||
@ -427,9 +449,17 @@ def start_file_watcher(llm, watch_directory, persist_directory=None,
|
|||||||
recreate=recreate
|
recreate=recreate
|
||||||
)
|
)
|
||||||
|
|
||||||
# Initialize collection if it does not exist
|
# Process all files if:
|
||||||
if not os.path.exists(file_watcher.hash_state_path):
|
# 1. initialize=True was passed (explicit request to initialize)
|
||||||
file_watcher.initialize_collection()
|
# 2. This is a new collection (doesn't exist yet)
|
||||||
|
# 3. There's no hash state (first run)
|
||||||
|
if initialize or file_watcher.is_new_collection or not file_watcher.file_hashes:
|
||||||
|
logging.info("Initializing collection with all documents")
|
||||||
|
asyncio.run_coroutine_threadsafe(file_watcher.initialize_collection(), loop)
|
||||||
|
else:
|
||||||
|
# Only process new/changed files
|
||||||
|
logging.info("Scanning for new/changed documents")
|
||||||
|
asyncio.run_coroutine_threadsafe(file_watcher.scan_directory(), loop)
|
||||||
|
|
||||||
# Start observer
|
# Start observer
|
||||||
observer = Observer()
|
observer = Observer()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user