backstory/src/utils/chunk.py

import tiktoken
from . import defines
from typing import List, Dict, Any, Union

def get_encoding(model=defines.model):
    """Get the tokenizer for counting tokens."""
    try:
        return tiktoken.get_encoding("cl100k_base")  # Default encoding used by many embedding models
    except:
        return tiktoken.encoding_for_model(model)

def count_tokens(text: str) -> int:
    """Count the number of tokens in a text string."""
    encoding = get_encoding()
    return len(encoding.encode(text))

def chunk_text(text: str, max_tokens: int = 512, overlap: int = 50) -> List[str]:
    """
    Split a text into chunks based on token count with overlap between chunks.

    Args:
        text: The text to split into chunks
        max_tokens: Maximum number of tokens per chunk
        overlap: Number of tokens to overlap between chunks

    Returns:
        List of text chunks
    """
    if not text or max_tokens <= 0:
        return []

    encoding = get_encoding()
    tokens = encoding.encode(text)
    chunks = []

    i = 0
    while i < len(tokens):
        # Get the current chunk of tokens
        chunk_end = min(i + max_tokens, len(tokens))
        chunk_tokens = tokens[i:chunk_end]
        chunks.append(encoding.decode(chunk_tokens))

        # Move to the next position with overlap
        if chunk_end == len(tokens):
            break
        i += max_tokens - overlap

    return chunks

def chunk_document(document: Dict[str, Any],
                  text_key: str = "text",
                  max_tokens: int = 512,
                  overlap: int = 50) -> List[Dict[str, Any]]:
    """
    Chunk a document dictionary into multiple chunks.

    Args:
        document: Document dictionary with metadata and text
        text_key: The key in the document that contains the text to chunk
        max_tokens: Maximum number of tokens per chunk
        overlap: Number of tokens to overlap between chunks

    Returns:
        List of document dictionaries, each with chunked text and preserved metadata
    """
    if text_key not in document:
        raise Exception(f"{text_key} not in document")

    # Extract text and create chunks
    if "title" in document:
        text = f"{document["title"]}: {document[text_key]}"
    else:
        text = document[text_key]
    chunks = chunk_text(text, max_tokens, overlap)

    # Create document chunks with preserved metadata
    chunked_docs = []
    for i, chunk in enumerate(chunks):
        # Create a new doc with all original fields
        doc_chunk = document.copy()
        # Replace text with the chunk
        doc_chunk[text_key] = chunk
        # Add chunk metadata
        doc_chunk["chunk_id"] = i
        doc_chunk["chunk_total"] = len(chunks)
        chunked_docs.append(doc_chunk)

    return chunked_docs