ai-control-systems/rag/index.py

#!/usr/bin/env python3
"""
RAG indexer: load URLs → chunk text → embed via Ollama → store in FAISS.
Usage: python rag/index.py [--rebuild]
"""

import os
import re
import sys
import pickle
import hashlib
import time
from pathlib import Path

import requests
import numpy as np
import faiss
from bs4 import BeautifulSoup

OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://192.168.0.47:11434")
EMBED_MODEL = os.getenv("EMBED_MODEL", "bge-m3")
CHUNK_SIZE = 500
CHUNK_OVERLAP = 100
BATCH_SIZE = 32

BASE_DIR = Path(__file__).resolve().parent.parent
STORE_DIR = BASE_DIR / "rag" / "store"
SOURCES_FILE = BASE_DIR / "data" / "sources.txt"
INDEX_PATH = STORE_DIR / "faiss.index"
CHUNKS_PATH = STORE_DIR / "chunks.pkl"
META_PATH = STORE_DIR / "meta.pkl"


def load_sources(path: Path) -> list[str]:
    urls = []
    with open(path) as f:
        for line in f:
            line = line.strip()
            if line and not line.startswith("#"):
                urls.append(line)
    return urls


def fetch_page(url: str) -> str:
    resp = requests.get(url, timeout=30, headers={"User-Agent": "Mozilla/5.0"})
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")
    for tag in soup(["script", "style", "nav", "footer", "header", "aside", "noscript"]):
        tag.decompose()
    text = soup.get_text(separator="\n", strip=True)
    text = re.sub(r"\n{3,}", "\n\n", text)
    text = re.sub(r"[ \t]+", " ", text)
    return text


def chunk_text(text: str, size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> list[str]:
    sentences = re.split(r"(?<=[.!?])\s+", text)
    chunks = []
    current = ""
    for sent in sentences:
        if len(current) + len(sent) > size and current:
            chunks.append(current.strip())
            words = current.split()
            overlap_words = []
            char_count = 0
            for w in reversed(words):
                if char_count + len(w) + 1 > overlap:
                    break
                overlap_words.insert(0, w)
                char_count += len(w) + 1
            current = " ".join(overlap_words) + " " + sent
        else:
            current += " " + sent if current else sent
    if current.strip():
        chunks.append(current.strip())
    return chunks


def get_embeddings(texts: list[str]) -> np.ndarray:
    resp = requests.post(
        f"{OLLAMA_HOST}/api/embed",
        json={"model": EMBED_MODEL, "input": texts},
        timeout=120,
    )
    resp.raise_for_status()
    data = resp.json()
    return np.array(data["embeddings"], dtype=np.float32)


def main():
    rebuild = "--rebuild" in sys.argv

    if not rebuild and INDEX_PATH.exists() and CHUNKS_PATH.exists():
        print("Index already exists. Use --rebuild to reindex.")
        return

    urls = load_sources(SOURCES_FILE)
    print(f"Loaded {len(urls)} source URLs")

    all_chunks = []
    all_meta = []

    for i, url in enumerate(urls):
        print(f"[{i+1}/{len(urls)}] Fetching {url}...", end=" ", flush=True)
        try:
            text = fetch_page(url)
            chunks = chunk_text(text)
            print(f"{len(chunks)} chunks")
            for j, chunk in enumerate(chunks):
                all_chunks.append(chunk)
                all_meta.append({"url": url, "chunk_idx": j})
        except Exception as e:
            print(f"ERROR: {e}")
            continue

    print(f"\nTotal chunks: {len(all_chunks)}")
    if not all_chunks:
        print("No chunks to index!")
        return

    print(f"Generating embeddings ({EMBED_MODEL})...")
    all_embeddings = []
    for start in range(0, len(all_chunks), BATCH_SIZE):
        batch = all_chunks[start : start + BATCH_SIZE]
        print(f"  Batch {start//BATCH_SIZE + 1}/{(len(all_chunks)-1)//BATCH_SIZE + 1}...", flush=True)
        embs = get_embeddings(batch)
        all_embeddings.append(embs)

    embeddings = np.vstack(all_embeddings)
    faiss.normalize_L2(embeddings)
    dim = embeddings.shape[1]
    print(f"Embedding dimension: {dim}")

    index = faiss.IndexFlatIP(dim)
    index.add(embeddings)

    STORE_DIR.mkdir(parents=True, exist_ok=True)
    faiss.write_index(index, str(INDEX_PATH))
    with open(CHUNKS_PATH, "wb") as f:
        pickle.dump(all_chunks, f)
    with open(META_PATH, "wb") as f:
        pickle.dump(all_meta, f)

    print(f"Saved index ({index.ntotal} vectors) to {STORE_DIR}")


if __name__ == "__main__":
    main()