#!/usr/bin/env python3 """ RAG indexer: load URLs → chunk text → embed via Ollama → store in FAISS. Usage: python rag/index.py [--rebuild] """ import os import re import sys import pickle import hashlib import time from pathlib import Path import requests import numpy as np import faiss from bs4 import BeautifulSoup OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://192.168.0.47:11434") EMBED_MODEL = os.getenv("EMBED_MODEL", "bge-m3") CHUNK_SIZE = 500 CHUNK_OVERLAP = 100 BATCH_SIZE = 32 BASE_DIR = Path(__file__).resolve().parent.parent STORE_DIR = BASE_DIR / "rag" / "store" SOURCES_FILE = BASE_DIR / "data" / "sources.txt" INDEX_PATH = STORE_DIR / "faiss.index" CHUNKS_PATH = STORE_DIR / "chunks.pkl" META_PATH = STORE_DIR / "meta.pkl" def load_sources(path: Path) -> list[str]: urls = [] with open(path) as f: for line in f: line = line.strip() if line and not line.startswith("#"): urls.append(line) return urls def fetch_page(url: str) -> str: resp = requests.get(url, timeout=30, headers={"User-Agent": "Mozilla/5.0"}) resp.raise_for_status() soup = BeautifulSoup(resp.text, "html.parser") for tag in soup(["script", "style", "nav", "footer", "header", "aside", "noscript"]): tag.decompose() text = soup.get_text(separator="\n", strip=True) text = re.sub(r"\n{3,}", "\n\n", text) text = re.sub(r"[ \t]+", " ", text) return text def chunk_text(text: str, size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> list[str]: sentences = re.split(r"(?<=[.!?])\s+", text) chunks = [] current = "" for sent in sentences: if len(current) + len(sent) > size and current: chunks.append(current.strip()) words = current.split() overlap_words = [] char_count = 0 for w in reversed(words): if char_count + len(w) + 1 > overlap: break overlap_words.insert(0, w) char_count += len(w) + 1 current = " ".join(overlap_words) + " " + sent else: current += " " + sent if current else sent if current.strip(): chunks.append(current.strip()) return chunks def get_embeddings(texts: list[str]) -> np.ndarray: resp = requests.post( f"{OLLAMA_HOST}/api/embed", json={"model": EMBED_MODEL, "input": texts}, timeout=120, ) resp.raise_for_status() data = resp.json() return np.array(data["embeddings"], dtype=np.float32) def main(): rebuild = "--rebuild" in sys.argv if not rebuild and INDEX_PATH.exists() and CHUNKS_PATH.exists(): print("Index already exists. Use --rebuild to reindex.") return urls = load_sources(SOURCES_FILE) print(f"Loaded {len(urls)} source URLs") all_chunks = [] all_meta = [] for i, url in enumerate(urls): print(f"[{i+1}/{len(urls)}] Fetching {url}...", end=" ", flush=True) try: text = fetch_page(url) chunks = chunk_text(text) print(f"{len(chunks)} chunks") for j, chunk in enumerate(chunks): all_chunks.append(chunk) all_meta.append({"url": url, "chunk_idx": j}) except Exception as e: print(f"ERROR: {e}") continue print(f"\nTotal chunks: {len(all_chunks)}") if not all_chunks: print("No chunks to index!") return print(f"Generating embeddings ({EMBED_MODEL})...") all_embeddings = [] for start in range(0, len(all_chunks), BATCH_SIZE): batch = all_chunks[start : start + BATCH_SIZE] print(f" Batch {start//BATCH_SIZE + 1}/{(len(all_chunks)-1)//BATCH_SIZE + 1}...", flush=True) embs = get_embeddings(batch) all_embeddings.append(embs) embeddings = np.vstack(all_embeddings) faiss.normalize_L2(embeddings) dim = embeddings.shape[1] print(f"Embedding dimension: {dim}") index = faiss.IndexFlatIP(dim) index.add(embeddings) STORE_DIR.mkdir(parents=True, exist_ok=True) faiss.write_index(index, str(INDEX_PATH)) with open(CHUNKS_PATH, "wb") as f: pickle.dump(all_chunks, f) with open(META_PATH, "wb") as f: pickle.dump(all_meta, f) print(f"Saved index ({index.ntotal} vectors) to {STORE_DIR}") if __name__ == "__main__": main()