Initial commit: RAG system for control theory Q&A
Ollama + FAISS based retrieval-augmented generation system that indexes Wikipedia articles on automatic control theory and answers questions in Russian. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
148
rag/index.py
Normal file
148
rag/index.py
Normal file
@@ -0,0 +1,148 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
RAG indexer: load URLs → chunk text → embed via Ollama → store in FAISS.
|
||||
Usage: python rag/index.py [--rebuild]
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import pickle
|
||||
import hashlib
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
import numpy as np
|
||||
import faiss
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://192.168.0.47:11434")
|
||||
EMBED_MODEL = os.getenv("EMBED_MODEL", "bge-m3")
|
||||
CHUNK_SIZE = 500
|
||||
CHUNK_OVERLAP = 100
|
||||
BATCH_SIZE = 32
|
||||
|
||||
BASE_DIR = Path(__file__).resolve().parent.parent
|
||||
STORE_DIR = BASE_DIR / "rag" / "store"
|
||||
SOURCES_FILE = BASE_DIR / "data" / "sources.txt"
|
||||
INDEX_PATH = STORE_DIR / "faiss.index"
|
||||
CHUNKS_PATH = STORE_DIR / "chunks.pkl"
|
||||
META_PATH = STORE_DIR / "meta.pkl"
|
||||
|
||||
|
||||
def load_sources(path: Path) -> list[str]:
|
||||
urls = []
|
||||
with open(path) as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if line and not line.startswith("#"):
|
||||
urls.append(line)
|
||||
return urls
|
||||
|
||||
|
||||
def fetch_page(url: str) -> str:
|
||||
resp = requests.get(url, timeout=30, headers={"User-Agent": "Mozilla/5.0"})
|
||||
resp.raise_for_status()
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
for tag in soup(["script", "style", "nav", "footer", "header", "aside", "noscript"]):
|
||||
tag.decompose()
|
||||
text = soup.get_text(separator="\n", strip=True)
|
||||
text = re.sub(r"\n{3,}", "\n\n", text)
|
||||
text = re.sub(r"[ \t]+", " ", text)
|
||||
return text
|
||||
|
||||
|
||||
def chunk_text(text: str, size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> list[str]:
|
||||
sentences = re.split(r"(?<=[.!?])\s+", text)
|
||||
chunks = []
|
||||
current = ""
|
||||
for sent in sentences:
|
||||
if len(current) + len(sent) > size and current:
|
||||
chunks.append(current.strip())
|
||||
words = current.split()
|
||||
overlap_words = []
|
||||
char_count = 0
|
||||
for w in reversed(words):
|
||||
if char_count + len(w) + 1 > overlap:
|
||||
break
|
||||
overlap_words.insert(0, w)
|
||||
char_count += len(w) + 1
|
||||
current = " ".join(overlap_words) + " " + sent
|
||||
else:
|
||||
current += " " + sent if current else sent
|
||||
if current.strip():
|
||||
chunks.append(current.strip())
|
||||
return chunks
|
||||
|
||||
|
||||
def get_embeddings(texts: list[str]) -> np.ndarray:
|
||||
resp = requests.post(
|
||||
f"{OLLAMA_HOST}/api/embed",
|
||||
json={"model": EMBED_MODEL, "input": texts},
|
||||
timeout=120,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
return np.array(data["embeddings"], dtype=np.float32)
|
||||
|
||||
|
||||
def main():
|
||||
rebuild = "--rebuild" in sys.argv
|
||||
|
||||
if not rebuild and INDEX_PATH.exists() and CHUNKS_PATH.exists():
|
||||
print("Index already exists. Use --rebuild to reindex.")
|
||||
return
|
||||
|
||||
urls = load_sources(SOURCES_FILE)
|
||||
print(f"Loaded {len(urls)} source URLs")
|
||||
|
||||
all_chunks = []
|
||||
all_meta = []
|
||||
|
||||
for i, url in enumerate(urls):
|
||||
print(f"[{i+1}/{len(urls)}] Fetching {url}...", end=" ", flush=True)
|
||||
try:
|
||||
text = fetch_page(url)
|
||||
chunks = chunk_text(text)
|
||||
print(f"{len(chunks)} chunks")
|
||||
for j, chunk in enumerate(chunks):
|
||||
all_chunks.append(chunk)
|
||||
all_meta.append({"url": url, "chunk_idx": j})
|
||||
except Exception as e:
|
||||
print(f"ERROR: {e}")
|
||||
continue
|
||||
|
||||
print(f"\nTotal chunks: {len(all_chunks)}")
|
||||
if not all_chunks:
|
||||
print("No chunks to index!")
|
||||
return
|
||||
|
||||
print(f"Generating embeddings ({EMBED_MODEL})...")
|
||||
all_embeddings = []
|
||||
for start in range(0, len(all_chunks), BATCH_SIZE):
|
||||
batch = all_chunks[start : start + BATCH_SIZE]
|
||||
print(f" Batch {start//BATCH_SIZE + 1}/{(len(all_chunks)-1)//BATCH_SIZE + 1}...", flush=True)
|
||||
embs = get_embeddings(batch)
|
||||
all_embeddings.append(embs)
|
||||
|
||||
embeddings = np.vstack(all_embeddings)
|
||||
faiss.normalize_L2(embeddings)
|
||||
dim = embeddings.shape[1]
|
||||
print(f"Embedding dimension: {dim}")
|
||||
|
||||
index = faiss.IndexFlatIP(dim)
|
||||
index.add(embeddings)
|
||||
|
||||
STORE_DIR.mkdir(parents=True, exist_ok=True)
|
||||
faiss.write_index(index, str(INDEX_PATH))
|
||||
with open(CHUNKS_PATH, "wb") as f:
|
||||
pickle.dump(all_chunks, f)
|
||||
with open(META_PATH, "wb") as f:
|
||||
pickle.dump(all_meta, f)
|
||||
|
||||
print(f"Saved index ({index.ntotal} vectors) to {STORE_DIR}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user