From f105ab62774c8e3561835f8994898a6c99b87a20 Mon Sep 17 00:00:00 2001 From: root Date: Fri, 24 Apr 2026 00:57:53 -0400 Subject: [PATCH] Initial commit: RAG system for control theory Q&A Ollama + FAISS based retrieval-augmented generation system that indexes Wikipedia articles on automatic control theory and answers questions in Russian. Co-Authored-By: Claude Opus 4.7 --- .gitignore | 5 ++ README.md | 46 +++++++++++++++ data/sources.txt | 40 +++++++++++++ rag/index.py | 148 +++++++++++++++++++++++++++++++++++++++++++++++ rag/query.py | 137 +++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 4 ++ 6 files changed, 380 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 data/sources.txt create mode 100644 rag/index.py create mode 100644 rag/query.py create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..dc4e385 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +__pycache__/ +*.pyc +.venv/ +rag/store/ +output/ diff --git a/README.md b/README.md new file mode 100644 index 0000000..e482cea --- /dev/null +++ b/README.md @@ -0,0 +1,46 @@ +# RAG: Теория автоматического управления + +Retrieval-Augmented Generation система для ответов на вопросы по теории автоматического управления и электротехнике. + +## Архитектура + +``` +data/sources.txt → rag/index.py → rag/store/ + ├── faiss.index + ├── chunks.pkl + └── meta.pkl + +rag/query.py "вопрос" → FAISS поиск → Ollama generate → ответ +``` + +## Компоненты + +- **Ollama** — локальный LLM-сервер (192.168.0.47:11434) + - Эмбеддинги: `bge-m3` + - Генерация: `qwen3.5:9b` +- **FAISS** — векторный индекс для поиска похожих фрагментов +- **BeautifulSoup** — парсинг веб-страниц + +## Источники + +Wikipedia-статьи (RU/EN) по темам: передаточные функции, обратная связь, устойчивость, PID-регулятор, Боде, Найквист, корневой годограф, RLC-цепи, импеданс, резонанс. + +## Запуск + +```bash +source .venv/bin/activate + +# Индексация (уже выполнена) +python rag/index.py [--rebuild] + +# Запрос +python rag/query.py "Что такое PID-регулятор?" +``` + +## Переменные окружения + +| Переменная | По умолчанию | Описание | +|-------------------|----------------------------|-----------------------| +| `OLLAMA_HOST` | `http://192.168.0.47:11434`| Адрес Ollama | +| `EMBED_MODEL` | `bge-m3` | Модель эмбеддингов | +| `GENERATE_MODEL` | `qwen3.5:9b` | Модель генерации | diff --git a/data/sources.txt b/data/sources.txt new file mode 100644 index 0000000..395c05b --- /dev/null +++ b/data/sources.txt @@ -0,0 +1,40 @@ +# Теория управления — источники для RAG-индексации +# Строки с # — комментарии, пустые строки игнорируются + +# Передаточные функции и основы +https://ru.wikipedia.org/wiki/%D0%9F%D0%B5%D1%80%D0%B5%D0%B4%D0%B0%D1%82%D0%BE%D1%87%D0%BD%D0%B0%D1%8F_%D1%84%D1%83%D0%BD%D0%BA%D1%86%D0%B8%D1%8F +https://en.wikipedia.org/wiki/Transfer_function + +# Обратная связь +https://ru.wikipedia.org/wiki/%D0%9E%D0%B1%D1%80%D0%B0%D1%82%D0%BD%D0%B0%D1%8F_%D1%81%D0%B2%D1%8F%D0%B7%D1%8C +https://en.wikipedia.org/wiki/Feedback + +# Устойчивость систем управления +https://ru.wikipedia.org/wiki/%D0%A3%D1%81%D1%82%D0%BE%D0%B9%D1%87%D0%B8%D0%B2%D0%BE%D1%81%D1%82%D1%8C_%D1%81%D0%B8%D1%81%D1%82%D0%B5%D0%BC_%D1%83%D0%BF%D1%80%D0%B0%D0%B2%D0%BB%D0%B5%D0%BD%D0%B8%D1%8F + +# PID-регулятор +https://ru.wikipedia.org/wiki/PID-%D0%BA%D0%BE%D0%BD%D1%82%D1%80%D0%BE%D0%BB%D0%BB%D0%B5%D1%80 +https://en.wikipedia.org/wiki/PID_controller + +# АЧХ, ФЧХ, метод Боде +https://en.wikipedia.org/wiki/Bode_plot + +# Годограф Найквиста +https://en.wikipedia.org/wiki/Nyquist_stability_criterion + +# Корневой годограф +https://en.wikipedia.org/wiki/Root_locus_analysis + +# Импульсная переходная функция +https://en.wikipedia.org/wiki/Impulse_response + +# RLC-цепи +https://ru.wikipedia.org/wiki/RLC-%D1%86%D0%B5%D0%BF%D1%8C +https://en.wikipedia.org/wiki/RLC_circuit + +# Импеданс +https://ru.wikipedia.org/wiki/%D0%98%D0%BC%D0%BF%D0%B5%D0%B4%D0%B0%D0%BD%D1%81 +https://en.wikipedia.org/wiki/Electrical_impedance + +# Резонанс +https://en.wikipedia.org/wiki/Resonance diff --git a/rag/index.py b/rag/index.py new file mode 100644 index 0000000..fc1d124 --- /dev/null +++ b/rag/index.py @@ -0,0 +1,148 @@ +#!/usr/bin/env python3 +""" +RAG indexer: load URLs → chunk text → embed via Ollama → store in FAISS. +Usage: python rag/index.py [--rebuild] +""" + +import os +import re +import sys +import pickle +import hashlib +import time +from pathlib import Path + +import requests +import numpy as np +import faiss +from bs4 import BeautifulSoup + +OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://192.168.0.47:11434") +EMBED_MODEL = os.getenv("EMBED_MODEL", "bge-m3") +CHUNK_SIZE = 500 +CHUNK_OVERLAP = 100 +BATCH_SIZE = 32 + +BASE_DIR = Path(__file__).resolve().parent.parent +STORE_DIR = BASE_DIR / "rag" / "store" +SOURCES_FILE = BASE_DIR / "data" / "sources.txt" +INDEX_PATH = STORE_DIR / "faiss.index" +CHUNKS_PATH = STORE_DIR / "chunks.pkl" +META_PATH = STORE_DIR / "meta.pkl" + + +def load_sources(path: Path) -> list[str]: + urls = [] + with open(path) as f: + for line in f: + line = line.strip() + if line and not line.startswith("#"): + urls.append(line) + return urls + + +def fetch_page(url: str) -> str: + resp = requests.get(url, timeout=30, headers={"User-Agent": "Mozilla/5.0"}) + resp.raise_for_status() + soup = BeautifulSoup(resp.text, "html.parser") + for tag in soup(["script", "style", "nav", "footer", "header", "aside", "noscript"]): + tag.decompose() + text = soup.get_text(separator="\n", strip=True) + text = re.sub(r"\n{3,}", "\n\n", text) + text = re.sub(r"[ \t]+", " ", text) + return text + + +def chunk_text(text: str, size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> list[str]: + sentences = re.split(r"(?<=[.!?])\s+", text) + chunks = [] + current = "" + for sent in sentences: + if len(current) + len(sent) > size and current: + chunks.append(current.strip()) + words = current.split() + overlap_words = [] + char_count = 0 + for w in reversed(words): + if char_count + len(w) + 1 > overlap: + break + overlap_words.insert(0, w) + char_count += len(w) + 1 + current = " ".join(overlap_words) + " " + sent + else: + current += " " + sent if current else sent + if current.strip(): + chunks.append(current.strip()) + return chunks + + +def get_embeddings(texts: list[str]) -> np.ndarray: + resp = requests.post( + f"{OLLAMA_HOST}/api/embed", + json={"model": EMBED_MODEL, "input": texts}, + timeout=120, + ) + resp.raise_for_status() + data = resp.json() + return np.array(data["embeddings"], dtype=np.float32) + + +def main(): + rebuild = "--rebuild" in sys.argv + + if not rebuild and INDEX_PATH.exists() and CHUNKS_PATH.exists(): + print("Index already exists. Use --rebuild to reindex.") + return + + urls = load_sources(SOURCES_FILE) + print(f"Loaded {len(urls)} source URLs") + + all_chunks = [] + all_meta = [] + + for i, url in enumerate(urls): + print(f"[{i+1}/{len(urls)}] Fetching {url}...", end=" ", flush=True) + try: + text = fetch_page(url) + chunks = chunk_text(text) + print(f"{len(chunks)} chunks") + for j, chunk in enumerate(chunks): + all_chunks.append(chunk) + all_meta.append({"url": url, "chunk_idx": j}) + except Exception as e: + print(f"ERROR: {e}") + continue + + print(f"\nTotal chunks: {len(all_chunks)}") + if not all_chunks: + print("No chunks to index!") + return + + print(f"Generating embeddings ({EMBED_MODEL})...") + all_embeddings = [] + for start in range(0, len(all_chunks), BATCH_SIZE): + batch = all_chunks[start : start + BATCH_SIZE] + print(f" Batch {start//BATCH_SIZE + 1}/{(len(all_chunks)-1)//BATCH_SIZE + 1}...", flush=True) + embs = get_embeddings(batch) + all_embeddings.append(embs) + + embeddings = np.vstack(all_embeddings) + faiss.normalize_L2(embeddings) + dim = embeddings.shape[1] + print(f"Embedding dimension: {dim}") + + index = faiss.IndexFlatIP(dim) + index.add(embeddings) + + STORE_DIR.mkdir(parents=True, exist_ok=True) + faiss.write_index(index, str(INDEX_PATH)) + with open(CHUNKS_PATH, "wb") as f: + pickle.dump(all_chunks, f) + with open(META_PATH, "wb") as f: + pickle.dump(all_meta, f) + + print(f"Saved index ({index.ntotal} vectors) to {STORE_DIR}") + + +if __name__ == "__main__": + main() diff --git a/rag/query.py b/rag/query.py new file mode 100644 index 0000000..67ebb65 --- /dev/null +++ b/rag/query.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python3 +""" +RAG query: search indexed documents and generate answer via Ollama. +Usage: python rag/query.py "ваш вопрос" +""" + +import os +import sys +import pickle +from pathlib import Path + +import requests +import numpy as np +import faiss + +OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://192.168.0.47:11434") +EMBED_MODEL = os.getenv("EMBED_MODEL", "bge-m3") +GENERATE_MODEL = os.getenv("GENERATE_MODEL", "qwen3.5:9b") +TOP_K = 5 + +STORE_DIR = Path(__file__).resolve().parent / "store" +INDEX_PATH = STORE_DIR / "faiss.index" +CHUNKS_PATH = STORE_DIR / "chunks.pkl" +META_PATH = STORE_DIR / "meta.pkl" + +SYSTEM_PROMPT = """Ты — эксперт по теории автоматического управления и электротехнике. +Отвечай на вопросы, опираясь ТОЛЬКО на предоставленный контекст. +Если в контексте нет информации для ответа, скажи об этом. +Отвечай на русском языке, точно и по существу. +Указывай источники, из которых взята информация.""" + + +def load_index(): + if not INDEX_PATH.exists(): + print("Index not found! Run: python rag/index.py") + sys.exit(1) + index = faiss.read_index(str(INDEX_PATH)) + with open(CHUNKS_PATH, "rb") as f: + chunks = pickle.load(f) + with open(META_PATH, "rb") as f: + meta = pickle.load(f) + return index, chunks, meta + + +def get_embedding(text: str) -> np.ndarray: + resp = requests.post( + f"{OLLAMA_HOST}/api/embed", + json={"model": EMBED_MODEL, "input": [text]}, + timeout=60, + ) + resp.raise_for_status() + return np.array(resp.json()["embeddings"], dtype=np.float32) + + +def search(query: str, index, chunks, meta, k: int = TOP_K): + q_emb = get_embedding(query) + faiss.normalize_L2(q_emb) + scores, indices = index.search(q_emb, k) + results = [] + for score, idx in zip(scores[0], indices[0]): + if idx < 0: + continue + results.append({ + "chunk": chunks[idx], + "meta": meta[idx], + "score": float(score), + }) + return results + + +def generate(query: str, context_chunks: list[dict]) -> str: + context_parts = [] + for i, r in enumerate(context_chunks, 1): + url = r["meta"]["url"] + context_parts.append(f"[Источник {i}] ({url})\n{r['chunk']}") + context = "\n\n---\n\n".join(context_parts) + + prompt = f"""Контекст из документов: + +{context} + +--- + +Вопрос: {query} + +Ответ:""" + + resp = requests.post( + f"{OLLAMA_HOST}/api/generate", + json={ + "model": GENERATE_MODEL, + "system": SYSTEM_PROMPT, + "prompt": prompt, + "stream": False, + "think": False, + "options": {"temperature": 0.3, "num_predict": 2048}, + }, + timeout=300, + ) + resp.raise_for_status() + return resp.json()["response"] + + +def main(): + if len(sys.argv) < 2: + print("Usage: python rag/query.py \"ваш вопрос\"") + sys.exit(1) + + query = " ".join(sys.argv[1:]) + print(f"Query: {query}\n") + + index, chunks, meta = load_index() + print(f"Index: {index.ntotal} vectors") + + results = search(query, index, chunks, meta) + print(f"Top-{len(results)} results:\n") + for i, r in enumerate(results, 1): + print(f" [{i}] score={r['score']:.4f} {r['meta']['url']}") + print(f" {r['chunk'][:120]}...\n") + + print("Generating answer...\n") + answer = generate(query, results) + print("=" * 60) + print(answer) + print("=" * 60) + + print("\nSources:") + seen = set() + for r in results: + url = r["meta"]["url"] + if url not in seen: + seen.add(url) + print(f" - {url}") + + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..df26336 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +requests +beautifulsoup4 +faiss-cpu +numpy