Initial commit: RAG system for control theory Q&A
Ollama + FAISS based retrieval-augmented generation system that indexes Wikipedia articles on automatic control theory and answers questions in Russian. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
5
.gitignore
vendored
Normal file
5
.gitignore
vendored
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
__pycache__/
|
||||||
|
*.pyc
|
||||||
|
.venv/
|
||||||
|
rag/store/
|
||||||
|
output/
|
||||||
46
README.md
Normal file
46
README.md
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
# RAG: Теория автоматического управления
|
||||||
|
|
||||||
|
Retrieval-Augmented Generation система для ответов на вопросы по теории автоматического управления и электротехнике.
|
||||||
|
|
||||||
|
## Архитектура
|
||||||
|
|
||||||
|
```
|
||||||
|
data/sources.txt → rag/index.py → rag/store/
|
||||||
|
├── faiss.index
|
||||||
|
├── chunks.pkl
|
||||||
|
└── meta.pkl
|
||||||
|
|
||||||
|
rag/query.py "вопрос" → FAISS поиск → Ollama generate → ответ
|
||||||
|
```
|
||||||
|
|
||||||
|
## Компоненты
|
||||||
|
|
||||||
|
- **Ollama** — локальный LLM-сервер (192.168.0.47:11434)
|
||||||
|
- Эмбеддинги: `bge-m3`
|
||||||
|
- Генерация: `qwen3.5:9b`
|
||||||
|
- **FAISS** — векторный индекс для поиска похожих фрагментов
|
||||||
|
- **BeautifulSoup** — парсинг веб-страниц
|
||||||
|
|
||||||
|
## Источники
|
||||||
|
|
||||||
|
Wikipedia-статьи (RU/EN) по темам: передаточные функции, обратная связь, устойчивость, PID-регулятор, Боде, Найквист, корневой годограф, RLC-цепи, импеданс, резонанс.
|
||||||
|
|
||||||
|
## Запуск
|
||||||
|
|
||||||
|
```bash
|
||||||
|
source .venv/bin/activate
|
||||||
|
|
||||||
|
# Индексация (уже выполнена)
|
||||||
|
python rag/index.py [--rebuild]
|
||||||
|
|
||||||
|
# Запрос
|
||||||
|
python rag/query.py "Что такое PID-регулятор?"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Переменные окружения
|
||||||
|
|
||||||
|
| Переменная | По умолчанию | Описание |
|
||||||
|
|-------------------|----------------------------|-----------------------|
|
||||||
|
| `OLLAMA_HOST` | `http://192.168.0.47:11434`| Адрес Ollama |
|
||||||
|
| `EMBED_MODEL` | `bge-m3` | Модель эмбеддингов |
|
||||||
|
| `GENERATE_MODEL` | `qwen3.5:9b` | Модель генерации |
|
||||||
40
data/sources.txt
Normal file
40
data/sources.txt
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
# Теория управления — источники для RAG-индексации
|
||||||
|
# Строки с # — комментарии, пустые строки игнорируются
|
||||||
|
|
||||||
|
# Передаточные функции и основы
|
||||||
|
https://ru.wikipedia.org/wiki/%D0%9F%D0%B5%D1%80%D0%B5%D0%B4%D0%B0%D1%82%D0%BE%D1%87%D0%BD%D0%B0%D1%8F_%D1%84%D1%83%D0%BD%D0%BA%D1%86%D0%B8%D1%8F
|
||||||
|
https://en.wikipedia.org/wiki/Transfer_function
|
||||||
|
|
||||||
|
# Обратная связь
|
||||||
|
https://ru.wikipedia.org/wiki/%D0%9E%D0%B1%D1%80%D0%B0%D1%82%D0%BD%D0%B0%D1%8F_%D1%81%D0%B2%D1%8F%D0%B7%D1%8C
|
||||||
|
https://en.wikipedia.org/wiki/Feedback
|
||||||
|
|
||||||
|
# Устойчивость систем управления
|
||||||
|
https://ru.wikipedia.org/wiki/%D0%A3%D1%81%D1%82%D0%BE%D0%B9%D1%87%D0%B8%D0%B2%D0%BE%D1%81%D1%82%D1%8C_%D1%81%D0%B8%D1%81%D1%82%D0%B5%D0%BC_%D1%83%D0%BF%D1%80%D0%B0%D0%B2%D0%BB%D0%B5%D0%BD%D0%B8%D1%8F
|
||||||
|
|
||||||
|
# PID-регулятор
|
||||||
|
https://ru.wikipedia.org/wiki/PID-%D0%BA%D0%BE%D0%BD%D1%82%D1%80%D0%BE%D0%BB%D0%BB%D0%B5%D1%80
|
||||||
|
https://en.wikipedia.org/wiki/PID_controller
|
||||||
|
|
||||||
|
# АЧХ, ФЧХ, метод Боде
|
||||||
|
https://en.wikipedia.org/wiki/Bode_plot
|
||||||
|
|
||||||
|
# Годограф Найквиста
|
||||||
|
https://en.wikipedia.org/wiki/Nyquist_stability_criterion
|
||||||
|
|
||||||
|
# Корневой годограф
|
||||||
|
https://en.wikipedia.org/wiki/Root_locus_analysis
|
||||||
|
|
||||||
|
# Импульсная переходная функция
|
||||||
|
https://en.wikipedia.org/wiki/Impulse_response
|
||||||
|
|
||||||
|
# RLC-цепи
|
||||||
|
https://ru.wikipedia.org/wiki/RLC-%D1%86%D0%B5%D0%BF%D1%8C
|
||||||
|
https://en.wikipedia.org/wiki/RLC_circuit
|
||||||
|
|
||||||
|
# Импеданс
|
||||||
|
https://ru.wikipedia.org/wiki/%D0%98%D0%BC%D0%BF%D0%B5%D0%B4%D0%B0%D0%BD%D1%81
|
||||||
|
https://en.wikipedia.org/wiki/Electrical_impedance
|
||||||
|
|
||||||
|
# Резонанс
|
||||||
|
https://en.wikipedia.org/wiki/Resonance
|
||||||
148
rag/index.py
Normal file
148
rag/index.py
Normal file
@@ -0,0 +1,148 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
RAG indexer: load URLs → chunk text → embed via Ollama → store in FAISS.
|
||||||
|
Usage: python rag/index.py [--rebuild]
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import pickle
|
||||||
|
import hashlib
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import numpy as np
|
||||||
|
import faiss
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://192.168.0.47:11434")
|
||||||
|
EMBED_MODEL = os.getenv("EMBED_MODEL", "bge-m3")
|
||||||
|
CHUNK_SIZE = 500
|
||||||
|
CHUNK_OVERLAP = 100
|
||||||
|
BATCH_SIZE = 32
|
||||||
|
|
||||||
|
BASE_DIR = Path(__file__).resolve().parent.parent
|
||||||
|
STORE_DIR = BASE_DIR / "rag" / "store"
|
||||||
|
SOURCES_FILE = BASE_DIR / "data" / "sources.txt"
|
||||||
|
INDEX_PATH = STORE_DIR / "faiss.index"
|
||||||
|
CHUNKS_PATH = STORE_DIR / "chunks.pkl"
|
||||||
|
META_PATH = STORE_DIR / "meta.pkl"
|
||||||
|
|
||||||
|
|
||||||
|
def load_sources(path: Path) -> list[str]:
|
||||||
|
urls = []
|
||||||
|
with open(path) as f:
|
||||||
|
for line in f:
|
||||||
|
line = line.strip()
|
||||||
|
if line and not line.startswith("#"):
|
||||||
|
urls.append(line)
|
||||||
|
return urls
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_page(url: str) -> str:
|
||||||
|
resp = requests.get(url, timeout=30, headers={"User-Agent": "Mozilla/5.0"})
|
||||||
|
resp.raise_for_status()
|
||||||
|
soup = BeautifulSoup(resp.text, "html.parser")
|
||||||
|
for tag in soup(["script", "style", "nav", "footer", "header", "aside", "noscript"]):
|
||||||
|
tag.decompose()
|
||||||
|
text = soup.get_text(separator="\n", strip=True)
|
||||||
|
text = re.sub(r"\n{3,}", "\n\n", text)
|
||||||
|
text = re.sub(r"[ \t]+", " ", text)
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def chunk_text(text: str, size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> list[str]:
|
||||||
|
sentences = re.split(r"(?<=[.!?])\s+", text)
|
||||||
|
chunks = []
|
||||||
|
current = ""
|
||||||
|
for sent in sentences:
|
||||||
|
if len(current) + len(sent) > size and current:
|
||||||
|
chunks.append(current.strip())
|
||||||
|
words = current.split()
|
||||||
|
overlap_words = []
|
||||||
|
char_count = 0
|
||||||
|
for w in reversed(words):
|
||||||
|
if char_count + len(w) + 1 > overlap:
|
||||||
|
break
|
||||||
|
overlap_words.insert(0, w)
|
||||||
|
char_count += len(w) + 1
|
||||||
|
current = " ".join(overlap_words) + " " + sent
|
||||||
|
else:
|
||||||
|
current += " " + sent if current else sent
|
||||||
|
if current.strip():
|
||||||
|
chunks.append(current.strip())
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
|
||||||
|
def get_embeddings(texts: list[str]) -> np.ndarray:
|
||||||
|
resp = requests.post(
|
||||||
|
f"{OLLAMA_HOST}/api/embed",
|
||||||
|
json={"model": EMBED_MODEL, "input": texts},
|
||||||
|
timeout=120,
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
data = resp.json()
|
||||||
|
return np.array(data["embeddings"], dtype=np.float32)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
rebuild = "--rebuild" in sys.argv
|
||||||
|
|
||||||
|
if not rebuild and INDEX_PATH.exists() and CHUNKS_PATH.exists():
|
||||||
|
print("Index already exists. Use --rebuild to reindex.")
|
||||||
|
return
|
||||||
|
|
||||||
|
urls = load_sources(SOURCES_FILE)
|
||||||
|
print(f"Loaded {len(urls)} source URLs")
|
||||||
|
|
||||||
|
all_chunks = []
|
||||||
|
all_meta = []
|
||||||
|
|
||||||
|
for i, url in enumerate(urls):
|
||||||
|
print(f"[{i+1}/{len(urls)}] Fetching {url}...", end=" ", flush=True)
|
||||||
|
try:
|
||||||
|
text = fetch_page(url)
|
||||||
|
chunks = chunk_text(text)
|
||||||
|
print(f"{len(chunks)} chunks")
|
||||||
|
for j, chunk in enumerate(chunks):
|
||||||
|
all_chunks.append(chunk)
|
||||||
|
all_meta.append({"url": url, "chunk_idx": j})
|
||||||
|
except Exception as e:
|
||||||
|
print(f"ERROR: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
print(f"\nTotal chunks: {len(all_chunks)}")
|
||||||
|
if not all_chunks:
|
||||||
|
print("No chunks to index!")
|
||||||
|
return
|
||||||
|
|
||||||
|
print(f"Generating embeddings ({EMBED_MODEL})...")
|
||||||
|
all_embeddings = []
|
||||||
|
for start in range(0, len(all_chunks), BATCH_SIZE):
|
||||||
|
batch = all_chunks[start : start + BATCH_SIZE]
|
||||||
|
print(f" Batch {start//BATCH_SIZE + 1}/{(len(all_chunks)-1)//BATCH_SIZE + 1}...", flush=True)
|
||||||
|
embs = get_embeddings(batch)
|
||||||
|
all_embeddings.append(embs)
|
||||||
|
|
||||||
|
embeddings = np.vstack(all_embeddings)
|
||||||
|
faiss.normalize_L2(embeddings)
|
||||||
|
dim = embeddings.shape[1]
|
||||||
|
print(f"Embedding dimension: {dim}")
|
||||||
|
|
||||||
|
index = faiss.IndexFlatIP(dim)
|
||||||
|
index.add(embeddings)
|
||||||
|
|
||||||
|
STORE_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
faiss.write_index(index, str(INDEX_PATH))
|
||||||
|
with open(CHUNKS_PATH, "wb") as f:
|
||||||
|
pickle.dump(all_chunks, f)
|
||||||
|
with open(META_PATH, "wb") as f:
|
||||||
|
pickle.dump(all_meta, f)
|
||||||
|
|
||||||
|
print(f"Saved index ({index.ntotal} vectors) to {STORE_DIR}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
137
rag/query.py
Normal file
137
rag/query.py
Normal file
@@ -0,0 +1,137 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
RAG query: search indexed documents and generate answer via Ollama.
|
||||||
|
Usage: python rag/query.py "ваш вопрос"
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import pickle
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import numpy as np
|
||||||
|
import faiss
|
||||||
|
|
||||||
|
OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://192.168.0.47:11434")
|
||||||
|
EMBED_MODEL = os.getenv("EMBED_MODEL", "bge-m3")
|
||||||
|
GENERATE_MODEL = os.getenv("GENERATE_MODEL", "qwen3.5:9b")
|
||||||
|
TOP_K = 5
|
||||||
|
|
||||||
|
STORE_DIR = Path(__file__).resolve().parent / "store"
|
||||||
|
INDEX_PATH = STORE_DIR / "faiss.index"
|
||||||
|
CHUNKS_PATH = STORE_DIR / "chunks.pkl"
|
||||||
|
META_PATH = STORE_DIR / "meta.pkl"
|
||||||
|
|
||||||
|
SYSTEM_PROMPT = """Ты — эксперт по теории автоматического управления и электротехнике.
|
||||||
|
Отвечай на вопросы, опираясь ТОЛЬКО на предоставленный контекст.
|
||||||
|
Если в контексте нет информации для ответа, скажи об этом.
|
||||||
|
Отвечай на русском языке, точно и по существу.
|
||||||
|
Указывай источники, из которых взята информация."""
|
||||||
|
|
||||||
|
|
||||||
|
def load_index():
|
||||||
|
if not INDEX_PATH.exists():
|
||||||
|
print("Index not found! Run: python rag/index.py")
|
||||||
|
sys.exit(1)
|
||||||
|
index = faiss.read_index(str(INDEX_PATH))
|
||||||
|
with open(CHUNKS_PATH, "rb") as f:
|
||||||
|
chunks = pickle.load(f)
|
||||||
|
with open(META_PATH, "rb") as f:
|
||||||
|
meta = pickle.load(f)
|
||||||
|
return index, chunks, meta
|
||||||
|
|
||||||
|
|
||||||
|
def get_embedding(text: str) -> np.ndarray:
|
||||||
|
resp = requests.post(
|
||||||
|
f"{OLLAMA_HOST}/api/embed",
|
||||||
|
json={"model": EMBED_MODEL, "input": [text]},
|
||||||
|
timeout=60,
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
return np.array(resp.json()["embeddings"], dtype=np.float32)
|
||||||
|
|
||||||
|
|
||||||
|
def search(query: str, index, chunks, meta, k: int = TOP_K):
|
||||||
|
q_emb = get_embedding(query)
|
||||||
|
faiss.normalize_L2(q_emb)
|
||||||
|
scores, indices = index.search(q_emb, k)
|
||||||
|
results = []
|
||||||
|
for score, idx in zip(scores[0], indices[0]):
|
||||||
|
if idx < 0:
|
||||||
|
continue
|
||||||
|
results.append({
|
||||||
|
"chunk": chunks[idx],
|
||||||
|
"meta": meta[idx],
|
||||||
|
"score": float(score),
|
||||||
|
})
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def generate(query: str, context_chunks: list[dict]) -> str:
|
||||||
|
context_parts = []
|
||||||
|
for i, r in enumerate(context_chunks, 1):
|
||||||
|
url = r["meta"]["url"]
|
||||||
|
context_parts.append(f"[Источник {i}] ({url})\n{r['chunk']}")
|
||||||
|
context = "\n\n---\n\n".join(context_parts)
|
||||||
|
|
||||||
|
prompt = f"""Контекст из документов:
|
||||||
|
|
||||||
|
{context}
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Вопрос: {query}
|
||||||
|
|
||||||
|
Ответ:"""
|
||||||
|
|
||||||
|
resp = requests.post(
|
||||||
|
f"{OLLAMA_HOST}/api/generate",
|
||||||
|
json={
|
||||||
|
"model": GENERATE_MODEL,
|
||||||
|
"system": SYSTEM_PROMPT,
|
||||||
|
"prompt": prompt,
|
||||||
|
"stream": False,
|
||||||
|
"think": False,
|
||||||
|
"options": {"temperature": 0.3, "num_predict": 2048},
|
||||||
|
},
|
||||||
|
timeout=300,
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
return resp.json()["response"]
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
if len(sys.argv) < 2:
|
||||||
|
print("Usage: python rag/query.py \"ваш вопрос\"")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
query = " ".join(sys.argv[1:])
|
||||||
|
print(f"Query: {query}\n")
|
||||||
|
|
||||||
|
index, chunks, meta = load_index()
|
||||||
|
print(f"Index: {index.ntotal} vectors")
|
||||||
|
|
||||||
|
results = search(query, index, chunks, meta)
|
||||||
|
print(f"Top-{len(results)} results:\n")
|
||||||
|
for i, r in enumerate(results, 1):
|
||||||
|
print(f" [{i}] score={r['score']:.4f} {r['meta']['url']}")
|
||||||
|
print(f" {r['chunk'][:120]}...\n")
|
||||||
|
|
||||||
|
print("Generating answer...\n")
|
||||||
|
answer = generate(query, results)
|
||||||
|
print("=" * 60)
|
||||||
|
print(answer)
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
print("\nSources:")
|
||||||
|
seen = set()
|
||||||
|
for r in results:
|
||||||
|
url = r["meta"]["url"]
|
||||||
|
if url not in seen:
|
||||||
|
seen.add(url)
|
||||||
|
print(f" - {url}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
4
requirements.txt
Normal file
4
requirements.txt
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
requests
|
||||||
|
beautifulsoup4
|
||||||
|
faiss-cpu
|
||||||
|
numpy
|
||||||
Reference in New Issue
Block a user