Spaces:
Running
Running
| import streamlit as st | |
| from datasets import load_dataset | |
| import numpy as np | |
| import torch | |
| import torch.nn as nn | |
| import torch.nn.functional as F | |
| from torch.utils.data import Dataset, DataLoader | |
| from transformers import AutoTokenizer | |
| from sentence_transformers import SentenceTransformer | |
| import time | |
| from datetime import datetime | |
| import json | |
| import os | |
| import pickle | |
| import re | |
| import warnings | |
| warnings.filterwarnings('ignore') | |
| # =================================================================== | |
| # 1. НАСТРОЙКИ | |
| # =================================================================== | |
| print(f"CUDA доступна: {torch.cuda.is_available()}") | |
| if torch.cuda.is_available(): | |
| print(f"GPU: {torch.cuda.get_device_name(0)}") | |
| # СВОЯ НЕЙРОСЕТЬ | |
| MODEL_NAME = "DeepPavlov/rubert-base-cased" # Для токенизации | |
| EMBEDDING_MODEL = "all-MiniLM-L6-v2" | |
| SCIENCE_DATASET = "RafaelUI/ru_science" | |
| ARTICLE_LIMIT = 200 # Для обучения | |
| MAX_LENGTH = 256 | |
| BATCH_SIZE = 16 | |
| EPOCHS = 10 | |
| LEARNING_RATE = 1e-4 | |
| AI_NAME = "OpenAirAI" | |
| COMPANY_NAME = "OpenRussianAI" | |
| CREATORS = ["Грибков Евгений", "RootLinux21"] | |
| WEBSITE = "https://sites.google.com/view/opruai/home" | |
| HUGGINGFACE = "https://huggingface.co/OpenRussianAI" | |
| CREATION_DATE = "2026" | |
| st.set_page_config( | |
| page_title=f"{AI_NAME} - Своя нейросеть", | |
| page_icon="🧠", | |
| layout="wide", | |
| initial_sidebar_state="expanded" | |
| ) | |
| # =================================================================== | |
| # 2. СВОЯ НЕЙРОСЕТЬ НА PYTORCH | |
| # =================================================================== | |
| class SimpleTransformer(nn.Module): | |
| """Своя нейросеть с нуля на PyTorch""" | |
| def __init__(self, vocab_size, embed_dim=256, num_heads=8, num_layers=4, max_length=512): | |
| super().__init__() | |
| self.embed_dim = embed_dim | |
| self.max_length = max_length | |
| # 1. Embedding слой (превращает слова в векторы) | |
| self.embedding = nn.Embedding(vocab_size, embed_dim) | |
| self.pos_encoding = nn.Parameter(torch.randn(1, max_length, embed_dim)) | |
| # 2. Слои внимания (Transformer encoder) | |
| self.attention_layers = nn.ModuleList([ | |
| nn.MultiheadAttention(embed_dim, num_heads, batch_first=True) | |
| for _ in range(num_layers) | |
| ]) | |
| # 3. FFN слои | |
| self.ffn_layers = nn.ModuleList([ | |
| nn.Sequential( | |
| nn.Linear(embed_dim, embed_dim * 4), | |
| nn.ReLU(), | |
| nn.Linear(embed_dim * 4, embed_dim) | |
| ) | |
| for _ in range(num_layers) | |
| ]) | |
| # 4. Layer Norm | |
| self.norm_layers = nn.ModuleList([ | |
| nn.LayerNorm(embed_dim) | |
| for _ in range(num_layers) | |
| ]) | |
| # 5. Выходной слой (для генерации) | |
| self.output_layer = nn.Linear(embed_dim, vocab_size) | |
| self.dropout = nn.Dropout(0.1) | |
| def forward(self, input_ids, attention_mask=None): | |
| # 1. Получаем эмбеддинги | |
| x = self.embedding(input_ids) # [batch, seq_len, embed_dim] | |
| x = x + self.pos_encoding[:, :x.size(1), :] | |
| x = self.dropout(x) | |
| # 2. Проходим через слои внимания | |
| for attn, ffn, norm in zip( | |
| self.attention_layers, self.ffn_layers, self.norm_layers | |
| ): | |
| # Attention | |
| attn_output, _ = attn(x, x, x, key_padding_mask=~attention_mask.bool()) | |
| x = x + attn_output | |
| x = norm(x) | |
| # FFN | |
| ffn_output = ffn(x) | |
| x = x + ffn_output | |
| x = norm(x) | |
| # 3. Выходной слой | |
| logits = self.output_layer(x) | |
| return logits | |
| # =================================================================== | |
| # 3. ДАТАСЕТ ДЛЯ ОБУЧЕНИЯ | |
| # =================================================================== | |
| class ScienceDataset(Dataset): | |
| """Свой датасет для обучения""" | |
| def __init__(self, articles, tokenizer, max_length=256): | |
| self.articles = articles | |
| self.tokenizer = tokenizer | |
| self.max_length = max_length | |
| def __len__(self): | |
| return len(self.articles) | |
| def __getitem__(self, idx): | |
| article = self.articles[idx] | |
| text = f"{article['title']}. {article['text']}" | |
| # Токенизация | |
| encoding = self.tokenizer( | |
| text, | |
| truncation=True, | |
| padding='max_length', | |
| max_length=self.max_length, | |
| return_tensors='pt' | |
| ) | |
| return { | |
| 'input_ids': encoding['input_ids'].squeeze(), | |
| 'attention_mask': encoding['attention_mask'].squeeze(), | |
| 'labels': encoding['input_ids'].squeeze() # Для обучения | |
| } | |
| # =================================================================== | |
| # 4. ОБУЧЕНИЕ НЕЙРОСЕТИ | |
| # =================================================================== | |
| def train_model(model, dataloader, epochs=10, lr=1e-4): | |
| """Обучение своей нейросети""" | |
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
| model = model.to(device) | |
| optimizer = torch.optim.AdamW(model.parameters(), lr=lr) | |
| criterion = nn.CrossEntropyLoss(ignore_index=0) # ignore padding | |
| losses = [] | |
| for epoch in range(epochs): | |
| total_loss = 0 | |
| progress_bar = st.progress(0) | |
| status_text = st.empty() | |
| for i, batch in enumerate(dataloader): | |
| input_ids = batch['input_ids'].to(device) | |
| attention_mask = batch['attention_mask'].to(device) | |
| labels = batch['labels'].to(device) | |
| # Forward | |
| optimizer.zero_grad() | |
| logits = model(input_ids, attention_mask) | |
| # Вычисляем loss | |
| loss = criterion( | |
| logits.view(-1, logits.size(-1)), | |
| labels.view(-1) | |
| ) | |
| # Backward | |
| loss.backward() | |
| torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) | |
| optimizer.step() | |
| total_loss += loss.item() | |
| # Обновляем прогресс | |
| progress = (i + 1) / len(dataloader) | |
| progress_bar.progress((epoch + progress) / epochs) | |
| status_text.text( | |
| f"Эпоха {epoch+1}/{epochs}, " | |
| f"Батч {i+1}/{len(dataloader)}, " | |
| f"Loss: {loss.item():.4f}" | |
| ) | |
| avg_loss = total_loss / len(dataloader) | |
| losses.append(avg_loss) | |
| st.write(f"✅ Эпоха {epoch+1}: Средний Loss = {avg_loss:.4f}") | |
| return model, losses | |
| # =================================================================== | |
| # 5. ГЕНЕРАЦИЯ ОТВЕТОВ | |
| # =================================================================== | |
| def generate_answer(model, tokenizer, query, max_length=150): | |
| """Генерация ответа своей нейросетью""" | |
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
| model.eval() | |
| with torch.no_grad(): | |
| # Токенизируем запрос | |
| encoding = tokenizer( | |
| query, | |
| truncation=True, | |
| padding='max_length', | |
| max_length=100, | |
| return_tensors='pt' | |
| ) | |
| input_ids = encoding['input_ids'].to(device) | |
| attention_mask = encoding['attention_mask'].to(device) | |
| # Генерируем ответ | |
| generated = input_ids.clone() | |
| for _ in range(max_length): | |
| logits = model(generated, attention_mask) | |
| # Берем последний токен | |
| next_token_logits = logits[:, -1, :] | |
| next_token_probs = F.softmax(next_token_logits, dim=-1) | |
| # Выбираем токен | |
| next_token = torch.multinomial(next_token_probs, num_samples=1) | |
| # Добавляем к последовательности | |
| generated = torch.cat([generated, next_token], dim=1) | |
| # Если сгенерирован токен конца | |
| if next_token.item() == tokenizer.eos_token_id: | |
| break | |
| # Декодируем | |
| response = tokenizer.decode(generated[0], skip_special_tokens=True) | |
| # Убираем запрос из ответа | |
| response = response.replace(query, "").strip() | |
| return response if response else "Извините, нейросеть не сгенерировала ответ." | |
| # =================================================================== | |
| # 6. ЗАГРУЗКА ДАННЫХ | |
| # =================================================================== | |
| def load_science_articles(): | |
| articles_file = "science_articles.pkl" | |
| if os.path.exists(articles_file): | |
| with open(articles_file, 'rb') as f: | |
| return pickle.load(f) | |
| with st.spinner("📚 Загружаю научные статьи..."): | |
| try: | |
| dataset = load_dataset(SCIENCE_DATASET, split="train", streaming=True) | |
| articles = [] | |
| for i, row in enumerate(dataset): | |
| if i >= ARTICLE_LIMIT: | |
| break | |
| text = row.get('content', '') or row.get('text', '') or str(row) | |
| title = row.get('title', f"Статья {i}") | |
| articles.append({ | |
| "id": i, | |
| "title": title[:200], | |
| "text": text[:1000], | |
| "source": "ru_science" | |
| }) | |
| with open(articles_file, 'wb') as f: | |
| pickle.dump(articles, f) | |
| return articles | |
| except Exception as e: | |
| st.error(f"Ошибка: {e}") | |
| return [] | |
| def load_embedder(): | |
| try: | |
| return SentenceTransformer(EMBEDDING_MODEL) | |
| except: | |
| return None | |
| # =================================================================== | |
| # 7. ИНТЕРФЕЙС | |
| # =================================================================== | |
| # Загрузка данных | |
| articles = load_science_articles() | |
| st.title(f"🧠 {AI_NAME} - Своя нейросеть на PyTorch") | |
| st.markdown(f"**{AI_NAME}** от **{COMPANY_NAME}** | Обучаем на **{SCIENCE_DATASET}**") | |
| # Информация о модели | |
| st.info(f"📊 **Данные:** {len(articles)} статей | **Размер модели:** 256 эмбеддингов | **Слои:** 4 Transformer") | |
| # Кнопка обучения | |
| if st.button("🚀 Обучить нейросеть с нуля"): | |
| if not articles: | |
| st.error("Нет данных для обучения!") | |
| else: | |
| # Загружаем токенизатор | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
| tokenizer.pad_token = tokenizer.eos_token | |
| # Создаем датасет | |
| dataset = ScienceDataset(articles, tokenizer, MAX_LENGTH) | |
| dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True) | |
| # Создаем модель | |
| vocab_size = len(tokenizer) | |
| model = SimpleTransformer( | |
| vocab_size=vocab_size, | |
| embed_dim=256, | |
| num_heads=8, | |
| num_layers=4, | |
| max_length=MAX_LENGTH | |
| ) | |
| # Обучаем | |
| st.write("🧠 **Начинаем обучение...**") | |
| trained_model, losses = train_model( | |
| model, | |
| dataloader, | |
| epochs=EPOCHS, | |
| lr=LEARNING_RATE | |
| ) | |
| # Сохраняем модель | |
| torch.save(trained_model.state_dict(), "openairai_model.pth") | |
| st.success(f"✅ Модель сохранена! (потери: {losses[-1]:.4f})") | |
| st.session_state.model = trained_model | |
| st.session_state.tokenizer = tokenizer | |
| # Проверка модели | |
| if "model" in st.session_state: | |
| model = st.session_state.model | |
| tokenizer = st.session_state.tokenizer | |
| st.success("✅ Модель загружена и готова к использованию!") | |
| # Поле для вопроса | |
| query = st.text_input("🔍 Задайте вопрос нейросети:", placeholder="Например: Что такое наука?") | |
| if query: | |
| with st.spinner("🧠 Нейросеть думает..."): | |
| response = generate_answer(model, tokenizer, query) | |
| st.markdown(f"**🤖 Ответ:** {response}") | |
| else: | |
| st.warning("⚠️ Модель ещё не обучена. Нажмите кнопку выше для обучения.") | |
| # Показываем пример обучения | |
| with st.expander("📖 Как это работает?"): | |
| st.markdown(""" | |
| **Своя нейросеть на PyTorch:** | |
| 1. **Архитектура:** Transformer (4 слоя внимания) | |
| 2. **Размер:** 256 эмбеддингов | |
| 3. **Обучение:** на научных статьях | |
| 4. **Генерация:** пошаговая | |
| **Преимущества:** | |
| - Полный контроль над моделью | |
| - Можно дообучать на любых данных | |
| - Не зависит от сторонних API | |
| - Бесплатно | |
| **Недостатки:** | |
| - Требует GPU для быстрого обучения | |
| - Меньше, чем большие модели | |
| - Нужно много данных | |
| """) | |
| # --- ПОДВАЛ --- | |
| st.divider() | |
| st.caption(f"🧠 {AI_NAME} от {COMPANY_NAME} | Создан в {CREATION_DATE} | Своя нейросеть на PyTorch") |