VersionRAG / src /streamlit_app.py
shahbazdev0's picture
Update src/streamlit_app.py
028477b verified
# app.py - Main Streamlit Application
import streamlit as st
import os
import json
import hashlib
import time
from datetime import datetime
from pathlib import Path
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from typing import List, Dict, Optional, Tuple
import uuid
# Import custom modules
from version_rag import VersionRAG, BaselineRAG
from graph_manager import GraphManager
from evaluation import Evaluator, VersionQADataset
from utils import DocumentProcessor, ChangeDetector, PersistentStorage
# Page configuration
st.set_page_config(
page_title="VersionRAG - Version-Aware RAG System",
page_icon="πŸ“š",
layout="wide",
initial_sidebar_state="expanded"
)
# Initialize session state
def init_session_state():
if 'user_id' not in st.session_state:
st.session_state.user_id = str(uuid.uuid4())
if 'version_rag' not in st.session_state:
st.session_state.version_rag = None
if 'baseline_rag' not in st.session_state:
st.session_state.baseline_rag = None
if 'graph_manager' not in st.session_state:
st.session_state.graph_manager = None
if 'uploaded_files' not in st.session_state:
st.session_state.uploaded_files = {}
if 'chat_history' not in st.session_state:
st.session_state.chat_history = []
if 'evaluation_results' not in st.session_state:
st.session_state.evaluation_results = None
if 'feedback_data' not in st.session_state:
st.session_state.feedback_data = []
if 'persistent_storage' not in st.session_state:
st.session_state.persistent_storage = None
init_session_state()
# Custom CSS
st.markdown("""
<style>
.main-header {
font-size: 2.5rem;
font-weight: bold;
color: #1f77b4;
text-align: center;
padding: 1rem 0;
}
.metric-card {
background-color: #f0f2f6;
padding: 1rem;
border-radius: 0.5rem;
margin: 0.5rem 0;
}
.diff-added {
background-color: #d4edda;
padding: 0.2rem 0.5rem;
border-radius: 0.3rem;
}
.diff-removed {
background-color: #f8d7da;
padding: 0.2rem 0.5rem;
border-radius: 0.3rem;
}
.version-tag {
background-color: #e7f3ff;
color: #0366d6;
padding: 0.2rem 0.5rem;
border-radius: 0.3rem;
font-weight: bold;
}
.stTabs [data-baseweb="tab-list"] {
gap: 2rem;
}
</style>
""", unsafe_allow_html=True)
# Sidebar
with st.sidebar:
st.markdown("### πŸ” User Session")
st.info(f"User ID: {st.session_state.user_id[:8]}...")
st.markdown("### βš™οΈ Settings")
# API Key input
api_key = st.text_input("OpenAI API Key", type="password",
value=os.getenv("OPENAI_API_KEY", ""))
if api_key:
os.environ["OPENAI_API_KEY"] = api_key
# Model selection
model_name = st.selectbox(
"LLM Model",
["gpt-3.5-turbo", "gpt-4", "gpt-4-turbo-preview"],
index=0
)
# Embedding model
embedding_model = st.selectbox(
"Embedding Model",
["text-embedding-3-small", "text-embedding-3-large", "text-embedding-ada-002"], # βœ… CORRECT
index=0
)
# Retrieval parameters
st.markdown("### 🎯 Retrieval Parameters")
top_k = st.slider("Top K Results", 1, 10, 5)
similarity_threshold = st.slider("Similarity Threshold", 0.0, 1.0, 0.7)
# Initialize systems button
if st.button("πŸš€ Initialize Systems", type="primary"):
with st.spinner("Initializing VersionRAG and Baseline systems..."):
try:
st.session_state.version_rag = VersionRAG(
user_id=st.session_state.user_id,
model_name=model_name,
embedding_model=embedding_model
)
st.session_state.baseline_rag = BaselineRAG(
user_id=st.session_state.user_id,
model_name=model_name,
embedding_model=embedding_model
)
st.session_state.graph_manager = GraphManager(
user_id=st.session_state.user_id
)
st.success("βœ… Systems initialized successfully!")
except Exception as e:
st.error(f"❌ Initialization error: {str(e)}")
# Knowledge base status
if st.session_state.uploaded_files:
st.markdown("### πŸ“š Knowledge Base")
for filename, info in st.session_state.uploaded_files.items():
with st.expander(f"πŸ“„ {filename}"):
st.write(f"**Version:** {info['version']}")
st.write(f"**Uploaded:** {info['timestamp']}")
st.write(f"**Hash:** {info['hash'][:12]}...")
# Main content
st.markdown('<div class="main-header">πŸ“š VersionRAG: Version-Aware RAG System</div>',
unsafe_allow_html=True)
# Create tabs
tab1, tab2, tab3, tab4, tab5, tab6 = st.tabs([
"πŸ“€ Document Upload",
"πŸ’¬ Query Interface",
"πŸ“Š Evaluation",
"πŸ” Version Explorer",
"πŸ“ˆ Analytics",
"πŸ‘₯ Multi-User Management"
])
# Tab 1: Document Upload
with tab1:
st.header("Document Upload & Indexing")
col1, col2 = st.columns([2, 1])
with col1:
uploaded_files = st.file_uploader(
"Upload versioned documents (PDF, TXT)",
type=["pdf", "txt"],
accept_multiple_files=True
)
if uploaded_files:
st.markdown("### πŸ“‹ File Metadata")
for idx, file in enumerate(uploaded_files):
with st.expander(f"πŸ“„ {file.name}", expanded=True):
col_a, col_b = st.columns(2)
with col_a:
version = st.text_input(
"Version",
key=f"version_{idx}",
value="1.0.0"
)
with col_b:
domain = st.selectbox(
"Domain",
["Software", "Healthcare", "Finance", "Industrial", "Other"],
key=f"domain_{idx}"
)
topic = st.text_input(
"Topic/Module",
key=f"topic_{idx}",
value=file.name.split('.')[0]
)
if st.button(f"Process {file.name}", key=f"process_{idx}"):
if not st.session_state.version_rag:
st.error("Please initialize systems first!")
else:
with st.spinner(f"Processing {file.name}..."):
try:
# Read file content
content = file.read()
if file.type == "application/pdf":
text = DocumentProcessor.extract_text_from_pdf(content)
else:
text = content.decode('utf-8')
# Calculate hash
file_hash = hashlib.sha256(content).hexdigest()
# Check if file already exists
if file.name in st.session_state.uploaded_files:
old_hash = st.session_state.uploaded_files[file.name]['hash']
if old_hash == file_hash:
st.info("File unchanged, skipping indexing.")
continue
else:
st.info("File changed, re-indexing with diff analysis...")
# Perform diff analysis
old_text = st.session_state.uploaded_files[file.name]['text']
changes = ChangeDetector.compute_diff(old_text, text)
# Add to graph
st.session_state.graph_manager.add_version_with_changes(
document_name=topic,
version=version,
changes=changes
)
# Add to VersionRAG
st.session_state.version_rag.add_documents(
texts=[text],
metadatas=[{
'filename': file.name,
'version': version,
'domain': domain,
'topic': topic,
'hash': file_hash,
'timestamp': datetime.now().isoformat()
}]
)
# Add to Baseline RAG
st.session_state.baseline_rag.add_documents(
texts=[text],
metadatas=[{
'filename': file.name,
'version': version
}]
)
# Add to graph
st.session_state.graph_manager.add_document_version(
document_name=topic,
version=version,
content=text,
metadata={
'domain': domain,
'filename': file.name
}
)
# Store in session state
st.session_state.uploaded_files[file.name] = {
'version': version,
'domain': domain,
'topic': topic,
'hash': file_hash,
'text': text,
'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
}
st.success(f"βœ… Successfully processed {file.name}")
except Exception as e:
st.error(f"❌ Error processing {file.name}: {str(e)}")
with col2:
st.markdown("### πŸ“Š Upload Statistics")
if st.session_state.uploaded_files:
stats_data = {
'Total Files': len(st.session_state.uploaded_files),
'Domains': len(set(f['domain'] for f in st.session_state.uploaded_files.values())),
'Total Versions': len(set(f['version'] for f in st.session_state.uploaded_files.values()))
}
for key, value in stats_data.items():
st.metric(key, value)
# Domain distribution
domain_counts = {}
for file_info in st.session_state.uploaded_files.values():
domain = file_info['domain']
domain_counts[domain] = domain_counts.get(domain, 0) + 1
fig = px.pie(
values=list(domain_counts.values()),
names=list(domain_counts.keys()),
title="Documents by Domain"
)
st.plotly_chart(fig, use_container_width=True)
# Tab 2: Query Interface
with tab2:
st.header("Interactive Query Interface")
if not st.session_state.version_rag:
st.warning("⚠️ Please initialize the systems first from the sidebar!")
else:
# Query type selection
query_type = st.radio(
"Query Type",
["Content Retrieval", "Version Inquiry", "Change Retrieval"],
horizontal=True
)
# Query input
col1, col2 = st.columns([3, 1])
with col1:
query = st.text_input(
"Enter your query",
placeholder="e.g., What is the assert module in Node.js v20.0?"
)
with col2:
compare_mode = st.checkbox("Compare with Baseline", value=True)
# Version filter (for content retrieval)
if query_type == "Content Retrieval":
version_filter = st.text_input(
"Version Filter (optional)",
placeholder="e.g., 1.2.0"
)
else:
version_filter = None
if st.button("πŸ” Search", type="primary"):
if not query:
st.warning("Please enter a query!")
else:
with st.spinner("Searching..."):
start_time = time.time()
# VersionRAG query
if query_type == "Content Retrieval":
vrag_result = st.session_state.version_rag.query(
query=query,
version_filter=version_filter,
top_k=top_k
)
elif query_type == "Version Inquiry":
vrag_result = st.session_state.version_rag.version_inquiry(
query=query
)
else: # Change Retrieval
vrag_result = st.session_state.version_rag.change_retrieval(
query=query
)
vrag_time = time.time() - start_time
# Baseline query (if comparison enabled)
if compare_mode:
start_time = time.time()
baseline_result = st.session_state.baseline_rag.query(
query=query,
top_k=top_k
)
baseline_time = time.time() - start_time
# Display results
if compare_mode:
col1, col2 = st.columns(2)
with col1:
st.markdown("### πŸš€ VersionRAG Response")
st.markdown(f"**Response Time:** {vrag_time:.3f}s")
st.markdown("---")
st.markdown(vrag_result['answer'])
if 'sources' in vrag_result:
with st.expander("πŸ“š Sources"):
for idx, source in enumerate(vrag_result['sources']):
st.markdown(f"**Source {idx+1}**")
st.markdown(f"- Version: `{source.get('version', 'N/A')}`")
st.markdown(f"- File: `{source.get('filename', 'N/A')}`")
st.markdown(f"- Similarity: {source.get('similarity', 0):.3f}")
st.markdown(f"```\n{source.get('content', '')[:200]}...\n```")
with col2:
st.markdown("### πŸ“Š Baseline RAG Response")
st.markdown(f"**Response Time:** {baseline_time:.3f}s")
st.markdown("---")
st.markdown(baseline_result['answer'])
if 'sources' in baseline_result:
with st.expander("πŸ“š Sources"):
for idx, source in enumerate(baseline_result['sources']):
st.markdown(f"**Source {idx+1}**")
st.markdown(f"```\n{source.get('content', '')[:200]}...\n```")
else:
st.markdown("### πŸš€ VersionRAG Response")
st.markdown(f"**Response Time:** {vrag_time:.3f}s")
st.markdown("---")
st.markdown(vrag_result['answer'])
if 'sources' in vrag_result:
with st.expander("πŸ“š Sources"):
for idx, source in enumerate(vrag_result['sources']):
st.markdown(f"**Source {idx+1}**")
st.markdown(f"- Version: `{source.get('version', 'N/A')}`")
st.markdown(f"- File: `{source.get('filename', 'N/A')}`")
st.markdown(f"- Similarity: {source.get('similarity', 0):.3f}")
st.markdown(f"```\n{source.get('content', '')[:200]}...\n```")
# Feedback
st.markdown("### πŸ“ Feedback")
col1, col2, col3 = st.columns([1, 1, 2])
with col1:
rating = st.slider("Rate this answer", 1, 5, 3)
with col2:
if st.button("Submit Feedback"):
st.session_state.feedback_data.append({
'query': query,
'query_type': query_type,
'rating': rating,
'timestamp': datetime.now().isoformat(),
'response_time': vrag_time
})
st.success("Thank you for your feedback!")
# Add to chat history
st.session_state.chat_history.append({
'query': query,
'query_type': query_type,
'vrag_answer': vrag_result['answer'],
'vrag_time': vrag_time,
'baseline_answer': baseline_result['answer'] if compare_mode else None,
'baseline_time': baseline_time if compare_mode else None,
'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
})
# Chat history
if st.session_state.chat_history:
st.markdown("### πŸ’­ Query History")
for idx, chat in enumerate(reversed(st.session_state.chat_history[-5:])):
with st.expander(f"{chat['timestamp']} - {chat['query'][:50]}..."):
st.markdown(f"**Query Type:** {chat['query_type']}")
st.markdown(f"**VersionRAG Answer:** {chat['vrag_answer'][:200]}...")
st.markdown(f"**Response Time:** {chat['vrag_time']:.3f}s")
# Tab 3: Evaluation
with tab3:
st.header("System Evaluation")
if not st.session_state.version_rag:
st.warning("⚠️ Please initialize the systems first!")
else:
st.markdown("""
This section evaluates VersionRAG against the baseline system using the Mini-VersionQA dataset.
Metrics include Hit@k, MRR, Accuracy, and Version-Sensitive Accuracy (VSA).
""")
# Evaluation dataset configuration
st.markdown("### πŸ“‹ Evaluation Dataset Configuration")
use_custom_dataset = st.checkbox("Use custom evaluation dataset")
if use_custom_dataset:
uploaded_qa_file = st.file_uploader(
"Upload QA Dataset (JSON)",
type=["json"]
)
if uploaded_qa_file:
qa_data = json.load(uploaded_qa_file)
st.success(f"Loaded {len(qa_data)} questions")
else:
st.info("Using default Mini-VersionQA dataset")
qa_data = None
if st.button("πŸš€ Run Evaluation", type="primary"):
with st.spinner("Running evaluation..."):
try:
# Initialize evaluator
evaluator = Evaluator(
version_rag=st.session_state.version_rag,
baseline_rag=st.session_state.baseline_rag
)
# Create or load dataset
if qa_data:
dataset = VersionQADataset.from_dict(qa_data)
else:
dataset = VersionQADataset.create_mini_versionqa()
# Run evaluation
results = evaluator.evaluate(dataset)
st.session_state.evaluation_results = results
# Display results
st.markdown("### πŸ“Š Evaluation Results")
# Overall comparison
col1, col2 = st.columns(2)
with col1:
st.markdown("#### πŸš€ VersionRAG")
st.metric("Accuracy", f"{results['versionrag']['accuracy']:.2%}")
st.metric("Hit@5", f"{results['versionrag']['hit_at_5']:.2%}")
st.metric("MRR", f"{results['versionrag']['mrr']:.3f}")
st.metric("VSA", f"{results['versionrag']['vsa']:.2%}")
st.metric("Avg Latency", f"{results['versionrag']['avg_latency']:.3f}s")
with col2:
st.markdown("#### πŸ“Š Baseline RAG")
st.metric("Accuracy", f"{results['baseline']['accuracy']:.2%}")
st.metric("Hit@5", f"{results['baseline']['hit_at_5']:.2%}")
st.metric("MRR", f"{results['baseline']['mrr']:.3f}")
st.metric("VSA", f"{results['baseline']['vsa']:.2%}")
st.metric("Avg Latency", f"{results['baseline']['avg_latency']:.3f}s")
# Performance improvement
st.markdown("### πŸ“ˆ Performance Improvement")
improvement = {
'Accuracy': (results['versionrag']['accuracy'] - results['baseline']['accuracy']) * 100,
'Hit@5': (results['versionrag']['hit_at_5'] - results['baseline']['hit_at_5']) * 100,
'MRR': (results['versionrag']['mrr'] - results['baseline']['mrr']) * 100,
'VSA': (results['versionrag']['vsa'] - results['baseline']['vsa']) * 100
}
fig = go.Figure(data=[
go.Bar(name='Improvement', x=list(improvement.keys()),
y=list(improvement.values()),
marker_color='lightblue')
])
fig.add_hline(y=25, line_dash="dash", line_color="red",
annotation_text="Target: 25 points")
fig.update_layout(
title="VersionRAG vs Baseline - Performance Improvement (percentage points)",
yaxis_title="Improvement (%)",
showlegend=False
)
st.plotly_chart(fig, use_container_width=True)
# Query type breakdown
st.markdown("### πŸ” Performance by Query Type")
query_types = ['Content Retrieval', 'Version Inquiry', 'Change Retrieval']
vrag_scores = [
results['versionrag']['by_type']['content_retrieval'],
results['versionrag']['by_type']['version_inquiry'],
results['versionrag']['by_type']['change_retrieval']
]
baseline_scores = [
results['baseline']['by_type']['content_retrieval'],
results['baseline']['by_type']['version_inquiry'],
results['baseline']['by_type']['change_retrieval']
]
fig = go.Figure(data=[
go.Bar(name='VersionRAG', x=query_types, y=vrag_scores),
go.Bar(name='Baseline', x=query_types, y=baseline_scores)
])
fig.update_layout(
title="Accuracy by Query Type",
yaxis_title="Accuracy (%)",
barmode='group'
)
st.plotly_chart(fig, use_container_width=True)
# Success criteria check
st.markdown("### βœ… Success Criteria")
criteria = {
'VSA Improvement β‰₯ 25 points': improvement['VSA'] >= 25,
'Content Retrieval β‰₯ 85%': vrag_scores[0] >= 85,
'Version Inquiry β‰₯ 90%': vrag_scores[1] >= 90,
'Change Retrieval β‰₯ 60%': vrag_scores[2] >= 60
}
for criterion, passed in criteria.items():
if passed:
st.success(f"βœ… {criterion}")
else:
st.error(f"❌ {criterion}")
except Exception as e:
st.error(f"Evaluation error: {str(e)}")
# Tab 4: Version Explorer
with tab4:
st.header("Version Explorer")
if not st.session_state.graph_manager:
st.warning("⚠️ Please initialize the systems first!")
else:
# Document selection
documents = st.session_state.graph_manager.get_all_documents()
if not documents:
st.info("No documents uploaded yet. Please upload documents in the 'Document Upload' tab.")
else:
selected_doc = st.selectbox("Select Document", documents)
if selected_doc:
# Get versions for selected document
versions = st.session_state.graph_manager.get_document_versions(selected_doc)
st.markdown(f"### πŸ“š {selected_doc}")
st.markdown(f"**Total Versions:** {len(versions)}")
# Version timeline
if len(versions) > 1:
st.markdown("### πŸ“… Version Timeline")
timeline_data = []
for v in sorted(versions):
version_info = st.session_state.graph_manager.get_version_info(
selected_doc, v
)
timeline_data.append({
'Version': v,
'Date': version_info.get('timestamp', 'N/A')
})
df = pd.DataFrame(timeline_data)
st.dataframe(df, use_container_width=True)
# Version comparison
st.markdown("### πŸ”„ Version Comparison")
col1, col2 = st.columns(2)
with col1:
version1 = st.selectbox("Version 1", sorted(versions), index=0)
with col2:
version2 = st.selectbox("Version 2", sorted(versions),
index=min(1, len(versions)-1))
if version1 and version2 and version1 != version2:
if st.button("Compare Versions"):
with st.spinner("Computing differences..."):
changes = st.session_state.graph_manager.get_changes_between_versions(
selected_doc, version1, version2
)
st.markdown("### πŸ“ Changes Detected")
if changes['additions']:
st.markdown("#### βž• Additions")
for add in changes['additions']:
st.markdown(f'<div class="diff-added">{add}</div>',
unsafe_allow_html=True)
if changes['deletions']:
st.markdown("#### βž– Deletions")
for delete in changes['deletions']:
st.markdown(f'<div class="diff-removed">{delete}</div>',
unsafe_allow_html=True)
if changes['modifications']:
st.markdown("#### πŸ”„ Modifications")
for mod in changes['modifications']:
st.markdown(f"- {mod}")
# Visualize changes
st.markdown("### πŸ“Š Change Statistics")
change_stats = {
'Additions': len(changes['additions']),
'Deletions': len(changes['deletions']),
'Modifications': len(changes['modifications'])
}
fig = px.bar(
x=list(change_stats.keys()),
y=list(change_stats.values()),
title=f"Changes from {version1} to {version2}",
labels={'x': 'Change Type', 'y': 'Count'}
)
st.plotly_chart(fig, use_container_width=True)
# Tab 5: Analytics
with tab5:
st.header("System Analytics")
# System statistics
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("Total Queries", len(st.session_state.chat_history))
with col2:
if st.session_state.feedback_data:
avg_rating = sum(f['rating'] for f in st.session_state.feedback_data) / len(st.session_state.feedback_data)
st.metric("Avg Rating", f"{avg_rating:.2f} / 5")
else:
st.metric("Avg Rating", "N/A")
with col3:
if st.session_state.chat_history:
avg_response_time = sum(c['vrag_time'] for c in st.session_state.chat_history) / len(st.session_state.chat_history)
st.metric("Avg Response Time", f"{avg_response_time:.3f}s")
else:
st.metric("Avg Response Time", "N/A")
with col4:
st.metric("Total Documents", len(st.session_state.uploaded_files))
# Query type distribution
if st.session_state.chat_history:
st.markdown("### πŸ“Š Query Type Distribution")
query_type_counts = {}
for chat in st.session_state.chat_history:
qtype = chat['query_type']
query_type_counts[qtype] = query_type_counts.get(qtype, 0) + 1
fig = px.pie(
values=list(query_type_counts.values()),
names=list(query_type_counts.keys()),
title="Distribution of Query Types"
)
st.plotly_chart(fig, use_container_width=True)
# Response time trend
if len(st.session_state.chat_history) > 1:
st.markdown("### ⏱️ Response Time Trend")
times = [c['vrag_time'] for c in st.session_state.chat_history]
fig = go.Figure(data=go.Scatter(
y=times,
mode='lines+markers',
name='Response Time'
))
fig.update_layout(
title="Response Time Over Queries",
xaxis_title="Query Number",
yaxis_title="Response Time (s)"
)
st.plotly_chart(fig, use_container_width=True)
# Feedback analysis
if st.session_state.feedback_data:
st.markdown("### πŸ“ User Feedback Analysis")
# Rating distribution
rating_counts = {}
for feedback in st.session_state.feedback_data:
rating = feedback['rating']
rating_counts[rating] = rating_counts.get(rating, 0) + 1
fig = go.Figure(data=[
go.Bar(x=list(rating_counts.keys()), y=list(rating_counts.values()))
])
fig.update_layout(
title="Rating Distribution",
xaxis_title="Rating",
yaxis_title="Count"
)
st.plotly_chart(fig, use_container_width=True)
# Export analytics
st.markdown("### πŸ’Ύ Export Data")
col1, col2 = st.columns(2)
with col1:
if st.button("Export Chat History"):
if st.session_state.chat_history:
df = pd.DataFrame(st.session_state.chat_history)
csv = df.to_csv(index=False)
st.download_button(
"Download CSV",
csv,
"chat_history.csv",
"text/csv"
)
with col2:
if st.button("Export Feedback Data"):
if st.session_state.feedback_data:
df = pd.DataFrame(st.session_state.feedback_data)
csv = df.to_csv(index=False)
st.download_button(
"Download CSV",
csv,
"feedback_data.csv",
"text/csv"
)
# Tab 6: Multi-User Management
with tab6:
st.header("Multi-User Management")
st.markdown("""
This section demonstrates VersionRAG's multi-user capabilities with logical data separation
and persistent knowledge base management.
""")
# User session info
st.markdown("### πŸ‘€ Current Session")
col1, col2, col3 = st.columns(3)
with col1:
st.info(f"**User ID:** {st.session_state.user_id[:16]}...")
with col2:
st.info(f"**Documents:** {len(st.session_state.uploaded_files)}")
with col3:
st.info(f"**Queries:** {len(st.session_state.chat_history)}")
# Data isolation demonstration
st.markdown("### πŸ”’ Data Isolation")
st.markdown("""
Each user's knowledge base is logically separated using `tenant_id` metadata in ChromaDB.
This ensures:
- No data leakage between users
- Independent query results
- Isolated document management
""")
# Knowledge base status
st.markdown("### πŸ“š Knowledge Base Status")
if st.session_state.uploaded_files:
kb_data = []
for filename, info in st.session_state.uploaded_files.items():
kb_data.append({
'File': filename,
'Version': info['version'],
'Domain': info['domain'],
'Topic': info['topic'],
'Uploaded': info['timestamp'],
'Hash': info['hash'][:12] + "..."
})
df = pd.DataFrame(kb_data)
st.dataframe(df, use_container_width=True)
# Persistent storage info
st.success("""
βœ… **Persistent Storage Active**
- All documents are stored with file hash tracking
- Unchanged files skip re-indexing
- Automatic diff-based updates for modified files
""")
else:
st.info("No documents in knowledge base. Upload documents to get started.")
# Session management
st.markdown("### πŸ”„ Session Management")
col1, col2 = st.columns(2)
with col1:
if st.button("πŸ†• Create New Session"):
if st.checkbox("Confirm session reset"):
st.session_state.user_id = str(uuid.uuid4())
st.session_state.version_rag = None
st.session_state.baseline_rag = None
st.session_state.graph_manager = None
st.session_state.uploaded_files = {}
st.session_state.chat_history = []
st.success("New session created!")
st.rerun()
with col2:
if st.button("πŸ’Ύ Export Session Data"):
session_data = {
'user_id': st.session_state.user_id,
'uploaded_files': st.session_state.uploaded_files,
'chat_history': st.session_state.chat_history,
'feedback_data': st.session_state.feedback_data,
'timestamp': datetime.now().isoformat()
}
json_str = json.dumps(session_data, indent=2)
st.download_button(
"Download Session JSON",
json_str,
f"session_{st.session_state.user_id[:8]}.json",
"application/json"
)
# UX Metrics
st.markdown("### πŸ“Š UX Metrics")
col1, col2, col3 = st.columns(3)
with col1:
# Calculate reupload count (files with same name but different hash)
reupload_count = 0
st.metric("Reupload Count", reupload_count,
help="Number of times files were reuploaded")
with col2:
if st.session_state.chat_history:
avg_response = sum(c['vrag_time'] for c in st.session_state.chat_history) / len(st.session_state.chat_history)
st.metric("Avg Response Time", f"{avg_response:.3f}s")
else:
st.metric("Avg Response Time", "N/A")
with col3:
cross_contamination = 0 # This would be detected in production
st.metric("Cross-User Contamination", cross_contamination,
help="Number of cross-user data leakage incidents")
# Footer
st.markdown("---")
st.markdown("""
<div style='text-align: center; color: #666;'>
<p>VersionRAG - Version-Aware Retrieval-Augmented Generation System</p>
<p>Built with Streamlit, LangChain, and ChromaDB</p>
</div>
""", unsafe_allow_html=True)