Spaces:

can-org
/

Testing-AI-Contain

Sleeping

App Files Files Community

Sangyog10 commited on Aug 4, 2025

Commit

29fbb51

1 Parent(s): 72b7684

set up rag pipeline for chatbot

Browse files

Files changed (9) hide show

.gitignore +3 -0
README.md +3 -1
app.py +3 -0
features/rag_chatbot/__init__.py +0 -0
features/rag_chatbot/controller.py +182 -0
features/rag_chatbot/document_handler.py +37 -0
features/rag_chatbot/rag_pipeline.py +216 -0
features/rag_chatbot/routes.py +111 -0
requirements.txt +8 -0

.gitignore CHANGED Viewed

@@ -66,3 +66,6 @@ notebooks
 np_text_model/classifier/sentencepiece.bpe.model
 np_text_model/classifier/tokenizer.json

 np_text_model/classifier/sentencepiece.bpe.model
 np_text_model/classifier/tokenizer.json
+# vector database
+chroma_data
+chroma_database

README.md CHANGED Viewed

@@ -119,7 +119,9 @@ AI-Checker/
 2. **Run the API**
    ```bash
-   uvicorn app:app --reload
    ```
 3. **Build Docker (optional)**

 2. **Run the API**
    ```bash
+   chroma run --path ./chroma_database ## to run chromadb locally
+   uvicorn app:app --reload --port 8001 ## fastapi (run after chromadb)
    ```
 3. **Build Docker (optional)**

app.py CHANGED Viewed

@@ -11,6 +11,7 @@ from features.nepali_text_classifier.routes import (
 )
 from features.image_classifier.routes import router as image_classifier_router
 from features.image_edit_detector.routes import router as image_edit_detector_router
 from fastapi.staticfiles import StaticFiles
 from config import ACCESS_RATE
@@ -41,6 +42,8 @@ app.include_router(text_classifier_router, prefix="/text")
 app.include_router(nepali_text_classifier_router, prefix="/NP")
 app.include_router(image_classifier_router, prefix="/AI-image")
 app.include_router(image_edit_detector_router, prefix="/detect")
 @app.get("/")

 )
 from features.image_classifier.routes import router as image_classifier_router
 from features.image_edit_detector.routes import router as image_edit_detector_router
+from features.rag_chatbot.routes import router as rag_router
 from fastapi.staticfiles import StaticFiles
 from config import ACCESS_RATE
 app.include_router(nepali_text_classifier_router, prefix="/NP")
 app.include_router(image_classifier_router, prefix="/AI-image")
 app.include_router(image_edit_detector_router, prefix="/detect")
+app.include_router(rag_router, prefix="/rag")
 @app.get("/")

features/rag_chatbot/__init__.py ADDED Viewed

File without changes

features/rag_chatbot/controller.py ADDED Viewed

	@@ -0,0 +1,182 @@

+import os
+import asyncio
+import logging
+from io import BytesIO
+from typing import Dict, Any
+from fastapi import HTTPException, UploadFile, status, Depends
+from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
+from .rag_pipeline import route_and_process_query, add_document_to_rag, check_system_health
+from .document_handler import extract_text_from_file
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+security = HTTPBearer()
+# Supported file types
+SUPPORTED_CONTENT_TYPES = {
+    "application/pdf",
+    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+    "text/plain"
+}
+MAX_FILE_SIZE = 100 * 1024 * 1024  # 100MB
+async def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
+    """Verify Bearer token from Authorization header."""
+    token = credentials.credentials
+    expected_token = os.getenv("MY_SECRET_TOKEN")
+    if not expected_token:
+        logger.error("MY_SECRET_TOKEN not configured")
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail="Server configuration error"
+        )
+    if token != expected_token:
+        logger.warning(f"Invalid token attempt: {token[:10]}...")
+        raise HTTPException(
+            status_code=status.HTTP_403_FORBIDDEN,
+            detail="Invalid or expired token"
+        )
+    return token
+async def handle_rag_query(query: str) -> Dict[str, Any]:
+    """Handle an incoming query by routing it and getting the appropriate answer."""
+    # Input validation
+    if not query or not query.strip():
+        raise HTTPException(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            detail="Query cannot be empty"
+        )
+    if len(query) > 1000:  # Reasonable limit
+        raise HTTPException(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            detail="Query too long. Please limit to 1000 characters."
+        )
+    try:
+        logger.info(f"Processing query: {query[:50]}...")
+        # Process query in thread pool
+        response = await asyncio.to_thread(route_and_process_query, query)
+        logger.info(f"Query processed successfully. Route: {response.get('route', 'Unknown')}")
+        return response
+    except Exception as e:
+        logger.error(f"Error processing query: {e}")
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail="Error processing your query. Please try again."
+        )
+async def handle_document_upload(file: UploadFile) -> Dict[str, str]:
+    """Handle uploading a document to the RAG's vector store."""
+    # File validation
+    if not file.filename:
+        raise HTTPException(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            detail="No file provided"
+        )
+    if file.content_type not in SUPPORTED_CONTENT_TYPES:
+        raise HTTPException(
+            status_code=status.HTTP_415_UNSUPPORTED_MEDIA_TYPE,
+            detail=f"Unsupported file type: {file.content_type}. "
+                   f"Supported types: {', '.join(SUPPORTED_CONTENT_TYPES)}"
+        )
+    # Check file size
+    contents = await file.read()
+    if len(contents) > MAX_FILE_SIZE:
+        raise HTTPException(
+            status_code=status.HTTP_413_REQUEST_ENTITY_TOO_LARGE,
+            detail=f"File too large. Maximum size: {MAX_FILE_SIZE / (1024*1024):.1f}MB"
+        )
+    # Reset file pointer
+    await file.seek(0)
+    try:
+        logger.info(f"Processing file upload: {file.filename}")
+        # Extract text from file
+        text = await extract_text_from_file(file)
+        if not text or not text.strip():
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                detail="The file appears to be empty or could not be read."
+            )
+        if len(text) < 50:  # Too short to be meaningful
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                detail="The extracted text is too short to be meaningful."
+            )
+        # Add to RAG system
+        success = await asyncio.to_thread(
+            add_document_to_rag,
+            text,
+            {
+                "source": file.filename,
+                "content_type": file.content_type,
+                "size": len(contents)
+            }
+        )
+        if not success:
+            raise HTTPException(
+                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+                detail="Failed to add document to the knowledge base"
+            )
+        logger.info(f"Successfully processed file: {file.filename}")
+        return {
+            "message": f"Successfully uploaded and processed '{file.filename}'. "
+                      f"It is now available for querying.",
+            "filename": file.filename,
+            "text_length": len(text),
+            "content_type": file.content_type
+        }
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error processing file {file.filename}: {e}")
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail="Error processing the file. Please try again."
+        )
+async def handle_health_check() -> Dict[str, Any]:
+    """Handle health check requests."""
+    try:
+        health_status = await asyncio.to_thread(check_system_health)
+        if health_status["status"] == "unhealthy":
+            raise HTTPException(
+                status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
+                detail="Service is currently unhealthy"
+            )
+        return health_status
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Health check failed: {e}")
+        raise HTTPException(
+            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
+            detail="Health check failed"
+        )

features/rag_chatbot/document_handler.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from io import BytesIO
+from fastapi import UploadFile, HTTPException
+import PyPDF2
+import docx
+async def extract_text_from_file(file: UploadFile) -> str:
+    """Extracts text from various file types."""
+    content = await file.read()
+    file_stream = BytesIO(content)
+    if file.content_type == "application/pdf":
+        return extract_text_from_pdf(file_stream)
+    elif file.content_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
+        return extract_text_from_docx(file_stream)
+    elif file.content_type == "text/plain":
+        return file_stream.read().decode("utf-8")
+    else:
+        raise HTTPException(
+            status_code=415,
+            detail="Unsupported file type. Please upload a .pdf, .docx, or .txt file."
+        )
+def extract_text_from_pdf(file_stream: BytesIO) -> str:
+    """Extracts text from a PDF file."""
+    reader = PyPDF2.PdfReader(file_stream)
+    text = ""
+    for page in reader.pages:
+        text += page.extract_text() or ""
+    return text
+def extract_text_from_docx(file_stream: BytesIO) -> str:
+    """Extracts text from a DOCX file."""
+    doc = docx.Document(file_stream)
+    text = ""
+    for para in doc.paragraphs:
+        text += para.text + "\n"
+    return text

features/rag_chatbot/rag_pipeline.py ADDED Viewed

	@@ -0,0 +1,216 @@

+import os
+import chromadb
+from dotenv import load_dotenv
+from langchain_core.documents import Document
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_openai import OpenAIEmbeddings, OpenAI
+from langchain.chains.question_answering import load_qa_chain
+from langchain_community.vectorstores import Chroma
+from langchain.chains import LLMChain
+from langchain.prompts import PromptTemplate
+load_dotenv()
+CHROMA_HOST = os.getenv("CHROMA_HOST", "localhost")
+COLLECTION_NAME = "company_docs_collection"
+vector_store = None
+company_qa_chain = None
+query_router_chain = None
+cybersecurity_chain = None
+llm = OpenAI(temperature=0)
+def initialize_pipelines():
+    """Initializes all required models, chains, and the vector store."""
+    global vector_store, company_qa_chain, query_router_chain, cybersecurity_chain, llm
+    try:
+        embeddings = OpenAIEmbeddings()
+        # Initialize ChromaDB client
+        try:
+            chroma_client = chromadb.HttpClient(host=CHROMA_HOST, port=8000)
+            chroma_client.heartbeat() # Heartbeat check to confirm the connection
+            print("Successfully connected to ChromaDB.")
+        except Exception as e:
+            print(f"FATAL: Could not connect to ChromaDB at {CHROMA_HOST}:8000. Please ensure the ChromaDB server is running.")
+            print(f"Error details: {e}")
+            raise ConnectionError("Failed to connect to ChromaDB.") from e
+        # Initialize vector store
+        vector_store = Chroma(
+            client=chroma_client,
+            collection_name=COLLECTION_NAME,
+            embedding_function=embeddings,
+        )
+        # Query Router Chain
+        router_template = """
+        You are a query classifier. Classify the following query into one of these categories:
+        - COMPANY: Questions about company policies, procedures, documents, or internal information
+        - CYBERSECURITY: Questions about cybersecurity, security threats, best practices, or vulnerabilities
+        - OFF_TOPIC: Questions that don't fit the above categories
+        Query: {query}
+        Respond with only the category name (COMPANY, CYBERSECURITY, or OFF_TOPIC):
+        """
+        router_prompt = PromptTemplate(
+            input_variables=["query"],
+            template=router_template
+        )
+        query_router_chain = LLMChain(
+            llm=llm,
+            prompt=router_prompt
+        )
+        # Company QA Chain
+        company_qa_chain = load_qa_chain(llm, chain_type="stuff")
+        # Cybersecurity Chain
+        cybersecurity_template = """
+        You are a cybersecurity expert. Answer the following cybersecurity question based on your knowledge:
+        Question: {question}
+        Provide a comprehensive and accurate answer about cybersecurity:
+        """
+        cybersecurity_prompt = PromptTemplate(
+            input_variables=["question"],
+            template=cybersecurity_template
+        )
+        cybersecurity_chain = LLMChain(
+            llm=llm,
+            prompt=cybersecurity_prompt
+        )
+        print("All pipelines initialized successfully!")
+    except Exception as e:
+        print(f"Error initializing pipelines: {e}")
+        raise
+def add_document_to_rag(text: str, metadata: dict):
+    """Splits a document and adds it to the ChromaDB index."""
+    global vector_store
+    if not vector_store:
+        initialize_pipelines()
+    try:
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=1000,
+            chunk_overlap=200
+        )
+        docs = text_splitter.create_documents([text], metadatas=[metadata])
+        if not docs:
+            print("Document was empty after splitting, not adding to ChromaDB.")
+            return False
+        print(f"Adding {len(docs)} document chunks to ChromaDB...")
+        vector_store.add_documents(docs)
+        print("Successfully added documents.")
+        return True
+    except Exception as e:
+        print(f"Error adding document to RAG: {e}")
+        return False
+def route_and_process_query(query: str):
+    """Routes the query and processes it using the appropriate pipeline."""
+    global query_router_chain, vector_store, company_qa_chain, cybersecurity_chain
+    if not all([query_router_chain, vector_store, company_qa_chain, cybersecurity_chain]):
+        initialize_pipelines()
+    try:
+        # 1. Classify the query
+        route_result = query_router_chain.run(query)
+        route = route_result.strip().upper()
+        print(f"Query routed to: {route}")
+        # 2. Route to appropriate logic
+        if "CYBERSECURITY" in route:
+            answer = cybersecurity_chain.run(question=query)
+            return {
+                "answer": answer,
+                "source": "Cybersecurity Knowledge Base",
+                "route": "CYBERSECURITY"
+            }
+        elif "COMPANY" in route:
+            # Perform similarity search on ChromaDB
+            docs = vector_store.similarity_search(query, k=3)
+            print(f"Found {len(docs)} relevant documents.")
+            print(f"Documents: {[doc.metadata.get('source', 'Unknown') for doc in docs]}")
+            if not docs:
+                return {
+                    "answer": "I could not find any relevant information to answer your question.",
+                    "source": "Company Documents",
+                    "route": "COMPANY"
+                }
+            # Run the QA chain
+            answer = company_qa_chain.run(input_documents=docs, question=query)
+            sources = list(set([doc.metadata.get("source", "Unknown") for doc in docs]))
+            return {
+                "answer": answer,
+                "source": "Company Documents",
+                "documents": sources,
+                "route": "COMPANY"
+            }
+        else:  # OFF_TOPIC
+            return {
+                "answer": "I am a specialized assistant of CyberAlertNepal. I cannot answer questions outside of cybersecurity topics.",
+                "source": "N/A",
+                "route": "OFF_TOPIC"
+            }
+    except Exception as e:
+        print(f"Error processing query: {e}")
+        return {
+            "answer": "I encountered an error while processing your query. Please try again.",
+            "source": "Error",
+            "error": str(e)
+        }
+def check_system_health():
+    """Check if all components are properly initialized."""
+    try:
+        # Test ChromaDB connection
+        if vector_store:
+            vector_store._client.heartbeat()
+        # Test if all chains are initialized
+        components = {
+            "vector_store": vector_store is not None,
+            "company_qa_chain": company_qa_chain is not None,
+            "query_router_chain": query_router_chain is not None,
+            "cybersecurity_chain": cybersecurity_chain is not None
+        }
+        return {
+            "status": "healthy" if all(components.values()) else "unhealthy",
+            "components": components
+        }
+    except Exception as e:
+        return {
+            "status": "unhealthy",
+            "error": str(e)
+        }
+# Initialize pipelines on module import
+try:
+    initialize_pipelines()
+except Exception as e:
+    print(f"Failed to initialize pipelines on startup: {e}")

features/rag_chatbot/routes.py ADDED Viewed

	@@ -0,0 +1,111 @@

+from fastapi import APIRouter, Depends, HTTPException, UploadFile, File, Request
+from fastapi.security import HTTPBearer
+from pydantic import BaseModel, Field
+from slowapi.util import get_remote_address
+from slowapi import Limiter
+from typing import Optional
+from config import ACCESS_RATE
+from .controller import (
+    handle_rag_query,
+    handle_document_upload,
+    handle_health_check,
+    verify_token,
+)
+limiter = Limiter(key_func=get_remote_address)
+router = APIRouter(prefix="/rag", tags=["RAG Chatbot"])
+security = HTTPBearer()
+class QueryInput(BaseModel):
+    query: str = Field(..., min_length=1, max_length=1000, description="The question to ask")
+class QueryResponse(BaseModel):
+    answer: str
+    source: str
+    route: Optional[str] = None
+    documents: Optional[list] = None
+    error: Optional[str] = None
+class UploadResponse(BaseModel):
+    message: str
+    filename: str
+    text_length: int
+    content_type: str
+class HealthResponse(BaseModel):
+    status: str
+    components: Optional[dict] = None
+    error: Optional[str] = None
+@router.post("/question", response_model=QueryResponse)
+@limiter.limit(ACCESS_RATE)
+async def ask_question(
+    request: Request,
+    data: QueryInput,
+    token: str = Depends(verify_token)
+) -> QueryResponse:
+    """
+    Ask a question to the RAG chatbot.
+    The chatbot can answer:
+    - Company-related questions (based on uploaded documents)
+    - Cybersecurity questions (from knowledge base)
+    """
+    response = await handle_rag_query(data.query)
+    return QueryResponse(**response)
+@router.post("/upload", response_model=UploadResponse)
+@limiter.limit(ACCESS_RATE)
+async def upload_document(
+    request: Request,
+    file: UploadFile = File(..., description="Document file (PDF, DOCX, or TXT)"),
+    token: str = Depends(verify_token)
+) -> UploadResponse:
+    """
+    Upload a document to the company knowledge base.
+    Supported formats:
+    - PDF (.pdf)
+    - Word documents (.docx)
+    - Plain text (.txt)
+    Maximum file size: 10MB
+    """
+    response = await handle_document_upload(file)
+    return UploadResponse(**response)
+@router.get("/health", response_model=HealthResponse)
+@limiter.limit(ACCESS_RATE)
+async def health_check(request: Request) -> HealthResponse:
+    """
+    Check the health status of the RAG system.
+    Returns the status of all components:
+    - ChromaDB connection
+    - Vector store
+    - AI chains
+    """
+    response = await handle_health_check()
+    return HealthResponse(**response)
+@router.get("/info")
+@limiter.limit(ACCESS_RATE)
+async def get_system_info(request: Request):
+    """Get information about the RAG system capabilities."""
+    return {
+        "name": "RAG Chatbot",
+        "version": "1.0.0",
+        "description": "A specialized chatbot for cybersecurity and company-related questions",
+        "capabilities": [
+            "Company document Q&A (based on uploaded documents)",
+            "Cybersecurity knowledge and best practices",
+            "Document upload and processing (PDF, DOCX, TXT)"
+        ],
+        "supported_file_types": [
+            "application/pdf",
+            "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+            "text/plain"
+        ],
+        "max_file_size_mb": 10,
+        "max_query_length": 1000
+    }

requirements.txt CHANGED Viewed

@@ -18,3 +18,11 @@ scipy
 fitz
 frontend
 tools

 fitz
 frontend
 tools
+langchain
+langchain-community
+langchain-openai
+faiss-cpu
+PyPDF2
+tiktoken
+chromadb
+langchain_chroma