Sangyog10 commited on
Commit
29fbb51
·
1 Parent(s): 72b7684

set up rag pipeline for chatbot

Browse files
.gitignore CHANGED
@@ -66,3 +66,6 @@ notebooks
66
  np_text_model/classifier/sentencepiece.bpe.model
67
  np_text_model/classifier/tokenizer.json
68
 
 
 
 
 
66
  np_text_model/classifier/sentencepiece.bpe.model
67
  np_text_model/classifier/tokenizer.json
68
 
69
+ # vector database
70
+ chroma_data
71
+ chroma_database
README.md CHANGED
@@ -119,7 +119,9 @@ AI-Checker/
119
  2. **Run the API**
120
 
121
  ```bash
122
- uvicorn app:app --reload
 
 
123
  ```
124
 
125
  3. **Build Docker (optional)**
 
119
  2. **Run the API**
120
 
121
  ```bash
122
+ chroma run --path ./chroma_database ## to run chromadb locally
123
+ uvicorn app:app --reload --port 8001 ## fastapi (run after chromadb)
124
+
125
  ```
126
 
127
  3. **Build Docker (optional)**
app.py CHANGED
@@ -11,6 +11,7 @@ from features.nepali_text_classifier.routes import (
11
  )
12
  from features.image_classifier.routes import router as image_classifier_router
13
  from features.image_edit_detector.routes import router as image_edit_detector_router
 
14
  from fastapi.staticfiles import StaticFiles
15
 
16
  from config import ACCESS_RATE
@@ -41,6 +42,8 @@ app.include_router(text_classifier_router, prefix="/text")
41
  app.include_router(nepali_text_classifier_router, prefix="/NP")
42
  app.include_router(image_classifier_router, prefix="/AI-image")
43
  app.include_router(image_edit_detector_router, prefix="/detect")
 
 
44
 
45
 
46
  @app.get("/")
 
11
  )
12
  from features.image_classifier.routes import router as image_classifier_router
13
  from features.image_edit_detector.routes import router as image_edit_detector_router
14
+ from features.rag_chatbot.routes import router as rag_router
15
  from fastapi.staticfiles import StaticFiles
16
 
17
  from config import ACCESS_RATE
 
42
  app.include_router(nepali_text_classifier_router, prefix="/NP")
43
  app.include_router(image_classifier_router, prefix="/AI-image")
44
  app.include_router(image_edit_detector_router, prefix="/detect")
45
+ app.include_router(rag_router, prefix="/rag")
46
+
47
 
48
 
49
  @app.get("/")
features/rag_chatbot/__init__.py ADDED
File without changes
features/rag_chatbot/controller.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import asyncio
3
+ import logging
4
+ from io import BytesIO
5
+ from typing import Dict, Any
6
+
7
+ from fastapi import HTTPException, UploadFile, status, Depends
8
+ from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
9
+
10
+ from .rag_pipeline import route_and_process_query, add_document_to_rag, check_system_health
11
+ from .document_handler import extract_text_from_file
12
+
13
+ # Configure logging
14
+ logging.basicConfig(level=logging.INFO)
15
+ logger = logging.getLogger(__name__)
16
+
17
+ security = HTTPBearer()
18
+
19
+ # Supported file types
20
+ SUPPORTED_CONTENT_TYPES = {
21
+ "application/pdf",
22
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
23
+ "text/plain"
24
+ }
25
+
26
+ MAX_FILE_SIZE = 100 * 1024 * 1024 # 100MB
27
+
28
+ async def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
29
+ """Verify Bearer token from Authorization header."""
30
+ token = credentials.credentials
31
+ expected_token = os.getenv("MY_SECRET_TOKEN")
32
+
33
+ if not expected_token:
34
+ logger.error("MY_SECRET_TOKEN not configured")
35
+ raise HTTPException(
36
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
37
+ detail="Server configuration error"
38
+ )
39
+
40
+ if token != expected_token:
41
+ logger.warning(f"Invalid token attempt: {token[:10]}...")
42
+ raise HTTPException(
43
+ status_code=status.HTTP_403_FORBIDDEN,
44
+ detail="Invalid or expired token"
45
+ )
46
+ return token
47
+
48
+ async def handle_rag_query(query: str) -> Dict[str, Any]:
49
+ """Handle an incoming query by routing it and getting the appropriate answer."""
50
+
51
+ # Input validation
52
+ if not query or not query.strip():
53
+ raise HTTPException(
54
+ status_code=status.HTTP_400_BAD_REQUEST,
55
+ detail="Query cannot be empty"
56
+ )
57
+
58
+ if len(query) > 1000: # Reasonable limit
59
+ raise HTTPException(
60
+ status_code=status.HTTP_400_BAD_REQUEST,
61
+ detail="Query too long. Please limit to 1000 characters."
62
+ )
63
+
64
+ try:
65
+ logger.info(f"Processing query: {query[:50]}...")
66
+
67
+ # Process query in thread pool
68
+ response = await asyncio.to_thread(route_and_process_query, query)
69
+
70
+ logger.info(f"Query processed successfully. Route: {response.get('route', 'Unknown')}")
71
+ return response
72
+
73
+ except Exception as e:
74
+ logger.error(f"Error processing query: {e}")
75
+ raise HTTPException(
76
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
77
+ detail="Error processing your query. Please try again."
78
+ )
79
+
80
+ async def handle_document_upload(file: UploadFile) -> Dict[str, str]:
81
+ """Handle uploading a document to the RAG's vector store."""
82
+
83
+ # File validation
84
+ if not file.filename:
85
+ raise HTTPException(
86
+ status_code=status.HTTP_400_BAD_REQUEST,
87
+ detail="No file provided"
88
+ )
89
+
90
+ if file.content_type not in SUPPORTED_CONTENT_TYPES:
91
+ raise HTTPException(
92
+ status_code=status.HTTP_415_UNSUPPORTED_MEDIA_TYPE,
93
+ detail=f"Unsupported file type: {file.content_type}. "
94
+ f"Supported types: {', '.join(SUPPORTED_CONTENT_TYPES)}"
95
+ )
96
+
97
+ # Check file size
98
+ contents = await file.read()
99
+ if len(contents) > MAX_FILE_SIZE:
100
+ raise HTTPException(
101
+ status_code=status.HTTP_413_REQUEST_ENTITY_TOO_LARGE,
102
+ detail=f"File too large. Maximum size: {MAX_FILE_SIZE / (1024*1024):.1f}MB"
103
+ )
104
+
105
+ # Reset file pointer
106
+ await file.seek(0)
107
+
108
+ try:
109
+ logger.info(f"Processing file upload: {file.filename}")
110
+
111
+ # Extract text from file
112
+ text = await extract_text_from_file(file)
113
+
114
+ if not text or not text.strip():
115
+ raise HTTPException(
116
+ status_code=status.HTTP_400_BAD_REQUEST,
117
+ detail="The file appears to be empty or could not be read."
118
+ )
119
+
120
+ if len(text) < 50: # Too short to be meaningful
121
+ raise HTTPException(
122
+ status_code=status.HTTP_400_BAD_REQUEST,
123
+ detail="The extracted text is too short to be meaningful."
124
+ )
125
+
126
+ # Add to RAG system
127
+ success = await asyncio.to_thread(
128
+ add_document_to_rag,
129
+ text,
130
+ {
131
+ "source": file.filename,
132
+ "content_type": file.content_type,
133
+ "size": len(contents)
134
+ }
135
+ )
136
+
137
+ if not success:
138
+ raise HTTPException(
139
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
140
+ detail="Failed to add document to the knowledge base"
141
+ )
142
+
143
+ logger.info(f"Successfully processed file: {file.filename}")
144
+
145
+ return {
146
+ "message": f"Successfully uploaded and processed '{file.filename}'. "
147
+ f"It is now available for querying.",
148
+ "filename": file.filename,
149
+ "text_length": len(text),
150
+ "content_type": file.content_type
151
+ }
152
+
153
+ except HTTPException:
154
+ raise
155
+ except Exception as e:
156
+ logger.error(f"Error processing file {file.filename}: {e}")
157
+ raise HTTPException(
158
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
159
+ detail="Error processing the file. Please try again."
160
+ )
161
+
162
+ async def handle_health_check() -> Dict[str, Any]:
163
+ """Handle health check requests."""
164
+ try:
165
+ health_status = await asyncio.to_thread(check_system_health)
166
+
167
+ if health_status["status"] == "unhealthy":
168
+ raise HTTPException(
169
+ status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
170
+ detail="Service is currently unhealthy"
171
+ )
172
+
173
+ return health_status
174
+
175
+ except HTTPException:
176
+ raise
177
+ except Exception as e:
178
+ logger.error(f"Health check failed: {e}")
179
+ raise HTTPException(
180
+ status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
181
+ detail="Health check failed"
182
+ )
features/rag_chatbot/document_handler.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from io import BytesIO
2
+ from fastapi import UploadFile, HTTPException
3
+ import PyPDF2
4
+ import docx
5
+
6
+ async def extract_text_from_file(file: UploadFile) -> str:
7
+ """Extracts text from various file types."""
8
+ content = await file.read()
9
+ file_stream = BytesIO(content)
10
+
11
+ if file.content_type == "application/pdf":
12
+ return extract_text_from_pdf(file_stream)
13
+ elif file.content_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
14
+ return extract_text_from_docx(file_stream)
15
+ elif file.content_type == "text/plain":
16
+ return file_stream.read().decode("utf-8")
17
+ else:
18
+ raise HTTPException(
19
+ status_code=415,
20
+ detail="Unsupported file type. Please upload a .pdf, .docx, or .txt file."
21
+ )
22
+
23
+ def extract_text_from_pdf(file_stream: BytesIO) -> str:
24
+ """Extracts text from a PDF file."""
25
+ reader = PyPDF2.PdfReader(file_stream)
26
+ text = ""
27
+ for page in reader.pages:
28
+ text += page.extract_text() or ""
29
+ return text
30
+
31
+ def extract_text_from_docx(file_stream: BytesIO) -> str:
32
+ """Extracts text from a DOCX file."""
33
+ doc = docx.Document(file_stream)
34
+ text = ""
35
+ for para in doc.paragraphs:
36
+ text += para.text + "\n"
37
+ return text
features/rag_chatbot/rag_pipeline.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import chromadb
3
+ from dotenv import load_dotenv
4
+ from langchain_core.documents import Document
5
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
6
+ from langchain_openai import OpenAIEmbeddings, OpenAI
7
+ from langchain.chains.question_answering import load_qa_chain
8
+ from langchain_community.vectorstores import Chroma
9
+ from langchain.chains import LLMChain
10
+ from langchain.prompts import PromptTemplate
11
+
12
+ load_dotenv()
13
+
14
+ CHROMA_HOST = os.getenv("CHROMA_HOST", "localhost")
15
+ COLLECTION_NAME = "company_docs_collection"
16
+
17
+ vector_store = None
18
+ company_qa_chain = None
19
+ query_router_chain = None
20
+ cybersecurity_chain = None
21
+ llm = OpenAI(temperature=0)
22
+
23
+ def initialize_pipelines():
24
+ """Initializes all required models, chains, and the vector store."""
25
+ global vector_store, company_qa_chain, query_router_chain, cybersecurity_chain, llm
26
+
27
+ try:
28
+ embeddings = OpenAIEmbeddings()
29
+
30
+ # Initialize ChromaDB client
31
+ try:
32
+ chroma_client = chromadb.HttpClient(host=CHROMA_HOST, port=8000)
33
+ chroma_client.heartbeat() # Heartbeat check to confirm the connection
34
+ print("Successfully connected to ChromaDB.")
35
+ except Exception as e:
36
+ print(f"FATAL: Could not connect to ChromaDB at {CHROMA_HOST}:8000. Please ensure the ChromaDB server is running.")
37
+ print(f"Error details: {e}")
38
+ raise ConnectionError("Failed to connect to ChromaDB.") from e
39
+
40
+ # Initialize vector store
41
+ vector_store = Chroma(
42
+ client=chroma_client,
43
+ collection_name=COLLECTION_NAME,
44
+ embedding_function=embeddings,
45
+ )
46
+
47
+ # Query Router Chain
48
+ router_template = """
49
+ You are a query classifier. Classify the following query into one of these categories:
50
+ - COMPANY: Questions about company policies, procedures, documents, or internal information
51
+ - CYBERSECURITY: Questions about cybersecurity, security threats, best practices, or vulnerabilities
52
+ - OFF_TOPIC: Questions that don't fit the above categories
53
+
54
+ Query: {query}
55
+
56
+ Respond with only the category name (COMPANY, CYBERSECURITY, or OFF_TOPIC):
57
+ """
58
+
59
+ router_prompt = PromptTemplate(
60
+ input_variables=["query"],
61
+ template=router_template
62
+ )
63
+
64
+ query_router_chain = LLMChain(
65
+ llm=llm,
66
+ prompt=router_prompt
67
+ )
68
+
69
+ # Company QA Chain
70
+ company_qa_chain = load_qa_chain(llm, chain_type="stuff")
71
+
72
+ # Cybersecurity Chain
73
+ cybersecurity_template = """
74
+ You are a cybersecurity expert. Answer the following cybersecurity question based on your knowledge:
75
+
76
+ Question: {question}
77
+
78
+ Provide a comprehensive and accurate answer about cybersecurity:
79
+ """
80
+
81
+ cybersecurity_prompt = PromptTemplate(
82
+ input_variables=["question"],
83
+ template=cybersecurity_template
84
+ )
85
+
86
+ cybersecurity_chain = LLMChain(
87
+ llm=llm,
88
+ prompt=cybersecurity_prompt
89
+ )
90
+
91
+ print("All pipelines initialized successfully!")
92
+
93
+ except Exception as e:
94
+ print(f"Error initializing pipelines: {e}")
95
+ raise
96
+
97
+ def add_document_to_rag(text: str, metadata: dict):
98
+ """Splits a document and adds it to the ChromaDB index."""
99
+ global vector_store
100
+
101
+ if not vector_store:
102
+ initialize_pipelines()
103
+
104
+ try:
105
+ text_splitter = RecursiveCharacterTextSplitter(
106
+ chunk_size=1000,
107
+ chunk_overlap=200
108
+ )
109
+ docs = text_splitter.create_documents([text], metadatas=[metadata])
110
+
111
+ if not docs:
112
+ print("Document was empty after splitting, not adding to ChromaDB.")
113
+ return False
114
+
115
+ print(f"Adding {len(docs)} document chunks to ChromaDB...")
116
+ vector_store.add_documents(docs)
117
+ print("Successfully added documents.")
118
+ return True
119
+
120
+ except Exception as e:
121
+ print(f"Error adding document to RAG: {e}")
122
+ return False
123
+
124
+ def route_and_process_query(query: str):
125
+ """Routes the query and processes it using the appropriate pipeline."""
126
+ global query_router_chain, vector_store, company_qa_chain, cybersecurity_chain
127
+
128
+ if not all([query_router_chain, vector_store, company_qa_chain, cybersecurity_chain]):
129
+ initialize_pipelines()
130
+
131
+ try:
132
+ # 1. Classify the query
133
+ route_result = query_router_chain.run(query)
134
+ route = route_result.strip().upper()
135
+
136
+ print(f"Query routed to: {route}")
137
+
138
+ # 2. Route to appropriate logic
139
+ if "CYBERSECURITY" in route:
140
+ answer = cybersecurity_chain.run(question=query)
141
+ return {
142
+ "answer": answer,
143
+ "source": "Cybersecurity Knowledge Base",
144
+ "route": "CYBERSECURITY"
145
+ }
146
+
147
+ elif "COMPANY" in route:
148
+ # Perform similarity search on ChromaDB
149
+ docs = vector_store.similarity_search(query, k=3)
150
+ print(f"Found {len(docs)} relevant documents.")
151
+ print(f"Documents: {[doc.metadata.get('source', 'Unknown') for doc in docs]}")
152
+
153
+ if not docs:
154
+ return {
155
+ "answer": "I could not find any relevant information to answer your question.",
156
+ "source": "Company Documents",
157
+ "route": "COMPANY"
158
+ }
159
+
160
+ # Run the QA chain
161
+ answer = company_qa_chain.run(input_documents=docs, question=query)
162
+ sources = list(set([doc.metadata.get("source", "Unknown") for doc in docs]))
163
+
164
+ return {
165
+ "answer": answer,
166
+ "source": "Company Documents",
167
+ "documents": sources,
168
+ "route": "COMPANY"
169
+ }
170
+
171
+ else: # OFF_TOPIC
172
+ return {
173
+ "answer": "I am a specialized assistant of CyberAlertNepal. I cannot answer questions outside of cybersecurity topics.",
174
+ "source": "N/A",
175
+ "route": "OFF_TOPIC"
176
+ }
177
+
178
+ except Exception as e:
179
+ print(f"Error processing query: {e}")
180
+ return {
181
+ "answer": "I encountered an error while processing your query. Please try again.",
182
+ "source": "Error",
183
+ "error": str(e)
184
+ }
185
+
186
+ def check_system_health():
187
+ """Check if all components are properly initialized."""
188
+ try:
189
+ # Test ChromaDB connection
190
+ if vector_store:
191
+ vector_store._client.heartbeat()
192
+
193
+ # Test if all chains are initialized
194
+ components = {
195
+ "vector_store": vector_store is not None,
196
+ "company_qa_chain": company_qa_chain is not None,
197
+ "query_router_chain": query_router_chain is not None,
198
+ "cybersecurity_chain": cybersecurity_chain is not None
199
+ }
200
+
201
+ return {
202
+ "status": "healthy" if all(components.values()) else "unhealthy",
203
+ "components": components
204
+ }
205
+
206
+ except Exception as e:
207
+ return {
208
+ "status": "unhealthy",
209
+ "error": str(e)
210
+ }
211
+
212
+ # Initialize pipelines on module import
213
+ try:
214
+ initialize_pipelines()
215
+ except Exception as e:
216
+ print(f"Failed to initialize pipelines on startup: {e}")
features/rag_chatbot/routes.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, Depends, HTTPException, UploadFile, File, Request
2
+ from fastapi.security import HTTPBearer
3
+ from pydantic import BaseModel, Field
4
+ from slowapi.util import get_remote_address
5
+ from slowapi import Limiter
6
+ from typing import Optional
7
+ from config import ACCESS_RATE
8
+ from .controller import (
9
+ handle_rag_query,
10
+ handle_document_upload,
11
+ handle_health_check,
12
+ verify_token,
13
+ )
14
+
15
+ limiter = Limiter(key_func=get_remote_address)
16
+ router = APIRouter(prefix="/rag", tags=["RAG Chatbot"])
17
+ security = HTTPBearer()
18
+
19
+ class QueryInput(BaseModel):
20
+ query: str = Field(..., min_length=1, max_length=1000, description="The question to ask")
21
+
22
+ class QueryResponse(BaseModel):
23
+ answer: str
24
+ source: str
25
+ route: Optional[str] = None
26
+ documents: Optional[list] = None
27
+ error: Optional[str] = None
28
+
29
+ class UploadResponse(BaseModel):
30
+ message: str
31
+ filename: str
32
+ text_length: int
33
+ content_type: str
34
+
35
+ class HealthResponse(BaseModel):
36
+ status: str
37
+ components: Optional[dict] = None
38
+ error: Optional[str] = None
39
+
40
+ @router.post("/question", response_model=QueryResponse)
41
+ @limiter.limit(ACCESS_RATE)
42
+ async def ask_question(
43
+ request: Request,
44
+ data: QueryInput,
45
+ token: str = Depends(verify_token)
46
+ ) -> QueryResponse:
47
+ """
48
+ Ask a question to the RAG chatbot.
49
+
50
+ The chatbot can answer:
51
+ - Company-related questions (based on uploaded documents)
52
+ - Cybersecurity questions (from knowledge base)
53
+ """
54
+ response = await handle_rag_query(data.query)
55
+ return QueryResponse(**response)
56
+
57
+ @router.post("/upload", response_model=UploadResponse)
58
+ @limiter.limit(ACCESS_RATE)
59
+ async def upload_document(
60
+ request: Request,
61
+ file: UploadFile = File(..., description="Document file (PDF, DOCX, or TXT)"),
62
+ token: str = Depends(verify_token)
63
+ ) -> UploadResponse:
64
+ """
65
+ Upload a document to the company knowledge base.
66
+
67
+ Supported formats:
68
+ - PDF (.pdf)
69
+ - Word documents (.docx)
70
+ - Plain text (.txt)
71
+
72
+ Maximum file size: 10MB
73
+ """
74
+ response = await handle_document_upload(file)
75
+ return UploadResponse(**response)
76
+
77
+ @router.get("/health", response_model=HealthResponse)
78
+ @limiter.limit(ACCESS_RATE)
79
+ async def health_check(request: Request) -> HealthResponse:
80
+ """
81
+ Check the health status of the RAG system.
82
+
83
+ Returns the status of all components:
84
+ - ChromaDB connection
85
+ - Vector store
86
+ - AI chains
87
+ """
88
+ response = await handle_health_check()
89
+ return HealthResponse(**response)
90
+
91
+ @router.get("/info")
92
+ @limiter.limit(ACCESS_RATE)
93
+ async def get_system_info(request: Request):
94
+ """Get information about the RAG system capabilities."""
95
+ return {
96
+ "name": "RAG Chatbot",
97
+ "version": "1.0.0",
98
+ "description": "A specialized chatbot for cybersecurity and company-related questions",
99
+ "capabilities": [
100
+ "Company document Q&A (based on uploaded documents)",
101
+ "Cybersecurity knowledge and best practices",
102
+ "Document upload and processing (PDF, DOCX, TXT)"
103
+ ],
104
+ "supported_file_types": [
105
+ "application/pdf",
106
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
107
+ "text/plain"
108
+ ],
109
+ "max_file_size_mb": 10,
110
+ "max_query_length": 1000
111
+ }
requirements.txt CHANGED
@@ -18,3 +18,11 @@ scipy
18
  fitz
19
  frontend
20
  tools
 
 
 
 
 
 
 
 
 
18
  fitz
19
  frontend
20
  tools
21
+ langchain
22
+ langchain-community
23
+ langchain-openai
24
+ faiss-cpu
25
+ PyPDF2
26
+ tiktoken
27
+ chromadb
28
+ langchain_chroma