PDF-Chatbot / pdf_utils.py
Muzenda-K
Fresh initial commit
40ab55e
raw
history blame contribute delete
394 Bytes
#!/usr/bin/env python
# coding: utf-8
# In[1]:
import PyPDF2
def extract_text(file_stream):
reader = PyPDF2.PdfReader(file_stream)
return "\n".join(page.extract_text() for page in reader.pages)
def chunk_text(text, chunk_size=1000, overlap=200):
chunks = []
for i in range(0, len(text), chunk_size - overlap):
chunks.append(text[i:i+chunk_size])
return chunks