Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python | |
| # coding: utf-8 | |
| # In[1]: | |
| import PyPDF2 | |
| def extract_text(file_stream): | |
| reader = PyPDF2.PdfReader(file_stream) | |
| return "\n".join(page.extract_text() for page in reader.pages) | |
| def chunk_text(text, chunk_size=1000, overlap=200): | |
| chunks = [] | |
| for i in range(0, len(text), chunk_size - overlap): | |
| chunks.append(text[i:i+chunk_size]) | |
| return chunks | |