File size: 1,131 Bytes
5f36465
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import fitz  # PyMuPDF
import uuid
import pandas as pd
import os

def chunk_text(text, chunk_size=500, overlap=100):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i+chunk_size])
        if chunk:
            chunks.append(chunk)
    return chunks

def extract_chunks_from_pdfs(pdf_paths, output_csv_path):
    all_chunks = []

    for file_path in pdf_paths:
        doc = fitz.open(file_path)
        for page_num, page in enumerate(doc, start=1):
            text = page.get_text()
            chunks = chunk_text(text)
            for idx, chunk in enumerate(chunks):
                all_chunks.append({
                    "chunk_id": str(uuid.uuid4()),
                    "source_doc": os.path.basename(file_path),
                    "page": page_num,
                    "chunk_index": idx,
                    "text": chunk
                })

    df['text'] = df['text'].apply(lambda t: t[:1000])            
    df = pd.DataFrame(all_chunks)
    df.to_csv(output_csv_path, index=False)
    return df