Spaces:

PlotweaverAI
/

plotweaver-audiobook

Running

App Files Files Community

Toadoum commited on 6 days ago

Commit

e05701a

verified ·

1 Parent(s): 3cd33f6

Update app.py

Browse files

Files changed (1) hide show

app.py +97 -6

app.py CHANGED Viewed

@@ -46,9 +46,98 @@ def extract_text_from_pdf(file_path: str) -> str:
     return text.strip()
 def extract_text_from_docx(file_path: str) -> str:
-    """Extract text from DOCX."""
-    doc = Document(file_path)
-    return "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
 def extract_text(file_path: str) -> str:
     """Extract text from uploaded file."""
@@ -56,13 +145,15 @@ def extract_text(file_path: str) -> str:
     if ext == ".pdf":
         return extract_text_from_pdf(file_path)
-    elif ext in [".docx", ".doc"]:
         return extract_text_from_docx(file_path)
     elif ext == ".txt":
-        with open(file_path, "r", encoding="utf-8") as f:
             return f.read()
     else:
-        raise ValueError(f"Unsupported format: {ext}")
 # ============================================
 # LAZY MODEL LOADING

     return text.strip()
 def extract_text_from_docx(file_path: str) -> str:
+    """Extract text from DOCX with multiple fallback methods."""
+    import zipfile
+    import xml.etree.ElementTree as ET
+    # Method 1: Direct XML extraction (most reliable)
+    try:
+        with zipfile.ZipFile(file_path, 'r') as z:
+            if 'word/document.xml' in z.namelist():
+                xml_content = z.read('word/document.xml')
+                tree = ET.fromstring(xml_content)
+                # Extract all text nodes
+                texts = []
+                for elem in tree.iter():
+                    if elem.tag.endswith('}t') or elem.tag == 't':
+                        if elem.text:
+                            texts.append(elem.text)
+                text = ''.join(texts)
+                if text.strip():
+                    # Add paragraph breaks
+                    return text.replace('  ', '\n\n')
+    except Exception as e:
+        print(f"XML extraction failed: {e}")
+    # Method 2: Try python-docx
+    try:
+        doc = Document(file_path)
+        text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
+        if text.strip():
+            return text
+    except Exception as e:
+        print(f"python-docx failed: {e}")
+    # Method 3: Read as binary and extract readable text
+    try:
+        with open(file_path, 'rb') as f:
+            content = f.read()
+        # Try to decode text portions
+        text_parts = []
+        try:
+            # Look for XML text content
+            import re
+            # Find text between <w:t> tags
+            matches = re.findall(b'<w:t[^>]*>([^<]+)</w:t>', content)
+            for match in matches:
+                try:
+                    text_parts.append(match.decode('utf-8'))
+                except:
+                    pass
+            if text_parts:
+                return ' '.join(text_parts)
+        except Exception as e:
+            print(f"Binary extraction failed: {e}")
+    except Exception as e:
+        print(f"File read failed: {e}")
+    raise ValueError("Could not extract text from this DOCX file. The file may be corrupted or in an unsupported format. Please try:\n1. Open in Word and Save As a new .docx\n2. Convert to PDF\n3. Copy text to a .txt file")
+def extract_text_from_doc(file_path: str) -> str:
+    """Extract text from old .doc format."""
+    import subprocess
+    # Try antiword (if installed)
+    try:
+        result = subprocess.run(
+            ['antiword', file_path],
+            capture_output=True,
+            text=True,
+            timeout=30
+        )
+        if result.returncode == 0 and result.stdout.strip():
+            return result.stdout.strip()
+    except (FileNotFoundError, subprocess.TimeoutExpired):
+        pass
+    # Try catdoc (if installed)
+    try:
+        result = subprocess.run(
+            ['catdoc', file_path],
+            capture_output=True,
+            text=True,
+            timeout=30
+        )
+        if result.returncode == 0 and result.stdout.strip():
+            return result.stdout.strip()
+    except (FileNotFoundError, subprocess.TimeoutExpired):
+        pass
+    raise ValueError("Cannot read .doc files on this server. Please convert to .docx, .pdf, or .txt format.")
 def extract_text(file_path: str) -> str:
     """Extract text from uploaded file."""
     if ext == ".pdf":
         return extract_text_from_pdf(file_path)
+    elif ext == ".docx":
         return extract_text_from_docx(file_path)
+    elif ext == ".doc":
+        return extract_text_from_doc(file_path)
     elif ext == ".txt":
+        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
             return f.read()
     else:
+        raise ValueError(f"Unsupported format: {ext}. Please use PDF, DOCX, or TXT.")
 # ============================================
 # LAZY MODEL LOADING