Toadoum commited on
Commit
e05701a
·
verified ·
1 Parent(s): 3cd33f6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +97 -6
app.py CHANGED
@@ -46,9 +46,98 @@ def extract_text_from_pdf(file_path: str) -> str:
46
  return text.strip()
47
 
48
  def extract_text_from_docx(file_path: str) -> str:
49
- """Extract text from DOCX."""
50
- doc = Document(file_path)
51
- return "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
  def extract_text(file_path: str) -> str:
54
  """Extract text from uploaded file."""
@@ -56,13 +145,15 @@ def extract_text(file_path: str) -> str:
56
 
57
  if ext == ".pdf":
58
  return extract_text_from_pdf(file_path)
59
- elif ext in [".docx", ".doc"]:
60
  return extract_text_from_docx(file_path)
 
 
61
  elif ext == ".txt":
62
- with open(file_path, "r", encoding="utf-8") as f:
63
  return f.read()
64
  else:
65
- raise ValueError(f"Unsupported format: {ext}")
66
 
67
  # ============================================
68
  # LAZY MODEL LOADING
 
46
  return text.strip()
47
 
48
  def extract_text_from_docx(file_path: str) -> str:
49
+ """Extract text from DOCX with multiple fallback methods."""
50
+ import zipfile
51
+ import xml.etree.ElementTree as ET
52
+
53
+ # Method 1: Direct XML extraction (most reliable)
54
+ try:
55
+ with zipfile.ZipFile(file_path, 'r') as z:
56
+ if 'word/document.xml' in z.namelist():
57
+ xml_content = z.read('word/document.xml')
58
+ tree = ET.fromstring(xml_content)
59
+
60
+ # Extract all text nodes
61
+ texts = []
62
+ for elem in tree.iter():
63
+ if elem.tag.endswith('}t') or elem.tag == 't':
64
+ if elem.text:
65
+ texts.append(elem.text)
66
+
67
+ text = ''.join(texts)
68
+ if text.strip():
69
+ # Add paragraph breaks
70
+ return text.replace(' ', '\n\n')
71
+ except Exception as e:
72
+ print(f"XML extraction failed: {e}")
73
+
74
+ # Method 2: Try python-docx
75
+ try:
76
+ doc = Document(file_path)
77
+ text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
78
+ if text.strip():
79
+ return text
80
+ except Exception as e:
81
+ print(f"python-docx failed: {e}")
82
+
83
+ # Method 3: Read as binary and extract readable text
84
+ try:
85
+ with open(file_path, 'rb') as f:
86
+ content = f.read()
87
+
88
+ # Try to decode text portions
89
+ text_parts = []
90
+ try:
91
+ # Look for XML text content
92
+ import re
93
+ # Find text between <w:t> tags
94
+ matches = re.findall(b'<w:t[^>]*>([^<]+)</w:t>', content)
95
+ for match in matches:
96
+ try:
97
+ text_parts.append(match.decode('utf-8'))
98
+ except:
99
+ pass
100
+
101
+ if text_parts:
102
+ return ' '.join(text_parts)
103
+ except Exception as e:
104
+ print(f"Binary extraction failed: {e}")
105
+ except Exception as e:
106
+ print(f"File read failed: {e}")
107
+
108
+ raise ValueError("Could not extract text from this DOCX file. The file may be corrupted or in an unsupported format. Please try:\n1. Open in Word and Save As a new .docx\n2. Convert to PDF\n3. Copy text to a .txt file")
109
+
110
+ def extract_text_from_doc(file_path: str) -> str:
111
+ """Extract text from old .doc format."""
112
+ import subprocess
113
+
114
+ # Try antiword (if installed)
115
+ try:
116
+ result = subprocess.run(
117
+ ['antiword', file_path],
118
+ capture_output=True,
119
+ text=True,
120
+ timeout=30
121
+ )
122
+ if result.returncode == 0 and result.stdout.strip():
123
+ return result.stdout.strip()
124
+ except (FileNotFoundError, subprocess.TimeoutExpired):
125
+ pass
126
+
127
+ # Try catdoc (if installed)
128
+ try:
129
+ result = subprocess.run(
130
+ ['catdoc', file_path],
131
+ capture_output=True,
132
+ text=True,
133
+ timeout=30
134
+ )
135
+ if result.returncode == 0 and result.stdout.strip():
136
+ return result.stdout.strip()
137
+ except (FileNotFoundError, subprocess.TimeoutExpired):
138
+ pass
139
+
140
+ raise ValueError("Cannot read .doc files on this server. Please convert to .docx, .pdf, or .txt format.")
141
 
142
  def extract_text(file_path: str) -> str:
143
  """Extract text from uploaded file."""
 
145
 
146
  if ext == ".pdf":
147
  return extract_text_from_pdf(file_path)
148
+ elif ext == ".docx":
149
  return extract_text_from_docx(file_path)
150
+ elif ext == ".doc":
151
+ return extract_text_from_doc(file_path)
152
  elif ext == ".txt":
153
+ with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
154
  return f.read()
155
  else:
156
+ raise ValueError(f"Unsupported format: {ext}. Please use PDF, DOCX, or TXT.")
157
 
158
  # ============================================
159
  # LAZY MODEL LOADING