heymenn commited on
Commit
8d410d8
·
1 Parent(s): 6f9eaf4

add docx download to ETSI files

Browse files
Files changed (4) hide show
  1. app.py +23 -0
  2. classes.py +117 -1
  3. static/script.js +64 -9
  4. templates/index.html +7 -0
app.py CHANGED
@@ -347,6 +347,29 @@ def find_document_batch(request: BatchDocRequest):
347
  search_time=time.time()-start_time
348
  )
349
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
350
  @app.post('/search', response_model=KeywordResponse, tags=["Content Search"], summary="Search specifications by keywords", responses={
351
  200: {
352
  "description": "Search completed successfully"
 
347
  search_time=time.time()-start_time
348
  )
349
 
350
+ @app.post("/find/docx", tags=["Document Retrieval"], summary="Download an ETSI specification as DOCX",
351
+ responses={
352
+ 200: {"description": "DOCX file streamed directly"},
353
+ 400: {"description": "DOCX download only supported for ETSI specifications"},
354
+ 404: {"description": "Specification not found or DOCX not available"},
355
+ })
356
+ def find_document_docx(request: DocRequest):
357
+ document = request.doc_id
358
+ if not valid_etsi_spec_format.match(document):
359
+ raise HTTPException(status_code=400, detail="DOCX download is only supported for ETSI specifications (e.g. '102 221')")
360
+
361
+ result = etsi_spec_finder.search_document_docx(document, request.version)
362
+
363
+ if not result.endswith(".docx"):
364
+ raise HTTPException(status_code=404, detail=result)
365
+
366
+ return FileResponse(
367
+ result,
368
+ filename=os.path.basename(result),
369
+ media_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
370
+ )
371
+
372
+
373
  @app.post('/search', response_model=KeywordResponse, tags=["Content Search"], summary="Search specifications by keywords", responses={
374
  200: {
375
  "description": "Search completed successfully"
classes.py CHANGED
@@ -148,4 +148,120 @@ class ETSISpecFinder:
148
  if f.endswith(".pdf"):
149
  return url2 + release + "/" + f
150
 
151
- return f"Specification {doc_id} not found"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  if f.endswith(".pdf"):
149
  return url2 + release + "/" + f
150
 
151
+ return f"Specification {doc_id} not found"
152
+
153
+ def _get_wki_id(self, doc_id: str, version: str = None) -> str:
154
+ """Return the ETSI portal wki_id for a spec version, or None if not found."""
155
+ if version:
156
+ version_str = version
157
+ else:
158
+ # Derive version from the FTP PDF URL
159
+ pdf_url = self.search_document(doc_id)
160
+ if "not found" in pdf_url.lower():
161
+ return None
162
+ # URL path: .../18.04.00_60/ts_...p.pdf → folder is parts[-2]
163
+ parts = pdf_url.rstrip("/").split("/")
164
+ version_folder = parts[-2] # e.g. "18.04.00_60"
165
+ v_parts = version_folder.split("_")[0].split(".") # ["18", "04", "00"]
166
+ try:
167
+ version_str = f"{int(v_parts[0])}.{int(v_parts[1])}.{int(v_parts[2])}"
168
+ except (ValueError, IndexError):
169
+ return None
170
+
171
+ for spec_type in ["TS", "TR"]:
172
+ params = {
173
+ "option": "com_standardssearch",
174
+ "view": "data",
175
+ "format": "json",
176
+ "page": "1",
177
+ "search": f"ETSI {spec_type} {doc_id} v{version_str}",
178
+ "etsiNumber": "1",
179
+ "published": "1",
180
+ }
181
+ try:
182
+ resp = requests.get("https://www.etsi.org/", params=params,
183
+ headers=self.headers, verify=False, timeout=15)
184
+ data = resp.json()
185
+ if data and isinstance(data, list):
186
+ return str(data[0]["wki_id"])
187
+ except Exception as e:
188
+ print(f"Error getting wki_id for {doc_id}: {e}")
189
+ return None
190
+
191
+ def _authenticate_eol(self, wki_id: str) -> requests.Session:
192
+ """Create a requests.Session authenticated to the ETSI EOL portal."""
193
+ session = requests.Session()
194
+ session.headers.update({"User-Agent": self.headers["User-Agent"]})
195
+
196
+ login_redir_url = (
197
+ f"https://portal.etsi.org/LoginRedirection.aspx"
198
+ f"?ReturnUrl=%2fwebapp%2fprotect%2fNTaccount.asp%3fWki_Id%3d{wki_id}"
199
+ f"&Wki_Id={wki_id}"
200
+ )
201
+ # Seed DNN session cookies
202
+ session.get(login_redir_url, verify=False, timeout=15)
203
+
204
+ # Authenticate via EOL JSON login
205
+ session.post(
206
+ "https://portal.etsi.org/ETSIPages/LoginEOL.ashx",
207
+ data=json.dumps({"username": os.environ.get("EOL_USER"),
208
+ "password": os.environ.get("EOL_PASSWORD")}),
209
+ headers={"Content-Type": "application/json; charset=UTF-8",
210
+ "Referer": login_redir_url},
211
+ verify=False,
212
+ allow_redirects=False,
213
+ timeout=15,
214
+ )
215
+ return session
216
+
217
+ def search_document_docx(self, doc_id: str, version: str = None) -> str:
218
+ """Download an ETSI spec as DOCX and return the local file path."""
219
+ wki_id = self._get_wki_id(doc_id, version)
220
+ if not wki_id:
221
+ return f"Specification {doc_id} not found"
222
+
223
+ session = self._authenticate_eol(wki_id)
224
+
225
+ # NTaccount.asp → parse profile_id from meta-refresh
226
+ r = session.get(
227
+ f"https://portal.etsi.org/webapp/protect/NTaccount.asp?Wki_Id={wki_id}",
228
+ verify=False, timeout=15,
229
+ )
230
+ meta_match = re.search(r'URL=([^"\'\s>]+)', r.text)
231
+ if not meta_match:
232
+ return f"Specification {doc_id}: authentication failed"
233
+
234
+ meta_url = meta_match.group(1)
235
+ if not meta_url.startswith("http"):
236
+ meta_url = f"https://portal.etsi.org/webapp/protect/{meta_url}"
237
+
238
+ # CheckIdentifier → 302 to copy_file
239
+ r2 = session.get(meta_url, allow_redirects=False, verify=False, timeout=15)
240
+ if r2.status_code != 302:
241
+ return f"Specification {doc_id}: download chain failed"
242
+
243
+ # copy_file (may have a second redirect)
244
+ copy_url = "https://portal.etsi.org" + r2.headers["Location"]
245
+ r3 = session.get(copy_url, allow_redirects=False, verify=False, timeout=15)
246
+
247
+ if r3.status_code == 302:
248
+ final_url = "https://portal.etsi.org/webapp/ewp/" + r3.headers["Location"]
249
+ r4 = session.get(final_url, verify=False, timeout=15)
250
+ else:
251
+ r4 = r3
252
+
253
+ # Extract DOCX link
254
+ docx_urls = re.findall(r'href=["\']([^"\']*\.docx)["\']', r4.text, re.IGNORECASE)
255
+ if not docx_urls:
256
+ return f"Specification {doc_id}: DOCX not available"
257
+
258
+ docx_url = docx_urls[0]
259
+
260
+ # Download
261
+ dl = session.get(docx_url, headers={"Referer": r4.url}, verify=False, timeout=60)
262
+ filename = docx_url.split("/")[-1]
263
+ tmp_path = f"/tmp/{filename}"
264
+ with open(tmp_path, "wb") as f:
265
+ f.write(dl.content)
266
+
267
+ return tmp_path
static/script.js CHANGED
@@ -49,10 +49,14 @@ const GPP_SPEC_RE = /^\d{2}\.\d{3}/;
49
 
50
  function toggleVersionField() {
51
  const docId = document.getElementById('doc-id').value.trim();
52
- const group = document.getElementById('single-version-group');
53
- const isSpec = ETSI_SPEC_RE.test(docId) || GPP_SPEC_RE.test(docId);
54
- group.style.display = isSpec ? 'block' : 'none';
55
- if (!isSpec) document.getElementById('doc-version').value = '';
 
 
 
 
56
  }
57
 
58
  // Keyboard shortcuts management
@@ -73,15 +77,23 @@ function setupKeyboardHandlers() {
73
  // Search functions
74
  async function searchSingle() {
75
  const docId = document.getElementById('doc-id').value.trim();
76
-
77
  if (!docId) {
78
  showError('Please enter a document ID');
79
  return;
80
  }
81
-
 
 
 
 
 
 
 
 
82
  showLoading();
83
  updateHeaderStats('Searching...');
84
-
85
  try {
86
  const version = document.getElementById('doc-version').value.trim() || null;
87
  const body = { doc_id: docId };
@@ -94,9 +106,9 @@ async function searchSingle() {
94
  },
95
  body: JSON.stringify(body)
96
  });
97
-
98
  const data = await response.json();
99
-
100
  if (response.ok) {
101
  displaySingleResult(data);
102
  updateHeaderStats(`Found in ${data.search_time.toFixed(3)}s`);
@@ -113,6 +125,49 @@ async function searchSingle() {
113
  }
114
  }
115
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  async function searchBatch() {
117
  const batchText = document.getElementById('batch-ids').value.trim();
118
 
 
49
 
50
  function toggleVersionField() {
51
  const docId = document.getElementById('doc-id').value.trim();
52
+ const versionGroup = document.getElementById('single-version-group');
53
+ const formatGroup = document.getElementById('single-format-group');
54
+ const isSpec = ETSI_SPEC_RE.test(docId) || GPP_SPEC_RE.test(docId);
55
+ const isEtsiSpec = ETSI_SPEC_RE.test(docId);
56
+ versionGroup.style.display = isSpec ? 'block' : 'none';
57
+ formatGroup.style.display = isEtsiSpec ? 'block' : 'none';
58
+ if (!isSpec) document.getElementById('doc-version').value = '';
59
+ if (!isEtsiSpec) document.getElementById('doc-format').value = 'pdf';
60
  }
61
 
62
  // Keyboard shortcuts management
 
77
  // Search functions
78
  async function searchSingle() {
79
  const docId = document.getElementById('doc-id').value.trim();
80
+
81
  if (!docId) {
82
  showError('Please enter a document ID');
83
  return;
84
  }
85
+
86
+ const format = document.getElementById('doc-format').value || 'pdf';
87
+ const isDocx = ETSI_SPEC_RE.test(docId) && format === 'docx';
88
+
89
+ if (isDocx) {
90
+ downloadDocx(docId);
91
+ return;
92
+ }
93
+
94
  showLoading();
95
  updateHeaderStats('Searching...');
96
+
97
  try {
98
  const version = document.getElementById('doc-version').value.trim() || null;
99
  const body = { doc_id: docId };
 
106
  },
107
  body: JSON.stringify(body)
108
  });
109
+
110
  const data = await response.json();
111
+
112
  if (response.ok) {
113
  displaySingleResult(data);
114
  updateHeaderStats(`Found in ${data.search_time.toFixed(3)}s`);
 
125
  }
126
  }
127
 
128
+ async function downloadDocx(docId) {
129
+ showLoading();
130
+ updateHeaderStats('Downloading DOCX...');
131
+
132
+ try {
133
+ const version = document.getElementById('doc-version').value.trim() || null;
134
+ const body = { doc_id: docId };
135
+ if (version) body.version = version;
136
+
137
+ const response = await fetch('/find/docx', {
138
+ method: 'POST',
139
+ headers: { 'Content-Type': 'application/json' },
140
+ body: JSON.stringify(body)
141
+ });
142
+
143
+ if (response.ok) {
144
+ const blob = await response.blob();
145
+ const disposition = response.headers.get('Content-Disposition') || '';
146
+ const fnMatch = disposition.match(/filename="?([^";\n]+)"?/);
147
+ const filename = fnMatch ? fnMatch[1] : `${docId.replace(/ /g, '_')}.docx`;
148
+ const url = URL.createObjectURL(blob);
149
+ const a = document.createElement('a');
150
+ a.href = url;
151
+ a.download = filename;
152
+ document.body.appendChild(a);
153
+ a.click();
154
+ document.body.removeChild(a);
155
+ URL.revokeObjectURL(url);
156
+ updateHeaderStats('DOCX downloaded');
157
+ } else {
158
+ const data = await response.json();
159
+ showError(data.detail);
160
+ updateHeaderStats('Error');
161
+ }
162
+ } catch (error) {
163
+ showError('Error connecting to server');
164
+ updateHeaderStats('Error');
165
+ console.error('Error:', error);
166
+ } finally {
167
+ hideLoading();
168
+ }
169
+ }
170
+
171
  async function searchBatch() {
172
  const batchText = document.getElementById('batch-ids').value.trim();
173
 
templates/index.html CHANGED
@@ -43,6 +43,13 @@
43
  <label for="doc-version">Version <span class="label-hint">(leave empty for latest)</span></label>
44
  <input type="text" id="doc-version" placeholder="e.g., 17.6.0">
45
  </div>
 
 
 
 
 
 
 
46
  </div>
47
 
48
  <!-- Batch Search -->
 
43
  <label for="doc-version">Version <span class="label-hint">(leave empty for latest)</span></label>
44
  <input type="text" id="doc-version" placeholder="e.g., 17.6.0">
45
  </div>
46
+ <div class="form-group version-group" id="single-format-group" style="display:none;">
47
+ <label for="doc-format">Format <span class="label-hint">(ETSI specs only)</span></label>
48
+ <select id="doc-format">
49
+ <option value="pdf">PDF (public)</option>
50
+ <option value="docx">DOCX (requires ETSI credentials)</option>
51
+ </select>
52
+ </div>
53
  </div>
54
 
55
  <!-- Batch Search -->