AyoubChLin commited on
Commit
efddb2f
·
1 Parent(s): 50231a8

feat: update classifier model configuration and remove external dependencies

Browse files
.env.example CHANGED
@@ -5,8 +5,7 @@ DEBUG=false
5
  STATIC_DIR=static
6
  UPLOAD_SUBDIR=uploads
7
 
8
- CLASSIFIER_SPACE=https://ayoubchlin-ayoubchlin-stable-bart-mnli-cnn.hf.space/
9
- CLASSIFIER_API_NAME=/predict
10
  HUGGINGFACE_TOKEN=
11
 
12
  LANGUAGE_DETECTOR_URL=https://team-language-detector-languagedetector.hf.space/run/predict
 
5
  STATIC_DIR=static
6
  UPLOAD_SUBDIR=uploads
7
 
8
+ CLASSIFIER_MODEL=AyoubChLin/distilbert_cnn_news
 
9
  HUGGINGFACE_TOKEN=
10
 
11
  LANGUAGE_DETECTOR_URL=https://team-language-detector-languagedetector.hf.space/run/predict
.qwen/settings.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "permissions": {
3
+ "allow": [
4
+ "Bash(pip install *)",
5
+ "Bash(pip3 install *)"
6
+ ]
7
+ },
8
+ "$version": 3
9
+ }
.qwen/settings.json.orig ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "permissions": {
3
+ "allow": [
4
+ "Bash(pip install *)"
5
+ ]
6
+ }
7
+ }
Dockerfile CHANGED
@@ -12,6 +12,8 @@ RUN apt-get update \
12
  COPY requirements.txt .
13
  RUN pip install --no-cache-dir -r requirements.txt
14
 
 
 
15
  COPY . .
16
 
17
  EXPOSE 4002
 
12
  COPY requirements.txt .
13
  RUN pip install --no-cache-dir -r requirements.txt
14
 
15
+ RUN huggingface-cli login --token ${HUGGINGFACE_TOKEN} 2>/dev/null || true
16
+
17
  COPY . .
18
 
19
  EXPOSE 4002
README.md CHANGED
@@ -39,7 +39,7 @@ cp .env.example .env
39
  ```
40
 
41
  Key vars:
42
- - `CLASSIFIER_SPACE`
43
  - `HUGGINGFACE_TOKEN`
44
  - `LANGUAGE_DETECTOR_URL`
45
  - `DEFAULT_LABELS_CSV`
@@ -63,3 +63,4 @@ pytest -q
63
  ## Notes
64
  - OCR requires `tesseract-ocr` (installed in Dockerfile).
65
  - Supported extraction formats in this refactor: `.pdf`, `.docx`, `.xlsx`, image formats, and plain text files.
 
 
39
  ```
40
 
41
  Key vars:
42
+ - `CLASSIFIER_MODEL`
43
  - `HUGGINGFACE_TOKEN`
44
  - `LANGUAGE_DETECTOR_URL`
45
  - `DEFAULT_LABELS_CSV`
 
63
  ## Notes
64
  - OCR requires `tesseract-ocr` (installed in Dockerfile).
65
  - Supported extraction formats in this refactor: `.pdf`, `.docx`, `.xlsx`, image formats, and plain text files.
66
+ - The classifier model is loaded directly from Hugging Face Hub (no external Space dependency).
app/core/config.py CHANGED
@@ -15,8 +15,7 @@ class Settings(BaseSettings):
15
  static_dir: Path = Path("static")
16
  upload_subdir: str = "uploads"
17
 
18
- classifier_space: str = "https://ayoubchlin-ayoubchlin-stable-bart-mnli-cnn.hf.space/"
19
- classifier_api_name: str = "/predict"
20
  huggingface_token: str | None = None
21
 
22
  language_detector_url: str = "https://team-language-detector-languagedetector.hf.space/run/predict"
 
15
  static_dir: Path = Path("static")
16
  upload_subdir: str = "uploads"
17
 
18
+ classifier_model: str = "AyoubChLin/distilbert_cnn_news"
 
19
  huggingface_token: str | None = None
20
 
21
  language_detector_url: str = "https://team-language-detector-languagedetector.hf.space/run/predict"
app/services/classifier_service.py CHANGED
@@ -1,8 +1,6 @@
1
- import json
2
- from pathlib import Path
3
  from typing import Any
4
 
5
- from gradio_client import Client
6
 
7
  from app.core.config import settings
8
  from app.core.exceptions import ClassificationError
@@ -10,72 +8,30 @@ from app.core.exceptions import ClassificationError
10
 
11
  class ClassifierService:
12
  def __init__(self) -> None:
13
- self._client: Client | None = None
14
-
15
- def _get_client(self) -> Client:
16
- if self._client is not None:
17
- return self._client
18
-
19
- client_kwargs: dict[str, Any] = {}
20
- if settings.huggingface_token:
21
- client_kwargs["hf_token"] = settings.huggingface_token
22
-
23
- try:
24
- self._client = Client(settings.classifier_space, **client_kwargs)
25
- except Exception as exc:
26
- raise ClassificationError("Unable to initialize classifier client") from exc
27
-
28
- return self._client
29
-
30
- @staticmethod
31
- def _extract_label(payload: Any) -> str | None:
32
- if isinstance(payload, dict):
33
- value = payload.get("label")
34
- if isinstance(value, str) and value.strip():
35
- return value.strip()
36
- return None
37
-
38
- if isinstance(payload, list):
39
- for item in payload:
40
- label = ClassifierService._extract_label(item)
41
- if label:
42
- return label
43
-
44
- return None
45
 
46
  def classify(self, text: str, labels: list[str]) -> str:
47
  if not labels:
48
  raise ClassificationError("No labels configured")
49
 
50
- labels_text = ", ".join(labels)
51
-
52
  try:
53
- result = self._get_client().predict(
54
- text,
55
- labels_text,
56
- api_name=settings.classifier_api_name,
57
- )
58
  except Exception as exc:
59
- raise ClassificationError("Classifier request failed") from exc
60
-
61
- if isinstance(result, str):
62
- candidate_path = Path(result)
63
- if candidate_path.exists():
64
- try:
65
- parsed = json.loads(candidate_path.read_text(encoding="utf-8"))
66
- except Exception as exc:
67
- raise ClassificationError("Classifier output file is not valid JSON") from exc
68
- label = self._extract_label(parsed)
69
- if label:
70
- return label
71
-
72
- stripped = result.strip()
73
- if stripped:
74
- return stripped
75
 
76
- label = self._extract_label(result)
77
- if label:
78
- return label
79
 
80
  raise ClassificationError("Classifier did not return a valid label")
81
 
 
 
 
1
  from typing import Any
2
 
3
+ from transformers import pipeline
4
 
5
  from app.core.config import settings
6
  from app.core.exceptions import ClassificationError
 
8
 
9
  class ClassifierService:
10
  def __init__(self) -> None:
11
+ self._pipeline: Any | None = None
12
+
13
+ def _get_pipeline(self) -> Any:
14
+ if self._pipeline is None:
15
+ try:
16
+ self._pipeline = pipeline(
17
+ "zero-shot-classification",
18
+ model=settings.classifier_model,
19
+ )
20
+ except Exception as exc:
21
+ raise ClassificationError("Unable to initialize classifier pipeline") from exc
22
+ return self._pipeline
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
  def classify(self, text: str, labels: list[str]) -> str:
25
  if not labels:
26
  raise ClassificationError("No labels configured")
27
 
 
 
28
  try:
29
+ result = self._get_pipeline()(text, labels, multi_label=False)
 
 
 
 
30
  except Exception as exc:
31
+ raise ClassificationError("Classifier prediction failed") from exc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
+ if isinstance(result, dict) and "labels" in result and result["labels"]:
34
+ return result["labels"][0]
 
35
 
36
  raise ClassificationError("Classifier did not return a valid label")
37
 
requirements.txt CHANGED
@@ -3,9 +3,13 @@ uvicorn[standard]==0.34.0
3
  pydantic==2.10.6
4
  pydantic-settings==2.7.1
5
  requests==2.32.3
6
- gradio_client==1.7.0
7
  python-multipart==0.0.20
8
 
 
 
 
 
 
9
  pytesseract==0.3.13
10
  Pillow==11.1.0
11
  pypdf==5.4.0
 
3
  pydantic==2.10.6
4
  pydantic-settings==2.7.1
5
  requests==2.32.3
 
6
  python-multipart==0.0.20
7
 
8
+ transformers==4.46.0
9
+ torch==2.5.1
10
+ accelerate==1.1.1
11
+ sentencepiece==0.2.0
12
+
13
  pytesseract==0.3.13
14
  Pillow==11.1.0
15
  pypdf==5.4.0