| FROM python:3.12-slim |
|
|
| WORKDIR /app |
|
|
| |
| RUN apt-get update && apt-get install -y --no-install-recommends \ |
| build-essential \ |
| && rm -rf /var/lib/apt/lists/* |
|
|
| |
| |
| COPY requirements.txt . |
| RUN pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu && \ |
| pip install --no-cache-dir -r requirements.txt |
|
|
| |
| |
|
|
| |
| ENV HF_HOME=/opt/huggingface |
| RUN mkdir -p /opt/huggingface && chmod 777 /opt/huggingface |
| |
| RUN python -c "\ |
| from transformers import MBartForConditionalGeneration, AutoTokenizer, AutoConfig; \ |
| import torch; \ |
| repo = 'bayan10/summarization-model'; \ |
| print('Downloading summarization tokenizer...'); \ |
| AutoTokenizer.from_pretrained(repo); \ |
| print('Downloading summarization config...'); \ |
| AutoConfig.from_pretrained(repo); \ |
| print('Downloading summarization model (float16)...'); \ |
| MBartForConditionalGeneration.from_pretrained(repo, torch_dtype=torch.float16); \ |
| print('Summarization model cached!'); \ |
| " |
|
|
| |
| RUN python -c "\ |
| from huggingface_hub import hf_hub_download; \ |
| from transformers import AutoTokenizer, EncoderDecoderModel, AutoModelForMaskedLM; \ |
| print('Downloading AraSpell checkpoint...'); \ |
| hf_hub_download(repo_id='bayan10/AraSpell-Model', filename='last_model.pt'); \ |
| print('Downloading AraBERT tokenizer...'); \ |
| AutoTokenizer.from_pretrained('aubmindlab/bert-base-arabertv02'); \ |
| print('Downloading AraBERT encoder-decoder...'); \ |
| EncoderDecoderModel.from_encoder_decoder_pretrained('aubmindlab/bert-base-arabertv02', 'aubmindlab/bert-base-arabertv02'); \ |
| print('Downloading AraBERT MLM (for ContextualCorrector)...'); \ |
| AutoModelForMaskedLM.from_pretrained('aubmindlab/bert-base-arabertv02'); \ |
| print('Spelling model + MLM cached!'); \ |
| " |
|
|
| |
| |
| ENV CAMELTOOLS_DATA=/opt/camel_tools |
| RUN mkdir -p /opt/camel_tools && chmod 777 /opt/camel_tools && camel_data -i light |
|
|
| |
| RUN python -c "\ |
| from transformers import EncoderDecoderModel, AutoTokenizer; \ |
| repo = 'bayan10/PuncAra-v1'; \ |
| print('Downloading PuncAra-v1 tokenizer...'); \ |
| AutoTokenizer.from_pretrained(repo); \ |
| print('Downloading PuncAra-v1 model...'); \ |
| EncoderDecoderModel.from_pretrained(repo); \ |
| print('PuncAra-v1 cached!'); \ |
| " |
|
|
| |
| RUN python -c "\ |
| from transformers import AutoTokenizer, AutoModelForSeq2SeqLM; \ |
| import torch; \ |
| repo = 'bayan10/dialect-to-msa-model'; \ |
| print('Downloading dialect tokenizer...'); \ |
| AutoTokenizer.from_pretrained(repo); \ |
| print('Downloading dialect model (float16)...'); \ |
| AutoModelForSeq2SeqLM.from_pretrained(repo, torch_dtype=torch.float16); \ |
| print('Dialect model cached!'); \ |
| " |
|
|
| |
| COPY src/ ./src/ |
| COPY quran.py ./ |
| COPY quran_master.db ./ |
| COPY .env* ./ |
|
|
| |
| ENV PORT=7860 |
| ENV DEBUG=False |
| ENV PYTHONUNBUFFERED=1 |
|
|
| |
| EXPOSE 7860 |
|
|
| |
| |
| CMD ["gunicorn", "--chdir", "src", "app:app", "--bind", "0.0.0.0:7860", "--timeout", "300", "--workers", "1"] |
|
|