| | from enum import Enum |
| | import time |
| | import traceback |
| | import uuid |
| | from werkzeug.exceptions import InternalServerError |
| | from config.logger import CustomLogger,request_id_var |
| | import torch |
| | from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer |
| | import os |
| | import sys |
| | import nltk |
| | from nltk.tokenize.punkt import PunktSentenceTokenizer |
| |
|
| | try: |
| | if getattr(sys, 'frozen', False): |
| | application_path = sys._MEIPASS |
| | else: |
| | application_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', '..') |
| | log=CustomLogger() |
| | log.info(f"application_path : {application_path}") |
| | log.info("before loading model") |
| | |
| | device = "cuda" |
| | device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") |
| | log.info(f"device : {device}") |
| | gpu=0 if torch.cuda.is_available() else -1 |
| |
|
| | |
| | pipeline_kwargs={"return_token_type_ids": False,"max_length": 512,"truncation": True,"batch_size": 1} |
| | gibberishModel = AutoModelForSequenceClassification.from_pretrained(os.path.join(application_path, "models/gibberish")).to(device) |
| | gibberishTokenizer = AutoTokenizer.from_pretrained(os.path.join(application_path, "models/gibberish")) |
| | |
| | request_id_var.set("Startup") |
| | log_dict={} |
| | log.info("model loaded") |
| |
|
| | except Exception as e: |
| | log.error(f"Exception: {e}") |
| | log.error(f"Exception: {str(traceback.extract_tb(e.__traceback__)[0].lineno),e}") |
| |
|
| |
|
| |
|
| | class MatchType(Enum): |
| | SENTENCE = "sentence" |
| | FULL = "full" |
| |
|
| | def get_inputs(self, prompt: str) -> list[str]: |
| | if self == MatchType.SENTENCE: |
| | pk = PunktSentenceTokenizer() |
| | return pk.sentences_from_text(text=prompt) |
| | return [prompt] |
| | |
| |
|
| | class Gibberish: |
| |
|
| | def scan(self,payload): |
| | log.info("inside gibberish_check") |
| | id=uuid.uuid4().hex |
| | request_id_var.set(id) |
| | log_dict[request_id_var.get()]=[] |
| | try: |
| | st = time.time() |
| | text=payload['text'] |
| | gibberish_labels = payload['labels'] |
| | nlp = pipeline(task="text-classification", model=gibberishModel, tokenizer=gibberishTokenizer, device=device,model_kwargs=pipeline_kwargs) |
| | match_type = MatchType(MatchType.FULL) |
| | results_all = nlp(match_type.get_inputs(text)) |
| | log.debug(f"Gibberish detection finished :{results_all}") |
| | output={} |
| | res=[] |
| | for result in results_all: |
| | score = round( |
| | result["score"] if result["label"] in gibberish_labels else 1 - result["score"], |
| | 2, |
| | ) |
| | output['gibberish_label'] = result["label"] |
| | output['gibberish_score'] = score |
| |
|
| | res.append(output) |
| |
|
| | del nlp |
| | er=log_dict[request_id_var.get()] |
| | logobj = {"_id":id,"error":er} |
| | if len(er)!=0: |
| | log.debug(str(logobj)) |
| | del log_dict[id] |
| | return {"result":res,"time_taken":str(round(time.time()-st,3))+"s"} |
| | |
| | except Exception as e: |
| | log.error("Error occured in gibberish_check") |
| | log.error(f"Exception: {str(traceback.extract_tb(e.__traceback__)[0].lineno),e}") |
| | log_dict[request_id_var.get()].append({"Line number":str(traceback.extract_tb(e.__traceback__)[0].lineno),"Error":str(e), |
| | "Error Module":"Failed at gibberish_check call"}) |
| | raise InternalServerError() |