| | --- |
| | license: mit |
| | language: |
| | - ru |
| | --- |
| | |
| | RUPunct_medium - средняя модель из семейства RUPunct. Баланс между производительностью и качеством. |
| | |
| | Код инференса: |
| | ```py |
| | from transformers import pipeline |
| | from transformers import AutoTokenizer |
| | |
| | pt = "RUPunct/RUPunct_medium" |
| |
|
| | tk = AutoTokenizer.from_pretrained(pt, strip_accents=False, add_prefix_space=True) |
| | classifier = pipeline("ner", model=pt, tokenizer=tk, aggregation_strategy="first") |
| | |
| | |
| | def process_token(token, label): |
| | if label == "LOWER_O": |
| | return token |
| | if label == "LOWER_PERIOD": |
| | return token + "." |
| | if label == "LOWER_COMMA": |
| | return token + "," |
| | if label == "LOWER_QUESTION": |
| | return token + "?" |
| | if label == "LOWER_TIRE": |
| | return token + "—" |
| | if label == "LOWER_DVOETOCHIE": |
| | return token + ":" |
| | if label == "LOWER_VOSKL": |
| | return token + "!" |
| | if label == "LOWER_PERIODCOMMA": |
| | return token + ";" |
| | if label == "LOWER_DEFIS": |
| | return token + "-" |
| | if label == "LOWER_MNOGOTOCHIE": |
| | return token + "..." |
| | if label == "LOWER_QUESTIONVOSKL": |
| | return token + "?!" |
| | if label == "UPPER_O": |
| | return token.capitalize() |
| | if label == "UPPER_PERIOD": |
| | return token.capitalize() + "." |
| | if label == "UPPER_COMMA": |
| | return token.capitalize() + "," |
| | if label == "UPPER_QUESTION": |
| | return token.capitalize() + "?" |
| | if label == "UPPER_TIRE": |
| | return token.capitalize() + " —" |
| | if label == "UPPER_DVOETOCHIE": |
| | return token.capitalize() + ":" |
| | if label == "UPPER_VOSKL": |
| | return token.capitalize() + "!" |
| | if label == "UPPER_PERIODCOMMA": |
| | return token.capitalize() + ";" |
| | if label == "UPPER_DEFIS": |
| | return token.capitalize() + "-" |
| | if label == "UPPER_MNOGOTOCHIE": |
| | return token.capitalize() + "..." |
| | if label == "UPPER_QUESTIONVOSKL": |
| | return token.capitalize() + "?!" |
| | if label == "UPPER_TOTAL_O": |
| | return token.upper() |
| | if label == "UPPER_TOTAL_PERIOD": |
| | return token.upper() + "." |
| | if label == "UPPER_TOTAL_COMMA": |
| | return token.upper() + "," |
| | if label == "UPPER_TOTAL_QUESTION": |
| | return token.upper() + "?" |
| | if label == "UPPER_TOTAL_TIRE": |
| | return token.upper() + " —" |
| | if label == "UPPER_TOTAL_DVOETOCHIE": |
| | return token.upper() + ":" |
| | if label == "UPPER_TOTAL_VOSKL": |
| | return token.upper() + "!" |
| | if label == "UPPER_TOTAL_PERIODCOMMA": |
| | return token.upper() + ";" |
| | if label == "UPPER_TOTAL_DEFIS": |
| | return token.upper() + "-" |
| | if label == "UPPER_TOTAL_MNOGOTOCHIE": |
| | return token.upper() + "..." |
| | if label == "UPPER_TOTAL_QUESTIONVOSKL": |
| | return token.upper() + "?!" |
| | |
| | while 1: |
| | input_text = input(":> ") |
| | preds = classifier(input_text) |
| | output = "" |
| | for item in preds: |
| | output += " " + process_token(item['word'].strip(), item['entity_group']) |
| | print(">>>", output) |
| | ``` |