| | |
| | import json |
| | from fastapi import APIRouter, Response |
| | from fastapi.responses import JSONResponse |
| | from pythainlp.tokenize import ( |
| | word_tokenize as py_word_tokenize, |
| | subword_tokenize as py_subword_tokenize, |
| | sent_tokenize as py_sent_tokenize |
| | ) |
| | from enum import Enum |
| | from typing import List, Optional |
| | from pydantic import BaseModel |
| |
|
| | router = APIRouter() |
| |
|
| |
|
| | class SentTokenizeEngine(str, Enum): |
| | whitespace = "whitespace" |
| | whitespace_newline = "whitespace+newline" |
| | crfcut = "crfcut" |
| |
|
| |
|
| | class WordTokenizeEngine(str, Enum): |
| | newmm = "newmm" |
| | longest = "longest" |
| | tltk = "tltk" |
| |
|
| |
|
| | class SubwordTokenizeEngine(str, Enum): |
| | tcc = "tcc" |
| | etcc = "etcc" |
| | ssg = "ssg" |
| | tltk = "tltk" |
| |
|
| | class WordTokenizeResponse(BaseModel): |
| | words: List[str] = [] |
| |
|
| | class SubwordTokenizeResponse(BaseModel): |
| | subwords: List[str] = [] |
| |
|
| | class SentTokenizeEngine(BaseModel): |
| | sents: List[str] = [] |
| |
|
| | @router.post('/word_tokenize', response_model=WordTokenizeResponse) |
| | def word_tokenize(text: str, engine: WordTokenizeEngine = "newmm"): |
| | """ |
| | Word tokenize or word segmentation for Thai language |
| | |
| | ## Input |
| | |
| | - **text**: Text that want to tokenize. |
| | - **engine**: Word Tokenize Engine (default is newmm) |
| | """ |
| | return JSONResponse( |
| | {"words": py_word_tokenize(text=text, engine=engine)}, |
| | media_type="application/json; charset=utf-8", |
| | ) |
| |
|
| |
|
| | @router.post('/subword_tokenize', response_model=SubwordTokenizeResponse) |
| | def subword_tokenize(text: str, engine: SubwordTokenizeEngine = "tcc"): |
| | """ |
| | Subword tokenize or subword segmentation for Thai language |
| | |
| | ## Input |
| | |
| | - **text**: Text that want to tokenize. |
| | - **engine**: Sub word Tokenize Engine (default is tcc) |
| | """ |
| | return JSONResponse( |
| | {"subwords": py_subword_tokenize(text=text, engine=engine)}, |
| | media_type="application/json; charset=utf-8", |
| | ) |
| |
|
| |
|
| | @router.post('/sent_tokenize', response_model=SentTokenizeEngine) |
| | def sent_tokenize(text: str, engine: SentTokenizeEngine = "crfcut"): |
| | """ |
| | Thai sentence segmentation |
| | |
| | ## Input |
| | |
| | - **text**: Text that want to tokenize. |
| | - **engine**: Sentence Tokenize Engine (default is crfcut) |
| | """ |
| | return JSONResponse( |
| | {"sents": py_sent_tokenize(text=text, engine=engine)}, |
| | media_type="application/json; charset=utf-8", |
| | ) |
| |
|