| |
|
| | import os
|
| | import json
|
| | import sentencepiece as spm
|
| | from tqdm import tqdm
|
| | MODEL_PREFIX = "icefire_spm"
|
| |
|
| | sp = spm.SentencePieceProcessor(model_file=f"{MODEL_PREFIX}.model")
|
| | sample = "Daenerys Targaryen rides Drogon to Winterfell."
|
| | ids = sp.encode(sample)
|
| | pieces = sp.encode(sample, out_type=str)
|
| | print("\nTest encode:")
|
| | print(f" Text : {sample}")
|
| | print(f" IDs : {ids}")
|
| | print(f" Pieces: {pieces}")
|
| | print(f" Decode: {sp.decode(ids)}")
|
| |
|
| |
|
| |
|
| |
|
| | entity = "Winterfell"
|
| | e_pieces = sp.encode(entity, out_type=str)
|
| | print(f"\nEntity '{entity}' → {len(e_pieces)} piece(s): {e_pieces}")
|
| |
|
| | entity = "Long Claw"
|
| | e_pieces = sp.encode(entity, out_type=str)
|
| | print(f"\nEntity '{entity}' → {len(e_pieces)} piece(s): {e_pieces}") |