| | import os |
| | import sentencepiece as spm |
| | from transformers import AutoTokenizer, PreTrainedTokenizerFast |
| |
|
| | class TokenizerSetup: |
| | def __init__(self, model_path="tokenizer", model_type="bpe", vocab_size=32000, hf_model=None): |
| | """Initialize tokenizer setup for custom or pretrained use.""" |
| | self.model_path = model_path |
| | self.model_type = model_type.lower() |
| | self.vocab_size = vocab_size |
| | self.hf_model = hf_model |
| | self.tokenizer = None |
| | |
| | |
| | valid_types = ["bpe", "unigram", "char", "word"] |
| | if self.model_type not in valid_types: |
| | print(f"⚠️ Invalid model_type '{self.model_type}'. Choose from {valid_types}") |
| | self.model_type = "bpe" |
| |
|
| | def train_sentencepiece(self, input_file): |
| | """Train a SentencePiece tokenizer with specified settings.""" |
| | if not os.path.exists(input_file): |
| | print(f"⚠️ Input file {input_file} not found! Provide a valid text corpus.") |
| | return |
| | |
| | try: |
| | spm.SentencePieceTrainer.Train( |
| | f"--input={input_file} " |
| | f"--model_prefix={self.model_path} " |
| | f"--vocab_size={self.vocab_size} " |
| | f"--model_type={self.model_type} " |
| | f"--pad_id=0 --unk_id=1 --bos_id=2 --eos_id=3 " |
| | f"--user_defined_symbols=<pad>,<unk>,<bos>,<eos>" |
| | ) |
| | print(f"✅ Trained SentencePiece tokenizer. Saved as {self.model_path}.model") |
| | except Exception as e: |
| | print(f"⚠️ Error training SentencePiece: {e}") |
| |
|
| | def load_tokenizer(self): |
| | """Load either a SentencePiece or Hugging Face tokenizer.""" |
| | try: |
| | if self.hf_model: |
| | self.tokenizer = AutoTokenizer.from_pretrained(self.hf_model) |
| | print(f"✅ Loaded Hugging Face tokenizer from {self.hf_model}") |
| | else: |
| | sp_model = f"{self.model_path}.model" |
| | if not os.path.exists(sp_model): |
| | print(f"⚠️ {sp_model} not found! Train it first.") |
| | return |
| | |
| | sp = spm.SentencePieceProcessor(model_file=sp_model) |
| | self.tokenizer = PreTrainedTokenizerFast( |
| | tokenizer_object=sp, |
| | pad_token="<pad>", |
| | unk_token="<unk>", |
| | bos_token="<bos>", |
| | eos_token="<eos>" |
| | ) |
| | print(f"✅ Loaded SentencePiece tokenizer from {sp_model}") |
| | except Exception as e: |
| | print(f"⚠️ Error loading tokenizer: {e}") |
| |
|
| | def save_tokenizer(self, save_dir="tokenizer/"): |
| | """Save tokenizer files to a directory.""" |
| | if not self.tokenizer: |
| | print("⚠️ No tokenizer loaded to save!") |
| | return |
| | |
| | try: |
| | os.makedirs(save_dir, exist_ok=True) |
| | self.tokenizer.save_pretrained(save_dir) |
| | if not self.hf_model: |
| | for ext in [".model", ".vocab"]: |
| | src = f"{self.model_path}{ext}" |
| | if os.path.exists(src): |
| | os.system(f"cp {src} {save_dir}") |
| | print(f"✅ Tokenizer saved to {save_dir}") |
| | except Exception as e: |
| | print(f"⚠️ Error saving tokenizer: {e}") |
| |
|
| | def tokenize_text(self, text, return_tensors=True): |
| | """Tokenize text and show both IDs and decoded output.""" |
| | if not self.tokenizer: |
| | print("⚠️ No tokenizer initialized! Load or train one first.") |
| | return None |
| | |
| | try: |
| | tokens = self.tokenizer(text, return_tensors="pt" if return_tensors else None) |
| | ids = tokens["input_ids"] if return_tensors else tokens |
| | decoded = self.tokenizer.decode(ids[0] if return_tensors else ids, skip_special_tokens=True) |
| | print(f"🔹 Token IDs: {ids}") |
| | print(f"🔹 Decoded: {decoded}") |
| | return tokens |
| | except Exception as e: |
| | print(f"⚠️ Error tokenizing text: {e}") |
| | return None |
| |
|
| | if __name__ == "__main__": |
| | |
| | tokenizer_setup = TokenizerSetup( |
| | model_path="tokenizer", |
| | model_type="bpe", |
| | vocab_size=32000, |
| | hf_model=None |
| | ) |
| |
|
| | |
| | input_file = "../datasets/eclipse_corpuz_1.1.txt" |
| | if not os.path.exists(f"{tokenizer_setup.model_path}.model"): |
| | tokenizer_setup.train_sentencepiece(input_file) |
| |
|
| | |
| | tokenizer_setup.load_tokenizer() |
| |
|
| | |
| | tokenizer_setup.save_tokenizer("../finetuned_charm15/") |
| |
|
| | |
| | sample_text = "Charm 15 is an AI model optimized for deep learning and security." |
| | tokenizer_setup.tokenize_text(sample_text) |