| | import sentencepiece as spm
|
| | import os
|
| | import json
|
| |
|
| |
|
| | class MTPTokenizer:
|
| | """Tokenizer using SentencePiece BPE - Optimizado para formato instruction-response"""
|
| |
|
| | def __init__(self, model_path=None):
|
| | self.sp = None
|
| | self.model_path = model_path
|
| |
|
| | if model_path and os.path.exists(model_path):
|
| | self.load(model_path)
|
| |
|
| | def train(self, corpus_path, vocab_size=4000, model_prefix='mtp_tokenizer'):
|
| | """Train SentencePiece BPE tokenizer en corpus con formato JSONL"""
|
| |
|
| |
|
| | texts = []
|
| | print(f" → Procesando corpus para entrenar tokenizer...")
|
| |
|
| | with open(corpus_path, 'r', encoding='utf-8') as f:
|
| | for line_num, line in enumerate(f, 1):
|
| | line = line.strip()
|
| | if not line:
|
| | continue
|
| |
|
| | try:
|
| | data = json.loads(line)
|
| |
|
| |
|
| | if 'instruction' in data:
|
| | texts.append(data['instruction'].strip())
|
| | if 'context' in data and data['context'].strip():
|
| | texts.append(data['context'].strip())
|
| | if 'response' in data:
|
| | texts.append(data['response'].strip())
|
| |
|
| | except json.JSONDecodeError:
|
| | continue
|
| |
|
| |
|
| | texts = [t for t in texts if t and t.strip()]
|
| |
|
| | if not texts:
|
| | raise ValueError("No se encontraron textos válidos en el corpus")
|
| |
|
| |
|
| | temp_file = 'temp_corpus.txt'
|
| | with open(temp_file, 'w', encoding='utf-8') as f:
|
| | for text in texts:
|
| | f.write(text + '\n')
|
| |
|
| |
|
| | total_chars = sum(len(text) for text in texts)
|
| | max_vocab = min(vocab_size, max(256, int(total_chars * 0.15)))
|
| |
|
| | print(f" → Corpus stats: {len(texts)} textos, {total_chars} caracteres")
|
| | print(f" → Vocabulario ajustado: {max_vocab} (solicitado: {vocab_size})")
|
| |
|
| |
|
| | spm.SentencePieceTrainer.train(
|
| | input=temp_file,
|
| | model_prefix=model_prefix,
|
| | vocab_size=max_vocab,
|
| | model_type='bpe',
|
| | pad_id=0,
|
| | unk_id=1,
|
| | bos_id=2,
|
| | eos_id=3,
|
| | character_coverage=1.0,
|
| | normalization_rule_name='identity',
|
| | num_threads=4,
|
| | split_digits=True,
|
| | allow_whitespace_only_pieces=False,
|
| | byte_fallback=False,
|
| | max_sentencepiece_length=16,
|
| | add_dummy_prefix=False,
|
| | remove_extra_whitespaces=False
|
| | )
|
| |
|
| |
|
| | os.remove(temp_file)
|
| |
|
| |
|
| | self.model_path = f"{model_prefix}.model"
|
| | self.load(self.model_path)
|
| |
|
| | print(f"✓ Tokenizer entrenado: {self.vocab_size()} tokens")
|
| | print(f"✓ Modelo guardado: {self.model_path}")
|
| |
|
| | def load(self, model_path):
|
| | """Load trained tokenizer"""
|
| | self.sp = spm.SentencePieceProcessor()
|
| | self.sp.load(model_path)
|
| | self.model_path = model_path
|
| |
|
| | def encode(self, text):
|
| | """Encode text to token IDs"""
|
| | if self.sp is None:
|
| | raise ValueError("Tokenizer not loaded. Train or load a model first.")
|
| | return self.sp.encode_as_ids(text)
|
| |
|
| | def decode(self, ids):
|
| | """Decode token IDs to text"""
|
| | if self.sp is None:
|
| | raise ValueError("Tokenizer not loaded. Train or load a model first.")
|
| | return self.sp.decode_ids(ids)
|
| |
|
| | def vocab_size(self):
|
| | """Get vocabulary size"""
|
| | if self.sp is None:
|
| | return 0
|
| | return self.sp.get_piece_size()
|
| |
|
| | def bos_id(self):
|
| | """Beginning of sentence token ID"""
|
| | return self.sp.bos_id()
|
| |
|
| | def eos_id(self):
|
| | """End of sentence token ID"""
|
| | return self.sp.eos_id()
|
| |
|
| | def pad_id(self):
|
| | """Padding token ID"""
|
| | return self.sp.pad_id()
|
| |
|
| | def unk_id(self):
|
| | """Unknown token ID"""
|
| | return self.sp.unk_id() |