Compression-Lens / core /segmentation.py
Jellyfish042's picture
Checkpoint before tooltip palette update
350392a
raw
history blame contribute delete
593 Bytes
"""
Fallback segmentation utilities.
Used for offline tests or snapshot generation when model tokenizers
are unavailable.
"""
from typing import Dict, List
def fallback_token_info(text: str) -> Dict[str, List]:
"""Return minimal token info using UTF-8 codepoint boundaries."""
boundaries = [0]
byte_pos = 0
for ch in text:
byte_pos += len(ch.encode("utf-8"))
boundaries.append(byte_pos)
return {
"common_boundaries": boundaries,
"qwen_tokens": [],
"rwkv_tokens": [],
"byte_to_qwen": {},
"byte_to_rwkv": {},
}