Spaces:
Running
Running
| """ | |
| Fallback segmentation utilities. | |
| Used for offline tests or snapshot generation when model tokenizers | |
| are unavailable. | |
| """ | |
| from typing import Dict, List | |
| def fallback_token_info(text: str) -> Dict[str, List]: | |
| """Return minimal token info using UTF-8 codepoint boundaries.""" | |
| boundaries = [0] | |
| byte_pos = 0 | |
| for ch in text: | |
| byte_pos += len(ch.encode("utf-8")) | |
| boundaries.append(byte_pos) | |
| return { | |
| "common_boundaries": boundaries, | |
| "qwen_tokens": [], | |
| "rwkv_tokens": [], | |
| "byte_to_qwen": {}, | |
| "byte_to_rwkv": {}, | |
| } | |