Instructions to use voidful/PangolinTokenizer with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use voidful/PangolinTokenizer with Transformers:
# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("voidful/PangolinTokenizer", dtype="auto") - Notebooks
- Google Colab
- Kaggle
| { | |
| "add_prefix_space": false, | |
| "backend": "tokenizers", | |
| "bos_token": "<s>", | |
| "clean_up_tokenization_spaces": false, | |
| "effective_vocab_size": 114822, | |
| "eos_token": "</s>", | |
| "errors": "replace", | |
| "fix_mistral_regex": true, | |
| "is_local": false, | |
| "local_files_only": false, | |
| "model_max_length": 131072, | |
| "model_type": "byte_level_bpe", | |
| "no_audio_codec_tokens": true, | |
| "no_dense_timestamp_tokens": true, | |
| "open_formosa": { | |
| "required_special_token_count": 157, | |
| "required_special_tokens_present": true, | |
| "required_special_tokens_single_id": true, | |
| "standard_special_tokens": { | |
| "bos_token": "<s>", | |
| "eos_token": "</s>", | |
| "pad_token": "<pad>", | |
| "unk_token": "<unk>" | |
| } | |
| }, | |
| "pad_token": "<pad>", | |
| "padding_side": "right", | |
| "rich_transcription": { | |
| "allow_non_speech_events": true, | |
| "compact_json": true, | |
| "default_format": "json_segments", | |
| "enabled": true, | |
| "include_content": true, | |
| "include_speaker": true, | |
| "include_start_end": true, | |
| "no_dense_timestamp_tokens": true, | |
| "timestamp_precision_digits": 2, | |
| "timestamp_unit": "seconds" | |
| }, | |
| "special_tokens": [ | |
| "<|pad|>", | |
| "<|bos|>", | |
| "<|eos|>", | |
| "<|unk|>", | |
| "<|system|>", | |
| "<|user_channel|>", | |
| "<|assistant_channel|>", | |
| "<|task:speech_to_text|>", | |
| "<|task:text_to_speech|>", | |
| "<|input_audio_start|>", | |
| "<|input_audio_end|>", | |
| "<|audio_ref_start|>", | |
| "<|audio_ref_end|>", | |
| "<|audio_start|>", | |
| "<|audio_end|>", | |
| "<|speech_start|>", | |
| "<|speech_end|>", | |
| "<|transcript_start|>", | |
| "<|transcript_end|>", | |
| "<|segment_start|>", | |
| "<|segment_end|>", | |
| "<|speaker|>", | |
| "<|start_time|>", | |
| "<|end_time|>", | |
| "<|duration|>", | |
| "<|content|>", | |
| "<|non_speech_event|>", | |
| "<|retrieval_result_start|>", | |
| "<|retrieval_result_end|>", | |
| "<|ocr_start|>", | |
| "<|ocr_end|>", | |
| "<|image_start|>", | |
| "<|image_end|>", | |
| "<|video_start|>", | |
| "<|video_end|>" | |
| ], | |
| "strict_no_dense_timestamp_tokens": true, | |
| "tokenizer_class": "GPT2Tokenizer", | |
| "truncation_side": "right", | |
| "unk_token": "<unk>", | |
| "vocab_size": 114688 | |
| } | |