File size: 795 Bytes
03022ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
def FunCineForgeTokenizer(init_param_path, **kwargs):
    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained(init_param_path)
    special_tokens = {
        'eos_token': '<|endoftext|>',
        'pad_token': '<|endoftext|>',
        'additional_special_tokens': [
            '<|im_start|>', '<|im_end|>',
            '<|startofclue|>', '<|endofclue|>', '<|endofprompt|>',
            '[breath]', '<strong>', '</strong>', '[noise]',
            '[laughter]', '[cough]', '[clucking]', '[accent]',
            '[quick_breath]',
            "<laughter>", "</laughter>",
            "[hissing]", "[sigh]", "[vocalized-noise]",
            "[lipsmack]", "[mn]", "<|endofsystem|>"
        ]
    }
    tokenizer.add_special_tokens(special_tokens)

    return tokenizer