yujiepan commited on
Commit
daba5d0
·
verified ·
1 Parent(s): c2d41f2

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
.meta.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "torch": "2.11.0+cu126",
3
+ "transformers": "5.7.0.dev0"
4
+ }
README.md ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ base_model:
4
+ - openai/privacy-filter
5
+ ---
6
+
7
+ This tiny model is intended for debugging. It is randomly initialized using the configuration adapted from [openai/privacy-filter](https://huggingface.co/openai/privacy-filter).
8
+
9
+ | File path | Size |
10
+ |------|------|
11
+ | model.safetensors | 4.1MB |
12
+
13
+
14
+ ### Example usage:
15
+
16
+ ```python
17
+ import torch
18
+ from transformers import AutoModelForTokenClassification, AutoTokenizer
19
+
20
+ model_id = "tiny-random/openai-privacy-filter"
21
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
22
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
23
+ model = AutoModelForTokenClassification.from_pretrained(
24
+ model_id,
25
+ dtype=torch.bfloat16,
26
+ ).to(device)
27
+ text = ''
28
+ for i in range(10):
29
+ text += f'Contact me at test{i}@example.com or call 555-0000-{i}. '
30
+ enc = tokenizer(text, return_tensors='pt').to(device)
31
+ with torch.no_grad():
32
+ outputs = model(**enc)
33
+ predicted_token_class_ids = outputs.logits.argmax(dim=-1)
34
+ predicted_token_classes = [model.config.id2label[token_id.item()] for token_id in predicted_token_class_ids[0]]
35
+ print(predicted_token_classes, len(predicted_token_classes))
36
+ ```
37
+
38
+ ### Codes to create this repo:
39
+
40
+ <details>
41
+ <summary>Click to expand</summary>
42
+
43
+ ```python
44
+ # Generated by AI.
45
+ import json
46
+ from pathlib import Path
47
+
48
+ import torch
49
+ from huggingface_hub import hf_hub_download
50
+ from transformers import (
51
+ AutoConfig,
52
+ AutoModelForTokenClassification,
53
+ AutoTokenizer,
54
+ set_seed,
55
+ )
56
+
57
+ source_model_id = "openai/privacy-filter"
58
+ save_folder = "/tmp/tiny-random/openai-privacy-filter"
59
+
60
+ Path(save_folder).mkdir(parents=True, exist_ok=True)
61
+ for filename in (
62
+ 'tokenizer.json',
63
+ 'tokenizer_config.json',
64
+ 'viterbi_calibration.json',
65
+ ):
66
+ hf_hub_download(
67
+ repo_id=source_model_id,
68
+ filename=filename,
69
+ repo_type='model',
70
+ local_dir=save_folder,
71
+ )
72
+
73
+ with open(
74
+ hf_hub_download(source_model_id, filename='config.json', repo_type='model'),
75
+ 'r',
76
+ encoding='utf-8',
77
+ ) as f:
78
+ config_json: dict = json.load(f)
79
+
80
+ config_json.update({
81
+ 'num_hidden_layers': 4,
82
+ 'hidden_size': 8,
83
+ 'intermediate_size': 32,
84
+ 'num_attention_heads': 8,
85
+ 'num_key_value_heads': 4,
86
+ 'head_dim': 32,
87
+ })
88
+ config_json.pop('transformers.js_config', None)
89
+
90
+ with open(f'{save_folder}/config.json', 'w', encoding='utf-8') as f:
91
+ json.dump(config_json, f, indent=2)
92
+
93
+ config = AutoConfig.from_pretrained(save_folder)
94
+ print(config)
95
+ torch.set_default_dtype(torch.bfloat16)
96
+ model = AutoModelForTokenClassification.from_config(config, trust_remote_code=True)
97
+ torch.set_default_dtype(torch.float32)
98
+
99
+ model = model.cpu()
100
+ set_seed(42)
101
+ with torch.no_grad():
102
+ for name, p in sorted(model.named_parameters()):
103
+ torch.nn.init.normal_(p, mean=0.0, std=0.8)
104
+ print(name, tuple(p.shape))
105
+ for i in range(model.config.num_hidden_layers):
106
+ model.model.layers[i].self_attn.sinks = torch.nn.Parameter(model.model.layers[i].self_attn.sinks.float())
107
+ model.save_pretrained(save_folder)
108
+ print(model)
109
+ ```
110
+
111
+ </details>
112
+
113
+ ### Printing the model:
114
+
115
+ <details><summary>Click to expand</summary>
116
+
117
+ ```text
118
+ OpenAIPrivacyFilterForTokenClassification(
119
+ (model): OpenAIPrivacyFilterModel(
120
+ (embed_tokens): Embedding(200064, 8, padding_idx=199999)
121
+ (layers): ModuleList(
122
+ (0-3): 4 x OpenAIPrivacyFilterEncoderLayer(
123
+ (self_attn): OpenAIPrivacyFilterAttention(
124
+ (q_proj): Linear(in_features=8, out_features=256, bias=True)
125
+ (k_proj): Linear(in_features=8, out_features=128, bias=True)
126
+ (v_proj): Linear(in_features=8, out_features=128, bias=True)
127
+ (o_proj): Linear(in_features=256, out_features=8, bias=True)
128
+ )
129
+ (mlp): OpenAIPrivacyFilterMLP(
130
+ (router): OpenAIPrivacyFilterTopKRouter()
131
+ (experts): OpenAIPrivacyFilterExperts()
132
+ )
133
+ (input_layernorm): OpenAIPrivacyFilterRMSNorm((8,), eps=1e-05)
134
+ (post_attention_layernorm): OpenAIPrivacyFilterRMSNorm((8,), eps=1e-05)
135
+ )
136
+ )
137
+ (norm): OpenAIPrivacyFilterRMSNorm((8,), eps=1e-05)
138
+ (rotary_emb): OpenAIPrivacyFilterRotaryEmbedding()
139
+ )
140
+ (dropout): Dropout(p=0.0, inplace=False)
141
+ (score): Linear(in_features=8, out_features=33, bias=True)
142
+ )
143
+ ```
144
+
145
+ </details>
146
+
147
+ ### Test environment:
148
+
149
+ - torch: 2.11.0+cu126
150
+ - transformers: 5.7.0.dev0
config.json ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "OpenAIPrivacyFilterForTokenClassification"
4
+ ],
5
+ "attention_bias": true,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": null,
8
+ "classifier_dropout": 0.0,
9
+ "default_n_ctx": 128000,
10
+ "dtype": "bfloat16",
11
+ "eos_token_id": 199999,
12
+ "head_dim": 32,
13
+ "hidden_act": "silu",
14
+ "hidden_size": 8,
15
+ "id2label": {
16
+ "0": "O",
17
+ "1": "B-account_number",
18
+ "2": "I-account_number",
19
+ "3": "E-account_number",
20
+ "4": "S-account_number",
21
+ "5": "B-private_address",
22
+ "6": "I-private_address",
23
+ "7": "E-private_address",
24
+ "8": "S-private_address",
25
+ "9": "B-private_date",
26
+ "10": "I-private_date",
27
+ "11": "E-private_date",
28
+ "12": "S-private_date",
29
+ "13": "B-private_email",
30
+ "14": "I-private_email",
31
+ "15": "E-private_email",
32
+ "16": "S-private_email",
33
+ "17": "B-private_person",
34
+ "18": "I-private_person",
35
+ "19": "E-private_person",
36
+ "20": "S-private_person",
37
+ "21": "B-private_phone",
38
+ "22": "I-private_phone",
39
+ "23": "E-private_phone",
40
+ "24": "S-private_phone",
41
+ "25": "B-private_url",
42
+ "26": "I-private_url",
43
+ "27": "E-private_url",
44
+ "28": "S-private_url",
45
+ "29": "B-secret",
46
+ "30": "I-secret",
47
+ "31": "E-secret",
48
+ "32": "S-secret"
49
+ },
50
+ "initial_context_length": 4096,
51
+ "initializer_range": 0.02,
52
+ "intermediate_size": 32,
53
+ "label2id": {
54
+ "B-account_number": 1,
55
+ "B-private_address": 5,
56
+ "B-private_date": 9,
57
+ "B-private_email": 13,
58
+ "B-private_person": 17,
59
+ "B-private_phone": 21,
60
+ "B-private_url": 25,
61
+ "B-secret": 29,
62
+ "E-account_number": 3,
63
+ "E-private_address": 7,
64
+ "E-private_date": 11,
65
+ "E-private_email": 15,
66
+ "E-private_person": 19,
67
+ "E-private_phone": 23,
68
+ "E-private_url": 27,
69
+ "E-secret": 31,
70
+ "I-account_number": 2,
71
+ "I-private_address": 6,
72
+ "I-private_date": 10,
73
+ "I-private_email": 14,
74
+ "I-private_person": 18,
75
+ "I-private_phone": 22,
76
+ "I-private_url": 26,
77
+ "I-secret": 30,
78
+ "O": 0,
79
+ "S-account_number": 4,
80
+ "S-private_address": 8,
81
+ "S-private_date": 12,
82
+ "S-private_email": 16,
83
+ "S-private_person": 20,
84
+ "S-private_phone": 24,
85
+ "S-private_url": 28,
86
+ "S-secret": 32
87
+ },
88
+ "max_position_embeddings": 131072,
89
+ "model_type": "openai_privacy_filter",
90
+ "num_attention_heads": 8,
91
+ "num_experts_per_tok": 4,
92
+ "num_hidden_layers": 4,
93
+ "num_key_value_heads": 4,
94
+ "num_local_experts": 128,
95
+ "output_router_logits": false,
96
+ "pad_token_id": 199999,
97
+ "rms_norm_eps": 1e-05,
98
+ "rope_parameters": {
99
+ "beta_fast": 32.0,
100
+ "beta_slow": 1.0,
101
+ "factor": 32.0,
102
+ "original_max_position_embeddings": 4096,
103
+ "rope_theta": 150000.0,
104
+ "rope_type": "yarn",
105
+ "truncate": false
106
+ },
107
+ "router_aux_loss_coef": 0.001,
108
+ "sliding_window": 128,
109
+ "tie_word_embeddings": false,
110
+ "transformers_version": "5.7.0.dev0",
111
+ "use_cache": true,
112
+ "vocab_size": 200064
113
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:097171a60f58fd53356be06b72119af4452fd177efd97f431c5bcd6db116ff2b
3
+ size 4132058
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0614fe83cadab421296e664e1f48f4261fa8fef6e03e63bb75c20f38e37d07d3
3
+ size 27868174
tokenizer_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "eos_token": "<|endoftext|>",
4
+ "model_input_names": [
5
+ "input_ids",
6
+ "attention_mask"
7
+ ],
8
+ "model_max_length": 128000,
9
+ "pad_token": "<|endoftext|>",
10
+ "tokenizer_class": "TokenizersBackend"
11
+ }
viterbi_calibration.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "operating_points": {
3
+ "default": {
4
+ "biases": {
5
+ "transition_bias_background_stay": 0.0,
6
+ "transition_bias_background_to_start": 0.0,
7
+ "transition_bias_end_to_background": 0.0,
8
+ "transition_bias_end_to_start": 0.0,
9
+ "transition_bias_inside_to_continue": 0.0,
10
+ "transition_bias_inside_to_end": 0.0
11
+ }
12
+ }
13
+ }
14
+ }