Upload folder using huggingface_hub

Browse files

Files changed (8) hide show

.gitattributes +1 -0
.meta.json +4 -0
README.md +150 -0
config.json +113 -0
model.safetensors +3 -0
tokenizer.json +3 -0
tokenizer_config.json +11 -0
viterbi_calibration.json +14 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

.meta.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "torch": "2.11.0+cu126",
+  "transformers": "5.7.0.dev0"
+}

README.md ADDED Viewed

	@@ -0,0 +1,150 @@

+---
+library_name: transformers
+base_model:
+- openai/privacy-filter
+---
+This tiny model is intended for debugging. It is randomly initialized using the configuration adapted from [openai/privacy-filter](https://huggingface.co/openai/privacy-filter).
+| File path | Size |
+|------|------|
+| model.safetensors | 4.1MB |
+### Example usage:
+```python
+import torch
+from transformers import AutoModelForTokenClassification, AutoTokenizer
+model_id = "tiny-random/openai-privacy-filter"
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForTokenClassification.from_pretrained(
+    model_id,
+    dtype=torch.bfloat16,
+).to(device)
+text = ''
+for i in range(10):
+    text += f'Contact me at test{i}@example.com or call 555-0000-{i}. '
+enc = tokenizer(text, return_tensors='pt').to(device)
+with torch.no_grad():
+    outputs = model(**enc)
+predicted_token_class_ids = outputs.logits.argmax(dim=-1)
+predicted_token_classes = [model.config.id2label[token_id.item()] for token_id in predicted_token_class_ids[0]]
+print(predicted_token_classes, len(predicted_token_classes))
+```
+### Codes to create this repo:
+<details>
+<summary>Click to expand</summary>
+```python
+# Generated by AI.
+import json
+from pathlib import Path
+import torch
+from huggingface_hub import hf_hub_download
+from transformers import (
+    AutoConfig,
+    AutoModelForTokenClassification,
+    AutoTokenizer,
+    set_seed,
+)
+source_model_id = "openai/privacy-filter"
+save_folder = "/tmp/tiny-random/openai-privacy-filter"
+Path(save_folder).mkdir(parents=True, exist_ok=True)
+for filename in (
+    'tokenizer.json',
+    'tokenizer_config.json',
+    'viterbi_calibration.json',
+):
+    hf_hub_download(
+        repo_id=source_model_id,
+        filename=filename,
+        repo_type='model',
+        local_dir=save_folder,
+    )
+with open(
+    hf_hub_download(source_model_id, filename='config.json', repo_type='model'),
+    'r',
+    encoding='utf-8',
+) as f:
+    config_json: dict = json.load(f)
+config_json.update({
+    'num_hidden_layers': 4,
+    'hidden_size': 8,
+    'intermediate_size': 32,
+    'num_attention_heads': 8,
+    'num_key_value_heads': 4,
+    'head_dim': 32,
+})
+config_json.pop('transformers.js_config', None)
+with open(f'{save_folder}/config.json', 'w', encoding='utf-8') as f:
+    json.dump(config_json, f, indent=2)
+config = AutoConfig.from_pretrained(save_folder)
+print(config)
+torch.set_default_dtype(torch.bfloat16)
+model = AutoModelForTokenClassification.from_config(config, trust_remote_code=True)
+torch.set_default_dtype(torch.float32)
+model = model.cpu()
+set_seed(42)
+with torch.no_grad():
+    for name, p in sorted(model.named_parameters()):
+        torch.nn.init.normal_(p, mean=0.0, std=0.8)
+        print(name, tuple(p.shape))
+for i in range(model.config.num_hidden_layers):
+    model.model.layers[i].self_attn.sinks = torch.nn.Parameter(model.model.layers[i].self_attn.sinks.float())
+model.save_pretrained(save_folder)
+print(model)
+```
+</details>
+### Printing the model:
+<details><summary>Click to expand</summary>
+```text
+OpenAIPrivacyFilterForTokenClassification(
+  (model): OpenAIPrivacyFilterModel(
+    (embed_tokens): Embedding(200064, 8, padding_idx=199999)
+    (layers): ModuleList(
+      (0-3): 4 x OpenAIPrivacyFilterEncoderLayer(
+        (self_attn): OpenAIPrivacyFilterAttention(
+          (q_proj): Linear(in_features=8, out_features=256, bias=True)
+          (k_proj): Linear(in_features=8, out_features=128, bias=True)
+          (v_proj): Linear(in_features=8, out_features=128, bias=True)
+          (o_proj): Linear(in_features=256, out_features=8, bias=True)
+        )
+        (mlp): OpenAIPrivacyFilterMLP(
+          (router): OpenAIPrivacyFilterTopKRouter()
+          (experts): OpenAIPrivacyFilterExperts()
+        )
+        (input_layernorm): OpenAIPrivacyFilterRMSNorm((8,), eps=1e-05)
+        (post_attention_layernorm): OpenAIPrivacyFilterRMSNorm((8,), eps=1e-05)
+      )
+    )
+    (norm): OpenAIPrivacyFilterRMSNorm((8,), eps=1e-05)
+    (rotary_emb): OpenAIPrivacyFilterRotaryEmbedding()
+  )
+  (dropout): Dropout(p=0.0, inplace=False)
+  (score): Linear(in_features=8, out_features=33, bias=True)
+)
+```
+</details>
+### Test environment:
+- torch: 2.11.0+cu126
+- transformers: 5.7.0.dev0

config.json ADDED Viewed

	@@ -0,0 +1,113 @@

+{
+  "architectures": [
+    "OpenAIPrivacyFilterForTokenClassification"
+  ],
+  "attention_bias": true,
+  "attention_dropout": 0.0,
+  "bos_token_id": null,
+  "classifier_dropout": 0.0,
+  "default_n_ctx": 128000,
+  "dtype": "bfloat16",
+  "eos_token_id": 199999,
+  "head_dim": 32,
+  "hidden_act": "silu",
+  "hidden_size": 8,
+  "id2label": {
+    "0": "O",
+    "1": "B-account_number",
+    "2": "I-account_number",
+    "3": "E-account_number",
+    "4": "S-account_number",
+    "5": "B-private_address",
+    "6": "I-private_address",
+    "7": "E-private_address",
+    "8": "S-private_address",
+    "9": "B-private_date",
+    "10": "I-private_date",
+    "11": "E-private_date",
+    "12": "S-private_date",
+    "13": "B-private_email",
+    "14": "I-private_email",
+    "15": "E-private_email",
+    "16": "S-private_email",
+    "17": "B-private_person",
+    "18": "I-private_person",
+    "19": "E-private_person",
+    "20": "S-private_person",
+    "21": "B-private_phone",
+    "22": "I-private_phone",
+    "23": "E-private_phone",
+    "24": "S-private_phone",
+    "25": "B-private_url",
+    "26": "I-private_url",
+    "27": "E-private_url",
+    "28": "S-private_url",
+    "29": "B-secret",
+    "30": "I-secret",
+    "31": "E-secret",
+    "32": "S-secret"
+  },
+  "initial_context_length": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 32,
+  "label2id": {
+    "B-account_number": 1,
+    "B-private_address": 5,
+    "B-private_date": 9,
+    "B-private_email": 13,
+    "B-private_person": 17,
+    "B-private_phone": 21,
+    "B-private_url": 25,
+    "B-secret": 29,
+    "E-account_number": 3,
+    "E-private_address": 7,
+    "E-private_date": 11,
+    "E-private_email": 15,
+    "E-private_person": 19,
+    "E-private_phone": 23,
+    "E-private_url": 27,
+    "E-secret": 31,
+    "I-account_number": 2,
+    "I-private_address": 6,
+    "I-private_date": 10,
+    "I-private_email": 14,
+    "I-private_person": 18,
+    "I-private_phone": 22,
+    "I-private_url": 26,
+    "I-secret": 30,
+    "O": 0,
+    "S-account_number": 4,
+    "S-private_address": 8,
+    "S-private_date": 12,
+    "S-private_email": 16,
+    "S-private_person": 20,
+    "S-private_phone": 24,
+    "S-private_url": 28,
+    "S-secret": 32
+  },
+  "max_position_embeddings": 131072,
+  "model_type": "openai_privacy_filter",
+  "num_attention_heads": 8,
+  "num_experts_per_tok": 4,
+  "num_hidden_layers": 4,
+  "num_key_value_heads": 4,
+  "num_local_experts": 128,
+  "output_router_logits": false,
+  "pad_token_id": 199999,
+  "rms_norm_eps": 1e-05,
+  "rope_parameters": {
+    "beta_fast": 32.0,
+    "beta_slow": 1.0,
+    "factor": 32.0,
+    "original_max_position_embeddings": 4096,
+    "rope_theta": 150000.0,
+    "rope_type": "yarn",
+    "truncate": false
+  },
+  "router_aux_loss_coef": 0.001,
+  "sliding_window": 128,
+  "tie_word_embeddings": false,
+  "transformers_version": "5.7.0.dev0",
+  "use_cache": true,
+  "vocab_size": 200064
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:097171a60f58fd53356be06b72119af4452fd177efd97f431c5bcd6db116ff2b
+size 4132058

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0614fe83cadab421296e664e1f48f4261fa8fef6e03e63bb75c20f38e37d07d3
+size 27868174

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "backend": "tokenizers",
+  "eos_token": "<|endoftext|>",
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 128000,
+  "pad_token": "<|endoftext|>",
+  "tokenizer_class": "TokenizersBackend"
+}

viterbi_calibration.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "operating_points": {
+    "default": {
+      "biases": {
+        "transition_bias_background_stay": 0.0,
+        "transition_bias_background_to_start": 0.0,
+        "transition_bias_end_to_background": 0.0,
+        "transition_bias_end_to_start": 0.0,
+        "transition_bias_inside_to_continue": 0.0,
+        "transition_bias_inside_to_end": 0.0
+      }
+    }
+  }
+}