Instructions to use hf-internal-testing/tiny-layoutlm with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use hf-internal-testing/tiny-layoutlm with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("fill-mask", model="hf-internal-testing/tiny-layoutlm")# Load model directly from transformers import AutoTokenizer, AutoModelForMaskedLM tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-layoutlm") model = AutoModelForMaskedLM.from_pretrained("hf-internal-testing/tiny-layoutlm") - Notebooks
- Google Colab
- Kaggle
| #!/usr/bin/env python | |
| import sys | |
| import os | |
| from transformers import LayoutLMTokenizer, LayoutLMTokenizerFast, LayoutLMConfig, LayoutLMForMaskedLM | |
| mname_orig = "microsoft/layoutlm-base-uncased" | |
| mname_tiny = "tiny-layoutlm" | |
| ### Tokenizer | |
| import json | |
| from transformers import AutoTokenizer | |
| from tokenizers import Tokenizer | |
| vocab_keep_items = 5000 | |
| tokenizer = AutoTokenizer.from_pretrained(mname_orig, use_fast=True) | |
| assert tokenizer.is_fast, "This only works for fast tokenizers." | |
| tokenizer_json = json.loads(tokenizer._tokenizer.to_str()) | |
| vocab = tokenizer_json["model"]["vocab"] | |
| if tokenizer_json["model"]["type"] == "BPE": | |
| new_vocab = { token: i for token, i in vocab.items() if i < vocab_keep_items } | |
| merges = tokenizer_json["model"]["merges"] | |
| new_merges = [] | |
| for i in range(len(merges)): | |
| a, b = merges[i].split() | |
| new_token = "".join((a, b)) | |
| if a in new_vocab and b in new_vocab and new_token in new_vocab: | |
| new_merges.append(merges[i]) | |
| tokenizer_json["model"]["merges"] = new_merges | |
| elif tokenizer_json["model"]["type"] == "Unigram": | |
| new_vocab = vocab[:vocab_keep_items] | |
| elif tokenizer_json["model"]["type"] == "WordPiece" or tokenizer_json["model"]["type"] == "WordLevel": | |
| new_vocab = { token: i for token, i in vocab.items() if i < vocab_keep_items } | |
| else: | |
| raise ValueError(f"don't know how to handle {tokenizer_json['model']['type']}") | |
| tokenizer_json["model"]["vocab"] = new_vocab | |
| tokenizer._tokenizer = Tokenizer.from_str(json.dumps(tokenizer_json)) | |
| tokenizer_fast_tiny = tokenizer | |
| ### Config | |
| config_tiny = LayoutLMConfig.from_pretrained(mname_orig) | |
| print(config_tiny) | |
| # remember to update this to the actual config as each model is different and then shrink the numbers | |
| config_tiny.update(dict( | |
| vocab_size=vocab_keep_items, | |
| hidden_size=32, | |
| intermediate_size=64, | |
| max_position_embeddings=512, | |
| max_2d_position_embeddings=128, | |
| num_attention_heads=2, | |
| num_hidden_layers=2, | |
| )) | |
| print("New config", config_tiny) | |
| ### Model | |
| model_tiny = LayoutLMForMaskedLM(config_tiny) | |
| print(f"{mname_tiny}: num of params {model_tiny.num_parameters()}") | |
| model_tiny.resize_token_embeddings(len(tokenizer_fast_tiny)) | |
| # Test | |
| inputs = tokenizer_fast_tiny("The capital of France is [MASK].", return_tensors="pt") | |
| #print(inputs) | |
| outputs = model_tiny(**inputs) | |
| print("Test with normal tokenizer:", len(outputs.logits[0])) | |
| # Save | |
| model_tiny.half() # makes it smaller | |
| model_tiny.save_pretrained(".") | |
| tokenizer_fast_tiny.save_pretrained(".") | |
| #print(model_tiny) | |
| readme = "README.md" | |
| if not os.path.exists(readme): | |
| with open(readme, "w") as f: | |
| f.write(f"This is a {mname_tiny} random model to be used for basic testing.\n") | |
| print(f"Generated {mname_tiny}") | |