Token Classification
Transformers
PyTorch
TensorBoard
layoutlmv3
Generated from Trainer
Eval Results (legacy)
Instructions to use jinhybr/OCR-LayoutLMv3 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use jinhybr/OCR-LayoutLMv3 with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("token-classification", model="jinhybr/OCR-LayoutLMv3")# Load model directly from transformers import AutoProcessor, AutoModelForTokenClassification processor = AutoProcessor.from_pretrained("jinhybr/OCR-LayoutLMv3") model = AutoModelForTokenClassification.from_pretrained("jinhybr/OCR-LayoutLMv3") - Notebooks
- Google Colab
- Kaggle
| from typing import Dict, List, Any | |
| from transformers import AutoModelForTokenClassification, AutoProcessor | |
| import torch | |
| from subprocess import run | |
| # install tesseract-ocr and pytesseract | |
| run("apt install -y tesseract-ocr", shell=True, check=True) | |
| run("pip install pytesseract", shell=True, check=True) | |
| # helper function to unnormalize bboxes for drawing onto the image | |
| def unnormalize_box(bbox, width, height): | |
| return [ | |
| width * (bbox[0] / 1000), | |
| height * (bbox[1] / 1000), | |
| width * (bbox[2] / 1000), | |
| height * (bbox[3] / 1000), | |
| ] | |
| # set device | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| class EndpointHandler: | |
| def __init__(self, path=""): | |
| # load model and processor from path | |
| self.model = AutoModelForTokenClassification.from_pretrained(path).to(device) | |
| self.processor = AutoProcessor.from_pretrained(path) | |
| def __call__(self, data: Dict[str, bytes]) -> Dict[str, List[Any]]: | |
| """ | |
| Args: | |
| data (:obj:): | |
| includes the deserialized image file as PIL.Image | |
| """ | |
| # process input | |
| image = data.pop("inputs", data) | |
| # process image | |
| encoding = self.processor(image, return_tensors="pt") | |
| # run prediction | |
| with torch.inference_mode(): | |
| outputs = self.model( | |
| input_ids=encoding.input_ids.to(device), | |
| bbox=encoding.bbox.to(device), | |
| attention_mask=encoding.attention_mask.to(device), | |
| pixel_values=encoding.pixel_values.to(device), | |
| ) | |
| predictions = outputs.logits.softmax(-1) | |
| # post process output | |
| result = [] | |
| for item, inp_ids, bbox in zip( | |
| predictions.squeeze(0).cpu(), encoding.input_ids.squeeze(0).cpu(), encoding.bbox.squeeze(0).cpu() | |
| ): | |
| label = self.model.config.id2label[int(item.argmax().cpu())] | |
| if label == "O": | |
| continue | |
| score = item.max().item() | |
| text = self.processor.tokenizer.decode(inp_ids) | |
| bbox = unnormalize_box(bbox.tolist(), image.width, image.height) | |
| result.append({"label": label, "score": score, "text": text, "bbox": bbox}) | |
| return {"predictions": result} | |