| from constants import * |
| from transformers import AutoTokenizer |
| import torch |
| import numpy as np |
| from PIL import Image |
| from torchvision import transforms |
|
|
|
|
| def get_tokenizer(): |
| tokenizer = AutoTokenizer.from_pretrained("gpt2") |
| point_tokens = [f"coord_bin_{i}" for i in range(0, NUM_BINS)] |
| new_tokens = [ |
| "<point_start>", "<point_end>", "<result_start>", |
| "<result_end>", "<pointx_start>", "<pointx_end>", |
| "<pointy_start>", "<pointy_end>", |
| *point_tokens |
| ] |
| tokenizer.add_tokens(new_tokens) |
| |
| if tokenizer.pad_token is None: |
| tokenizer.add_special_tokens({'pad_token': '[PAD]'}) |
| |
|
|
| print(f"Tokenizer pad token: {tokenizer.pad_token}, ID: {tokenizer.pad_token_id}") |
| print(f"Tokenizer EOS token: {tokenizer.eos_token}, ID: {tokenizer.eos_token_id}") |
|
|
| |
| if tokenizer.pad_token_id is None: |
| raise ValueError("Tokenizer pad token ID is not set!") |
|
|
| return tokenizer, len(tokenizer) |
|
|
| def image_to_tensor(image, image_size=IMAGE_SIZE): |
| if image.mode != 'RGB': |
| image = image.convert('RGB') |
| |
| |
| transform = transforms.Compose([ |
| transforms.Resize((image_size, image_size)), |
| transforms.ToTensor(), |
| transforms.Normalize(mean=IMAGE_MEAN, std=IMAGE_STD) |
| ]) |
| return transform(image) |
|
|
| def tensor_to_image(tensor): |
| tensor = tensor.clone().detach() |
| if tensor.is_cuda: |
| tensor = tensor.cpu() |
| mean = torch.tensor(IMAGE_MEAN).view(3, 1, 1) |
| std = torch.tensor(IMAGE_STD).view(3, 1, 1) |
| tensor = tensor * std + mean |
| tensor = torch.clamp(tensor, 0, 1) |
| image_np = tensor.numpy().transpose(1, 2, 0) |
| image_np = (image_np * 255).astype(np.uint8) |
| return Image.fromarray(image_np) |
|
|
| tokenizer, vocab_size = get_tokenizer() |