geolip-captionbert-8192 / colab_test_script.py
AbstractPhil's picture
Create colab_test_script.py
2028a79 verified
# ========================================================================================================================== #
# CLEAN TEST: AutoModel load from HuggingFace
# Run on a fresh Colab runtime with no prior state
# Paste this in Colab and it will simply run.
# Upcoming heads will add direct finetune capacity to this tiny model with exquisite potential.
# ========================================================================================================================== #
from transformers import AutoModel, AutoTokenizer
import torch
REPO_ID = "AbstractPhil/geolip-captionbert-8192"
print("Loading model...")
model = AutoModel.from_pretrained(REPO_ID, trust_remote_code=True)
model.eval()
print(f" Parameters: {sum(p.numel() for p in model.parameters()):,}")
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(REPO_ID, trust_remote_code=True)
print(f" Vocab: {tokenizer.vocab_size}")
# Encode
texts = [
"girl",
"boy",
"woman",
"man",
"mans",
"womens",
"women",
"woman",
"adjacency",
"adjacent",
"nearby",
"near",
"away",
"aways",
"similar",
"dissimilar",
"solid",
"liquid",
"prophetic",
"predictive",
"similarity",
"differentiation",
"differential",
"addition",
"subtraction",
"division",
"multiplication"
#"A cat sitting on a windowsill watching birds outside",
#"A golden retriever playing fetch on the beach at sunset",
#"A still life painting with flowers and fruit on a table",
#"An aerial photograph of a city skyline at night",
#"A child riding a bicycle through autumn leaves in a park",
#"a girl performing an action",
#"a boy performing an action",
#"a woman performing an action",
#"a man performing an action",
]
inputs = tokenizer(texts, max_length=8192, padding=True,
truncation=True, return_tensors="pt")
with torch.no_grad():
outputs = model(**inputs)
emb = outputs.last_hidden_state
print(f"\n Output shape: {emb.shape}")
print(f" Norms: {emb.norm(dim=-1).tolist()}")
# Pairwise similarity
print(f"\n Pairwise cosine similarity:")
sim = emb @ emb.T
for i in range(len(texts)):
for j in range(i+1, len(texts)):
print(f" [{i}]↔[{j}]: {sim[i,j]:.3f} ({texts[i][:40]}{texts[j][:40]})")
# Test encode convenience method
if hasattr(model, 'encode'):
print(f"\n Testing encode() method...")
e = model.encode(["Hello world", "Testing the encoder"])
print(f" Shape: {e.shape}")
print(f" Cosine: {(e[0] @ e[1]).item():.3f}")
print("\n✓ All tests passed")