# ========================================================================================================================== # # CLEAN TEST: AutoModel load from HuggingFace # Run on a fresh Colab runtime with no prior state # Paste this in Colab and it will simply run. # Upcoming heads will add direct finetune capacity to this tiny model with exquisite potential. # ========================================================================================================================== # from transformers import AutoModel, AutoTokenizer import torch REPO_ID = "AbstractPhil/geolip-captionbert-8192" print("Loading model...") model = AutoModel.from_pretrained(REPO_ID, trust_remote_code=True) model.eval() print(f" Parameters: {sum(p.numel() for p in model.parameters()):,}") print("Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained(REPO_ID, trust_remote_code=True) print(f" Vocab: {tokenizer.vocab_size}") # Encode texts = [ "girl", "boy", "woman", "man", "mans", "womens", "women", "woman", "adjacency", "adjacent", "nearby", "near", "away", "aways", "similar", "dissimilar", "solid", "liquid", "prophetic", "predictive", "similarity", "differentiation", "differential", "addition", "subtraction", "division", "multiplication" #"A cat sitting on a windowsill watching birds outside", #"A golden retriever playing fetch on the beach at sunset", #"A still life painting with flowers and fruit on a table", #"An aerial photograph of a city skyline at night", #"A child riding a bicycle through autumn leaves in a park", #"a girl performing an action", #"a boy performing an action", #"a woman performing an action", #"a man performing an action", ] inputs = tokenizer(texts, max_length=8192, padding=True, truncation=True, return_tensors="pt") with torch.no_grad(): outputs = model(**inputs) emb = outputs.last_hidden_state print(f"\n Output shape: {emb.shape}") print(f" Norms: {emb.norm(dim=-1).tolist()}") # Pairwise similarity print(f"\n Pairwise cosine similarity:") sim = emb @ emb.T for i in range(len(texts)): for j in range(i+1, len(texts)): print(f" [{i}]↔[{j}]: {sim[i,j]:.3f} ({texts[i][:40]}↔{texts[j][:40]})") # Test encode convenience method if hasattr(model, 'encode'): print(f"\n Testing encode() method...") e = model.encode(["Hello world", "Testing the encoder"]) print(f" Shape: {e.shape}") print(f" Cosine: {(e[0] @ e[1]).item():.3f}") print("\nāœ“ All tests passed")