| | from dataclasses import dataclass |
| | import shutil |
| | from textwrap import dedent, indent |
| | from typing import Any |
| | import numpy as np |
| | from zstandard import ZstdCompressor |
| | from pathlib import Path |
| | import io |
| | from sentence_transformers import SentenceTransformer |
| | from torch.nn import EmbeddingBag |
| | import torch |
| | from model2vec import StaticModel |
| | from tokenizers import Encoding, Tokenizer |
| |
|
| | models_path = Path("models") |
| |
|
| |
|
| | @dataclass |
| | class ModelCard: |
| | owner: str |
| | repo: str |
| | |
| | matroyshka_dims: list[int] |
| | description: str |
| | license: str |
| |
|
| | def name(self): |
| | return f"{self.owner}/{self.repo}" |
| |
|
| | def path(self): |
| | return models_path / self.owner / self.repo |
| |
|
| | def get_description(self): |
| | return dedent(self.description).strip() |
| |
|
| |
|
| | def zst_compress_file(input: Path): |
| | cctx = ZstdCompressor() |
| | output = input.parent / f"{input.name}.zst" |
| | print(f"Compressing {output}") |
| | with open(input, "rb") as fin, open(output, "wb") as fout: |
| | cctx.copy_stream(fin, fout) |
| |
|
| |
|
| | def save_data(path: Path, tensor: torch.Tensor): |
| | """Writes out the static embeddings to a .npy and .npy.zst file""" |
| | buffer = io.BytesIO() |
| |
|
| | if tensor.dtype in (torch.float8_e4m3fn, torch.float8_e5m2): |
| | |
| | np.save(buffer, tensor.detach().view(torch.uint8).numpy()) |
| | else: |
| | np.save(buffer, tensor.detach().numpy()) |
| |
|
| | print(f"Saving {path}") |
| | with (open(path, "wb") as outfile,): |
| | outfile.write(buffer.getvalue()) |
| |
|
| | zst_compress_file(path) |
| |
|
| |
|
| | def quantization_loss_mse(tensor: torch.Tensor, dtype: torch.dtype): |
| | """ |
| | Compute reconstruction loss when converting embeddings to a datatype and back using |
| | the mean squared error, which punishes big errors more than small ones. |
| | """ |
| |
|
| | |
| | roundtrip = tensor.detach().to(dtype).to(tensor.dtype) |
| |
|
| | |
| | return torch.mean((tensor - roundtrip) ** 2).item() |
| |
|
| |
|
| | def quantization_loss_mae(tensor: torch.Tensor, dtype: torch.dtype): |
| | """ |
| | Compute reconstruction loss when converting embeddings to a datatype and back using |
| | the mean absolute error, which is less sensitive to outliers than MSE. |
| | """ |
| |
|
| | |
| | roundtrip = tensor.detach().to(dtype).to(tensor.dtype) |
| |
|
| | |
| | return torch.mean(torch.abs(tensor - roundtrip)).item() |
| |
|
| |
|
| | def quantization_loss_cosine(tensor: torch.Tensor, dtype: torch.dtype): |
| | """ |
| | Compute reconstruction loss when converting embeddings to a datatype and back using |
| | cosine similarity. This measures whether the embedding directions are preserved |
| | after quantization, independent of their magnitudes. |
| | """ |
| |
|
| | |
| | roundtrip = tensor.detach().to(dtype).to(tensor.dtype) |
| |
|
| | |
| | if tensor.ndim == 1: |
| | orig = tensor.unsqueeze(0) |
| | recon = roundtrip.unsqueeze(0) |
| | else: |
| | orig = tensor.view(tensor.shape[0], -1) |
| | recon = roundtrip.view(roundtrip.shape[0], -1) |
| |
|
| | |
| | cos = torch.nn.functional.cosine_similarity(orig, recon, dim=1) |
| | return cos.mean().item() |
| |
|
| |
|
| | def export_embeddings(model_card: ModelCard, embeddings: torch.Tensor) -> None: |
| | vocab_size, dimensions = embeddings.shape |
| |
|
| | |
| | assert ( |
| | embeddings.dtype == torch.float32 |
| | ), f"The embeddings {embeddings.dtype} are assumed to be float32." |
| |
|
| | for dim in model_card.matroyshka_dims: |
| | assert ( |
| | dim <= dimensions |
| | ), f"The Matroyshka dimensions {dim} were bigger than the models dimensions of {dimensions}" |
| |
|
| | truncated = embeddings[:, :dim] |
| | assert truncated.shape == torch.Size([vocab_size, dim]) |
| |
|
| | save_data(model_card.path() / f"fp32.d{dim}.npy", truncated) |
| | save_data( |
| | model_card.path() / f"fp16.d{dim}.npy", |
| | truncated.to(dtype=torch.float16), |
| | ) |
| | save_data( |
| | model_card.path() / f"fp8_e5m2.d{dim}.npy", |
| | truncated.to(dtype=torch.float8_e5m2), |
| | ) |
| | save_data( |
| | model_card.path() / f"fp8_e4m3.d{dim}.npy", |
| | truncated.to(dtype=torch.float8_e4m3fn), |
| | ) |
| |
|
| |
|
| | def normalized_mean_pooling(x: torch.Tensor) -> torch.Tensor: |
| | pooled = x.mean(dim=0) |
| | normalized = torch.nn.functional.normalize(pooled, dim=0) |
| | return normalized |
| |
|
| |
|
| | def export_readme( |
| | model_card: ModelCard, |
| | embeddings: torch.Tensor, |
| | tokenizer: Tokenizer, |
| | ): |
| | vocab_size, dimensions = embeddings.shape |
| | norms = torch.norm(embeddings, dim=1) |
| |
|
| | phrases = [ |
| | "The committee approved the proposal after hours of heated discussion and several last-minute amendments." |
| | "When training large neural networks, careful tuning of hyperparameters can significantly affect performance and stability." |
| | "Despite the heavy rain, the concert continued as planned and the crowd stayed enthusiastic until the final encore." |
| | "In ancient mythology, heroes often embarked on perilous journeys to discover hidden truths about themselves and their world." |
| | "The new smartphone model features an improved camera system, faster processing, and extended battery life compared to its predecessor." |
| | "He tried to explain the concept using simple analogies, but the underlying mathematics remained difficult to grasp for most listeners." |
| | "After weeks of negotiations, the two countries signed a historic trade agreement aimed at reducing tariffs and boosting cooperation." |
| | "She paused for a moment before answering, choosing her words carefully to avoid misunderstanding in such a delicate situation." |
| | "The detective pieced together the timeline of events, realizing that the key witness had provided a contradictory statement." |
| | "Remote work has changed the way teams collaborate, with online tools replacing traditional office routines and in-person meetings." |
| | ] |
| |
|
| | cosine_similarity = { |
| | torch.float16: [], |
| | torch.float8_e4m3fn: [], |
| | torch.float8_e5m2: [], |
| | } |
| |
|
| | for phrase in phrases: |
| | encoding: Encoding = tokenizer.encode(phrase) |
| | embedded_phrase = embeddings[torch.tensor(encoding.ids, dtype=torch.long)] |
| |
|
| | for dtype in cosine_similarity.keys(): |
| | pooling_unquantized = normalized_mean_pooling(embedded_phrase) |
| | pooling_roundtrip = normalized_mean_pooling( |
| | embedded_phrase.to(dtype).to(torch.float32) |
| | ) |
| | cosine = torch.dot(pooling_unquantized, pooling_roundtrip).item() |
| | cosine_similarity[dtype].append(cosine) |
| |
|
| | avg_cosine_similarity = { |
| | dtype: sum(values) / len(values) for dtype, values in cosine_similarity.items() |
| | } |
| |
|
| | tokenizer_examples = [] |
| | for text in [ |
| | "This is an example of encoding", |
| | "The quick brown fox jumps over the lazy dog.", |
| | "Curaçao, naïve fiancé, jalapeño, déjà vu.", |
| | "Привет, как дела?", |
| | "Бързата кафява лисица прескача мързеливото куче.", |
| | "Γρήγορη καφέ αλεπού πηδάει πάνω από τον τεμπέλη σκύλο.", |
| | "اللغة العربية جميلة وغنية بالتاريخ.", |
| | "مرحبا بالعالم!", |
| | "Simplified: 快速的棕色狐狸跳过懒狗。", |
| | "Traditional: 快速的棕色狐狸跳過懶狗。", |
| | "素早い茶色の狐が怠け者の犬を飛び越える。", |
| | "コンピュータープログラミング", |
| | "빠른 갈색 여우가 게으른 개를 뛰어넘습니다.", |
| | "तेज़ भूरी लोमड़ी आलसी कुत्ते के ऊपर कूदती है।", |
| | "দ্রুত বাদামী শিয়াল অলস কুকুরের উপর দিয়ে লাফ দেয়।", |
| | "வேகமான பழுப்பு நரி சோம்பேறி நாயின் மேல் குதிக்கிறது.", |
| | "สุนัขจิ้งจอกสีน้ำตาลกระโดดข้ามสุนัขขี้เกียจ.", |
| | "ብሩክ ቡናማ ቀበሮ ሰነፍ ውሻን ተዘልሏል።", |
| | "Hello 世界 مرحبا 🌍", |
| | "123, αβγ, абв, العربية, 中文, हिन्दी.", |
| | ]: |
| | encoding = tokenizer.encode(text) |
| | tokens = [f"`{token}`" for token in encoding.tokens] |
| |
|
| | tokenizer_examples.append(f"**Input:** {text}<br/>") |
| | tokenizer_examples.append(f"**Tokens**: {' '.join(tokens)}") |
| | tokenizer_examples.append("") |
| |
|
| | tokenizer_output = "\n".join(tokenizer_examples) |
| |
|
| | with (model_card.path() / "README.md").open("wt") as file: |
| | prefix = " " |
| |
|
| | file.write( |
| | dedent( |
| | f""" |
| | # [{model_card.name()}](https://huggingface.co/{model_card.name()}) |
| | |
| | License: [{model_card.license}](https://choosealicense.com/licenses/{model_card.license}/) |
| | |
| | {indent(model_card.get_description(), prefix).strip()} |
| | |
| | ## Model Stats |
| | |
| | Stats that describe the embeddings tensor shapes and value distribution. |
| | |
| | | item | metric | value | |
| | | --------------| ----------------------- | ----- | |
| | | vocab | size | {vocab_size:,.0f} | |
| | | embedding | dimensions | {dimensions:,.0f} | |
| | | vector length | mean | {norms.mean().item():.2f} | |
| | | vector length | median | {norms.median().item():.2f} | |
| | | vector length | stddev | {norms.std().item():.2f} | |
| | | values | mean | {embeddings.mean().item():.2f} | |
| | | values | median | {embeddings.median().item():.2f} | |
| | | values | stddev | {embeddings.std().item():.2f} | |
| | |
| | ## Mean Pooled Quantization Loss |
| | |
| | This test roundtrips the vectors through quantization, but performs the |
| | mean pooling arithmetic in float32 space. The quantized and unquantized |
| | mean pooled vectors are compared to each other to determine their cosine |
| | similarity, to show how much the meaning of the vector has changed due |
| | to quantization. |
| | |
| | | Precision | Cosine Similarity | |
| | | ------------- | ----------------- | |
| | | fp16 | {avg_cosine_similarity[torch.float16]:.5f} | |
| | | fp8 e4m3 | {avg_cosine_similarity[torch.float8_e4m3fn]:.5f} | |
| | | fp8 e5m2 | {avg_cosine_similarity[torch.float8_e5m2]:.5f} | |
| | |
| | ## Quantization Loss Per Vector |
| | |
| | While ultimately the embedding vectors will be mean pooled together, it's |
| | still useful to look at the loss per-vector in the embedding table to see |
| | which quantization strategies retain the most vector meaning. |
| | |
| | - **Cosine Similarity** — measures how well the *direction* of embedding vectors |
| | is preserved after quantization, independent of scale. This is especially |
| | relevant when embeddings are used for similarity search or retrieval. |
| | - **MSE (Mean Squared Error)** — emphasizes large errors by squaring the |
| | differences. Useful for detecting whether any values are badly distorted. |
| | - **MAE (Mean Absolute Error)** — the average absolute difference between |
| | original and quantized values. Easier to interpret, less sensitive to outliers. |
| | |
| | | Precision | Metric | Value | |
| | | ------------- | ------ | ----- | |
| | | fp16 | cosine similarity | {quantization_loss_cosine(embeddings, torch.float16):.5f} | |
| | | fp8 e4m3 | cosine similarity | {quantization_loss_cosine(embeddings, torch.float8_e4m3fn):.5f} | |
| | | fp8 e5m2 | cosine similarity | {quantization_loss_cosine(embeddings, torch.float8_e5m2):.5f} | |
| | | fp16 | MSE | {quantization_loss_mse(embeddings, torch.float16):.5f} | |
| | | fp8 e4m3 | MSE | {quantization_loss_mse(embeddings, torch.float8_e4m3fn):.5f} | |
| | | fp8 e5m2 | MSE | {quantization_loss_mse(embeddings, torch.float8_e5m2):.5f} | |
| | | fp16 | MAE | {quantization_loss_mae(embeddings, torch.float16):.5f} | |
| | | fp8 e4m3 | MAE | {quantization_loss_mae(embeddings, torch.float8_e4m3fn):.5f} | |
| | | fp8 e5m2 | MAE | {quantization_loss_mae(embeddings, torch.float8_e5m2):.5f} | |
| | |
| | ## Tokenizer Examples |
| | |
| | {indent(tokenizer_output, prefix).strip()} |
| | """ |
| | ).strip() |
| | ) |
| |
|
| |
|
| | def export_tokenizer(model_card: ModelCard, tokenizer: Tokenizer) -> None: |
| | tokenizer_path = model_card.path() / "tokenizer.json" |
| | print(f"Exporting tokenizer: {tokenizer_path}") |
| | tokenizer.save(str(tokenizer_path)) |
| | zst_compress_file(tokenizer_path) |
| |
|
| |
|
| | def export_sentence_transformers(model_card: ModelCard) -> None: |
| | """Extract the embeddings and tokenizer from SentenceTransformers""" |
| |
|
| | print("Processing", model_card.name()) |
| |
|
| | model = SentenceTransformer(model_card.name(), device="cpu") |
| | embedding_bag: EmbeddingBag = model[0].embedding |
| | model_card.path().mkdir(exist_ok=True, parents=True) |
| | embeddings = torch.Tensor(embedding_bag.weight) |
| |
|
| | export_embeddings(model_card, embeddings) |
| | export_tokenizer(model_card, model.tokenizer) |
| | export_readme(model_card, embeddings, model.tokenizer) |
| |
|
| |
|
| | def export_model2vec(model_card: ModelCard) -> None: |
| | """Extract the embeddings and tokenizer from model2vec""" |
| |
|
| | print("Processing", model_card.name()) |
| |
|
| | model = StaticModel.from_pretrained(model_card.name()) |
| | model_card.path().mkdir(exist_ok=True, parents=True) |
| | embeddings = torch.from_numpy(model.embedding) |
| | export_embeddings(model_card, embeddings) |
| | export_tokenizer(model_card, model.tokenizer) |
| | export_readme(model_card, embeddings, model.tokenizer) |
| |
|
| |
|
| | def main() -> None: |
| | |
| | sentence_transformers_models = [ |
| | ModelCard( |
| | owner="sentence-transformers", |
| | repo="static-similarity-mrl-multilingual-v1", |
| | description=""" |
| | Multi-lingual similarity embeddings that were trained with Matroyshka loss |
| | that allows for more effective truncation of the embedding vectors. It |
| | was trained on a variety of domains of multilingual datasets. |
| | |
| | It's a general purpose model that can be used for semantic textual similarity, |
| | paraphrase mining, text classification, clustering, and more |
| | """, |
| | matroyshka_dims=[32, 64, 128, 256, 512, 1024], |
| | license="apache-2.0", |
| | ), |
| | ModelCard( |
| | owner="sentence-transformers", |
| | repo="static-retrieval-mrl-en-v1", |
| | description=""" |
| | English-only uncased similarity embeddings that were trained with Matroyshka |
| | loss that allows for more effective truncation of the embedding vectors. It |
| | was trained on a variety of domains of monolingual datasets. I was designed |
| | specifically for similarity retrieval. |
| | """, |
| | matroyshka_dims=[32, 64, 128, 256, 512, 1024], |
| | license="apache-2.0", |
| | ), |
| | ] |
| | |
| | model2vec_models = [ |
| | ModelCard( |
| | owner="minishlab", |
| | repo="potion-multilingual-128M", |
| | |
| | matroyshka_dims=[32, 64, 128, 256], |
| | description=""" |
| | A multilingual embedder. The details are a bit scant on how it's trained as |
| | there is no source code for it. However, it's likely a close architecture |
| | to the potion-retrieval-32M model, but trained on Common Crawl data. |
| | |
| | The 128M references the number of parameters in the embeddings: |
| | |
| | 256 dimensions * 500,353 vocab. |
| | """, |
| | license="mit", |
| | ), |
| | ModelCard( |
| | owner="minishlab", |
| | repo="potion-retrieval-32M", |
| | matroyshka_dims=[32, 64, 128, 256, 512], |
| | description=""" |
| | The token embeddings from a monolingual English 32M parameter model that was |
| | distilled from embeddings that were initialized from the the multi-domain |
| | [BAAI/bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5) |
| | |
| | The 32M references the number of parameters in the embeddings: |
| | |
| | 512 dimension * 63,091 vocab. |
| | """, |
| | license="mit", |
| | ), |
| | ] |
| |
|
| | if models_path.exists(): |
| | print(f"Removing the old models folder: {models_path}") |
| | shutil.rmtree(models_path) |
| | models_path.mkdir() |
| |
|
| | for model_card in sentence_transformers_models: |
| | export_sentence_transformers(model_card) |
| |
|
| | for model_card in model2vec_models: |
| | export_model2vec(model_card) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|