| | --- |
| | language: |
| | - en |
| | --- |
| | |
| | This model was trained with [Sparsembed](https://github.com/raphaelsty/sparsembed). You can find details on how to use it in the [Sparsembed](https://github.com/raphaelsty/sparsembed) repository. |
| |
|
| | ```sh |
| | pip install sparsembed |
| | ``` |
| |
|
| | ```python |
| | from sparsembed import model, retrieve |
| | from transformers import AutoModelForMaskedLM, AutoTokenizer |
| | |
| | device = "cuda" # cpu |
| | |
| | batch_size = 10 |
| | |
| | # List documents to index: |
| | documents = [ |
| | {'id': 0, |
| | 'title': 'Paris', |
| | 'url': 'https://en.wikipedia.org/wiki/Paris', |
| | 'text': 'Paris is the capital and most populous city of France.'}, |
| | {'id': 1, |
| | 'title': 'Paris', |
| | 'url': 'https://en.wikipedia.org/wiki/Paris', |
| | 'text': "Since the 17th century, Paris has been one of Europe's major centres of science, and arts."}, |
| | {'id': 2, |
| | 'title': 'Paris', |
| | 'url': 'https://en.wikipedia.org/wiki/Paris', |
| | 'text': 'The City of Paris is the centre and seat of government of the region and province of Île-de-France.' |
| | }] |
| | |
| | model = model.Splade( |
| | model=AutoModelForMaskedLM.from_pretrained("raphaelsty/distilbert-splade").to(device), |
| | tokenizer=AutoTokenizer.from_pretrained("raphaelsty/distilbert-splade"), |
| | device=device |
| | ) |
| | |
| | retriever = retrieve.SpladeRetriever( |
| | key="id", # Key identifier of each document. |
| | on=["title", "text"], # Fields to search. |
| | model=model # Splade retriever. |
| | ) |
| | |
| | retriever = retriever.add( |
| | documents=documents, |
| | batch_size=batch_size, |
| | k_tokens=256, # Number of activated tokens. |
| | ) |
| | |
| | retriever( |
| | ["paris", "Toulouse"], # Queries |
| | k_tokens=20, # Maximum number of activated tokens. |
| | k=100, # Number of documents to retrieve. |
| | batch_size=batch_size |
| | ) |
| | ``` |