| | --- |
| | language: |
| | - pt |
| | - en |
| | license: mit |
| | base_model: |
| | - google/bert_uncased_L-4_H-256_A-4 |
| | pipeline_tag: text-ranking |
| | --- |
| | |
| | ```python |
| | from transformers import AutoTokenizer, AutoModelForSequenceClassification |
| | import torch |
| | |
| | model_id = "cnmoro/BertMini-Reranker-EnPt" |
| | model = AutoModelForSequenceClassification.from_pretrained( |
| | model_id, |
| | num_labels=2 |
| | ) |
| | tokenizer = AutoTokenizer.from_pretrained(model_id) |
| | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| | model.to(device) |
| | |
| | template = "Query: {query}\nSentence: {document}" |
| | |
| | def rank(query, documents, normalize_scores=True): |
| | texts = [template.format(query=query, document=document) for document in documents] |
| | |
| | inputs = tokenizer( |
| | texts, |
| | add_special_tokens=True, |
| | max_length=512, |
| | truncation=True, |
| | padding=True, |
| | return_tensors="pt", |
| | ) |
| | |
| | input_ids = inputs["input_ids"].to(device) |
| | attention_mask = inputs["attention_mask"].to(device) |
| | |
| | model.eval() |
| | with torch.no_grad(): |
| | outputs = model(input_ids, attention_mask=attention_mask) |
| | logits = outputs.logits |
| | probabilities = torch.softmax(logits, dim=1) |
| | |
| | # Get the predicted classes and confidence scores |
| | predicted_classes = torch.argmax(probabilities, dim=1).tolist() |
| | confidences = probabilities.max(dim=1).values.tolist() |
| | |
| | # Construct the results |
| | results = [ |
| | {"prediction": pred, "confidence": conf} |
| | for pred, conf in zip(predicted_classes, confidences) |
| | ] |
| | |
| | final_results = [] |
| | for document, result in zip(documents, results): |
| | # If the prediction is 0, then get the score as 1 - confidence |
| | if result['prediction'] == 0: |
| | result['confidence'] = 1 - result['confidence'] |
| | final_results.append((document, result['confidence'])) |
| | |
| | # Sort by the confidence score, descending |
| | sorted_results = sorted(final_results, key=lambda x: x[1], reverse=True) |
| | |
| | if normalize_scores: |
| | total_score = sum([result[1] for result in sorted_results]) |
| | sorted_results = [(result[0], result[1] / total_score) for result in sorted_results] |
| | |
| | return sorted_results |
| | |
| | # Sample - 1 |
| | query = "O que é o Pantanal?" |
| | documents = [ |
| | "É um dos ecossistemas mais ricos em biodiversidade do mundo, abrigando uma grande variedade de espécies animais e vegetais.", |
| | "Sua beleza natural, com rios e lagos interligados, atrai turistas de todo o mundo.", |
| | "O Pantanal sofre com impactos ambientais, como a exploração mineral e o desmatamento.", |
| | "O Pantanal é uma extensa planície alagável localizada na América do Sul, principalmente no Brasil, mas também em partes da Bolívia e Paraguai.", |
| | "É um local com importância histórica e cultural para as populações locais.", |
| | "O Pantanal é um importante habitat para diversas espécies de animais, inclusive aves migratórias." |
| | ] |
| | rank(query, documents) |
| | # [('O Pantanal é uma extensa planície alagável localizada na América do Sul, principalmente no Brasil, mas também em partes da Bolívia e Paraguai.', |
| | # 0.36703487634136817), |
| | # ('O Pantanal é um importante habitat para diversas espécies de animais, inclusive aves migratórias.', |
| | # 0.36591911362645174), |
| | # ('O Pantanal sofre com impactos ambientais, como a exploração mineral e o desmatamento.', |
| | # 0.13708830048931145), |
| | # ('É um local com importância histórica e cultural para as populações locais.', |
| | # 0.0718928987255767), |
| | # ('Sua beleza natural, com rios e lagos interligados, atrai turistas de todo o mundo.', |
| | # 0.02968024567026795), |
| | # ('É um dos ecossistemas mais ricos em biodiversidade do mundo, abrigando uma grande variedade de espécies animais e vegetais.', |
| | # 0.02838456514702401)] |
| | |
| | # Sample - 2 |
| | query = "What is the speed of light?" |
| | documents = [ |
| | "Isaac Newton's laws of motion and gravity laid the groundwork for classical mechanics.", |
| | "The theory of relativity, proposed by Albert Einstein, has revolutionized our understanding of space, time, and gravity.", |
| | "The Earth orbits the Sun at an average distance of about 93 million miles, taking roughly 365.25 days to complete one revolution.", |
| | "The speed of light in a vacuum is approximately 299,792 kilometers per second (km/s), or about 186,282 miles per second.", |
| | "Light can be described as both a wave and a particle, a concept known as wave-particle duality." |
| | ] |
| | rank(query, documents) |
| | # [('The speed of light in a vacuum is approximately 299,792 kilometers per second (km/s), or about 186,282 miles per second.', |
| | # 0.33902196713184685), |
| | # ("Isaac Newton's laws of motion and gravity laid the groundwork for classical mechanics.", |
| | # 0.2309855191720416), |
| | # ('The Earth orbits the Sun at an average distance of about 93 million miles, taking roughly 365.25 days to complete one revolution.', |
| | # 0.20293087063400417), |
| | # ('Light can be described as both a wave and a particle, a concept known as wave-particle duality.', |
| | # 0.188980879354878), |
| | # ('The theory of relativity, proposed by Albert Einstein, has revolutionized our understanding of space, time, and gravity.', |
| | # 0.03808076370722937)] |
| | ``` |