| |
| """compareVec2VecWithAda.ipynb |
| |
| Automatically generated by Colaboratory. |
| |
| Original file is located at |
| https://colab.research.google.com/drive/1jPaNXdO0_oW6VczlWfm5RPUVpMtVQD9c |
| """ |
|
|
| import pandas as pd |
| import numpy as np |
| import openai |
| from sklearn.metrics.pairwise import cosine_similarity |
| from tensorflow.keras.models import load_model |
| from transformers import AutoTokenizer, AutoModel |
| import torch |
| import torch.nn.functional as F |
|
|
| |
| tokenizer = AutoTokenizer.from_pretrained('all-mpnet-base-v2') |
| model = AutoModel.from_pretrained('all-mpnet-base-v2') |
|
|
| |
| def cosine_similarity_loss(y_true, y_pred): |
| y_true = tf.nn.l2_normalize(y_true, axis=-1) |
| y_pred = tf.nn.l2_normalize(y_pred, axis=-1) |
| return -tf.reduce_mean(y_true * y_pred, axis=-1) |
|
|
|
|
| |
| def mean_pooling(model_output, attention_mask): |
| token_embeddings = model_output[0] |
| input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() |
| return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) |
|
|
| loaded_model = load_model('mpnet2adaE75V4.h5', custom_objects={'cosine_similarity_loss': cosine_similarity_loss}) |
|
|
| openai.api_key="insert API key here" |
|
|
| |
| df2 = pd.read_csv('Actual_Embeddings.csv') |
|
|
| |
| df2['Actual_Embeddings'] = df2['Actual_Embeddings'].apply(eval).apply(np.array) |
|
|
|
|
| def get_top_5_texts(query): |
| encoded_input = tokenizer(query, padding=True, truncation=True, return_tensors='pt') |
|
|
| with torch.no_grad(): |
| model_output = model(**encoded_input) |
|
|
| mpnetEmbeddings = mean_pooling(model_output, encoded_input['attention_mask']) |
|
|
| mpnetEmbeddings = F.normalize(mpnetEmbeddings, p=2, dim=1) |
| mpnetEmbeddings = mpnetEmbeddings.detach().cpu().numpy() |
| mpnetEmbeddings = np.reshape(mpnetEmbeddings, (1,-1)) |
| query_embedding = loaded_model.predict(mpnetEmbeddings) |
|
|
| similarities = [cosine_similarity(query_embedding.reshape(1, -1), emb.reshape(1, -1))[0][0] for emb in df2['Actual_Embeddings']] |
|
|
| print("Converted MPNet Embedding Results:") |
| top_5_idx2 = np.argsort(similarities)[-5:][::-1] |
| for i, idx in enumerate(top_5_idx2, 1): |
| print(f'Text {i}') |
| print(df2['combined'].iloc[idx]) |
| print("\n") |
|
|
| response = openai.Embedding.create(input=query, model="text-embedding-ada-002") |
| query_embedding = np.array(response['data'][0]['embedding']) |
| similarities2 = [cosine_similarity(query_embedding.reshape(1, -1), emb.reshape(1, -1))[0][0] for emb in df2['Actual_Embeddings']] |
|
|
| print("OpenAI Embedding Results:") |
| top_5_idx2 = np.argsort(similarities2)[-5:][::-1] |
| for i, idx in enumerate(top_5_idx2, 1): |
| print(f'Text {i}') |
| print(df2['combined'].iloc[idx]) |
| print("\n") |
|
|
| while True: |
| query = input("Enter your query: ") |
| get_top_5_texts(query) |