Upload 11 files

e65f5e1 over 2 years ago

8.48 kB

	import openai
	import pandas as pd
	import numpy as np
	# from openai.embeddings_utils import get_embedding
	from transformers import GPT2TokenizerFast
	from tqdm.auto import tqdm
	import os



	tqdm.pandas()

	import spacy
	# import numpy as np

	# Load spaCy model with GloVe embeddings
	import en_core_web_sm

	nlp = en_core_web_sm.load()

	def custom_embedding(text, model_name="text-embedding-ada-002"):
	# Process the text with spaCy
	doc = nlp(text)

	# Extract word embeddings and average them to get the text embedding
	word_embeddings = [token.vector for token in doc if token.has_vector]

	if not word_embeddings:
	return None # No embeddings found for any word in the text

	text_embedding = np.mean(word_embeddings, axis=0)

	# Create a response dictionary
	response = {
	"data": [
	{
	"embedding": text_embedding.tolist(),
	"index": 0,
	"object": "embedding"
	}
	],
	"model": model_name,
	"object": "list",
	"usage": {
	"prompt_tokens": len(text.split()),
	"total_tokens": len(text.split())
	}
	}

	return response

	# Example usage
	text = "Rome"
	response = custom_embedding(text)

	if response["data"][0]["embedding"] is not None:
	print(f"Custom Embedding for '{text}': {response['data'][0]['embedding']}")
	else:
	print(f"No embeddings found for words in '{text}'.")

	print(response)


	# import spacy
	# import numpy as np

	# Load spaCy model with GloVe embeddings
	# import en_core_web_sm

	nlp = en_core_web_sm.load()

	def custom_embedding(text_list, model_name="text-embedding-ada-002"):
	embeddings = []

	for text in text_list:
	# Process the text with spaCy
	doc = nlp(text)

	# Extract word embeddings and average them to get the text embedding
	word_embeddings = [token.vector for token in doc if token.has_vector]

	if not word_embeddings:
	embeddings.append(None) # No embeddings found for any word in the text
	else:
	text_embedding = np.mean(word_embeddings, axis=0)
	embeddings.append(text_embedding.tolist())

	# Create a response dictionary
	response = {
	"data": [
	{
	"embedding": emb,
	"index": idx,
	"object": "embedding"
	}
	for idx, emb in enumerate(embeddings)
	],
	"model": model_name,
	"object": "list",
	"usage": {
	"prompt_tokens": sum(len(text.split()) for text in text_list),
	"total_tokens": sum(len(text.split()) for text in text_list)
	}
	}

	return response

	# Example usage
	text = ["She is running", "Fitness is good", "I am hungry", "Basketball is healthy"]
	response = custom_embedding(text)

	for idx, embedding in enumerate(response["data"]):
	if embedding["embedding"] is not None:
	print(f"Custom Embedding for '{text[idx]}': {embedding['embedding']}")
	else:
	print(f"No embeddings found for words in '{text[idx]}'.")

	print(response)

	emb1 = response['data'][0]['embedding']
	emb2 = response['data'][1]['embedding']
	emb3 = response['data'][2]['embedding']
	emb4 = response['data'][3]['embedding']

	np.dot(emb1, emb2)
	np.dot(emb2, emb4)

	df = pd.read_csv('Dronealexa.csv')
	df = df.dropna()
	df.info()
	df.head()
	df['combined'] = "Title: " + df['Title'].str.strip() + "; URL: " + df['URL'].str.strip() + "; Publication Year: " + df['Publication Year'].astype(str).str.strip() + "; Abstract: " + df['Abstract'].str.strip()
	df.head()

	tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

	df['n_tokens'] = df.combined.progress_apply(lambda x: len(tokenizer.encode(x)))
	df = df[df.n_tokens < 8000]
	df.info()
	df.head()


	# import spacy
	# import numpy as np

	# Load spaCy model with GloVe embeddings
	# import en_core_web_sm

	nlp = en_core_web_sm.load()

	def get_embeddings(text, model):
	# Process the text with spaCy
	doc = model(text)

	# Extract word embeddings and average them to get the text embedding
	word_embeddings = [token.vector for token in doc if token.has_vector]

	if not word_embeddings:
	return None # No embeddings found for any word in the text

	text_embedding = np.mean(word_embeddings, axis=0)

	# Create a response dictionary
	response = {
	"data": [
	{
	"embedding": text_embedding.tolist(),
	"index": 0,
	"object": "embedding"
	}
	],
	"model": model.meta["name"],
	"object": "list",
	"usage": {
	"prompt_tokens": len(text.split()),
	"total_tokens": len(doc)
	}
	}

	return response

	# Example usage
	input_text = "Your input text goes here"
	custom_model = nlp # You can replace this with any other spaCy model

	# Renaming 'input_text' to avoid conflict with the built-in 'input' function
	text_to_process = input_text

	response = get_embeddings(text_to_process, custom_model)

	if response["data"][0]["embedding"] is not None:
	print(f"Custom Embedding for '{text_to_process}': {response['data'][0]['embedding']}")
	else:
	print(f"No embeddings found for words in '{text_to_process}'.")

	print(response)

	from tqdm import tqdm

	batch_size = 2000
	model_name = 'text-embedding-ada-002'

	# Assuming df is your DataFrame
	for i in tqdm(range(0, len(df.combined), batch_size)):
	# find end of batch
	i_end = min(i + batch_size, len(df.combined))

	# Get embeddings for the current batch
	batch_text = list(df.combined)[i:i_end]

	# Initialize an empty list to store the embeddings for each text in the batch
	batch_embeddings = []

	# Process each text in the batch and get embeddings
	for text in batch_text:
	response = get_embeddings(text, nlp)

	# Check if embeddings were found
	if response and response["data"][0]["embedding"] is not None:
	batch_embeddings.append(response["data"][0]["embedding"])
	else:
	# Handle the case where no embeddings are found for a text
	batch_embeddings.append(None)

	# Update the DataFrame with the embeddings
	for j in range(i, i_end):
	df.loc[j, 'ada_vector'] = str(batch_embeddings[j - i])

	df.head()
	df.info()
	df['ada_vector'] = df.ada_vector.progress_apply(eval).progress_apply(np.array)
	df.to_csv('embeddings_chatbot.csv',index=False)
	df=pd.read_csv('embeddings_chatbot.csv')

	user_query = input("Enter query - ")

	query_response = get_embeddings(user_query, nlp)

	if query_response["data"][0]["embedding"] is not None:
	print(f"Embedding for '{user_query}': {query_response['data'][0]['embedding']}")
	else:
	print(f"No embeddings found for words in '{user_query}'.")

	searchvector = get_embeddings(user_query, custom_model)["data"][0]["embedding"]



	from sklearn.metrics.pairwise import cosine_similarity

	# Assuming df['ada_vector'] contains the vectors you want to compare

	# Ensure 'ada_vector' column contains valid numeric arrays
	df['ada_vector'] = df['ada_vector'].apply(lambda x: np.array(x) if isinstance(x, (list, np.ndarray)) else x)

	# Filter out rows where 'ada_vector' is not a valid numeric array
	valid_rows = df['ada_vector'].apply(lambda x: isinstance(x, np.ndarray))

	# Calculate cosine similarity only for valid rows
	df.loc[valid_rows, 'similarities'] = df.loc[valid_rows, 'ada_vector'].apply(
	lambda x: cosine_similarity([x], [searchvector])[0][0]
	)

	# If you are using the 'progress_apply' from the 'tqdm' library
	# You can keep it as follows:
	# df.loc[valid_rows, 'similarities'] = df.loc[valid_rows, 'ada_vector'].progress_apply(
	# lambda x: cosine_similarity([x], [searchvector])[0][0]
	# )

	df.head()
	df.sort_values('similarities', ascending = False)
	result = df.sort_values('similarities', ascending = False).head(3)

	result.head()

	xc = list(result.combined)

	def construct_prompt(query, xc):
	context = ''
	for i in range(3):
	context += xc[i] + "\n"
	header = """Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."\n\nContext:\n"""
	header += context + "\n\n Q: " + query + "\n A:"
	return header



	from transformers import pipeline

	summarizer = pipeline("summarization")
	Fresult = construct_prompt(user_query, xc)
	summarizer("\n".join(xc), max_length=130, min_length=30, do_sample=False)