insight-finder

Running

App Files Files Community

insight-finder / src /services /external_process.py

heymenn

Upload 7 files

1f05644 verified 8 months ago

raw

history blame contribute delete

4.39 kB

	# This file is used to compute the embedding of the technologies, easily executable on google colab

	#!pip install sentence-transformers
	#!pip install nltk

	import numpy as np
	from sentence_transformers import SentenceTransformer
	import pickle
	import pandas as pd
	import nltk
	from nltk.stem import *
	nltk.download("punkt_tab")


	print("Loading SentenceTransformer model...")
	model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
	print("Model loaded.")

	def load_technologies():
	df = pd.read_excel('technologies_database.xlsx')
	return df

	def tech_to_dict(technologies):
	tech_dict = []
	for index, tech in enumerate(technologies):
	if not tech.find("<title>") > 1:
	tab = tech.split("\n")
	tab.pop(0)
	tab.pop(len(tab)-1)
	tech_dict.append({"title": tab[0][tab[0].find(": ")+2:],
	"purpose": tab[1][tab[1].find(": ")+2:],
	"key_components": tab[2][tab[2].find(": ")+2:],
	"advantages": tab[3][tab[3].find(": ")+2:],
	"limitations": tab[4][tab[4].find(": ")+2:],
	"id": index})
	return tech_dict

	def stem(data,data_type):
	stemmer = SnowballStemmer("english")
	processed_data = []
	if data_type == "technologies":
	for t_item in data:
	processed_data.append({
	"title": stemmer.stem(t_item["title"]),
	"purpose": stemmer.stem(t_item["purpose"]),
	"key_components": stemmer.stem(t_item["key_components"]),
	"advantages": stemmer.stem(t_item["advantages"]),
	"limitations": stemmer.stem(t_item["limitations"]),
	"id": t_item["id"]
	})
	else:
	for t_item in data:
	print(t_item)
	processed_data.append({
	"title": stemmer.stem(t_item),
	"description": stemmer.stem(data[t_item])
	})

	return processed_data

	def preprocess_tech_data(_df):
	if _df is None or "description" not in _df.columns:
	return [], []

	technologies_list = _df["description"].to_list()
	tech_dict_raw = tech_to_dict(technologies_list)

	tech_dict_filtered = [
	t for t in tech_dict_raw if (
	len(t.get("title", "")) >= 5 and
	len(t.get("advantages", "")) >= 5 and
	len(t.get("key_components", "")) >= 5
	)
	]

	if not tech_dict_filtered:
	return [], []

	processed_tech_wt = stem(tech_dict_filtered,"technologies")

	for t_item_wt in processed_tech_wt:
	kc = t_item_wt.get("key_components")
	if isinstance(kc, str):
	t_item_wt["key_components"] = ''.join(nltk.sent_tokenize(kc))
	else:
	t_item_wt["key_components"] = ""

	original_tech_for_display = tech_dict_filtered[:len(processed_tech_wt)]


	_keys = list(processed_tech_wt[0].keys()) if processed_tech_wt else []
	return processed_tech_wt, _keys, original_tech_for_display


	df = load_technologies()
	global_tech,keys,original_tech = preprocess_tech_data(df)
	global_tech_purposes = [t["purpose"] for t in global_tech]

	# Encode all global_tech purposes into embeddings
	print("Encoding global_tech purposes into embeddings... This might take a while for 1000 elements.")
	global_tech_embeddings = model.encode(global_tech_purposes, show_progress_bar=True)
	print("Global tech embeddings created.")

	# Define the filename for the pickle file
	output_filename = 'global_tech_embeddings.pkl'

	# Save the embeddings and the global_tech data (optional, but good for context)
	# Saving global_tech alongside embeddings ensures you have the original data if needed
	data_to_save = {
	'global_tech': global_tech, # The original list of dictionaries
	'global_tech_embeddings': global_tech_embeddings # The numpy array of embeddings
	}

	print(f"Saving embeddings and global_tech data to {output_filename}...")
	with open(output_filename, 'wb') as f:
	pickle.dump(data_to_save, f)
	print(f"Data saved successfully to {output_filename}.")

	print(f"\nTo load this file later in your API, use: \n"
	f"with open('{output_filename}', 'rb') as f:\n"
	f" loaded_data = pickle.load(f)\n"
	f"global_tech = loaded_data['global_tech']\n"
	f"global_tech_embeddings = loaded_data['global_tech_embeddings']\n")