| | |
| |
|
| | |
| | |
| |
|
| | import numpy as np |
| | from sentence_transformers import SentenceTransformer |
| | import pickle |
| | import pandas as pd |
| | import nltk |
| | from nltk.stem import * |
| | nltk.download("punkt_tab") |
| |
|
| |
|
| | print("Loading SentenceTransformer model...") |
| | model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') |
| | print("Model loaded.") |
| |
|
| | def load_technologies(): |
| | df = pd.read_excel('technologies_database.xlsx') |
| | return df |
| |
|
| | def tech_to_dict(technologies): |
| | tech_dict = [] |
| | for index, tech in enumerate(technologies): |
| | if not tech.find("<title>") > 1: |
| | tab = tech.split("\n") |
| | tab.pop(0) |
| | tab.pop(len(tab)-1) |
| | tech_dict.append({"title": tab[0][tab[0].find(": ")+2:], |
| | "purpose": tab[1][tab[1].find(": ")+2:], |
| | "key_components": tab[2][tab[2].find(": ")+2:], |
| | "advantages": tab[3][tab[3].find(": ")+2:], |
| | "limitations": tab[4][tab[4].find(": ")+2:], |
| | "id": index}) |
| | return tech_dict |
| |
|
| | def stem(data,data_type): |
| | stemmer = SnowballStemmer("english") |
| | processed_data = [] |
| | if data_type == "technologies": |
| | for t_item in data: |
| | processed_data.append({ |
| | "title": stemmer.stem(t_item["title"]), |
| | "purpose": stemmer.stem(t_item["purpose"]), |
| | "key_components": stemmer.stem(t_item["key_components"]), |
| | "advantages": stemmer.stem(t_item["advantages"]), |
| | "limitations": stemmer.stem(t_item["limitations"]), |
| | "id": t_item["id"] |
| | }) |
| | else: |
| | for t_item in data: |
| | print(t_item) |
| | processed_data.append({ |
| | "title": stemmer.stem(t_item), |
| | "description": stemmer.stem(data[t_item]) |
| | }) |
| |
|
| | return processed_data |
| |
|
| | def preprocess_tech_data(_df): |
| | if _df is None or "description" not in _df.columns: |
| | return [], [] |
| |
|
| | technologies_list = _df["description"].to_list() |
| | tech_dict_raw = tech_to_dict(technologies_list) |
| |
|
| | tech_dict_filtered = [ |
| | t for t in tech_dict_raw if ( |
| | len(t.get("title", "")) >= 5 and |
| | len(t.get("advantages", "")) >= 5 and |
| | len(t.get("key_components", "")) >= 5 |
| | ) |
| | ] |
| |
|
| | if not tech_dict_filtered: |
| | return [], [] |
| |
|
| | processed_tech_wt = stem(tech_dict_filtered,"technologies") |
| |
|
| | for t_item_wt in processed_tech_wt: |
| | kc = t_item_wt.get("key_components") |
| | if isinstance(kc, str): |
| | t_item_wt["key_components"] = ''.join(nltk.sent_tokenize(kc)) |
| | else: |
| | t_item_wt["key_components"] = "" |
| |
|
| | original_tech_for_display = tech_dict_filtered[:len(processed_tech_wt)] |
| |
|
| |
|
| | _keys = list(processed_tech_wt[0].keys()) if processed_tech_wt else [] |
| | return processed_tech_wt, _keys, original_tech_for_display |
| |
|
| |
|
| | df = load_technologies() |
| | global_tech,keys,original_tech = preprocess_tech_data(df) |
| | global_tech_purposes = [t["purpose"] for t in global_tech] |
| |
|
| | |
| | print("Encoding global_tech purposes into embeddings... This might take a while for 1000 elements.") |
| | global_tech_embeddings = model.encode(global_tech_purposes, show_progress_bar=True) |
| | print("Global tech embeddings created.") |
| |
|
| | |
| | output_filename = 'global_tech_embeddings.pkl' |
| |
|
| | |
| | |
| | data_to_save = { |
| | 'global_tech': global_tech, |
| | 'global_tech_embeddings': global_tech_embeddings |
| | } |
| |
|
| | print(f"Saving embeddings and global_tech data to {output_filename}...") |
| | with open(output_filename, 'wb') as f: |
| | pickle.dump(data_to_save, f) |
| | print(f"Data saved successfully to {output_filename}.") |
| |
|
| | print(f"\nTo load this file later in your API, use: \n" |
| | f"with open('{output_filename}', 'rb') as f:\n" |
| | f" loaded_data = pickle.load(f)\n" |
| | f"global_tech = loaded_data['global_tech']\n" |
| | f"global_tech_embeddings = loaded_data['global_tech_embeddings']\n") |
| |
|
| |
|