| | import pickle |
| | import numpy as np |
| | import pandas as pd |
| | import nltk |
| | from nltk.stem import * |
| | nltk.download("punkt_tab") |
| | from pathlib import Path |
| | from dotenv import load_dotenv |
| | load_dotenv() |
| | import os |
| | import google.generativeai as genai |
| | import json |
| | from google.genai import Client, types |
| | from datasets import load_dataset |
| |
|
| |
|
| |
|
| |
|
| | def set_prompt(problem): |
| | prompt = """ |
| | # ROLE |
| | |
| | You are a meticulous senior technical analyst and constraints scout. Your task is to read a small description of a technical problem and identify distinct constraints each related to the problem and ensuring that the whole problem is encompassed by each constraints into a JSON object. |
| | |
| | # OBJECTIVE |
| | |
| | Find all the constraints in this technical problem making sure each are premised on the problem only. |
| | Take into account different technical domains to encompass the whole problem. |
| | Output each constraints in a JSON such as : {"title of the constraints1":"description1","title of the constraintsN":"descriptionN"} |
| | |
| | # INSTRUCTIONS & RULES |
| | |
| | 1. **JSON Output**: Your entire response MUST be a single JSON code block starting with a hyphen (`-`) to denote a list. Do not include any explanatory text before or after the JSON. |
| | 2 **Discover and Iterate**: Your primary task is to scan the technical problem, find each constraints and create a seperate entry for it in the output JSON. |
| | 3. **Descriptive Sentences**: You MUST write clear, full sentences that describe the constraints's issues. Do not use single keywords. These descriptions should be based on the information in the technical problem. |
| | 4. **Infer Where Necessary**: The technical problem may not contain all details. Infer plausible information based on the context. |
| | |
| | # JSON SCHEMA & EXAMPLE |
| | |
| | { |
| | 'Exposing Compute Resources': 'The 6G network shall provide suitable APIs to allow authorized third parties and/or UEs to retrieve availability information about computational resources inside the Service Hosting Environment (SHE) and to utilize these computational resources for running workloads on demand.', |
| | 'Providing AI Compute': 'The 6G network shall be able to provide computing resources in the Service Hosting Environment for AI services and provide AI services to UEs.', |
| | ... |
| | } |
| | |
| | --- |
| | ***NOW, BEGIN THE TASK.*** |
| | |
| | # TECHNICAL PROBLEM |
| | |
| | """ + problem |
| | return prompt |
| |
|
| |
|
| | def load_data(): |
| | return load_dataset("heymenn/Technologies", split="train") |
| |
|
| |
|
| | def stem(data,data_type): |
| | stemmer = SnowballStemmer("english") |
| | processed_data = [] |
| | if data_type == "technologies": |
| | for index, t_item in enumerate(data): |
| | processed_data.append({ |
| | "name": stemmer.stem(t_item["name"]), |
| | "purpose": stemmer.stem(t_item["purpose"]), |
| | "problem_types_solved": stemmer.stem(t_item["problem_types_solved"]), |
| | "advantages": stemmer.stem(t_item["advantages"]), |
| | "limitations": stemmer.stem(t_item["limitations"]), |
| | "domain_tags": stemmer.stem(t_item["domain_tags"]), |
| | "id": index |
| | }) |
| |
|
| | else: |
| | for t_item in data: |
| | processed_data.append({ |
| | "title": stemmer.stem(t_item), |
| | "description": stemmer.stem(data[t_item]) |
| | }) |
| | |
| | return processed_data |
| |
|
| |
|
| | def get_technologies_by_id(technologies,dataset): |
| | result = [] |
| | for id in technologies: |
| | print(id) |
| | data = dataset[id] |
| | del data["embeddings"] |
| | print(data) |
| | result.append(data) |
| | return result |
| |
|
| | def save_to_pickle(result_similarites): |
| |
|
| | constraint_titles = sorted(list(set([item['constraint']['title'] for item in result_similarites]))) |
| | max_id2 = max([item['id2'] for item in result_similarites]) |
| |
|
| | row_label_to_index = {title: i for i, title in enumerate(constraint_titles)} |
| | col_labels = list(range(1, max_id2 + 1)) |
| |
|
| | num_rows = len(constraint_titles) |
| | num_cols = max_id2 |
| |
|
| | matrix = np.full((num_rows, num_cols), np.nan, dtype=np.float32) |
| |
|
| | for item in result_similarites: |
| | row_idx = row_label_to_index[item['constraint']['title']] |
| | col_idx = item['id2'] - 1 |
| | similarity_value = item['similarity'].item() |
| |
|
| | matrix[row_idx, col_idx] = similarity_value |
| |
|
| | print(f"Successfully created matrix with shape: {matrix.shape}") |
| | print(f"Number of rows (unique constraints): {num_rows}") |
| | print(f"Number of columns (max id2): {num_cols}") |
| | print("\nExample 5x5 block of the created matrix (NaN for missing values):") |
| | print(matrix[:5, :5]) |
| |
|
| | output_filename = "cosine_similarity_matrix_with_labels.pkl" |
| | data_to_save = { |
| | 'matrix': matrix, |
| | 'row_labels': constraint_titles, |
| | 'col_labels': col_labels |
| | } |
| |
|
| | with open(output_filename, 'wb') as f: |
| | pickle.dump(data_to_save, f) |
| |
|
| | print(f"\nMatrix and labels saved to {output_filename}") |
| | return output_filename |
| |
|
| | def set_gemini(): |
| | gemini_api = os.getenv("GEMINI_API") |
| | client = Client(api_key=gemini_api) |
| |
|
| | |
| | grounding_tool = types.Tool( |
| | google_search=types.GoogleSearch() |
| | ) |
| |
|
| | |
| | config = types.GenerateContentConfig( |
| | tools=[grounding_tool] |
| | ) |
| |
|
| | return client,config |