lewispons
/

GrammarGuru

Sentence Similarity

recommendersystem

Model card Files Files and versions

GrammarGuru / src /models /utils /utilities.py

lewispons's picture

Initial Setup

fc07932 over 2 years ago

history blame contribute delete

2.26 kB

	import yaml
	import time
	from typing import Dict, Union
	import pandas as pd
	import spacy
	import os


	def read_yaml_config(file_path: str) -> Dict:
	"""
	Reads a YAML configuration file and returns the loaded configuration as a dictionary.

	Args:
	file_path (str): The path to the YAML configuration file.
	"""
	with open(file_path, 'r') as file:
	config = yaml.safe_load(file)
	return config



	def validate_and_create_subfolders(
	model_name: str,
	parent_subfolder: str = "/Users/luis.morales/Desktop/arxiv-paper-recommender/models"
	):
	model_subfolders = ["data", "dictionaries", "similarities_matrix", "tdidf"]

	if not os.path.exists(f"{parent_subfolder}/{model_name}"):
	os.makedirs(f"{parent_subfolder}/{model_name}")
	for msubfolder in model_subfolders:
	os.makedirs(f"{parent_subfolder}/{model_name}/{msubfolder}")






	def execution_time(func):
	"""
	Decorator that measures the execution time of a function and prints the elapsed time.
	"""
	def wrapper(args, *kwargs):
	start_time = time.time()
	result = func(args, *kwargs)
	end_time = time.time()
	execution_seconds = end_time - start_time
	print(f"Function '{func.__name__}' executed in {execution_seconds:.4f} seconds.")
	return result
	return wrapper


	def cleanData(doc: Union[pd.Series, str], nlp = spacy.load('en_core_web_sm')):
	"""
	TODO: Optimize NLP Object to only obtain stopwords, lemmas, and tokenize docs.

	Cleans and processes the input documents by performing various text cleaning operations.

	Args:
	doc (pd.Series): The documents to be cleaned, passed in a Series object.
	stemming (bool, optional): Specifies whether stemming should be applied. Defaults to False.

	Returns:
	str: The cleaned and processed document as a single string.
	"""
	doc = doc.lower()
	doc = nlp(doc)
	tokens = [tokens.lower_ for tokens in doc]
	tokens = [tokens for tokens in doc if (tokens.is_stop == False)]
	tokens = [tokens for tokens in tokens if (tokens.is_punct == False)]
	final_token = [token.lemma_ for token in tokens]

	return " ".join(final_token)