| | from collections import Counter |
| | from typing import Dict, Tuple, List |
| |
|
| | import pandas as pd |
| | import torch |
| |
|
| | from datasets import load_dataset, load_from_disk |
| | from sklearn.model_selection import train_test_split |
| |
|
| | import src.data_utils.dataset_params as dataset_params |
| |
|
| | from src.data_utils.config import DatasetConfig, TextProcessorConfig |
| | from src.data_utils.text_processor import TextProcessor |
| |
|
| |
|
| | class DatasetGenerator: |
| | """ |
| | Main dataset generator class |
| | |
| | Provides methods to load, build vocabulary, convert text datasets |
| | into tensor format suitable for deep learning models. |
| | |
| | Args: |
| | dataset_name: Name of dataset from DatasetName enum |
| | config: Configuration object with preprocessing parameters |
| | device: Torch device to place tensors on (cpu/cuda) |
| | """ |
| |
|
| | def __init__( |
| | self, |
| | dataset_name: dataset_params.DatasetName, |
| | config: DatasetConfig = DatasetConfig(), |
| | device: torch.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| | ): |
| | self.dataset_params = dataset_params.get_dataset_params_by_name(dataset_name=dataset_name) |
| | self.config = config |
| | self.device = device |
| | self.text_processor = TextProcessor( |
| | vocab=None, |
| | config=TextProcessorConfig( |
| | max_seq_len=self.config.max_seq_len, |
| | lowercase=self.config.lowercase, |
| | remove_punct=self.config.remove_punct, |
| | pad_token=self.config.pad_token, |
| | unk_token=self.config.unk_token, |
| | ) |
| | ) |
| | self.vocab = None |
| | self.id2word = None |
| | self.embedding_layer = None |
| |
|
| |
|
| | def load_raw_data(self) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: |
| | """ |
| | Load raw dataset from source |
| | |
| | Returns: |
| | Tuple of (train_df, val_df, test_df) DataFrames |
| | """ |
| | if self.config.load_from_disk: |
| | dataset = load_from_disk(f"{self.config.path_to_data}/{self.dataset_params.local_path}") |
| | else: |
| | dataset = load_dataset(self.dataset_params.hugging_face_name) |
| | train_df = pd.DataFrame(dataset["train"]) |
| | test_df = pd.DataFrame(dataset["test"]) |
| | val_df, test_df = train_test_split( |
| | test_df, |
| | test_size=0.5, |
| | random_state=self.config.random_state, |
| | stratify=test_df[self.dataset_params.label_col_name] |
| | ) |
| | |
| | |
| | train_df = train_df.sample(n=self.config.train_size, random_state=self.config.random_state) |
| | val_df = val_df.sample(n=self.config.val_size, random_state=self.config.random_state) |
| | test_df = test_df.sample(n=self.config.test_size, random_state=self.config.random_state) |
| | |
| | return train_df, val_df, test_df |
| |
|
| |
|
| | def build_vocabulary(self, tokenized_texts: List[List[str]]) -> Tuple[Dict[str, int], Dict[int, str]]: |
| | """ |
| | Build vocabulary from tokenized texts |
| | |
| | Args: |
| | tokenized_texts: List of tokenized texts |
| | |
| | Returns: |
| | Tuple of (word_to_id, id_to_word) mappings |
| | """ |
| |
|
| | all_tokens = [token for tokens in tokenized_texts for token in tokens] |
| | word_counts = Counter(all_tokens) |
| | |
| | filtered_words = [word for word, count in word_counts.items() |
| | if count >= self.config.min_word_freq] |
| | |
| | word_to_id = {self.config.pad_token: 0, self.config.unk_token: 1} |
| | id_to_word = {0: self.config.pad_token, 1: self.config.unk_token} |
| | |
| | for idx, word in enumerate(filtered_words, start=2): |
| | word_to_id[word] = idx |
| | id_to_word[idx] = word |
| | |
| | self.text_processor.vocab = word_to_id |
| |
|
| | return word_to_id, id_to_word |
| |
|
| |
|
| | def generate_dataset(self) -> Tuple[ |
| | Tuple[torch.Tensor, torch.Tensor], |
| | Tuple[torch.Tensor, torch.Tensor], |
| | Tuple[torch.Tensor, torch.Tensor] |
| | ]: |
| | """ |
| | Main method to generate the full dataset |
| | |
| | Returns: |
| | Tuple containing: |
| | - (train_features, train_labels) |
| | - (val_features, val_labels) |
| | - (test_features, test_labels) |
| | - embedding_layer |
| | """ |
| | |
| | train_df, val_df, test_df = self.load_raw_data() |
| |
|
| | train_texts = train_df[self.dataset_params.content_col_name].tolist() |
| | train_tokens = [self.text_processor.preprocess_text(text) for text in train_texts] |
| | |
| | if self.config.build_vocab: |
| | self.vocab, self.id2word = self.build_vocabulary(train_tokens) |
| |
|
| | X_train = torch.stack([self.text_processor.text_to_tensor(text) for text in train_texts]) |
| | |
| | val_texts = val_df[self.dataset_params.content_col_name].tolist() |
| | X_val = torch.stack([self.text_processor.text_to_tensor(text) for text in val_texts]) |
| | |
| | test_texts = test_df[self.dataset_params.content_col_name].tolist() |
| | X_test = torch.stack([self.text_processor.text_to_tensor(text) for text in test_texts]) |
| | |
| | y_train = torch.tensor(train_df[self.dataset_params.label_col_name].values, dtype=torch.long) |
| | y_val = torch.tensor(val_df[self.dataset_params.label_col_name].values, dtype=torch.long) |
| | y_test = torch.tensor(test_df[self.dataset_params.label_col_name].values, dtype=torch.long) |
| | |
| | return (X_train, y_train), (X_val, y_val), (X_test, y_test) |
| |
|
| |
|
| | def get_vocabulary(self) -> Tuple[Dict[str, int], Dict[int, str]]: |
| | """ |
| | Get vocabulary mappings |
| | |
| | Returns: |
| | Tuple of (word_to_id, id_to_word) dictionaries |
| | """ |
| |
|
| | return self.vocab, self.id2word |
| |
|
| |
|
| | def get_config(self) -> DatasetConfig: |
| | """ |
| | Get current configuration |
| | |
| | Returns: |
| | DatasetConfig object |
| | """ |
| |
|
| | return self.config |
| |
|
| |
|
| | def get_text_processor(self) -> TextProcessor: |
| | """ |
| | Get the text processor for inference usage |
| | |
| | Returns: |
| | TextProcessor object |
| | """ |
| | return self.text_processor |
| |
|