| from dataclasses import dataclass |
|
|
|
|
| @dataclass |
| class DatasetConfig: |
| """ |
| Configuration class for dataset generation parameters |
| |
| Attributes: |
| embedding_dim: Dimension for embedding layer output |
| train_size: Number of samples in training set |
| val_size: Number of samples in validation set |
| test_size: Number of samples in test set |
| random_state: Random seed for reproducibility |
| min_word_freq: Minimum word frequency to include in vocabulary |
| load_from_disk: Load dataset from local dir. If false download from huggin face |
| path_to_data: Path to local dataset data |
| build_vocab: Is build vocabulary necessary |
| max_seq_len: Maximum sequence length (will be padded/truncated to this) |
| lowercase: Whether to convert text to lowercase |
| remove_punct: Whether to remove punctuation |
| pad_token: Padding token |
| unk_token: Unknown token |
| """ |
|
|
| embedding_dim: int = 64 |
| train_size: int = 10000 |
| val_size: int = 5000 |
| test_size: int = 5000 |
| random_state: int = 42 |
| min_word_freq: int = 1 |
| load_from_disk: bool = False |
| path_to_data: str = "./datasets" |
| build_vocab: bool = True |
|
|
| max_seq_len: int = 300 |
| lowercase: bool = True |
| remove_punct: bool = False |
| pad_token: str = "<PAD>" |
| unk_token: str = "<UNK>" |
|
|
|
|
| @dataclass |
| class TextProcessorConfig: |
| """ |
| Configuration class for text processor parameters (params should be equal dataset config) |
| |
| Attributes: |
| max_seq_len: Maximum sequence length (will be padded/truncated to this) |
| lowercase: Whether to convert text to lowercase |
| remove_punct: Whether to remove punctuation |
| pad_token: Padding token |
| unk_token: Unknown token |
| """ |
|
|
| max_seq_len: int = 300 |
| lowercase: bool = True |
| remove_punct: bool = False |
| pad_token: str = "<PAD>" |
| unk_token: str = "<UNK>" |
|
|