Spaces:

vectorplasticity
/

universal-model-trainer

Sleeping

App Files Files Community

vectorplasticity commited on Apr 12

Commit

16525fb

verified ·

1 Parent(s): 39f0935

Add dataset utilities

Browse files

Files changed (1) hide show

app/utils/dataset_utils.py +551 -0

app/utils/dataset_utils.py ADDED Viewed

	@@ -0,0 +1,551 @@

+"""
+Dataset Utilities - Helper functions for dataset operations
+"""
+import logging
+from typing import Dict, Any, List, Optional, Tuple
+from datasets import load_dataset, Dataset, DatasetDict
+from transformers import AutoTokenizer
+import json
+import os
+logger = logging.getLogger(__name__)
+# Dataset column mappings for common datasets
+DATASET_COLUMN_MAPPINGS = {
+    "wikitext": {"text": "text"},
+    "squad": {"question": "question", "context": "context", "answers": "answers"},
+    "squad_v2": {"question": "question", "context": "context", "answers": "answers"},
+    "cnn_dailymail": {"article": "article", "highlights": "highlights"},
+    "xsum": {"document": "document", "summary": "summary"},
+    "samsum": {"dialogue": "dialogue", "summary": "summary"},
+    "billsum": {"text": "text", "summary": "summary"},
+    "aeslc": {"email_body": "email_body", "subject_line": "subject_line"},
+    "conll2003": {"tokens": "tokens", "ner_tags": "ner_tags"},
+    "wnut_17": {"tokens": "tokens", "ner_tags": "ner_tags"},
+    "imdb": {"text": "text", "label": "label"},
+    "yelp_polarity": {"text": "text", "label": "label"},
+    "yelp_review_full": {"text": "text", "label": "label"},
+    "sst2": {"sentence": "sentence", "label": "label"},
+    "cola": {"sentence": "sentence", "label": "label"},
+    "mnli": {"premise": "premise", "hypothesis": "hypothesis", "label": "label"},
+    "qnli": {"question": "question", "sentence": "sentence", "label": "label"},
+    "qqp": {"question1": "question1", "question2": "question2", "label": "label"},
+    "mrpc": {"sentence1": "sentence1", "sentence2": "sentence2", "label": "label"},
+    "stsb": {"sentence1": "sentence1", "sentence2": "sentence2", "label": "label"},
+    "glue": {},
+    "super_glue": {},
+    "trec": {"text": "text", "label": "label"},
+    "ag_news": {"text": "text", "label": "label"},
+    "dbpedia_14": {"content": "content", "label": "label"},
+    "20newsgroups": {"text": "text", "label": "label"},
+}
+# Task-specific dataset templates
+TASK_DATASET_TEMPLATES = {
+    "causal-lm": {
+        "text_column": "text",
+        "format": "causal",
+        "examples": ["wikitext", "openwebtext", "the_pile", "c4", "oscar"],
+    },
+    "seq2seq": {
+        "input_column": None,
+        "target_column": None,
+        "format": "seq2seq",
+        "examples": ["cnn_dailymail", "xsum", "samsum", "billsum", "aeslc"],
+    },
+    "token-classification": {
+        "tokens_column": "tokens",
+        "labels_column": "ner_tags",
+        "format": "token",
+        "examples": ["conll2003", "wnut_17", "ontonotes5"],
+    },
+    "text-classification": {
+        "text_column": "text",
+        "label_column": "label",
+        "format": "classification",
+        "examples": ["imdb", "yelp_polarity", "sst2", "ag_news", "dbpedia_14"],
+    },
+    "question-answering": {
+        "context_column": "context",
+        "question_column": "question",
+        "answers_column": "answers",
+        "format": "qa",
+        "examples": ["squad", "squad_v2", "natural_questions", "hotpotqa"],
+    },
+    "reasoning": {
+        "input_column": "input",
+        "target_column": "target",
+        "format": "causal",
+        "examples": ["gsm8k", "strategyqa", "aqua"],
+    },
+}
+def get_dataset_info(dataset_name: str) -> Dict[str, Any]:
+    """Get information about a dataset from HuggingFace Hub."""
+    try:
+        from huggingface_hub import HfApi, dataset_info
+        api = HfApi()
+        info = api.dataset_info(dataset_name)
+        return {
+            "id": info.id,
+            "author": info.author,
+            "sha": info.sha,
+            "downloads": getattr(info, "downloads", 0),
+            "tags": info.tags or [],
+            "description": getattr(info, "description", ""),
+            "card_data": getattr(info, "card_data", {}),
+            "siblings": [s.rfilename for s in info.siblings] if info.siblings else [],
+            "size_bytes": sum(getattr(s, "size", 0) or 0 for s in info.siblings) if info.siblings else 0,
+        }
+    except Exception as e:
+        logger.error(f"Error getting dataset info for {dataset_name}: {e}")
+        return {"error": str(e)}
+def load_and_validate_dataset(
+    dataset_name: str,
+    config: Optional[str] = None,
+    split: Optional[str] = None,
+    trust_remote_code: bool = False,
+) -> Tuple[Optional[DatasetDict], Optional[str]]:
+    """Load a dataset and validate it."""
+    try:
+        kwargs = {"trust_remote_code": trust_remote_code}
+        if config:
+            kwargs["name"] = config
+        if split:
+            kwargs["split"] = split
+        dataset = load_dataset(dataset_name, **kwargs)
+        # If single split returned, wrap in dict
+        if isinstance(dataset, Dataset):
+            dataset = DatasetDict({"train": dataset})
+        return dataset, None
+    except Exception as e:
+        logger.error(f"Error loading dataset {dataset_name}: {e}")
+        return None, str(e)
+def get_dataset_schema(dataset: DatasetDict) -> Dict[str, Any]:
+    """Get the schema of a dataset."""
+    if not dataset:
+        return {}
+    # Get first available split
+    first_split = list(dataset.keys())[0]
+    ds = dataset[first_split]
+    schema = {
+        "splits": list(dataset.keys()),
+        "columns": {},
+        "num_rows": {},
+        "features": {},
+    }
+    for split_name, split_ds in dataset.items():
+        schema["num_rows"][split_name] = len(split_ds)
+    for col in ds.column_names:
+        col_info = {"name": col}
+        feature = ds.features.get(col)
+        if feature:
+            col_info["dtype"] = str(feature.dtype) if hasattr(feature, "dtype") else str(type(feature))
+            if hasattr(feature, "names"):
+                col_info["label_names"] = list(feature.names)
+            col_info["feature_type"] = type(feature).__name__
+        schema["columns"][col] = col_info
+        schema["features"][col] = str(feature) if feature else "unknown"
+    return schema
+def detect_task_type(dataset_name: str, dataset: DatasetDict) -> str:
+    """Detect the likely task type for a dataset based on its columns."""
+    if not dataset:
+        return "unknown"
+    first_split = list(dataset.keys())[0]
+    columns = set(dataset[first_split].column_names)
+    # Check for specific patterns
+    if "tokens" in columns and "ner_tags" in columns:
+        return "token-classification"
+    if "question" in columns and "context" in columns:
+        return "question-answering"
+    if "article" in columns or "document" in columns:
+        return "seq2seq"
+    if "text" in columns and "label" in columns:
+        return "text-classification"
+    if "text" in columns and len(columns) <= 3:
+        return "causal-lm"
+    if "dialogue" in columns or "summary" in columns:
+        return "seq2seq"
+    if "input" in columns and "target" in columns:
+        return "causal-lm"
+    # Default
+    return "causal-lm"
+def get_dataset_columns_for_task(
+    dataset: DatasetDict,
+    task_type: str
+) -> Dict[str, str]:
+    """Get the appropriate column mapping for a task."""
+    if not dataset:
+        return {}
+    first_split = list(dataset.keys())[0]
+    columns = set(dataset[first_split].column_names)
+    mapping = {}
+    if task_type == "causal-lm":
+        # Look for text column
+        for col in ["text", "content", "document", "article", "input"]:
+            if col in columns:
+                mapping["text_column"] = col
+                break
+        if not mapping and len(columns) == 1:
+            mapping["text_column"] = list(columns)[0]
+    elif task_type == "seq2seq":
+        for col in ["article", "document", "text", "input", "dialogue"]:
+            if col in columns:
+                mapping["input_column"] = col
+                break
+        for col in ["highlights", "summary", "target", "output", "subject_line"]:
+            if col in columns:
+                mapping["target_column"] = col
+                break
+    elif task_type == "token-classification":
+        for col in ["tokens", "words"]:
+            if col in columns:
+                mapping["tokens_column"] = col
+                break
+        for col in ["ner_tags", "labels", "tags"]:
+            if col in columns:
+                mapping["labels_column"] = col
+                break
+    elif task_type == "text-classification":
+        for col in ["text", "sentence", "content", "review"]:
+            if col in columns:
+                mapping["text_column"] = col
+                break
+        for col in ["label", "labels", "class", "category"]:
+            if col in columns:
+                mapping["label_column"] = col
+                break
+    elif task_type == "question-answering":
+        for col in ["context"]:
+            if col in columns:
+                mapping["context_column"] = col
+        for col in ["question"]:
+            if col in columns:
+                mapping["question_column"] = col
+        for col in ["answers", "answer"]:
+            if col in columns:
+                mapping["answers_column"] = col
+    return mapping
+def prepare_dataset_for_training(
+    dataset: DatasetDict,
+    tokenizer: Any,
+    task_type: str,
+    column_mapping: Dict[str, str],
+    max_length: int = 512,
+    padding: str = "max_length",
+    truncation: bool = True,
+) -> Tuple[DatasetDict, Dict[str, Any]]:
+    """Prepare dataset for training by tokenizing."""
+    stats = {
+        "original_samples": {},
+        "processed_samples": {},
+        "avg_length": {},
+        "removed_samples": {},
+    }
+    def tokenize_function(examples, text_col=None, target_col=None):
+        """Tokenize function based on task type."""
+        if task_type == "causal-lm":
+            text_col = column_mapping.get("text_column", "text")
+            if text_col not in examples:
+                return examples
+            outputs = tokenizer(
+                examples[text_col],
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                return_tensors=None,
+            )
+            outputs["labels"] = outputs["input_ids"].copy()
+            return outputs
+        elif task_type == "seq2seq":
+            input_col = column_mapping.get("input_column")
+            target_col = column_mapping.get("target_column")
+            if not input_col or not target_col:
+                raise ValueError(f"Missing columns for seq2seq: {column_mapping}")
+            model_inputs = tokenizer(
+                examples[input_col],
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+            )
+            with tokenizer.as_target_tokenizer():
+                labels = tokenizer(
+                    examples[target_col],
+                    padding=padding,
+                    truncation=truncation,
+                    max_length=max_length,
+                )
+            model_inputs["labels"] = labels["input_ids"]
+            return model_inputs
+        elif task_type == "token-classification":
+            tokens_col = column_mapping.get("tokens_column", "tokens")
+            labels_col = column_mapping.get("labels_column", "ner_tags")
+            if tokens_col not in examples or labels_col not in examples:
+                return examples
+            tokenized_inputs = tokenizer(
+                examples[tokens_col],
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                is_split_into_words=True,
+            )
+            labels = []
+            for i, label in enumerate(examples[labels_col]):
+                word_ids = tokenized_inputs.word_ids(batch_index=i)
+                previous_word_idx = None
+                label_ids = []
+                for word_idx in word_ids:
+                    if word_idx is None:
+                        label_ids.append(-100)
+                    elif word_idx != previous_word_idx:
+                        label_ids.append(label[word_idx])
+                    else:
+                        label_ids.append(-100)
+                    previous_word_idx = word_idx
+                labels.append(label_ids)
+            tokenized_inputs["labels"] = labels
+            return tokenized_inputs
+        elif task_type == "text-classification":
+            text_col = column_mapping.get("text_column", "text")
+            if text_col not in examples:
+                return examples
+            tokenized = tokenizer(
+                examples[text_col],
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+            )
+            # Add labels if present
+            label_col = column_mapping.get("label_column", "label")
+            if label_col in examples:
+                tokenized["labels"] = examples[label_col]
+            return tokenized
+        elif task_type == "question-answering":
+            context_col = column_mapping.get("context_column", "context")
+            question_col = column_mapping.get("question_column", "question")
+            answers_col = column_mapping.get("answers_column", "answers")
+            tokenized = tokenizer(
+                examples[question_col],
+                examples[context_col],
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+            )
+            # Process answers
+            if answers_col in examples:
+                # Simplified - full implementation would compute token positions
+                tokenized["labels"] = [[0, 0] for _ in examples[answers_col]]
+            return tokenized
+        return examples
+    # Tokenize each split
+    tokenized_datasets = DatasetDict()
+    for split_name, split_ds in dataset.items():
+        stats["original_samples"][split_name] = len(split_ds)
+        # Remove columns that aren't needed (keep label-related columns)
+        remove_columns = []
+        for col in split_ds.column_names:
+            if col not in ["labels", "label", "input_ids", "attention_mask"]:
+                if col not in column_mapping.values():
+                    remove_columns.append(col)
+        tokenized = split_ds.map(
+            tokenize_function,
+            batched=True,
+            remove_columns=remove_columns,
+            desc=f"Tokenizing {split_name}",
+        )
+        tokenized_datasets[split_name] = tokenized
+        stats["processed_samples"][split_name] = len(tokenized)
+    return tokenized_datasets, stats
+def split_dataset(
+    dataset: DatasetDict,
+    train_split: float = 0.9,
+    val_split: float = 0.1,
+    seed: int = 42,
+) -> DatasetDict:
+    """Split a dataset into train and validation sets."""
+    if "validation" in dataset:
+        return dataset
+    if "train" in dataset:
+        split_dataset = dataset["train"].train_test_split(
+            test_size=val_split,
+            seed=seed,
+        )
+        return DatasetDict({
+            "train": split_dataset["train"],
+            "validation": split_dataset["test"],
+        })
+    return dataset
+def sample_dataset(
+    dataset: DatasetDict,
+    n_samples: int,
+    split: str = "train",
+    seed: int = 42,
+) -> DatasetDict:
+    """Sample a subset of the dataset for quick testing."""
+    if split not in dataset:
+        return dataset
+    sampled = dataset[split].shuffle(seed=seed).select(range(min(n_samples, len(dataset[split]))))
+    result = dict(dataset)
+    result[split] = sampled
+    return DatasetDict(result)
+def get_label_list(dataset: DatasetDict, label_column: str = "label") -> List[str]:
+    """Get list of labels from dataset."""
+    if not dataset:
+        return []
+    for split_name, split_ds in dataset.items():
+        if label_column in split_ds.column_names:
+            features = split_ds.features.get(label_column)
+            if features and hasattr(features, "names"):
+                return list(features.names)
+            elif features and hasattr(features, "int2str"):
+                # Try to infer number of labels
+                unique_labels = set(split_ds[label_column])
+                return [str(i) for i in range(max(unique_labels) + 1)]
+    return []
+def estimate_dataset_size(dataset: DatasetDict) -> Dict[str, Any]:
+    """Estimate dataset size in memory."""
+    if not dataset:
+        return {"total_samples": 0, "estimated_size_mb": 0}
+    total_samples = sum(len(split) for split in dataset.values())
+    # Rough estimation: ~1KB per sample for text
+    estimated_size_mb = total_samples * 0.001
+    return {
+        "total_samples": total_samples,
+        "estimated_size_mb": round(estimated_size_mb, 2),
+        "splits": {name: len(split) for name, split in dataset.items()},
+    }
+def validate_dataset_for_task(
+    dataset: DatasetDict,
+    task_type: str,
+    column_mapping: Dict[str, str],
+) -> Tuple[bool, List[str]]:
+    """Validate that a dataset is suitable for a task."""
+    issues = []
+    if not dataset:
+        return False, ["Dataset is empty or could not be loaded"]
+    first_split = list(dataset.keys())[0]
+    columns = set(dataset[first_split].column_names)
+    if task_type == "causal-lm":
+        text_col = column_mapping.get("text_column")
+        if not text_col or text_col not in columns:
+            issues.append(f"Missing text column. Found: {columns}")
+    elif task_type == "seq2seq":
+        input_col = column_mapping.get("input_column")
+        target_col = column_mapping.get("target_column")
+        if not input_col or input_col not in columns:
+            issues.append(f"Missing input column. Found: {columns}")
+        if not target_col or target_col not in columns:
+            issues.append(f"Missing target column. Found: {columns}")
+    elif task_type == "token-classification":
+        tokens_col = column_mapping.get("tokens_column")
+        labels_col = column_mapping.get("labels_column")
+        if not tokens_col or tokens_col not in columns:
+            issues.append(f"Missing tokens column. Found: {columns}")
+        if not labels_col or labels_col not in columns:
+            issues.append(f"Missing labels column. Found: {columns}")
+    elif task_type == "text-classification":
+        text_col = column_mapping.get("text_column")
+        label_col = column_mapping.get("label_column")
+        if not text_col or text_col not in columns:
+            issues.append(f"Missing text column. Found: {columns}")
+        if not label_col or label_col not in columns:
+            issues.append(f"Missing label column. Found: {columns}")
+    elif task_type == "question-answering":
+        required = ["context_column", "question_column", "answers_column"]
+        for col_key in required:
+            col = column_mapping.get(col_key)
+            if not col or col not in columns:
+                issues.append(f"Missing {col_key}. Found: {columns}")
+    return len(issues) == 0, issues