File size: 3,653 Bytes
714cf46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
from .supported_datasets import (
    supported_datasets,
    internal_datasets,
    possible_with_vector_reps,
    standard_data_benchmark,
    testing,
)


def list_supported_datasets(with_descriptions=True):
    """
    Lists all supported datasets with optional descriptions.
    
    Args:
        with_descriptions (bool): Whether to include descriptions (if available)
    """
    try:
        from .dataset_descriptions import dataset_descriptions
        has_descriptions = True
    except ImportError:
        has_descriptions = False
        
    if not with_descriptions or not has_descriptions:
        print("\n=== Supported Datasets ===\n")
        for dataset_name in supported_datasets:
            print(f"- {dataset_name}: {supported_datasets[dataset_name]}")
        return
    
    print("\n=== Supported Datasets ===\n")
    
    # Calculate maximum widths for formatting
    max_name_len = max(len(name) for name in supported_datasets)
    max_type_len = max(len(dataset_descriptions.get(name, {}).get('type', 'Unknown')) for name in supported_datasets if name in dataset_descriptions)
    max_task_len = max(len(dataset_descriptions.get(name, {}).get('task', 'Unknown')) for name in supported_datasets if name in dataset_descriptions)
    
    # Print header
    print(f"{'Dataset':<{max_name_len+2}}{'Type':<{max_type_len+2}}{'Task':<{max_task_len+2}}Description")
    print("-" * (max_name_len + max_type_len + max_task_len + 50))
    
    # Print dataset information
    for dataset_name in supported_datasets:
        if dataset_name in dataset_descriptions:
            dataset_info = dataset_descriptions[dataset_name]
            print(f"{dataset_name:<{max_name_len+2}}{dataset_info.get('type', 'Unknown'):<{max_type_len+2}}{dataset_info.get('task', 'Unknown'):<{max_task_len+2}}{dataset_info.get('description', 'No description available')}")
        else:
            print(f"{dataset_name:<{max_name_len+2}}{'Unknown':<{max_type_len+2}}{'Unknown':<{max_task_len+2}}No description available")
    
    print("\n=== Standard Benchmark Datasets ===\n")
    for dataset_name in standard_data_benchmark:
        print(f"- {dataset_name}")


def get_dataset_info(dataset_name):
    """
    Get detailed information about a specific dataset.
    
    Args:
        dataset_name (str): Name of the dataset
        
    Returns:
        dict: Dataset information or None if not found
    """
    try:
        from .dataset_descriptions import dataset_descriptions
        if dataset_name in dataset_descriptions:
            return dataset_descriptions[dataset_name]
    except ImportError:
        pass
        
    if dataset_name in supported_datasets:
        return {"name": dataset_name, "source": supported_datasets[dataset_name]}
    
    return None


if __name__ == "__main__":
    import sys
    import argparse
    
    parser = argparse.ArgumentParser(description='List and describe supported datasets')
    parser.add_argument('--list', action='store_true', help='List all supported datasets')
    parser.add_argument('--info', type=str, help='Get information about a specific dataset')
    args = parser.parse_args()
    
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(1)
        
    if args.list:
        list_supported_datasets()
        
    if args.info:
        dataset_info = get_dataset_info(args.info)
        if dataset_info:
            print(f"\n=== Dataset: {args.info} ===\n")
            for key, value in dataset_info.items():
                print(f"{key.capitalize()}: {value}")
        else:
            print(f"Dataset '{args.info}' not found in supported datasets.")