File size: 3,653 Bytes
714cf46 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 | from .supported_datasets import (
supported_datasets,
internal_datasets,
possible_with_vector_reps,
standard_data_benchmark,
testing,
)
def list_supported_datasets(with_descriptions=True):
"""
Lists all supported datasets with optional descriptions.
Args:
with_descriptions (bool): Whether to include descriptions (if available)
"""
try:
from .dataset_descriptions import dataset_descriptions
has_descriptions = True
except ImportError:
has_descriptions = False
if not with_descriptions or not has_descriptions:
print("\n=== Supported Datasets ===\n")
for dataset_name in supported_datasets:
print(f"- {dataset_name}: {supported_datasets[dataset_name]}")
return
print("\n=== Supported Datasets ===\n")
# Calculate maximum widths for formatting
max_name_len = max(len(name) for name in supported_datasets)
max_type_len = max(len(dataset_descriptions.get(name, {}).get('type', 'Unknown')) for name in supported_datasets if name in dataset_descriptions)
max_task_len = max(len(dataset_descriptions.get(name, {}).get('task', 'Unknown')) for name in supported_datasets if name in dataset_descriptions)
# Print header
print(f"{'Dataset':<{max_name_len+2}}{'Type':<{max_type_len+2}}{'Task':<{max_task_len+2}}Description")
print("-" * (max_name_len + max_type_len + max_task_len + 50))
# Print dataset information
for dataset_name in supported_datasets:
if dataset_name in dataset_descriptions:
dataset_info = dataset_descriptions[dataset_name]
print(f"{dataset_name:<{max_name_len+2}}{dataset_info.get('type', 'Unknown'):<{max_type_len+2}}{dataset_info.get('task', 'Unknown'):<{max_task_len+2}}{dataset_info.get('description', 'No description available')}")
else:
print(f"{dataset_name:<{max_name_len+2}}{'Unknown':<{max_type_len+2}}{'Unknown':<{max_task_len+2}}No description available")
print("\n=== Standard Benchmark Datasets ===\n")
for dataset_name in standard_data_benchmark:
print(f"- {dataset_name}")
def get_dataset_info(dataset_name):
"""
Get detailed information about a specific dataset.
Args:
dataset_name (str): Name of the dataset
Returns:
dict: Dataset information or None if not found
"""
try:
from .dataset_descriptions import dataset_descriptions
if dataset_name in dataset_descriptions:
return dataset_descriptions[dataset_name]
except ImportError:
pass
if dataset_name in supported_datasets:
return {"name": dataset_name, "source": supported_datasets[dataset_name]}
return None
if __name__ == "__main__":
import sys
import argparse
parser = argparse.ArgumentParser(description='List and describe supported datasets')
parser.add_argument('--list', action='store_true', help='List all supported datasets')
parser.add_argument('--info', type=str, help='Get information about a specific dataset')
args = parser.parse_args()
if len(sys.argv) == 1:
parser.print_help()
sys.exit(1)
if args.list:
list_supported_datasets()
if args.info:
dataset_info = get_dataset_info(args.info)
if dataset_info:
print(f"\n=== Dataset: {args.info} ===\n")
for key, value in dataset_info.items():
print(f"{key.capitalize()}: {value}")
else:
print(f"Dataset '{args.info}' not found in supported datasets.") |