Spaces:

senthil2421
/

mlforge

Sleeping

mlforge / datasets /format_adapters.py

senthil2421

Refactor cloud_backend: remove local execution routes and fix missing modules

e10cda2 about 2 months ago

10.8 kB

	from pathlib import Path
	import json
	import re
	from typing import Any, List, Tuple, Iterator, Dict
	from .base_adapter import DatasetAdapter
	from models.dataset import UniversalDatasetItem, DatasetContentType, UniversalAnnotation, UniversalAnnotationType, DatasetTask
	from .annotation_parser import YOLOParser, COCOParser, VOCParser, RoboflowTXTParser, _img_dimensions

	class YOLOAdapter(DatasetAdapter):
	def detect(self, dataset_path: Path) -> bool:
	if list(dataset_path.rglob("data.yaml")):
	return True
	txt_files = list(dataset_path.rglob("*.txt"))
	label_txts = [f for f in txt_files if f.name not in ("classes.txt", "obj.names", "README.txt", "LICENSE.txt", "README.roboflow.txt")]
	if label_txts:
	try:
	content = label_txts[0].read_text(encoding="utf-8").strip().split('\n')[0]
	if re.match(r"^\d+\s+[\d\.]+\s+[\d\.]+\s+[\d\.]+\s+[\d\.]+", content):
	return True
	except: pass
	return False

	def get_task(self, dataset_path: Path) -> DatasetTask:
	return DatasetTask.detection

	def get_class_names(self, dataset_path: Path) -> List[str]:
	return YOLOParser.load_class_map(dataset_path)

	def iter_items(self, dataset_id: str, dataset_path: Path) -> Iterator[Tuple[Dict[str, Any], List[Dict[str, Any]]]]:
	class_map = self.get_class_names(dataset_path)
	for rel_path, image_id, split, anns in YOLOParser.iter_dataset(dataset_path, dataset_id, class_map):
	abs_path = dataset_path / rel_path
	w, h = _img_dimensions(abs_path)
	img_rec = {
	"id": image_id, "filename": Path(rel_path).name,
	"rel_path": str(rel_path), "width": w, "height": h,
	"split": split, "ann_count": len(anns),
	}
	yield img_rec, anns

	class COCOAdapter(DatasetAdapter):
	def detect(self, dataset_path: Path) -> bool:
	for jf in dataset_path.rglob("*.json"):
	try:
	snippet = jf.read_text(encoding="utf-8", errors="replace")[:2048]
	if '"images"' in snippet and '"annotations"' in snippet:
	return True
	except: pass
	return False

	def get_task(self, dataset_path: Path) -> DatasetTask:
	return DatasetTask.segmentation # Roboflow COCO often implies segmentation

	def get_class_names(self, dataset_path: Path) -> List[str]:
	ann_files = COCOParser.find_annotation_files(dataset_path)
	all_classes = []
	for ann_file in ann_files:
	classes, _ = COCOParser.parse_file(ann_file, "dummy")
	all_classes = list(dict.fromkeys(all_classes + classes))
	return all_classes

	def iter_items(self, dataset_id: str, dataset_path: Path) -> Iterator[Tuple[Dict[str, Any], List[Dict[str, Any]]]]:
	ann_files = COCOParser.find_annotation_files(dataset_path)
	for ann_file in ann_files:
	_, coco_results = COCOParser.parse_file(ann_file, dataset_id)
	for rel_path, image_id, split, anns in coco_results:
	abs_path = dataset_path / rel_path
	w, h = _img_dimensions(abs_path)
	img_rec = {
	"id": image_id, "filename": Path(rel_path).name,
	"rel_path": str(rel_path), "width": w, "height": h,
	"split": split, "ann_count": len(anns),
	}
	yield img_rec, anns

	class VOCAdapter(DatasetAdapter):
	def detect(self, dataset_path: Path) -> bool:
	for xf in dataset_path.rglob("*.xml"):
	try:
	snippet = xf.read_text(encoding="utf-8", errors="replace")[:512]
	if "<annotation>" in snippet:
	return True
	except: pass
	return False

	def get_task(self, dataset_path: Path) -> DatasetTask:
	return DatasetTask.detection

	def get_class_names(self, dataset_path: Path) -> List[str]:
	classes = set()
	for _, _, _, _, _, anns in VOCParser.iter_dataset(dataset_path, "dummy"):
	for ann in anns:
	classes.add(ann["label"])
	return sorted(list(classes))

	def iter_items(self, dataset_id: str, dataset_path: Path) -> Iterator[Tuple[Dict[str, Any], List[Dict[str, Any]]]]:
	for rel_path, image_id, split, w, h, anns in VOCParser.iter_dataset(dataset_path, dataset_id):
	img_rec = {
	"id": image_id, "filename": Path(rel_path).name,
	"rel_path": str(rel_path), "width": w, "height": h,
	"split": split, "ann_count": len(anns),
	}
	yield img_rec, anns

	class CreateMLAdapter(DatasetAdapter):
	def detect(self, dataset_path: Path) -> bool:
	for jf in dataset_path.rglob("*.json"):
	try:
	snippet = jf.read_text(encoding="utf-8", errors="replace")[:1024]
	if '"image"' in snippet and '"annotations"' in snippet and "[" in snippet:
	return True
	except: pass
	return False

	def get_task(self, dataset_path: Path) -> DatasetTask:
	return DatasetTask.detection

	def get_class_names(self, dataset_path: Path) -> List[str]:
	classes = set()
	for jf in dataset_path.rglob("*.json"):
	try:
	data = json.loads(jf.read_text(encoding="utf-8"))
	if isinstance(data, list):
	for item in data:
	for ann in item.get("annotations", []):
	if "label" in ann: classes.add(ann["label"])
	except: pass
	return sorted(list(classes))

	def iter_items(self, dataset_id: str, dataset_path: Path) -> Iterator[Tuple[Dict[str, Any], List[Dict[str, Any]]]]:
	from .annotation_parser import _make_ann
	for jf in dataset_path.rglob("*.json"):
	try:
	data = json.loads(jf.read_text(encoding="utf-8"))
	if not isinstance(data, list): continue

	# Determine split from path
	split = "train"
	if "val" in jf.parts or "valid" in jf.parts: split = "val"
	elif "test" in jf.parts: split = "test"

	for item in data:
	rel_img_path = item.get("image")
	if not rel_img_path: continue

	# Try to find the image relative to JSON or root
	img_path = jf.parent / rel_img_path
	if not img_path.exists():
	img_path = dataset_path / rel_img_path

	if img_path.exists():
	image_id = f"img-{uuid.uuid4().hex[:12]}"
	w, h = _img_dimensions(img_path)

	anns = []
	for ca in item.get("annotations", []):
	label = ca.get("label", "unknown")
	coord = ca.get("coordinates", {})
	# CreateML coords are usually center-based pixels: {x, y, width, height}
	if "x" in coord and "y" in coord and w > 0 and h > 0:
	cx, cy, bw, bh = coord["x"], coord["y"], coord["width"], coord["height"]
	# Convert to top-left normalized
	nx = (cx - bw/2) / w
	ny = (cy - bh/2) / h
	nw = bw / w
	nh = bh / h
	anns.append(_make_ann(image_id, dataset_id, label, (nx, ny, nw, nh)))

	img_rec = {
	"id": image_id, "filename": img_path.name,
	"rel_path": str(img_path.relative_to(dataset_path)),
	"width": w, "height": h, "split": split, "ann_count": len(anns)
	}
	yield img_rec, anns
	except: pass

	class NLPAdapter(DatasetAdapter):
	def detect(self, dataset_path: Path) -> bool:
	return any(dataset_path.rglob(".csv")) or any(dataset_path.rglob(".tsv"))

	def get_task(self, dataset_path: Path) -> DatasetTask:
	return DatasetTask.nlp

	def get_class_names(self, dataset_path: Path) -> List[str]:
	# Implementation for NLP class names
	return []

	def iter_items(self, dataset_id: str, dataset_path: Path) -> Iterator[Tuple[Dict[str, Any], List[Dict[str, Any]]]]:
	# Implementation for NLP items
	yield {}, []

	class TabularAdapter(DatasetAdapter):
	def detect(self, dataset_path: Path) -> bool:
	return False # Placeholder

	def get_task(self, dataset_path: Path) -> DatasetTask:
	return DatasetTask.classification

	def get_class_names(self, dataset_path: Path) -> List[str]:
	return []

	def iter_items(self, dataset_id: str, dataset_path: Path) -> Iterator[Tuple[Dict[str, Any], List[Dict[str, Any]]]]:
	yield {}, []

	class RoboflowClassificationAdapter(DatasetAdapter):
	def detect(self, dataset_path: Path) -> bool:
	# Check for _annotations.txt or folder-based classification
	if list(dataset_path.rglob("_annotations.txt")): return True
	for split in ["train", "valid", "test"]:
	split_dir = dataset_path / split
	if split_dir.exists() and split_dir.is_dir():
	subdirs = [d for d in split_dir.iterdir() if d.is_dir()]
	if subdirs and not any(d.name.lower() in ["images", "labels"] for d in subdirs):
	return True
	return False

	def get_task(self, dataset_path: Path) -> DatasetTask:
	return DatasetTask.classification

	def get_class_names(self, dataset_path: Path) -> List[str]:
	classes = set()
	for _, _, _, anns in RoboflowTXTParser.iter_dataset(dataset_path, "dummy"):
	for ann in anns: classes.add(ann["label"])
	return sorted(list(classes))

	def iter_items(self, dataset_id: str, dataset_path: Path) -> Iterator[Tuple[Dict[str, Any], List[Dict[str, Any]]]]:
	for rel_path, image_id, split, anns in RoboflowTXTParser.iter_dataset(dataset_path, dataset_id):
	abs_path = dataset_path / rel_path
	w, h = _img_dimensions(abs_path)
	img_rec = {
	"id": image_id, "filename": Path(rel_path).name,
	"rel_path": str(rel_path), "width": w, "height": h,
	"split": split, "ann_count": len(anns),
	}
	yield img_rec, anns