from __future__ import annotations import argparse import json from pathlib import Path SPLITS = ("train", "valid", "test") def _load_json(path: Path) -> dict: with path.open("r", encoding="utf-8") as f: return json.load(f) def _validate_split(split_dir: Path) -> list[str]: errors: list[str] = [] ann_path = split_dir / "_annotations.coco.json" if not ann_path.exists(): return [f"Missing {ann_path}"] data = _load_json(ann_path) for key in ("images", "annotations", "categories"): if key not in data: errors.append(f"{ann_path}: missing key '{key}'") images = data.get("images", []) categories = data.get("categories", []) if not isinstance(images, list) or not images: errors.append(f"{ann_path}: 'images' must be a non-empty list") if not isinstance(categories, list) or not categories: errors.append(f"{ann_path}: 'categories' must be a non-empty list") # Verify referenced image files exist missing_files = 0 checked = 0 for img in images[:5000]: file_name = img.get("file_name") if not file_name: continue checked += 1 if not (split_dir / file_name).exists(): missing_files += 1 if checked and missing_files: errors.append( f"{ann_path}: {missing_files}/{checked} referenced image files are missing in {split_dir}" ) return errors def main() -> int: parser = argparse.ArgumentParser(description="Validate COCO dataset structure for RF-DETR.") parser.add_argument( "--dataset-dir", type=Path, required=True, help="Path to dataset root containing train/ valid/ test/", ) args = parser.parse_args() dataset_dir: Path = args.dataset_dir if not dataset_dir.exists(): raise SystemExit(f"Dataset dir not found: {dataset_dir}") all_errors: list[str] = [] for split in SPLITS: split_dir = dataset_dir / split if not split_dir.exists(): all_errors.append(f"Missing split directory: {split_dir}") continue all_errors.extend(_validate_split(split_dir)) if all_errors: print("Dataset validation: FAILED") for e in all_errors: print(f"- {e}") return 2 print("Dataset validation: OK") return 0 if __name__ == "__main__": raise SystemExit(main())