90 lines
2.4 KiB
Python
90 lines
2.4 KiB
Python
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
from pathlib import Path
|
|
|
|
|
|
SPLITS = ("train", "valid", "test")
|
|
|
|
|
|
def _load_json(path: Path) -> dict:
|
|
with path.open("r", encoding="utf-8") as f:
|
|
return json.load(f)
|
|
|
|
|
|
def _validate_split(split_dir: Path) -> list[str]:
|
|
errors: list[str] = []
|
|
ann_path = split_dir / "_annotations.coco.json"
|
|
if not ann_path.exists():
|
|
return [f"Missing {ann_path}"]
|
|
|
|
data = _load_json(ann_path)
|
|
|
|
for key in ("images", "annotations", "categories"):
|
|
if key not in data:
|
|
errors.append(f"{ann_path}: missing key '{key}'")
|
|
|
|
images = data.get("images", [])
|
|
categories = data.get("categories", [])
|
|
|
|
if not isinstance(images, list) or not images:
|
|
errors.append(f"{ann_path}: 'images' must be a non-empty list")
|
|
|
|
if not isinstance(categories, list) or not categories:
|
|
errors.append(f"{ann_path}: 'categories' must be a non-empty list")
|
|
|
|
# Verify referenced image files exist
|
|
missing_files = 0
|
|
checked = 0
|
|
for img in images[:5000]:
|
|
file_name = img.get("file_name")
|
|
if not file_name:
|
|
continue
|
|
checked += 1
|
|
if not (split_dir / file_name).exists():
|
|
missing_files += 1
|
|
|
|
if checked and missing_files:
|
|
errors.append(
|
|
f"{ann_path}: {missing_files}/{checked} referenced image files are missing in {split_dir}"
|
|
)
|
|
|
|
return errors
|
|
|
|
|
|
def main() -> int:
|
|
parser = argparse.ArgumentParser(description="Validate COCO dataset structure for RF-DETR.")
|
|
parser.add_argument(
|
|
"--dataset-dir",
|
|
type=Path,
|
|
required=True,
|
|
help="Path to dataset root containing train/ valid/ test/",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
dataset_dir: Path = args.dataset_dir
|
|
if not dataset_dir.exists():
|
|
raise SystemExit(f"Dataset dir not found: {dataset_dir}")
|
|
|
|
all_errors: list[str] = []
|
|
for split in SPLITS:
|
|
split_dir = dataset_dir / split
|
|
if not split_dir.exists():
|
|
all_errors.append(f"Missing split directory: {split_dir}")
|
|
continue
|
|
all_errors.extend(_validate_split(split_dir))
|
|
|
|
if all_errors:
|
|
print("Dataset validation: FAILED")
|
|
for e in all_errors:
|
|
print(f"- {e}")
|
|
return 2
|
|
|
|
print("Dataset validation: OK")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|