Files
saw_mill_knot_detection/validate_coco_dataset.py

90 lines
2.4 KiB
Python
Raw Normal View History

from __future__ import annotations
import argparse
import json
from pathlib import Path
SPLITS = ("train", "valid", "test")
def _load_json(path: Path) -> dict:
with path.open("r", encoding="utf-8") as f:
return json.load(f)
def _validate_split(split_dir: Path) -> list[str]:
errors: list[str] = []
ann_path = split_dir / "_annotations.coco.json"
if not ann_path.exists():
return [f"Missing {ann_path}"]
data = _load_json(ann_path)
for key in ("images", "annotations", "categories"):
if key not in data:
errors.append(f"{ann_path}: missing key '{key}'")
images = data.get("images", [])
categories = data.get("categories", [])
if not isinstance(images, list) or not images:
errors.append(f"{ann_path}: 'images' must be a non-empty list")
if not isinstance(categories, list) or not categories:
errors.append(f"{ann_path}: 'categories' must be a non-empty list")
# Verify referenced image files exist
missing_files = 0
checked = 0
for img in images[:5000]:
file_name = img.get("file_name")
if not file_name:
continue
checked += 1
if not (split_dir / file_name).exists():
missing_files += 1
if checked and missing_files:
errors.append(
f"{ann_path}: {missing_files}/{checked} referenced image files are missing in {split_dir}"
)
return errors
def main() -> int:
parser = argparse.ArgumentParser(description="Validate COCO dataset structure for RF-DETR.")
parser.add_argument(
"--dataset-dir",
type=Path,
required=True,
help="Path to dataset root containing train/ valid/ test/",
)
args = parser.parse_args()
dataset_dir: Path = args.dataset_dir
if not dataset_dir.exists():
raise SystemExit(f"Dataset dir not found: {dataset_dir}")
all_errors: list[str] = []
for split in SPLITS:
split_dir = dataset_dir / split
if not split_dir.exists():
all_errors.append(f"Missing split directory: {split_dir}")
continue
all_errors.extend(_validate_split(split_dir))
if all_errors:
print("Dataset validation: FAILED")
for e in all_errors:
print(f"- {e}")
return 2
print("Dataset validation: OK")
return 0
if __name__ == "__main__":
raise SystemExit(main())