Initial commit: Wood knot detection model and GUI

This commit is contained in:
2025-12-22 14:11:39 -07:00
commit aed092f09c
20307 changed files with 785367 additions and 0 deletions

248
split_coco_dataset.py Normal file
View File

@ -0,0 +1,248 @@
#!/usr/bin/env python3
"""
Split the Kaggle wood defects COCO dataset into train/valid/test splits.
Creates both COCO format and YOLO format annotations.
Usage:
python split_coco_dataset.py --input bbox_coco_dataset.json --images IMAGE/ --output dataset_split
"""
import argparse
import json
import random
import shutil
from pathlib import Path
from typing import Dict, List
def coco_to_yolo_bbox(bbox: List[float], img_width: int, img_height: int) -> List[float]:
"""
Convert COCO bbox [x, y, width, height] to YOLO format [x_center, y_center, width, height].
All values normalized to [0, 1].
Args:
bbox: COCO format [x_min, y_min, width, height]
img_width: Image width in pixels
img_height: Image height in pixels
Returns:
YOLO format [x_center, y_center, width, height] normalized
"""
x_min, y_min, width, height = bbox
# Calculate center coordinates
x_center = (x_min + width / 2) / img_width
y_center = (y_min + height / 2) / img_height
# Normalize width and height
norm_width = width / img_width
norm_height = height / img_height
return [x_center, y_center, norm_width, norm_height]
def split_coco_dataset(
input_json: Path,
images_dir: Path,
output_dir: Path,
train_split: float = 0.8,
valid_split: float = 0.1,
seed: int = 42
):
"""
Split COCO dataset into train/valid/test splits.
Args:
input_json: Path to input COCO JSON file
images_dir: Directory containing all images
output_dir: Output directory for splits
train_split: Fraction for training (default 0.8)
valid_split: Fraction for validation (default 0.1)
seed: Random seed for reproducibility
"""
# Load COCO data
with input_json.open('r') as f:
data = json.load(f)
images = data['images']
annotations = data['annotations']
categories = data['categories']
# Set random seed for reproducibility
random.seed(seed)
# Shuffle images
random.shuffle(images)
# Calculate split sizes
n_images = len(images)
n_train = int(n_images * train_split)
n_valid = int(n_images * valid_split)
n_test = n_images - n_train - n_valid
print(f"Total images: {n_images}")
print(f"Train: {n_train}, Valid: {n_valid}, Test: {n_test}")
# Create splits
splits = {
'train': images[:n_train],
'valid': images[n_train:n_train + n_valid],
'test': images[n_train + n_valid:]
}
# Create output directories
output_dir.mkdir(parents=True, exist_ok=True)
# Create category ID to index mapping (YOLO uses 0-indexed categories)
category_id_to_idx = {cat['id']: idx for idx, cat in enumerate(categories)}
# Create image_id to image info mapping
image_info = {img['id']: img for img in images}
for split_name, split_images in splits.items():
split_dir = output_dir / split_name
split_dir.mkdir(exist_ok=True)
# Create labels directory for YOLO format
labels_dir = split_dir / 'labels'
labels_dir.mkdir(exist_ok=True)
# Get image IDs for this split
split_image_ids = {img['id'] for img in split_images}
# Filter annotations for this split
split_annotations = [
ann for ann in annotations
if ann['image_id'] in split_image_ids
]
# Create COCO data for this split
split_data = {
'images': split_images,
'annotations': split_annotations,
'categories': categories
}
# Save COCO JSON
json_path = split_dir / '_annotations.coco.json'
with json_path.open('w') as f:
json.dump(split_data, f, indent=2)
# Group annotations by image_id for YOLO format
annotations_by_image: Dict[int, List] = {}
for ann in split_annotations:
img_id = ann['image_id']
if img_id not in annotations_by_image:
annotations_by_image[img_id] = []
annotations_by_image[img_id].append(ann)
# Copy images and create YOLO labels
copied = 0
for img in split_images:
src_path = images_dir / img['file_name']
dst_path = split_dir / img['file_name']
if src_path.exists():
shutil.copy2(src_path, dst_path)
copied += 1
# Create YOLO format label file
img_id = img['id']
label_file = labels_dir / f"{Path(img['file_name']).stem}.txt"
with label_file.open('w') as f:
if img_id in annotations_by_image:
for ann in annotations_by_image[img_id]:
# Convert COCO bbox to YOLO format
yolo_bbox = coco_to_yolo_bbox(
ann['bbox'],
img['width'],
img['height']
)
# Get category index
cat_idx = category_id_to_idx[ann['category_id']]
# Write YOLO format: class x_center y_center width height
f.write(f"{cat_idx} {yolo_bbox[0]:.6f} {yolo_bbox[1]:.6f} "
f"{yolo_bbox[2]:.6f} {yolo_bbox[3]:.6f}\n")
else:
print(f"Warning: {src_path} not found")
print(f"{split_name}: {len(split_images)} images, {len(split_annotations)} annotations, {copied} copied")
# Create data.yaml for YOLO training
data_yaml_path = output_dir / 'data.yaml'
data_yaml_content = f"""# YOLO dataset configuration
path: {output_dir.absolute()} # dataset root dir
train: train # train images (relative to 'path')
val: valid # val images (relative to 'path')
test: test # test images (relative to 'path')
# Classes
names:
"""
for idx, cat in enumerate(categories):
data_yaml_content += f" {idx}: {cat['name']}\n"
with data_yaml_path.open('w') as f:
f.write(data_yaml_content)
print(f"\nDataset split complete! Saved to: {output_dir}")
print(f"Created YOLO format labels in {output_dir}/{{train,valid,test}}/labels/")
print(f"Created data.yaml at {data_yaml_path}")
def main():
parser = argparse.ArgumentParser(description="Split COCO dataset into train/valid/test")
parser.add_argument(
"--input",
type=Path,
default="bbox_coco_dataset.json",
help="Input COCO JSON file"
)
parser.add_argument(
"--images",
type=Path,
default="IMAGE",
help="Directory containing images"
)
parser.add_argument(
"--output",
type=Path,
default="dataset_split",
help="Output directory for splits"
)
parser.add_argument(
"--train-split",
type=float,
default=0.8,
help="Training split fraction"
)
parser.add_argument(
"--valid-split",
type=float,
default=0.1,
help="Validation split fraction"
)
parser.add_argument(
"--seed",
type=int,
default=42,
help="Random seed"
)
args = parser.parse_args()
split_coco_dataset(
input_json=args.input,
images_dir=args.images,
output_dir=args.output,
train_split=args.train_split,
valid_split=args.valid_split,
seed=args.seed
)
if __name__ == "__main__":
main()