RT-DETR object detection

Code available here.

This tutorial fine-tunes RT-DETRv2 on a custom COCO-format dataset from HuggingFace. The pipeline downloads and splits the data, fine-tunes the detector with live training charts in Flyte reports, evaluates COCO mAP on a validation split, and renders a side-by-side inference demo with ground-truth and predicted bounding boxes.

Flyte highlights:

Cached dataset preparation so re-runs skip the HuggingFace download.
Live training reports with loss curves and optional periodic mAP checkpoints.
GPU evaluation and demo tasks that stream annotated images into the UI.

Define the task environments

detr_object_detection.py
                
                    
                
            
                
            
main_img = flyte.Image.from_uv_script(__file__, name="detr-object-detection", pre=True)

gpu_env = flyte.TaskEnvironment(
    name="detr-object-detection-gpu",
    image=main_img,
    resources=flyte.Resources(cpu=4, memory="24Gi", gpu=1),
)

cpu_env = flyte.TaskEnvironment(
    name="detr-object-detection-cpu",
    image=main_img,
    resources=flyte.Resources(cpu=2, memory="6Gi"),
    depends_on=[gpu_env],
)

        
    
# /// script
# requires-python = ">=3.12"
# dependencies = [
#    "flyte>=2.4.0",
#    "torch>=2.9.0",
#    "transformers>=4.49.0",
#    "albumentations>=1.4.0",
#    "torchmetrics>=1.4.0",
#    ...
# ]
# ///

Orchestrate the pipeline

The pipeline task prepares data, fine-tunes RT-DETR, evaluates mAP, and renders an inference demo.

detr_object_detection.py
                
                    
                
            
                
            
@cpu_env.task(report=True)
async def pipeline(
    model_name: str = "PekingU/rtdetr_v2_r18vd",
    dataset_repo: str = "sagecodes/union_flyte_swag_object_detection",
    annotations_path: str = "swag/train.json",
    images_subdir: str = "swag/images",
    epochs: int = 30,
    lr: float = 5e-5,
    batch_size: int = 4,
    val_fraction: float = 0.2,
    threshold: float = 0.5,
    demo_images: int = 8,
    eval_every_n_epochs: int | None = None,
) -> tuple[flyte.io.Dir, str]:
    """
    End-to-end RT-DETRv2 fine-tuning pipeline.

    Returns the fine-tuned model directory and a JSON summary.

    1. Download COCO dataset from HuggingFace and split train/val
    2. Fine-tune RT-DETRv2 on the train split
    3. Evaluate: COCO mAP comparison (base vs fine-tuned)
    4. Inference demo: render bounding boxes on val images
    """
    log.info(f"Pipeline: {model_name} | dataset={dataset_repo}")

    def _pipeline_progress(step: int, label: str) -> str:
        steps = ["Preparing Data", "Fine-tuning", "Evaluating", "Inference Demo"]
        dots = ""
        for i, s in enumerate(steps):
            if i + 1 < step:
                icon = '<span style="color:#06d6a0;">&#10003;</span>'
            elif i + 1 == step:
                icon = '<span style="color:#e94560;">&#9679;</span>'
            else:
                icon = '<span style="color:#adb5bd;">&#9675;</span>'
            dots += f"<span style='margin:0 8px;'>{icon} {s}</span>"
        return f"""
        <h2>RT-DETRv2 Object Detection Pipeline</h2>
        <p><b>Model:</b> {model_name} | <b>Dataset:</b> {dataset_repo}</p>
        <div class="card" style="text-align:center;">{dots}</div>
        <p>{label}</p>
        """

    await flyte.report.replace.aio(
        _wrap_report(_pipeline_progress(1, "Downloading and splitting dataset...")),
        do_flush=True,
    )

    data_dir = await prepare_data(
        dataset_repo=dataset_repo,
        annotations_path=annotations_path,
        images_subdir=images_subdir,
        val_fraction=val_fraction,
    )

    await flyte.report.replace.aio(
        _wrap_report(_pipeline_progress(2, "Fine-tuning model...")),
        do_flush=True,
    )

    finetuned_dir = await train(
        model_name, data_dir, epochs, lr, batch_size,
        eval_every_n_epochs=eval_every_n_epochs,
    )

    await flyte.report.replace.aio(
        _wrap_report(_pipeline_progress(3, "Running COCO mAP evaluation...")),
        do_flush=True,
    )

    metrics_json = await evaluate(finetuned_dir, data_dir, threshold)
    metrics = json.loads(metrics_json)

    await flyte.report.replace.aio(
        _wrap_report(_pipeline_progress(4, "Rendering bounding box demo...")),
        do_flush=True,
    )

    demo_json = await inference_demo(
        finetuned_dir, data_dir, threshold, demo_images,
        metrics_json=metrics_json,
    )

    ft_map = metrics["finetuned"].get("map", 0)
    ft_map50 = metrics["finetuned"].get("map_50", 0)

    final_html = f"""
    <h2>Pipeline Complete</h2>
    <h3>{model_name}</h3>
    <div class="stat-grid">
      <div class="stat"><div class="value">{metrics['num_val_images']}</div><div class="label">Val Images</div></div>
      <div class="stat"><div class="value highlight">{ft_map:.3f}</div><div class="label">mAP</div></div>
      <div class="stat"><div class="value highlight">{ft_map50:.3f}</div><div class="label">mAP@50</div></div>
    </div>
    <div class="card">
      <b>Configuration:</b> {epochs} epochs | LR {lr} | Batch size {batch_size} |
      Val fraction {val_fraction} | Threshold {threshold}
    </div>
    """

    await flyte.report.replace.aio(_wrap_report(final_html), do_flush=True)

    log.info(f"Pipeline complete. Fine-tuned mAP: {ft_map:.3f}")
    return finetuned_dir, json.dumps({"metrics": metrics, "demo": json.loads(demo_json)})

Run the workflow

From the example directory:

        
cd v2/tutorials/detr_object_detection
uv run --script detr_object_detection.py

Quick local smoke test with one epoch:

flyte run detr_object_detection.py pipeline --epochs 1 --batch_size 2

This workflow needs a GPU. Check the train, evaluate, and inference_demo task reports for charts and annotated images.