convnext deim and deim Dfine yamls for possible future traings

2026-04-21 14:07:18 +00:00 · 2026-04-21 16:42:11 +03:00 · 2026-04-21 16:42:11 +03:00 · 10eda52af9
commit 10eda52af9
parent 296b50bf73
3 changed files with 156 additions and 1 deletions
--- a/ultralytics/cfg/models/deim/deim_convnextsmall_dinov3_sta_l4_l_timm.yaml
+++ b/ultralytics/cfg/models/deim/deim_convnextsmall_dinov3_sta_l4_l_timm.yaml
@ -0,0 +1,81 @@
+# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
+
+# Ultralytics RT-DETR DEIM config derived from `deim_dinov3s_sta_l4_l.yaml`.
+# This variant keeps the DEIM-L hybrid encoder/decoder and loss settings, but
+# swaps the semantic backbone to timm `convnext_small.dinov3_lvd1689m`.
+# ConvNeXt provides native P3/P4/P5 features at strides 8/16/32, so the STA
+# detail branch used for single-scale ViT backbones is removed here.
+# Model docs: https://docs.ultralytics.com/models/rtdetr
+# Task docs: https://docs.ultralytics.com/tasks/detect
+
+# Parameters
+nc: 80 # number of classes
+
+backbone:
+  # [from, repeats, module, args]
+  - [-1, 1, nn.Identity, []] # 0 image anchor
+
+  # Semantic path (timm ConvNeXt-S DINOv3).
+  # Timm args: [out_ch, model_name, pretrained, out_indices, split]
+  # Selected stages map naturally to P3/P4/P5 with channels 192/384/768.
+  - [0, 1, Timm, [None, convnext_small.dinov3_lvd1689m, True, [1, 2, 3], True]] # 1
+  - [1, 1, Index, [192, 1]] # 2 semantic P3
+  - [1, 1, Index, [384, 2]] # 3 semantic P4
+  - [1, 1, Index, [768, 3]] # 4 semantic P5
+
+  # Project native ConvNeXt pyramid features to the DEIM-L hidden_dim=224.
+  - [2, 1, Conv, [224, 1, 1, None, 1, 1, False]] # 5 P3 proj
+  - [3, 1, Conv, [224, 1, 1, None, 1, 1, False]] # 6 P4 proj
+  - [4, 1, Conv, [224, 1, 1, None, 1, 1, False]] # 7 P5 proj
+
+head:
+  # DEIMv2-L HybridEncoder head: input_proj + AIFI + FPN/PAN.
+  # Input projections remain identity because the fused backbone outputs are
+  # already projected to hidden_dim=224 above.
+  - [7, 1, nn.Identity, []] # 8 input_proj.2
+  - [-1, 1, AIFI, [896, 8]]
+  - [-1, 1, Conv, [224, 1, 1, None, 1, 1, False]] # 10 Y5, lateral_convs.0
+
+  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [6, 1, nn.Identity, []] # 12 input_proj.1
+  - [[-2, -1], 1, Add, []]
+  - [-1, 1, RepNCSPELAN5, [224, 448, 112, 3]] # 14 fpn_blocks.0
+  - [-1, 1, Conv, [224, 1, 1, None, 1, 1, False]] # 15 Y4, lateral_convs.1
+
+  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [5, 1, nn.Identity, []] # 17 input_proj.0
+  - [[-2, -1], 1, Add, []]
+  - [-1, 1, RepNCSPELAN5, [224, 448, 112, 3]] # 19 X3, fpn_blocks.1
+
+  - [-1, 1, SCDown, [224, 3, 2, False]] # 20 downsample_convs.0
+  - [[-1, 15], 1, Add, []]
+  - [-1, 1, RepNCSPELAN5, [224, 448, 112, 3]] # 22 F4, pan_blocks.0
+
+  - [-1, 1, SCDown, [224, 3, 2, False]] # 23 downsample_convs.1
+  - [[-1, 10], 1, Add, []]
+  - [-1, 1, RepNCSPELAN5, [224, 448, 112, 3]] # 25 F5, pan_blocks.1
+
+  # DeimDecoder (DEIMv2-L defaults explicitly set in YAML)
+  # Args: [nc, hd, nq, ndp, nh, ndl, d_ffn, dropout, act, eval_idx, nd, label_noise_ratio, box_noise_scale,
+  #        learnt_init_query, enable_cuda_acceleration, one_to_many_groups, dab_sine_embedding,
+  #        efficient_msdeformable_attn, query_select_method, reg_max, reg_scale, layer_scale, mlp_act,
+  #        o2m_topk_mode, use_gateway, share_bbox_head, share_score_head]
+  - [[19, 22, 25], 1, DeimDecoder, [nc, 224, 300, [3, 6, 3], 8, 4, 1792, 0.0, "silu", -1, 100, 0.5, 1.0, False, False, 0, False, False, "default", 32, 4.0, 1.0, "silu", "unshared", True, False, False]]
+
+loss:
+  gamma: 1.5
+  alpha: 0.75
+  use_fl: False
+  use_vfl: False
+  use_mal: True
+  use_union_set: True
+  reg_max: 32
+  loss_gain: {class: 1, bbox: 5, giou: 2, fgl: 0.15, ddf: 1.5}
+  matcher:
+    cost_gain: {class: 2, bbox: 5, giou: 2}
+    use_fl: True
+    alpha: 0.25
+    gamma: 2.0
+    change_matcher: True
+    iou_order_alpha: 4.0
+    matcher_change_epoch: 50
--- a/ultralytics/cfg/models/deim/deim_dinov3s_sta_l4_l_dfine.yaml
+++ b/ultralytics/cfg/models/deim/deim_dinov3s_sta_l4_l_dfine.yaml
@ -0,0 +1,74 @@
+# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
+
+# Ultralytics RT-DETR DEIM config based on `deim_dinov3s_sta_l4_l.yaml`.
+# This variant keeps the same DINOv3+STA backbone and DEIMv2-L hybrid encoder,
+# but swaps the final decoder to DFineDecoder.
+# Model docs: https://docs.ultralytics.com/models/rtdetr
+# Task docs: https://docs.ultralytics.com/tasks/detect
+
+# Parameters
+nc: 80 # number of classes
+
+backbone:
+  # [from, repeats, module, args]
+  # DEIMv2 DINOv3+STA backbone wrapper.
+  # Parser args: [out_ch, name, pretrained, interaction_indexes, finetune, patch_size, use_sta, conv_inplane, hidden_dim, split]
+  # If pretrained=True, loader uses official DINOv3 torch.hub URL by model name.
+  # For access-gated links, set DEIMV2_DINOV3_URL to your granted direct checkpoint URL.
+  - [-1, 1, DEIMDINOv3STAs, [224, dinov3_vits16, True, [5, 8, 11], True, 16, True, 32, 224, True]] # 0
+  - [0, 1, Index, [224, 1]] # 1 P3
+  - [0, 1, Index, [224, 2]] # 2 P4
+  - [0, 1, Index, [224, 3]] # 3 P5
+
+head:
+  # DEIMv2-L HybridEncoder head: input_proj + AIFI + FPN/PAN.
+  # Upstream DeimV2-L uses hidden_dim=224, dim_feedforward=896, expansion=1.0, depth_mult=1.0.
+  # That maps here to RepNCSPELAN5 args [c2=224, c3=448, c4=112, n=3].
+  # Upstream input_proj layers are identities because in_channels == hidden_dim == 224.
+  - [3, 1, nn.Identity, []] # 4 input_proj.2
+  - [-1, 1, AIFI, [896, 8]]
+  - [-1, 1, Conv, [224, 1, 1, None, 1, 1, False]] # 6 Y5, lateral_convs.0
+
+  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [2, 1, nn.Identity, []] # 8 input_proj.1
+  - [[-2, -1], 1, Add, []]
+  - [-1, 1, RepNCSPELAN5, [224, 448, 112, 3]] # 10 fpn_blocks.0
+  - [-1, 1, Conv, [224, 1, 1, None, 1, 1, False]] # 11 Y4, lateral_convs.1
+
+  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
+  - [1, 1, nn.Identity, []] # 13 input_proj.0
+  - [[-2, -1], 1, Add, []]
+  - [-1, 1, RepNCSPELAN5, [224, 448, 112, 3]] # 15 X3, fpn_blocks.1
+
+  - [-1, 1, SCDown, [224, 3, 2, False]] # 16 downsample_convs.0
+  - [[-1, 11], 1, Add, []]
+  - [-1, 1, RepNCSPELAN5, [224, 448, 112, 3]] # 18 F4, pan_blocks.0
+
+  - [-1, 1, SCDown, [224, 3, 2, False]] # 19 downsample_convs.1
+  - [[-1, 6], 1, Add, []]
+  - [-1, 1, RepNCSPELAN5, [224, 448, 112, 3]] # 21 F5, pan_blocks.1
+
+  # DFineDecoder with the same L-sized width/depth as `deim_dinov3s_sta_l4_l.yaml`.
+  # Args: [nc, hd, nq, ndp, nh, ndl, d_ffn, dropout, act, eval_idx, nd, label_noise_ratio, box_noise_scale,
+  #        learnt_init_query, enable_cuda_acceleration, one_to_many_groups, dab_sine_embedding,
+  #        efficient_msdeformable_attn, query_select_method, reg_max, reg_scale, layer_scale, mlp_act,
+  #        o2m_topk_mode]
+  - [[15, 18, 21], 1, DFineDecoder, [nc, 224, 300, [3, 6, 3], 8, 4, 1792, 0.0, "silu", -1, 100, 0.5, 1.0, False, False, 0, False, False, "default", 32, 4.0, 1.0, "silu", "unshared"]]
+
+loss:
+  gamma: 1.5
+  alpha: 0.75
+  use_fl: False
+  use_vfl: False
+  use_mal: True
+  use_union_set: True
+  reg_max: 32
+  loss_gain: {class: 1, bbox: 5, giou: 2, fgl: 0.15, ddf: 1.5}
+  matcher:
+    cost_gain: {class: 2, bbox: 5, giou: 2}
+    use_fl: True
+    alpha: 0.25
+    gamma: 2.0
+    change_matcher: True
+    iou_order_alpha: 4.0
+    matcher_change_epoch: 50
--- a/ultralytics/models/yolo/model.py
+++ b/ultralytics/models/yolo/model.py
@ -75,7 +75,7 @@ class YOLO(Model):
            # Continue with default YOLO initialization
            super().__init__(model=model, task=task, verbose=verbose)
            head_name = self.model.model[-1]._get_name() if hasattr(self.model, "model") else ""
-            if head_name == "DeimDecoder":
+            if head_name in {"DeimDecoder", "DFineDecoder"}:
                from ultralytics import RTDETRDEIM

                new_instance = RTDETRDEIM(self)