convnext deim and deim Dfine yamls for possible future traings

This commit is contained in:
esat 2026-04-21 16:42:11 +03:00
parent 296b50bf73
commit 10eda52af9
3 changed files with 156 additions and 1 deletions

View file

@ -0,0 +1,81 @@
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
# Ultralytics RT-DETR DEIM config derived from `deim_dinov3s_sta_l4_l.yaml`.
# This variant keeps the DEIM-L hybrid encoder/decoder and loss settings, but
# swaps the semantic backbone to timm `convnext_small.dinov3_lvd1689m`.
# ConvNeXt provides native P3/P4/P5 features at strides 8/16/32, so the STA
# detail branch used for single-scale ViT backbones is removed here.
# Model docs: https://docs.ultralytics.com/models/rtdetr
# Task docs: https://docs.ultralytics.com/tasks/detect
# Parameters
nc: 80 # number of classes
backbone:
# [from, repeats, module, args]
- [-1, 1, nn.Identity, []] # 0 image anchor
# Semantic path (timm ConvNeXt-S DINOv3).
# Timm args: [out_ch, model_name, pretrained, out_indices, split]
# Selected stages map naturally to P3/P4/P5 with channels 192/384/768.
- [0, 1, Timm, [None, convnext_small.dinov3_lvd1689m, True, [1, 2, 3], True]] # 1
- [1, 1, Index, [192, 1]] # 2 semantic P3
- [1, 1, Index, [384, 2]] # 3 semantic P4
- [1, 1, Index, [768, 3]] # 4 semantic P5
# Project native ConvNeXt pyramid features to the DEIM-L hidden_dim=224.
- [2, 1, Conv, [224, 1, 1, None, 1, 1, False]] # 5 P3 proj
- [3, 1, Conv, [224, 1, 1, None, 1, 1, False]] # 6 P4 proj
- [4, 1, Conv, [224, 1, 1, None, 1, 1, False]] # 7 P5 proj
head:
# DEIMv2-L HybridEncoder head: input_proj + AIFI + FPN/PAN.
# Input projections remain identity because the fused backbone outputs are
# already projected to hidden_dim=224 above.
- [7, 1, nn.Identity, []] # 8 input_proj.2
- [-1, 1, AIFI, [896, 8]]
- [-1, 1, Conv, [224, 1, 1, None, 1, 1, False]] # 10 Y5, lateral_convs.0
- [-1, 1, nn.Upsample, [None, 2, "nearest"]]
- [6, 1, nn.Identity, []] # 12 input_proj.1
- [[-2, -1], 1, Add, []]
- [-1, 1, RepNCSPELAN5, [224, 448, 112, 3]] # 14 fpn_blocks.0
- [-1, 1, Conv, [224, 1, 1, None, 1, 1, False]] # 15 Y4, lateral_convs.1
- [-1, 1, nn.Upsample, [None, 2, "nearest"]]
- [5, 1, nn.Identity, []] # 17 input_proj.0
- [[-2, -1], 1, Add, []]
- [-1, 1, RepNCSPELAN5, [224, 448, 112, 3]] # 19 X3, fpn_blocks.1
- [-1, 1, SCDown, [224, 3, 2, False]] # 20 downsample_convs.0
- [[-1, 15], 1, Add, []]
- [-1, 1, RepNCSPELAN5, [224, 448, 112, 3]] # 22 F4, pan_blocks.0
- [-1, 1, SCDown, [224, 3, 2, False]] # 23 downsample_convs.1
- [[-1, 10], 1, Add, []]
- [-1, 1, RepNCSPELAN5, [224, 448, 112, 3]] # 25 F5, pan_blocks.1
# DeimDecoder (DEIMv2-L defaults explicitly set in YAML)
# Args: [nc, hd, nq, ndp, nh, ndl, d_ffn, dropout, act, eval_idx, nd, label_noise_ratio, box_noise_scale,
# learnt_init_query, enable_cuda_acceleration, one_to_many_groups, dab_sine_embedding,
# efficient_msdeformable_attn, query_select_method, reg_max, reg_scale, layer_scale, mlp_act,
# o2m_topk_mode, use_gateway, share_bbox_head, share_score_head]
- [[19, 22, 25], 1, DeimDecoder, [nc, 224, 300, [3, 6, 3], 8, 4, 1792, 0.0, "silu", -1, 100, 0.5, 1.0, False, False, 0, False, False, "default", 32, 4.0, 1.0, "silu", "unshared", True, False, False]]
loss:
gamma: 1.5
alpha: 0.75
use_fl: False
use_vfl: False
use_mal: True
use_union_set: True
reg_max: 32
loss_gain: {class: 1, bbox: 5, giou: 2, fgl: 0.15, ddf: 1.5}
matcher:
cost_gain: {class: 2, bbox: 5, giou: 2}
use_fl: True
alpha: 0.25
gamma: 2.0
change_matcher: True
iou_order_alpha: 4.0
matcher_change_epoch: 50

View file

@ -0,0 +1,74 @@
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
# Ultralytics RT-DETR DEIM config based on `deim_dinov3s_sta_l4_l.yaml`.
# This variant keeps the same DINOv3+STA backbone and DEIMv2-L hybrid encoder,
# but swaps the final decoder to DFineDecoder.
# Model docs: https://docs.ultralytics.com/models/rtdetr
# Task docs: https://docs.ultralytics.com/tasks/detect
# Parameters
nc: 80 # number of classes
backbone:
# [from, repeats, module, args]
# DEIMv2 DINOv3+STA backbone wrapper.
# Parser args: [out_ch, name, pretrained, interaction_indexes, finetune, patch_size, use_sta, conv_inplane, hidden_dim, split]
# If pretrained=True, loader uses official DINOv3 torch.hub URL by model name.
# For access-gated links, set DEIMV2_DINOV3_URL to your granted direct checkpoint URL.
- [-1, 1, DEIMDINOv3STAs, [224, dinov3_vits16, True, [5, 8, 11], True, 16, True, 32, 224, True]] # 0
- [0, 1, Index, [224, 1]] # 1 P3
- [0, 1, Index, [224, 2]] # 2 P4
- [0, 1, Index, [224, 3]] # 3 P5
head:
# DEIMv2-L HybridEncoder head: input_proj + AIFI + FPN/PAN.
# Upstream DeimV2-L uses hidden_dim=224, dim_feedforward=896, expansion=1.0, depth_mult=1.0.
# That maps here to RepNCSPELAN5 args [c2=224, c3=448, c4=112, n=3].
# Upstream input_proj layers are identities because in_channels == hidden_dim == 224.
- [3, 1, nn.Identity, []] # 4 input_proj.2
- [-1, 1, AIFI, [896, 8]]
- [-1, 1, Conv, [224, 1, 1, None, 1, 1, False]] # 6 Y5, lateral_convs.0
- [-1, 1, nn.Upsample, [None, 2, "nearest"]]
- [2, 1, nn.Identity, []] # 8 input_proj.1
- [[-2, -1], 1, Add, []]
- [-1, 1, RepNCSPELAN5, [224, 448, 112, 3]] # 10 fpn_blocks.0
- [-1, 1, Conv, [224, 1, 1, None, 1, 1, False]] # 11 Y4, lateral_convs.1
- [-1, 1, nn.Upsample, [None, 2, "nearest"]]
- [1, 1, nn.Identity, []] # 13 input_proj.0
- [[-2, -1], 1, Add, []]
- [-1, 1, RepNCSPELAN5, [224, 448, 112, 3]] # 15 X3, fpn_blocks.1
- [-1, 1, SCDown, [224, 3, 2, False]] # 16 downsample_convs.0
- [[-1, 11], 1, Add, []]
- [-1, 1, RepNCSPELAN5, [224, 448, 112, 3]] # 18 F4, pan_blocks.0
- [-1, 1, SCDown, [224, 3, 2, False]] # 19 downsample_convs.1
- [[-1, 6], 1, Add, []]
- [-1, 1, RepNCSPELAN5, [224, 448, 112, 3]] # 21 F5, pan_blocks.1
# DFineDecoder with the same L-sized width/depth as `deim_dinov3s_sta_l4_l.yaml`.
# Args: [nc, hd, nq, ndp, nh, ndl, d_ffn, dropout, act, eval_idx, nd, label_noise_ratio, box_noise_scale,
# learnt_init_query, enable_cuda_acceleration, one_to_many_groups, dab_sine_embedding,
# efficient_msdeformable_attn, query_select_method, reg_max, reg_scale, layer_scale, mlp_act,
# o2m_topk_mode]
- [[15, 18, 21], 1, DFineDecoder, [nc, 224, 300, [3, 6, 3], 8, 4, 1792, 0.0, "silu", -1, 100, 0.5, 1.0, False, False, 0, False, False, "default", 32, 4.0, 1.0, "silu", "unshared"]]
loss:
gamma: 1.5
alpha: 0.75
use_fl: False
use_vfl: False
use_mal: True
use_union_set: True
reg_max: 32
loss_gain: {class: 1, bbox: 5, giou: 2, fgl: 0.15, ddf: 1.5}
matcher:
cost_gain: {class: 2, bbox: 5, giou: 2}
use_fl: True
alpha: 0.25
gamma: 2.0
change_matcher: True
iou_order_alpha: 4.0
matcher_change_epoch: 50

View file

@ -75,7 +75,7 @@ class YOLO(Model):
# Continue with default YOLO initialization
super().__init__(model=model, task=task, verbose=verbose)
head_name = self.model.model[-1]._get_name() if hasattr(self.model, "model") else ""
if head_name == "DeimDecoder":
if head_name in {"DeimDecoder", "DFineDecoder"}:
from ultralytics import RTDETRDEIM
new_instance = RTDETRDEIM(self)