mirror of
https://github.com/ultralytics/ultralytics
synced 2026-04-21 14:07:18 +00:00
convnext deim and deim Dfine yamls for possible future traings
This commit is contained in:
parent
296b50bf73
commit
10eda52af9
3 changed files with 156 additions and 1 deletions
|
|
@ -0,0 +1,81 @@
|
|||
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
|
||||
|
||||
# Ultralytics RT-DETR DEIM config derived from `deim_dinov3s_sta_l4_l.yaml`.
|
||||
# This variant keeps the DEIM-L hybrid encoder/decoder and loss settings, but
|
||||
# swaps the semantic backbone to timm `convnext_small.dinov3_lvd1689m`.
|
||||
# ConvNeXt provides native P3/P4/P5 features at strides 8/16/32, so the STA
|
||||
# detail branch used for single-scale ViT backbones is removed here.
|
||||
# Model docs: https://docs.ultralytics.com/models/rtdetr
|
||||
# Task docs: https://docs.ultralytics.com/tasks/detect
|
||||
|
||||
# Parameters
|
||||
nc: 80 # number of classes
|
||||
|
||||
backbone:
|
||||
# [from, repeats, module, args]
|
||||
- [-1, 1, nn.Identity, []] # 0 image anchor
|
||||
|
||||
# Semantic path (timm ConvNeXt-S DINOv3).
|
||||
# Timm args: [out_ch, model_name, pretrained, out_indices, split]
|
||||
# Selected stages map naturally to P3/P4/P5 with channels 192/384/768.
|
||||
- [0, 1, Timm, [None, convnext_small.dinov3_lvd1689m, True, [1, 2, 3], True]] # 1
|
||||
- [1, 1, Index, [192, 1]] # 2 semantic P3
|
||||
- [1, 1, Index, [384, 2]] # 3 semantic P4
|
||||
- [1, 1, Index, [768, 3]] # 4 semantic P5
|
||||
|
||||
# Project native ConvNeXt pyramid features to the DEIM-L hidden_dim=224.
|
||||
- [2, 1, Conv, [224, 1, 1, None, 1, 1, False]] # 5 P3 proj
|
||||
- [3, 1, Conv, [224, 1, 1, None, 1, 1, False]] # 6 P4 proj
|
||||
- [4, 1, Conv, [224, 1, 1, None, 1, 1, False]] # 7 P5 proj
|
||||
|
||||
head:
|
||||
# DEIMv2-L HybridEncoder head: input_proj + AIFI + FPN/PAN.
|
||||
# Input projections remain identity because the fused backbone outputs are
|
||||
# already projected to hidden_dim=224 above.
|
||||
- [7, 1, nn.Identity, []] # 8 input_proj.2
|
||||
- [-1, 1, AIFI, [896, 8]]
|
||||
- [-1, 1, Conv, [224, 1, 1, None, 1, 1, False]] # 10 Y5, lateral_convs.0
|
||||
|
||||
- [-1, 1, nn.Upsample, [None, 2, "nearest"]]
|
||||
- [6, 1, nn.Identity, []] # 12 input_proj.1
|
||||
- [[-2, -1], 1, Add, []]
|
||||
- [-1, 1, RepNCSPELAN5, [224, 448, 112, 3]] # 14 fpn_blocks.0
|
||||
- [-1, 1, Conv, [224, 1, 1, None, 1, 1, False]] # 15 Y4, lateral_convs.1
|
||||
|
||||
- [-1, 1, nn.Upsample, [None, 2, "nearest"]]
|
||||
- [5, 1, nn.Identity, []] # 17 input_proj.0
|
||||
- [[-2, -1], 1, Add, []]
|
||||
- [-1, 1, RepNCSPELAN5, [224, 448, 112, 3]] # 19 X3, fpn_blocks.1
|
||||
|
||||
- [-1, 1, SCDown, [224, 3, 2, False]] # 20 downsample_convs.0
|
||||
- [[-1, 15], 1, Add, []]
|
||||
- [-1, 1, RepNCSPELAN5, [224, 448, 112, 3]] # 22 F4, pan_blocks.0
|
||||
|
||||
- [-1, 1, SCDown, [224, 3, 2, False]] # 23 downsample_convs.1
|
||||
- [[-1, 10], 1, Add, []]
|
||||
- [-1, 1, RepNCSPELAN5, [224, 448, 112, 3]] # 25 F5, pan_blocks.1
|
||||
|
||||
# DeimDecoder (DEIMv2-L defaults explicitly set in YAML)
|
||||
# Args: [nc, hd, nq, ndp, nh, ndl, d_ffn, dropout, act, eval_idx, nd, label_noise_ratio, box_noise_scale,
|
||||
# learnt_init_query, enable_cuda_acceleration, one_to_many_groups, dab_sine_embedding,
|
||||
# efficient_msdeformable_attn, query_select_method, reg_max, reg_scale, layer_scale, mlp_act,
|
||||
# o2m_topk_mode, use_gateway, share_bbox_head, share_score_head]
|
||||
- [[19, 22, 25], 1, DeimDecoder, [nc, 224, 300, [3, 6, 3], 8, 4, 1792, 0.0, "silu", -1, 100, 0.5, 1.0, False, False, 0, False, False, "default", 32, 4.0, 1.0, "silu", "unshared", True, False, False]]
|
||||
|
||||
loss:
|
||||
gamma: 1.5
|
||||
alpha: 0.75
|
||||
use_fl: False
|
||||
use_vfl: False
|
||||
use_mal: True
|
||||
use_union_set: True
|
||||
reg_max: 32
|
||||
loss_gain: {class: 1, bbox: 5, giou: 2, fgl: 0.15, ddf: 1.5}
|
||||
matcher:
|
||||
cost_gain: {class: 2, bbox: 5, giou: 2}
|
||||
use_fl: True
|
||||
alpha: 0.25
|
||||
gamma: 2.0
|
||||
change_matcher: True
|
||||
iou_order_alpha: 4.0
|
||||
matcher_change_epoch: 50
|
||||
74
ultralytics/cfg/models/deim/deim_dinov3s_sta_l4_l_dfine.yaml
Normal file
74
ultralytics/cfg/models/deim/deim_dinov3s_sta_l4_l_dfine.yaml
Normal file
|
|
@ -0,0 +1,74 @@
|
|||
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
|
||||
|
||||
# Ultralytics RT-DETR DEIM config based on `deim_dinov3s_sta_l4_l.yaml`.
|
||||
# This variant keeps the same DINOv3+STA backbone and DEIMv2-L hybrid encoder,
|
||||
# but swaps the final decoder to DFineDecoder.
|
||||
# Model docs: https://docs.ultralytics.com/models/rtdetr
|
||||
# Task docs: https://docs.ultralytics.com/tasks/detect
|
||||
|
||||
# Parameters
|
||||
nc: 80 # number of classes
|
||||
|
||||
backbone:
|
||||
# [from, repeats, module, args]
|
||||
# DEIMv2 DINOv3+STA backbone wrapper.
|
||||
# Parser args: [out_ch, name, pretrained, interaction_indexes, finetune, patch_size, use_sta, conv_inplane, hidden_dim, split]
|
||||
# If pretrained=True, loader uses official DINOv3 torch.hub URL by model name.
|
||||
# For access-gated links, set DEIMV2_DINOV3_URL to your granted direct checkpoint URL.
|
||||
- [-1, 1, DEIMDINOv3STAs, [224, dinov3_vits16, True, [5, 8, 11], True, 16, True, 32, 224, True]] # 0
|
||||
- [0, 1, Index, [224, 1]] # 1 P3
|
||||
- [0, 1, Index, [224, 2]] # 2 P4
|
||||
- [0, 1, Index, [224, 3]] # 3 P5
|
||||
|
||||
head:
|
||||
# DEIMv2-L HybridEncoder head: input_proj + AIFI + FPN/PAN.
|
||||
# Upstream DeimV2-L uses hidden_dim=224, dim_feedforward=896, expansion=1.0, depth_mult=1.0.
|
||||
# That maps here to RepNCSPELAN5 args [c2=224, c3=448, c4=112, n=3].
|
||||
# Upstream input_proj layers are identities because in_channels == hidden_dim == 224.
|
||||
- [3, 1, nn.Identity, []] # 4 input_proj.2
|
||||
- [-1, 1, AIFI, [896, 8]]
|
||||
- [-1, 1, Conv, [224, 1, 1, None, 1, 1, False]] # 6 Y5, lateral_convs.0
|
||||
|
||||
- [-1, 1, nn.Upsample, [None, 2, "nearest"]]
|
||||
- [2, 1, nn.Identity, []] # 8 input_proj.1
|
||||
- [[-2, -1], 1, Add, []]
|
||||
- [-1, 1, RepNCSPELAN5, [224, 448, 112, 3]] # 10 fpn_blocks.0
|
||||
- [-1, 1, Conv, [224, 1, 1, None, 1, 1, False]] # 11 Y4, lateral_convs.1
|
||||
|
||||
- [-1, 1, nn.Upsample, [None, 2, "nearest"]]
|
||||
- [1, 1, nn.Identity, []] # 13 input_proj.0
|
||||
- [[-2, -1], 1, Add, []]
|
||||
- [-1, 1, RepNCSPELAN5, [224, 448, 112, 3]] # 15 X3, fpn_blocks.1
|
||||
|
||||
- [-1, 1, SCDown, [224, 3, 2, False]] # 16 downsample_convs.0
|
||||
- [[-1, 11], 1, Add, []]
|
||||
- [-1, 1, RepNCSPELAN5, [224, 448, 112, 3]] # 18 F4, pan_blocks.0
|
||||
|
||||
- [-1, 1, SCDown, [224, 3, 2, False]] # 19 downsample_convs.1
|
||||
- [[-1, 6], 1, Add, []]
|
||||
- [-1, 1, RepNCSPELAN5, [224, 448, 112, 3]] # 21 F5, pan_blocks.1
|
||||
|
||||
# DFineDecoder with the same L-sized width/depth as `deim_dinov3s_sta_l4_l.yaml`.
|
||||
# Args: [nc, hd, nq, ndp, nh, ndl, d_ffn, dropout, act, eval_idx, nd, label_noise_ratio, box_noise_scale,
|
||||
# learnt_init_query, enable_cuda_acceleration, one_to_many_groups, dab_sine_embedding,
|
||||
# efficient_msdeformable_attn, query_select_method, reg_max, reg_scale, layer_scale, mlp_act,
|
||||
# o2m_topk_mode]
|
||||
- [[15, 18, 21], 1, DFineDecoder, [nc, 224, 300, [3, 6, 3], 8, 4, 1792, 0.0, "silu", -1, 100, 0.5, 1.0, False, False, 0, False, False, "default", 32, 4.0, 1.0, "silu", "unshared"]]
|
||||
|
||||
loss:
|
||||
gamma: 1.5
|
||||
alpha: 0.75
|
||||
use_fl: False
|
||||
use_vfl: False
|
||||
use_mal: True
|
||||
use_union_set: True
|
||||
reg_max: 32
|
||||
loss_gain: {class: 1, bbox: 5, giou: 2, fgl: 0.15, ddf: 1.5}
|
||||
matcher:
|
||||
cost_gain: {class: 2, bbox: 5, giou: 2}
|
||||
use_fl: True
|
||||
alpha: 0.25
|
||||
gamma: 2.0
|
||||
change_matcher: True
|
||||
iou_order_alpha: 4.0
|
||||
matcher_change_epoch: 50
|
||||
|
|
@ -75,7 +75,7 @@ class YOLO(Model):
|
|||
# Continue with default YOLO initialization
|
||||
super().__init__(model=model, task=task, verbose=verbose)
|
||||
head_name = self.model.model[-1]._get_name() if hasattr(self.model, "model") else ""
|
||||
if head_name == "DeimDecoder":
|
||||
if head_name in {"DeimDecoder", "DFineDecoder"}:
|
||||
from ultralytics import RTDETRDEIM
|
||||
|
||||
new_instance = RTDETRDEIM(self)
|
||||
|
|
|
|||
Loading…
Reference in a new issue