From 6ebf893880778c24742abe26c341b8c2ca2a5a56 Mon Sep 17 00:00:00 2001 From: Laughing <61612323+Laughing-q@users.noreply.github.com> Date: Thu, 3 Apr 2025 19:49:04 +0800 Subject: [PATCH] `ultralytics 8.3.101` YOLOE visual prompt inference fix for video sources (#19959) Signed-off-by: Glenn Jocher Co-authored-by: UltralyticsAssistant Co-authored-by: Glenn Jocher --- docs/en/models/yoloe.md | 5 +++++ ultralytics/__init__.py | 2 +- ultralytics/models/yolo/model.py | 16 +++++++++++++++- 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/docs/en/models/yoloe.md b/docs/en/models/yoloe.md index ae528b0e6a..1e07cba54b 100644 --- a/docs/en/models/yoloe.md +++ b/docs/en/models/yoloe.md @@ -160,6 +160,11 @@ Object detection is straightforward with the `predict` method, as illustrated be === "Visual Prompt" + !!! note + + If `source` is a video/stream, the first frame of the video/stream will be automatically used as `refer_image`, or you could directly pass any frame from the video/stream to `refer_image` argument. + + Prompts in source image: ```python diff --git a/ultralytics/__init__.py b/ultralytics/__init__.py index 844647046c..81eb772883 100644 --- a/ultralytics/__init__.py +++ b/ultralytics/__init__.py @@ -1,6 +1,6 @@ # Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license -__version__ = "8.3.100" +__version__ = "8.3.101" import os diff --git a/ultralytics/models/yolo/model.py b/ultralytics/models/yolo/model.py index 67dbcb72cb..8aaff44796 100644 --- a/ultralytics/models/yolo/model.py +++ b/ultralytics/models/yolo/model.py @@ -2,6 +2,7 @@ from pathlib import Path +from ultralytics.data.build import load_inference_source from ultralytics.engine.model import Model from ultralytics.models import yolo from ultralytics.nn.tasks import ( @@ -267,7 +268,14 @@ class YOLOE(Model): f"{len(visual_prompts['cls'])} respectively" ) self.predictor = (predictor or self._smart_load("predictor"))( - overrides={"task": "segment", "mode": "predict", "save": False, "verbose": False}, _callbacks=self.callbacks + overrides={ + "task": self.model.task, + "mode": "predict", + "save": False, + "verbose": refer_image is None, + "batch": 1, + }, + _callbacks=self.callbacks, ) if len(visual_prompts): @@ -281,6 +289,12 @@ class YOLOE(Model): self.predictor.set_prompts(visual_prompts.copy()) self.predictor.setup_model(model=self.model) + + if refer_image is None: + dataset = load_inference_source(source) + if dataset.mode in {"video", "stream"}: + # NOTE: set the first frame as refer image for videos/streams inference + refer_image = next(iter(dataset))[1][0] if refer_image is not None and len(visual_prompts): vpe = self.predictor.get_vpe(refer_image) self.model.set_classes(self.model.names, vpe)