diff --git a/docs/en/models/yolov6.md b/docs/en/models/yolov6.md
index 4d781ed065..5c43599b07 100644
--- a/docs/en/models/yolov6.md
+++ b/docs/en/models/yolov6.md
@@ -11,7 +11,7 @@ keywords: Meituan YOLOv6, object detection, real-time applications, BiC module,
 [Meituan](https://www.meituan.com/) YOLOv6 is a cutting-edge object detector that offers remarkable balance between speed and accuracy, making it a popular choice for real-time applications. This model introduces several notable enhancements on its architecture and training scheme, including the implementation of a Bi-directional Concatenation (BiC) module, an anchor-aided training (AAT) strategy, and an improved [backbone](https://www.ultralytics.com/glossary/backbone) and neck design for state-of-the-art accuracy on the COCO dataset.
 
 ![Meituan YOLOv6](https://github.com/ultralytics/docs/releases/download/0/meituan-yolov6.avif)
-![Model example image](https://github.com/ultralytics/docs/releases/download/0/yolov6-architecture-diagram.avif) **Overview of YOLOv6.** Model architecture diagram showing the redesigned network components and training strategies that have led to significant performance improvements. (a) The neck of YOLOv6 (N and S are shown). Note for M/L, RepBlocks is replaced with CSPStackRep. (b) The structure of a BiC module. (c) A SimCSPSPPF block. ([source](https://arxiv.org/pdf/2301.05586.pdf)).
+![Model example image](https://github.com/ultralytics/docs/releases/download/0/yolov6-architecture-diagram.avif) **Overview of YOLOv6.** Model architecture diagram showing the redesigned network components and training strategies that have led to significant performance improvements. (a) The neck of YOLOv6 (N and S are shown). Note for M/L, RepBlocks is replaced with CSPStackRep. (b) The structure of a BiC module. (c) A SimCSPSPPF block. ([source](https://arxiv.org/pdf/2301.05586)).
 
 ### Key Features
 
diff --git a/ultralytics/data/utils.py b/ultralytics/data/utils.py
index 41628d75df..394aecd8f6 100644
--- a/ultralytics/data/utils.py
+++ b/ultralytics/data/utils.py
@@ -175,13 +175,8 @@ def visualize_image_annotations(image_path, txt_path, label_map):
     adjusted for readability, depending on the background color's luminance.
 
     Args:
-        image_path (str): The path to the image file to annotate, and it can be in formats supported by PIL (e.g., .jpg, .png).
-        txt_path (str): The path to the annotation file in YOLO format, that should contain one line per object with:
-                        - class_id (int): The class index.
-                        - x_center (float): The X center of the bounding box (relative to image width).
-                        - y_center (float): The Y center of the bounding box (relative to image height).
-                        - width (float): The width of the bounding box (relative to image width).
-                        - height (float): The height of the bounding box (relative to image height).
+        image_path (str): The path to the image file to annotate, and it can be in formats supported by PIL.
+        txt_path (str): The path to the annotation file in YOLO format, that should contain one line per object.
         label_map (dict): A dictionary that maps class IDs (integers) to class labels (strings).
 
     Examples:
@@ -222,8 +217,8 @@ def polygon2mask(imgsz, polygons, color=1, downsample_ratio=1):
         imgsz (tuple): The size of the image as (height, width).
         polygons (list[np.ndarray]): A list of polygons. Each polygon is an array with shape [N, M], where
                                      N is the number of polygons, and M is the number of points such that M % 2 = 0.
-        color (int, optional): The color value to fill in the polygons on the mask. Defaults to 1.
-        downsample_ratio (int, optional): Factor by which to downsample the mask. Defaults to 1.
+        color (int, optional): The color value to fill in the polygons on the mask.
+        downsample_ratio (int, optional): Factor by which to downsample the mask.
 
     Returns:
         (np.ndarray): A binary mask of the specified image size with the polygons filled in.
@@ -246,7 +241,7 @@ def polygons2masks(imgsz, polygons, color, downsample_ratio=1):
         polygons (list[np.ndarray]): A list of polygons. Each polygon is an array with shape [N, M], where
                                      N is the number of polygons, and M is the number of points such that M % 2 = 0.
         color (int): The color value to fill in the polygons on the masks.
-        downsample_ratio (int, optional): Factor by which to downsample each mask. Defaults to 1.
+        downsample_ratio (int, optional): Factor by which to downsample each mask.
 
     Returns:
         (np.ndarray): A set of binary masks of the specified image size with the polygons filled in.
@@ -281,8 +276,7 @@ def find_dataset_yaml(path: Path) -> Path:
     Find and return the YAML file associated with a Detect, Segment or Pose dataset.
 
     This function searches for a YAML file at the root level of the provided directory first, and if not found, it
-    performs a recursive search. It prefers YAML files that have the same stem as the provided path. An AssertionError
-    is raised if no YAML file is found or if multiple YAML files are found.
+    performs a recursive search. It prefers YAML files that have the same stem as the provided path.
 
     Args:
         path (Path): The directory path to search for the YAML file.
@@ -308,7 +302,7 @@ def check_det_dataset(dataset, autodownload=True):
 
     Args:
         dataset (str): Path to the dataset or dataset descriptor (like a YAML file).
-        autodownload (bool, optional): Whether to automatically download the dataset if not found. Defaults to True.
+        autodownload (bool, optional): Whether to automatically download the dataset if not found.
 
     Returns:
         (dict): Parsed dataset information and paths.
@@ -400,7 +394,7 @@ def check_cls_dataset(dataset, split=""):
 
     Args:
         dataset (str | Path): The name of the dataset.
-        split (str, optional): The split of the dataset. Either 'val', 'test', or ''. Defaults to ''.
+        split (str, optional): The split of the dataset. Either 'val', 'test', or ''.
 
     Returns:
         (dict): A dictionary containing the following keys:
@@ -634,8 +628,8 @@ def compress_one_image(f, f_new=None, max_dim=1920, quality=50):
     Args:
         f (str): The path to the input image file.
         f_new (str, optional): The path to the output image file. If not specified, the input file will be overwritten.
-        max_dim (int, optional): The maximum dimension (width or height) of the output image. Default is 1920 pixels.
-        quality (int, optional): The image compression quality as a percentage. Default is 50%.
+        max_dim (int, optional): The maximum dimension (width or height) of the output image.
+        quality (int, optional): The image compression quality as a percentage.
 
     Examples:
         >>> from pathlib import Path
@@ -664,9 +658,9 @@ def autosplit(path=DATASETS_DIR / "coco8/images", weights=(0.9, 0.1, 0.0), annot
     Automatically split a dataset into train/val/test splits and save the resulting splits into autosplit_*.txt files.
 
     Args:
-        path (Path, optional): Path to images directory. Defaults to DATASETS_DIR / 'coco8/images'.
-        weights (list | tuple, optional): Train, validation, and test split fractions. Defaults to (0.9, 0.1, 0.0).
-        annotated_only (bool, optional): If True, only images with an associated txt file are used. Defaults to False.
+        path (Path, optional): Path to images directory.
+        weights (list | tuple, optional): Train, validation, and test split fractions.
+        annotated_only (bool, optional): If True, only images with an associated txt file are used.
 
     Examples:
         >>> from ultralytics.data.utils import autosplit
diff --git a/ultralytics/engine/exporter.py b/ultralytics/engine/exporter.py
index ac6c704256..1c2586843a 100644
--- a/ultralytics/engine/exporter.py
+++ b/ultralytics/engine/exporter.py
@@ -138,7 +138,7 @@ def validate_args(format, passed_args, valid_args):
     Args:
         format (str): The export format.
         passed_args (Namespace): The arguments used during export.
-        valid_args (dict): List of valid arguments for the format.
+        valid_args (List): List of valid arguments for the format.
 
     Raises:
         AssertionError: If an unsupported argument is used, or if the format lacks supported argument listings.
@@ -219,8 +219,8 @@ class Exporter:
 
         Args:
             cfg (str, optional): Path to a configuration file.
-            overrides (dict, optional): Configuration overrides.
-            _callbacks (dict, optional): Dictionary of callback functions.
+            overrides (Dict, optional): Configuration overrides.
+            _callbacks (Dict, optional): Dictionary of callback functions.
         """
         self.args = get_cfg(cfg, overrides)
         if self.args.format.lower() in {"coreml", "mlmodel"}:  # fix attempt for protobuf<3.20.x errors
@@ -1574,7 +1574,7 @@ class NMSModel(torch.nn.Module):
             x (torch.Tensor): The preprocessed tensor with shape (N, 3, H, W).
 
         Returns:
-            out (torch.Tensor): The post-processed results with shape (N, max_det, 4 + 2 + extra_shape).
+            (torch.Tensor): List of detections, each an (N, max_det, 4 + 2 + extra_shape) Tensor where N is the number of detections after NMS.
         """
         from functools import partial
 
diff --git a/ultralytics/engine/trainer.py b/ultralytics/engine/trainer.py
index 1a2cfd366e..7b51bd8963 100644
--- a/ultralytics/engine/trainer.py
+++ b/ultralytics/engine/trainer.py
@@ -95,7 +95,7 @@ class BaseTrainer:
 
     def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):
         """
-        Initializes the BaseTrainer class.
+        Initialize the BaseTrainer class.
 
         Args:
             cfg (str, optional): Path to a configuration file. Defaults to DEFAULT_CFG.
@@ -159,11 +159,11 @@ class BaseTrainer:
             callbacks.add_integration_callbacks(self)
 
     def add_callback(self, event: str, callback):
-        """Appends the given callback."""
+        """Append the given callback to the event's callback list."""
         self.callbacks[event].append(callback)
 
     def set_callback(self, event: str, callback):
-        """Overrides the existing callbacks with the given callback."""
+        """Override the existing callbacks with the given callback for the specified event."""
         self.callbacks[event] = [callback]
 
     def run_callbacks(self, event: str):
@@ -219,7 +219,7 @@ class BaseTrainer:
         self.scheduler = optim.lr_scheduler.LambdaLR(self.optimizer, lr_lambda=self.lf)
 
     def _setup_ddp(self, world_size):
-        """Initializes and sets the DistributedDataParallel parameters for training."""
+        """Initialize and set the DistributedDataParallel parameters for training."""
         torch.cuda.set_device(RANK)
         self.device = torch.device("cuda", RANK)
         # LOGGER.info(f'DDP info: RANK {RANK}, WORLD_SIZE {world_size}, DEVICE {self.device}')
@@ -232,7 +232,7 @@ class BaseTrainer:
         )
 
     def _setup_train(self, world_size):
-        """Builds dataloaders and optimizer on correct rank process."""
+        """Build dataloaders and optimizer on correct rank process."""
         # Model
         self.run_callbacks("on_pretrain_routine_start")
         ckpt = self.setup_model()
@@ -320,7 +320,7 @@ class BaseTrainer:
         self.run_callbacks("on_pretrain_routine_end")
 
     def _do_train(self, world_size=1):
-        """Train completed, evaluate and plot if specified by arguments."""
+        """Train the model with the specified world size."""
         if world_size > 1:
             self._setup_ddp(world_size)
         self._setup_train(world_size)
@@ -480,7 +480,7 @@ class BaseTrainer:
         self.run_callbacks("teardown")
 
     def auto_batch(self, max_num_obj=0):
-        """Get batch size by calculating memory occupation of model."""
+        """Calculate optimal batch size based on model and device memory constraints."""
         return check_train_batch_size(
             model=self.model,
             imgsz=self.args.imgsz,
@@ -490,7 +490,7 @@ class BaseTrainer:
         )  # returns batch size
 
     def _get_memory(self, fraction=False):
-        """Get accelerator memory utilization in GB or fraction."""
+        """Get accelerator memory utilization in GB or as a fraction of total memory."""
         memory, total = 0, 0
         if self.device.type == "mps":
             memory = torch.mps.driver_allocated_memory()
@@ -505,7 +505,7 @@ class BaseTrainer:
         return ((memory / total) if total > 0 else 0) if fraction else (memory / 2**30)
 
     def _clear_memory(self):
-        """Clear accelerator memory on different platforms."""
+        """Clear accelerator memory by calling garbage collector and emptying cache."""
         gc.collect()
         if self.device.type == "mps":
             torch.mps.empty_cache()
@@ -515,7 +515,7 @@ class BaseTrainer:
             torch.cuda.empty_cache()
 
     def read_results_csv(self):
-        """Read results.csv into a dict using pandas."""
+        """Read results.csv into a dictionary using pandas."""
         import pandas as pd  # scope for faster 'import ultralytics'
 
         return pd.read_csv(self.csv).to_dict(orient="list")
@@ -557,9 +557,10 @@ class BaseTrainer:
 
     def get_dataset(self):
         """
-        Get train, val path from data dict if it exists.
+        Get train and validation datasets from data dictionary.
 
-        Returns None if data format is not recognized.
+        Returns:
+            (tuple): A tuple containing the training and validation/test datasets.
         """
         try:
             if self.args.task == "classify":
@@ -583,7 +584,12 @@ class BaseTrainer:
         return data["train"], data.get("val") or data.get("test")
 
     def setup_model(self):
-        """Load/create/download model for any task."""
+        """
+        Load, create, or download model for any task.
+
+        Returns:
+            (dict): Optional checkpoint to resume training from.
+        """
         if isinstance(self.model, torch.nn.Module):  # if model is loaded beforehand. No setup needed
             return
 
@@ -613,9 +619,10 @@ class BaseTrainer:
 
     def validate(self):
         """
-        Runs validation on test set using self.validator.
+        Run validation on test set using self.validator.
 
-        The returned dict is expected to contain "fitness" key.
+        Returns:
+            (tuple): A tuple containing metrics dictionary and fitness score.
         """
         metrics = self.validator(self)
         fitness = metrics.pop("fitness", -self.loss.detach().cpu().numpy())  # use loss as fitness measure if not found
@@ -649,7 +656,7 @@ class BaseTrainer:
         return {"loss": loss_items} if loss_items is not None else ["loss"]
 
     def set_model_attributes(self):
-        """To set or update model parameters before training."""
+        """Set or update model parameters before training."""
         self.model.names = self.data["names"]
 
     def build_targets(self, preds, targets):
@@ -670,7 +677,7 @@ class BaseTrainer:
         pass
 
     def save_metrics(self, metrics):
-        """Saves training metrics to a CSV file."""
+        """Save training metrics to a CSV file."""
         keys, vals = list(metrics.keys()), list(metrics.values())
         n = len(metrics) + 2  # number of cols
         s = "" if self.csv.exists() else (("%s," * n % tuple(["epoch", "time"] + keys)).rstrip(",") + "\n")  # header
@@ -688,7 +695,7 @@ class BaseTrainer:
         self.plots[path] = {"data": data, "timestamp": time.time()}
 
     def final_eval(self):
-        """Performs final evaluation and validation for object detection YOLO model."""
+        """Perform final evaluation and validation for object detection YOLO model."""
         ckpt = {}
         for f in self.last, self.best:
             if f.exists():
@@ -772,8 +779,7 @@ class BaseTrainer:
 
     def build_optimizer(self, model, name="auto", lr=0.001, momentum=0.9, decay=1e-5, iterations=1e5):
         """
-        Constructs an optimizer for the given model, based on the specified optimizer name, learning rate, momentum,
-        weight decay, and number of iterations.
+        Construct an optimizer for the given model.
 
         Args:
             model (torch.nn.Module): The model for which to build an optimizer.
diff --git a/ultralytics/models/sam/modules/sam.py b/ultralytics/models/sam/modules/sam.py
index 8f5c5b7746..96ef3e2046 100644
--- a/ultralytics/models/sam/modules/sam.py
+++ b/ultralytics/models/sam/modules/sam.py
@@ -176,7 +176,7 @@ class SAM2Model(torch.nn.Module):
         compile_image_encoder: bool = False,
     ):
         """
-        Initializes the SAM2Model for video object segmentation with memory-based tracking.
+        Initialize the SAM2Model for video object segmentation with memory-based tracking.
 
         Args:
             image_encoder (nn.Module): Visual encoder for extracting image features.
@@ -213,9 +213,9 @@ class SAM2Model(torch.nn.Module):
                 the encoder.
             proj_tpos_enc_in_obj_ptrs (bool): Whether to add an extra linear projection layer for temporal positional
                 encoding in object pointers.
-            use_signed_tpos_enc_to_obj_ptrs (bool): whether to use signed distance (instead of unsigned absolute distance)
-                in the temporal positional encoding in the object pointers, only relevant when both `use_obj_ptrs_in_encoder=True`
-                and `add_tpos_enc_to_obj_ptrs=True`.
+            use_signed_tpos_enc_to_obj_ptrs (bool): Whether to use signed distance (instead of unsigned absolute distance)
+                in the temporal positional encoding in the object pointers, only relevant when both
+                `use_obj_ptrs_in_encoder=True` and `add_tpos_enc_to_obj_ptrs=True`.
             only_obj_ptrs_in_the_past_for_eval (bool): Whether to only attend to object pointers in the past
                 during evaluation.
             pred_obj_scores (bool): Whether to predict if there is an object in the frame.
@@ -332,18 +332,18 @@ class SAM2Model(torch.nn.Module):
 
     @property
     def device(self):
-        """Returns the device on which the model's parameters are stored."""
+        """Return the device on which the model's parameters are stored."""
         return next(self.parameters()).device
 
     def forward(self, *args, **kwargs):
-        """Processes image and prompt inputs to generate object masks and scores in video sequences."""
+        """Process image and prompt inputs to generate object masks and scores in video sequences."""
         raise NotImplementedError(
             "Please use the corresponding methods in SAM2VideoPredictor for inference."
             "See notebooks/video_predictor_example.ipynb for an example."
         )
 
     def _build_sam_heads(self):
-        """Builds SAM-style prompt encoder and mask decoder for image segmentation tasks."""
+        """Build SAM-style prompt encoder and mask decoder for image segmentation tasks."""
         self.sam_prompt_embed_dim = self.hidden_dim
         self.sam_image_embedding_size = self.image_size // self.backbone_stride
 
@@ -545,7 +545,7 @@ class SAM2Model(torch.nn.Module):
         )
 
     def _use_mask_as_output(self, backbone_features, high_res_features, mask_inputs):
-        """Processes mask inputs directly as output, bypassing SAM encoder/decoder."""
+        """Process mask inputs directly as output, bypassing SAM encoder/decoder."""
         # Use -10/+10 as logits for neg/pos pixels (very close to 0/1 in prob after sigmoid).
         out_scale, out_bias = 20.0, -10.0  # sigmoid(-10.0)=4.5398e-05
         mask_inputs_float = mask_inputs.float()
@@ -592,7 +592,7 @@ class SAM2Model(torch.nn.Module):
         )
 
     def forward_image(self, img_batch: torch.Tensor):
-        """Processes image batch through encoder to extract multi-level features for SAM model."""
+        """Process image batch through encoder to extract multi-level features for SAM model."""
         backbone_out = self.image_encoder(img_batch)
         if self.use_high_res_features_in_sam:
             # precompute projected level 0 and level 1 features in SAM decoder
@@ -602,7 +602,7 @@ class SAM2Model(torch.nn.Module):
         return backbone_out
 
     def _prepare_backbone_features(self, backbone_out):
-        """Prepares and flattens visual features from the image backbone output for further processing."""
+        """Prepare and flatten visual features from the image backbone output for further processing."""
         assert len(backbone_out["backbone_fpn"]) == len(backbone_out["vision_pos_enc"])
         assert len(backbone_out["backbone_fpn"]) >= self.num_feature_levels
 
@@ -627,7 +627,7 @@ class SAM2Model(torch.nn.Module):
         num_frames,
         track_in_reverse=False,  # tracking in reverse time order (for demo usage)
     ):
-        """Prepares memory-conditioned features by fusing current frame's visual features with previous memories."""
+        """Prepare memory-conditioned features by fusing current frame's visual features with previous memories."""
         B = current_vision_feats[-1].size(1)  # batch size on this frame
         C = self.hidden_dim
         H, W = feat_sizes[-1]  # top-level (lowest-resolution) feature size
@@ -788,7 +788,7 @@ class SAM2Model(torch.nn.Module):
         object_score_logits,
         is_mask_from_pts,
     ):
-        """Encodes frame features and masks into a new memory representation for video segmentation."""
+        """Encode frame features and masks into a new memory representation for video segmentation."""
         B = current_vision_feats[-1].size(1)  # batch size on this frame
         C = self.hidden_dim
         H, W = feat_sizes[-1]  # top-level (lowest-resolution) feature size
@@ -838,7 +838,7 @@ class SAM2Model(torch.nn.Module):
         track_in_reverse,
         prev_sam_mask_logits,
     ):
-        """Performs a single tracking step, updating object masks and memory features based on current frame inputs."""
+        """Perform a single tracking step, updating object masks and memory features based on current frame inputs."""
         current_out = {"point_inputs": point_inputs, "mask_inputs": mask_inputs}
         # High-resolution feature maps for the SAM head, reshape (HW)BC => BCHW
         if len(current_vision_feats) > 1:
@@ -893,9 +893,7 @@ class SAM2Model(torch.nn.Module):
         object_score_logits,
         current_out,
     ):
-        """Finally run the memory encoder on the predicted mask to encode, it into a new memory feature (that can be
-        used in future frames).
-        """
+        """Run memory encoder on predicted mask to encode it into a new memory feature for future frames."""
         if run_mem_encoder and self.num_maskmem > 0:
             high_res_masks_for_mem_enc = high_res_masks
             maskmem_features, maskmem_pos_enc = self._encode_new_memory(
@@ -932,7 +930,7 @@ class SAM2Model(torch.nn.Module):
         # The previously predicted SAM mask logits (which can be fed together with new clicks in demo).
         prev_sam_mask_logits=None,
     ):
-        """Performs a single tracking step, updating object masks and memory features based on current frame inputs."""
+        """Perform a single tracking step, updating object masks and memory features based on current frame inputs."""
         current_out, sam_outputs, _, _ = self._track_step(
             frame_idx,
             is_init_cond_frame,
@@ -970,7 +968,7 @@ class SAM2Model(torch.nn.Module):
         return current_out
 
     def _use_multimask(self, is_init_cond_frame, point_inputs):
-        """Determines whether to use multiple mask outputs in the SAM head based on configuration and inputs."""
+        """Determine whether to use multiple mask outputs in the SAM head based on configuration and inputs."""
         num_pts = 0 if point_inputs is None else point_inputs["point_labels"].size(1)
         return (
             self.multimask_output_in_sam
@@ -980,7 +978,7 @@ class SAM2Model(torch.nn.Module):
 
     @staticmethod
     def _apply_non_overlapping_constraints(pred_masks):
-        """Applies non-overlapping constraints to masks, keeping the highest scoring object per location."""
+        """Apply non-overlapping constraints to masks, keeping the highest scoring object per location."""
         batch_size = pred_masks.size(0)
         if batch_size == 1:
             return pred_masks
@@ -1001,12 +999,7 @@ class SAM2Model(torch.nn.Module):
         self.binarize_mask_from_pts_for_mem_enc = binarize
 
     def set_imgsz(self, imgsz):
-        """
-        Set image size to make model compatible with different image sizes.
-
-        Args:
-            imgsz (Tuple[int, int]): The size of the input image.
-        """
+        """Set image size to make model compatible with different image sizes."""
         self.image_size = imgsz[0]
         self.sam_prompt_encoder.input_image_size = imgsz
         self.sam_prompt_encoder.image_embedding_size = [x // 16 for x in imgsz]  # fixed ViT patch size of 16
diff --git a/ultralytics/models/sam/modules/tiny_encoder.py b/ultralytics/models/sam/modules/tiny_encoder.py
index 1b181f7a06..e5a3a63d45 100644
--- a/ultralytics/models/sam/modules/tiny_encoder.py
+++ b/ultralytics/models/sam/modules/tiny_encoder.py
@@ -27,7 +27,7 @@ class Conv2d_BN(torch.nn.Sequential):
 
     Attributes:
         c (torch.nn.Conv2d): 2D convolution layer.
-        1 (torch.nn.BatchNorm2d): Batch normalization layer.
+        bn (torch.nn.BatchNorm2d): Batch normalization layer.
 
     Methods:
         __init__: Initializes the Conv2d_BN with specified parameters.
@@ -265,9 +265,9 @@ class ConvLayer(nn.Module):
             dim (int): The dimensionality of the input and output.
             input_resolution (Tuple[int, int]): The resolution of the input image.
             depth (int): The number of MBConv layers in the block.
-            activation (Callable): Activation function applied after each convolution.
+            activation (nn.Module): Activation function applied after each convolution.
             drop_path (float | List[float]): Drop path rate. Single float or a list of floats for each MBConv.
-            downsample (Optional[Callable]): Function for downsampling the output. None to skip downsampling.
+            downsample (Optional[nn.Module]): Function for downsampling the output. None to skip downsampling.
             use_checkpoint (bool): Whether to use gradient checkpointing to save memory.
             out_dim (Optional[int]): The dimensionality of the output. None means it will be the same as `dim`.
             conv_expand_ratio (float): Expansion ratio for the MBConv layers.
@@ -413,12 +413,9 @@ class Attention(torch.nn.Module):
         Args:
             dim (int): The dimensionality of the input and output.
             key_dim (int): The dimensionality of the keys and queries.
-            num_heads (int): Number of attention heads. Default is 8.
-            attn_ratio (float): Attention ratio, affecting the dimensions of the value vectors. Default is 4.
-            resolution (Tuple[int, int]): Spatial resolution of the input feature map. Default is (14, 14).
-
-        Raises:
-            AssertionError: If 'resolution' is not a tuple of length 2.
+            num_heads (int): Number of attention heads.
+            attn_ratio (float): Attention ratio, affecting the dimensions of the value vectors.
+            resolution (Tuple[int, int]): Spatial resolution of the input feature map.
 
         Examples:
             >>> attn = Attention(dim=256, key_dim=64, num_heads=8, resolution=(14, 14))
@@ -821,22 +818,20 @@ class TinyViT(nn.Module):
         attention and convolution blocks, and a classification head.
 
         Args:
-            img_size (int): Size of the input image. Default is 224.
-            in_chans (int): Number of input channels. Default is 3.
-            num_classes (int): Number of classes for classification. Default is 1000.
+            img_size (int): Size of the input image.
+            in_chans (int): Number of input channels.
+            num_classes (int): Number of classes for classification.
             embed_dims (Tuple[int, int, int, int]): Embedding dimensions for each stage.
-                Default is (96, 192, 384, 768).
-            depths (Tuple[int, int, int, int]): Number of blocks in each stage. Default is (2, 2, 6, 2).
+            depths (Tuple[int, int, int, int]): Number of blocks in each stage.
             num_heads (Tuple[int, int, int, int]): Number of attention heads in each stage.
-                Default is (3, 6, 12, 24).
-            window_sizes (Tuple[int, int, int, int]): Window sizes for each stage. Default is (7, 7, 14, 7).
-            mlp_ratio (float): Ratio of MLP hidden dim to embedding dim. Default is 4.0.
-            drop_rate (float): Dropout rate. Default is 0.0.
-            drop_path_rate (float): Stochastic depth rate. Default is 0.1.
-            use_checkpoint (bool): Whether to use checkpointing to save memory. Default is False.
-            mbconv_expand_ratio (float): Expansion ratio for MBConv layer. Default is 4.0.
-            local_conv_size (int): Kernel size for local convolutions. Default is 3.
-            layer_lr_decay (float): Layer-wise learning rate decay factor. Default is 1.0.
+            window_sizes (Tuple[int, int, int, int]): Window sizes for each stage.
+            mlp_ratio (float): Ratio of MLP hidden dim to embedding dim.
+            drop_rate (float): Dropout rate.
+            drop_path_rate (float): Stochastic depth rate.
+            use_checkpoint (bool): Whether to use checkpointing to save memory.
+            mbconv_expand_ratio (float): Expansion ratio for MBConv layer.
+            local_conv_size (int): Kernel size for local convolutions.
+            layer_lr_decay (float): Layer-wise learning rate decay factor.
 
         Examples:
             >>> model = TinyViT(img_size=224, num_classes=1000)
@@ -992,12 +987,7 @@ class TinyViT(nn.Module):
         return self.forward_features(x)
 
     def set_imgsz(self, imgsz=[1024, 1024]):
-        """
-        Set image size to make model compatible with different image sizes.
-
-        Args:
-            imgsz (Tuple[int, int]): The size of the input image.
-        """
+        """Set image size to make model compatible with different image sizes."""
         imgsz = [s // 4 for s in imgsz]
         self.patches_resolution = imgsz
         for i, layer in enumerate(self.layers):
diff --git a/ultralytics/models/sam/predict.py b/ultralytics/models/sam/predict.py
index 345fc7c98f..9017cc232e 100644
--- a/ultralytics/models/sam/predict.py
+++ b/ultralytics/models/sam/predict.py
@@ -701,9 +701,6 @@ class SAM2Predictor(Predictor):
             - The method supports batched inference for multiple objects when points or bboxes are provided.
             - Input prompts (bboxes, points) are automatically scaled to match the input image dimensions.
             - When both bboxes and points are provided, they are merged into a single 'points' input for the model.
-
-        References:
-            - SAM2 Paper: [Add link to SAM2 paper when available]
         """
         features = self.get_im_features(im) if self.features is None else self.features
 
diff --git a/ultralytics/nn/autobackend.py b/ultralytics/nn/autobackend.py
index 8e643a8bba..a23617e234 100644
--- a/ultralytics/nn/autobackend.py
+++ b/ultralytics/nn/autobackend.py
@@ -19,11 +19,7 @@ from ultralytics.utils.downloads import attempt_download_asset, is_url
 
 
 def check_class_names(names):
-    """
-    Check class names.
-
-    Map imagenet class codes to human-readable names if required. Convert lists to dicts.
-    """
+    """Check class names and convert to dict format if needed."""
     if isinstance(names, list):  # names is a list
         names = dict(enumerate(names))  # convert to dict
     if isinstance(names, dict):
@@ -78,8 +74,23 @@ class AutoBackend(nn.Module):
             | IMX                   | *_imx_model/      |
             | RKNN                  | *_rknn_model/     |
 
-    This class offers dynamic backend switching capabilities based on the input model format, making it easier to deploy
-    models across various platforms.
+    Attributes:
+        model (torch.nn.Module): The loaded YOLO model.
+        device (torch.device): The device (CPU or GPU) on which the model is loaded.
+        task (str): The type of task the model performs (detect, segment, classify, pose).
+        names (Dict): A dictionary of class names that the model can detect.
+        stride (int): The model stride, typically 32 for YOLO models.
+        fp16 (bool): Whether the model uses half-precision (FP16) inference.
+
+    Methods:
+        forward: Run inference on an input image.
+        from_numpy: Convert numpy array to tensor.
+        warmup: Warm up the model with a dummy input.
+        _model_type: Determine the model type from file path.
+
+    Examples:
+        >>> model = AutoBackend(weights="yolov8n.pt", device="cuda")
+        >>> results = model(img)
     """
 
     @torch.no_grad()
@@ -101,7 +112,7 @@ class AutoBackend(nn.Module):
             weights (str | torch.nn.Module): Path to the model weights file or a module instance. Defaults to 'yolo11n.pt'.
             device (torch.device): Device to run the model on. Defaults to CPU.
             dnn (bool): Use OpenCV DNN module for ONNX inference. Defaults to False.
-            data (str | Path | optional): Path to the additional data.yaml file containing class names. Optional.
+            data (str | Path | optional): Path to the additional data.yaml file containing class names.
             fp16 (bool): Enable half-precision inference. Supported only on specific backends. Defaults to False.
             batch (int): Batch-size to assume for inference.
             fuse (bool): Fuse Conv2D + BatchNorm layers for optimization. Defaults to True.
@@ -539,12 +550,12 @@ class AutoBackend(nn.Module):
 
         Args:
             im (torch.Tensor): The image tensor to perform inference on.
-            augment (bool): whether to perform data augmentation during inference, defaults to False
-            visualize (bool): whether to visualize the output predictions, defaults to False
-            embed (list, optional): A list of feature vectors/embeddings to return.
+            augment (bool): Whether to perform data augmentation during inference. Defaults to False.
+            visualize (bool): Whether to visualize the output predictions. Defaults to False.
+            embed (List, optional): A list of feature vectors/embeddings to return.
 
         Returns:
-            (tuple): Tuple containing the raw output tensor, and processed output for visualization (if visualize=True)
+            (torch.Tensor | List[torch.Tensor]): The raw output tensor(s) from the model.
         """
         b, ch, h, w = im.shape  # batch, channel, height, width
         if self.fp16 and im.dtype != torch.float16:
@@ -776,10 +787,13 @@ class AutoBackend(nn.Module):
     def _model_type(p="path/to/model.pt"):
         """
         Takes a path to a model file and returns the model type. Possibles types are pt, jit, onnx, xml, engine, coreml,
-        saved_model, pb, tflite, edgetpu, tfjs, ncnn or paddle.
+        saved_model, pb, tflite, edgetpu, tfjs, ncnn, mnn, imx or paddle.
 
         Args:
-            p (str): path to the model file. Defaults to path/to/model.pt
+            p (str): Path to the model file. Defaults to path/to/model.pt
+
+        Returns:
+            (List[bool]): List of booleans indicating the model type.
 
         Examples:
             >>> model = AutoBackend(weights="path/to/model.onnx")
diff --git a/ultralytics/nn/modules/block.py b/ultralytics/nn/modules/block.py
index 63472ae2f3..b9f0c7d773 100644
--- a/ultralytics/nn/modules/block.py
+++ b/ultralytics/nn/modules/block.py
@@ -69,7 +69,7 @@ class DFL(nn.Module):
         self.c1 = c1
 
     def forward(self, x):
-        """Applies a transformer layer on input tensor 'x' and returns a tensor."""
+        """Apply the DFL module to input tensor and return transformed output."""
         b, _, a = x.shape  # batch, channels, anchors
         return self.conv(x.view(b, 4, self.c1, a).transpose(2, 1).softmax(1)).view(b, 4, a)
         # return self.conv(x.view(b, self.c1, 4, a).softmax(1)).view(b, 4, a)
@@ -80,9 +80,12 @@ class Proto(nn.Module):
 
     def __init__(self, c1, c_=256, c2=32):
         """
-        Initializes the YOLOv8 mask Proto module with specified number of protos and masks.
+        Initialize the YOLOv8 mask Proto module with specified number of protos and masks.
 
-        Input arguments are ch_in, number of protos, number of masks.
+        Args:
+            c1 (int): Input channels.
+            c_ (int): Intermediate channels.
+            c2 (int): Output channels (number of protos).
         """
         super().__init__()
         self.cv1 = Conv(c1, c_, k=3)
@@ -91,7 +94,7 @@ class Proto(nn.Module):
         self.cv3 = Conv(c_, c2)
 
     def forward(self, x):
-        """Performs a forward pass through layers using an upsampled input image."""
+        """Perform a forward pass through layers using an upsampled input image."""
         return self.cv3(self.cv2(self.upsample(self.cv1(x))))
 
 
@@ -103,7 +106,14 @@ class HGStem(nn.Module):
     """
 
     def __init__(self, c1, cm, c2):
-        """Initialize the SPP layer with input/output channels and specified kernel sizes for max pooling."""
+        """
+        Initialize the StemBlock of PPHGNetV2.
+
+        Args:
+            c1 (int): Input channels.
+            cm (int): Middle channels.
+            c2 (int): Output channels.
+        """
         super().__init__()
         self.stem1 = Conv(c1, cm, 3, 2, act=nn.ReLU())
         self.stem2a = Conv(cm, cm // 2, 2, 1, 0, act=nn.ReLU())
@@ -134,7 +144,19 @@ class HGBlock(nn.Module):
     """
 
     def __init__(self, c1, cm, c2, k=3, n=6, lightconv=False, shortcut=False, act=nn.ReLU()):
-        """Initializes a CSP Bottleneck with 1 convolution using specified input and output channels."""
+        """
+        Initialize HGBlock with specified parameters.
+
+        Args:
+            c1 (int): Input channels.
+            cm (int): Middle channels.
+            c2 (int): Output channels.
+            k (int): Kernel size.
+            n (int): Number of LightConv or Conv blocks.
+            lightconv (bool): Whether to use LightConv.
+            shortcut (bool): Whether to use shortcut connection.
+            act (nn.Module): Activation function.
+        """
         super().__init__()
         block = LightConv if lightconv else Conv
         self.m = nn.ModuleList(block(c1 if i == 0 else cm, cm, k=k, act=act) for i in range(n))
@@ -154,7 +176,14 @@ class SPP(nn.Module):
     """Spatial Pyramid Pooling (SPP) layer https://arxiv.org/abs/1406.4729."""
 
     def __init__(self, c1, c2, k=(5, 9, 13)):
-        """Initialize the SPP layer with input/output channels and pooling kernel sizes."""
+        """
+        Initialize the SPP layer with input/output channels and pooling kernel sizes.
+
+        Args:
+            c1 (int): Input channels.
+            c2 (int): Output channels.
+            k (Tuple[int, int, int]): Kernel sizes for max pooling.
+        """
         super().__init__()
         c_ = c1 // 2  # hidden channels
         self.cv1 = Conv(c1, c_, 1, 1)
@@ -172,9 +201,15 @@ class SPPF(nn.Module):
 
     def __init__(self, c1, c2, k=5):
         """
-        Initializes the SPPF layer with given input/output channels and kernel size.
+        Initialize the SPPF layer with given input/output channels and kernel size.
 
-        This module is equivalent to SPP(k=(5, 9, 13)).
+        Args:
+            c1 (int): Input channels.
+            c2 (int): Output channels.
+            k (int): Kernel size.
+
+        Notes:
+            This module is equivalent to SPP(k=(5, 9, 13)).
         """
         super().__init__()
         c_ = c1 // 2  # hidden channels
@@ -183,7 +218,7 @@ class SPPF(nn.Module):
         self.m = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2)
 
     def forward(self, x):
-        """Forward pass through Ghost Convolution block."""
+        """Apply sequential pooling operations to input and return concatenated feature maps."""
         y = [self.cv1(x)]
         y.extend(self.m(y[-1]) for _ in range(3))
         return self.cv2(torch.cat(y, 1))
@@ -193,13 +228,20 @@ class C1(nn.Module):
     """CSP Bottleneck with 1 convolution."""
 
     def __init__(self, c1, c2, n=1):
-        """Initializes the CSP Bottleneck with configurations for 1 convolution with arguments ch_in, ch_out, number."""
+        """
+        Initialize the CSP Bottleneck with 1 convolution.
+
+        Args:
+            c1 (int): Input channels.
+            c2 (int): Output channels.
+            n (int): Number of convolutions.
+        """
         super().__init__()
         self.cv1 = Conv(c1, c2, 1, 1)
         self.m = nn.Sequential(*(Conv(c2, c2, 3) for _ in range(n)))
 
     def forward(self, x):
-        """Applies cross-convolutions to input in the C3 module."""
+        """Apply convolution and residual connection to input tensor."""
         y = self.cv1(x)
         return self.m(y) + y
 
@@ -208,7 +250,17 @@ class C2(nn.Module):
     """CSP Bottleneck with 2 convolutions."""
 
     def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
-        """Initializes a CSP Bottleneck with 2 convolutions and optional shortcut connection."""
+        """
+        Initialize a CSP Bottleneck with 2 convolutions.
+
+        Args:
+            c1 (int): Input channels.
+            c2 (int): Output channels.
+            n (int): Number of Bottleneck blocks.
+            shortcut (bool): Whether to use shortcut connections.
+            g (int): Groups for convolutions.
+            e (float): Expansion ratio.
+        """
         super().__init__()
         self.c = int(c2 * e)  # hidden channels
         self.cv1 = Conv(c1, 2 * self.c, 1, 1)
@@ -226,7 +278,17 @@ class C2f(nn.Module):
     """Faster Implementation of CSP Bottleneck with 2 convolutions."""
 
     def __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5):
-        """Initializes a CSP bottleneck with 2 convolutions and n Bottleneck blocks for faster processing."""
+        """
+        Initialize a CSP bottleneck with 2 convolutions.
+
+        Args:
+            c1 (int): Input channels.
+            c2 (int): Output channels.
+            n (int): Number of Bottleneck blocks.
+            shortcut (bool): Whether to use shortcut connections.
+            g (int): Groups for convolutions.
+            e (float): Expansion ratio.
+        """
         super().__init__()
         self.c = int(c2 * e)  # hidden channels
         self.cv1 = Conv(c1, 2 * self.c, 1, 1)
@@ -251,7 +313,17 @@ class C3(nn.Module):
     """CSP Bottleneck with 3 convolutions."""
 
     def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
-        """Initialize the CSP Bottleneck with given channels, number, shortcut, groups, and expansion values."""
+        """
+        Initialize the CSP Bottleneck with 3 convolutions.
+
+        Args:
+            c1 (int): Input channels.
+            c2 (int): Output channels.
+            n (int): Number of Bottleneck blocks.
+            shortcut (bool): Whether to use shortcut connections.
+            g (int): Groups for convolutions.
+            e (float): Expansion ratio.
+        """
         super().__init__()
         c_ = int(c2 * e)  # hidden channels
         self.cv1 = Conv(c1, c_, 1, 1)
@@ -260,7 +332,7 @@ class C3(nn.Module):
         self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, k=((1, 1), (3, 3)), e=1.0) for _ in range(n)))
 
     def forward(self, x):
-        """Forward pass through the CSP bottleneck with 2 convolutions."""
+        """Forward pass through the CSP bottleneck with 3 convolutions."""
         return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), 1))
 
 
@@ -268,7 +340,17 @@ class C3x(C3):
     """C3 module with cross-convolutions."""
 
     def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
-        """Initialize C3TR instance and set default parameters."""
+        """
+        Initialize C3 module with cross-convolutions.
+
+        Args:
+            c1 (int): Input channels.
+            c2 (int): Output channels.
+            n (int): Number of Bottleneck blocks.
+            shortcut (bool): Whether to use shortcut connections.
+            g (int): Groups for convolutions.
+            e (float): Expansion ratio.
+        """
         super().__init__(c1, c2, n, shortcut, g, e)
         self.c_ = int(c2 * e)
         self.m = nn.Sequential(*(Bottleneck(self.c_, self.c_, shortcut, g, k=((1, 3), (3, 1)), e=1) for _ in range(n)))
@@ -278,7 +360,15 @@ class RepC3(nn.Module):
     """Rep C3."""
 
     def __init__(self, c1, c2, n=3, e=1.0):
-        """Initialize CSP Bottleneck with a single convolution using input channels, output channels, and number."""
+        """
+        Initialize CSP Bottleneck with a single convolution.
+
+        Args:
+            c1 (int): Input channels.
+            c2 (int): Output channels.
+            n (int): Number of RepConv blocks.
+            e (float): Expansion ratio.
+        """
         super().__init__()
         c_ = int(c2 * e)  # hidden channels
         self.cv1 = Conv(c1, c_, 1, 1)
@@ -287,7 +377,7 @@ class RepC3(nn.Module):
         self.cv3 = Conv(c_, c2, 1, 1) if c_ != c2 else nn.Identity()
 
     def forward(self, x):
-        """Forward pass of RT-DETR neck layer."""
+        """Forward pass of RepC3 module."""
         return self.cv3(self.m(self.cv1(x)) + self.cv2(x))
 
 
@@ -295,7 +385,17 @@ class C3TR(C3):
     """C3 module with TransformerBlock()."""
 
     def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
-        """Initialize C3Ghost module with GhostBottleneck()."""
+        """
+        Initialize C3 module with TransformerBlock.
+
+        Args:
+            c1 (int): Input channels.
+            c2 (int): Output channels.
+            n (int): Number of Transformer blocks.
+            shortcut (bool): Whether to use shortcut connections.
+            g (int): Groups for convolutions.
+            e (float): Expansion ratio.
+        """
         super().__init__(c1, c2, n, shortcut, g, e)
         c_ = int(c2 * e)
         self.m = TransformerBlock(c_, c_, 4, n)
@@ -305,7 +405,17 @@ class C3Ghost(C3):
     """C3 module with GhostBottleneck()."""
 
     def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
-        """Initialize 'SPP' module with various pooling sizes for spatial pyramid pooling."""
+        """
+        Initialize C3 module with GhostBottleneck.
+
+        Args:
+            c1 (int): Input channels.
+            c2 (int): Output channels.
+            n (int): Number of Ghost bottleneck blocks.
+            shortcut (bool): Whether to use shortcut connections.
+            g (int): Groups for convolutions.
+            e (float): Expansion ratio.
+        """
         super().__init__(c1, c2, n, shortcut, g, e)
         c_ = int(c2 * e)  # hidden channels
         self.m = nn.Sequential(*(GhostBottleneck(c_, c_) for _ in range(n)))
@@ -315,7 +425,15 @@ class GhostBottleneck(nn.Module):
     """Ghost Bottleneck https://github.com/huawei-noah/ghostnet."""
 
     def __init__(self, c1, c2, k=3, s=1):
-        """Initializes GhostBottleneck module with arguments ch_in, ch_out, kernel, stride."""
+        """
+        Initialize Ghost Bottleneck module.
+
+        Args:
+            c1 (int): Input channels.
+            c2 (int): Output channels.
+            k (int): Kernel size.
+            s (int): Stride.
+        """
         super().__init__()
         c_ = c2 // 2
         self.conv = nn.Sequential(
@@ -328,7 +446,7 @@ class GhostBottleneck(nn.Module):
         )
 
     def forward(self, x):
-        """Applies skip connection and concatenation to input tensor."""
+        """Apply skip connection and concatenation to input tensor."""
         return self.conv(x) + self.shortcut(x)
 
 
@@ -336,7 +454,17 @@ class Bottleneck(nn.Module):
     """Standard bottleneck."""
 
     def __init__(self, c1, c2, shortcut=True, g=1, k=(3, 3), e=0.5):
-        """Initializes a standard bottleneck module with optional shortcut connection and configurable parameters."""
+        """
+        Initialize a standard bottleneck module.
+
+        Args:
+            c1 (int): Input channels.
+            c2 (int): Output channels.
+            shortcut (bool): Whether to use shortcut connection.
+            g (int): Groups for convolutions.
+            k (Tuple[int, int]): Kernel sizes for convolutions.
+            e (float): Expansion ratio.
+        """
         super().__init__()
         c_ = int(c2 * e)  # hidden channels
         self.cv1 = Conv(c1, c_, k[0], 1)
@@ -344,7 +472,7 @@ class Bottleneck(nn.Module):
         self.add = shortcut and c1 == c2
 
     def forward(self, x):
-        """Applies the YOLO FPN to input data."""
+        """Apply bottleneck with optional shortcut connection."""
         return x + self.cv2(self.cv1(x)) if self.add else self.cv2(self.cv1(x))
 
 
@@ -352,7 +480,17 @@ class BottleneckCSP(nn.Module):
     """CSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks."""
 
     def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
-        """Initializes the CSP Bottleneck given arguments for ch_in, ch_out, number, shortcut, groups, expansion."""
+        """
+        Initialize CSP Bottleneck.
+
+        Args:
+            c1 (int): Input channels.
+            c2 (int): Output channels.
+            n (int): Number of Bottleneck blocks.
+            shortcut (bool): Whether to use shortcut connections.
+            g (int): Groups for convolutions.
+            e (float): Expansion ratio.
+        """
         super().__init__()
         c_ = int(c2 * e)  # hidden channels
         self.cv1 = Conv(c1, c_, 1, 1)
@@ -364,7 +502,7 @@ class BottleneckCSP(nn.Module):
         self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)))
 
     def forward(self, x):
-        """Applies a CSP bottleneck with 3 convolutions."""
+        """Apply CSP bottleneck with 3 convolutions."""
         y1 = self.cv3(self.m(self.cv1(x)))
         y2 = self.cv2(x)
         return self.cv4(self.act(self.bn(torch.cat((y1, y2), 1))))
@@ -374,7 +512,15 @@ class ResNetBlock(nn.Module):
     """ResNet block with standard convolution layers."""
 
     def __init__(self, c1, c2, s=1, e=4):
-        """Initialize convolution with given parameters."""
+        """
+        Initialize ResNet block.
+
+        Args:
+            c1 (int): Input channels.
+            c2 (int): Output channels.
+            s (int): Stride.
+            e (int): Expansion ratio.
+        """
         super().__init__()
         c3 = e * c2
         self.cv1 = Conv(c1, c2, k=1, s=1, act=True)
@@ -391,7 +537,17 @@ class ResNetLayer(nn.Module):
     """ResNet layer with multiple ResNet blocks."""
 
     def __init__(self, c1, c2, s=1, is_first=False, n=1, e=4):
-        """Initializes the ResNetLayer given arguments."""
+        """
+        Initialize ResNet layer.
+
+        Args:
+            c1 (int): Input channels.
+            c2 (int): Output channels.
+            s (int): Stride.
+            is_first (bool): Whether this is the first layer.
+            n (int): Number of ResNet blocks.
+            e (int): Expansion ratio.
+        """
         super().__init__()
         self.is_first = is_first
 
@@ -413,7 +569,17 @@ class MaxSigmoidAttnBlock(nn.Module):
     """Max Sigmoid attention block."""
 
     def __init__(self, c1, c2, nh=1, ec=128, gc=512, scale=False):
-        """Initializes MaxSigmoidAttnBlock with specified arguments."""
+        """
+        Initialize MaxSigmoidAttnBlock.
+
+        Args:
+            c1 (int): Input channels.
+            c2 (int): Output channels.
+            nh (int): Number of heads.
+            ec (int): Embedding channels.
+            gc (int): Guide channels.
+            scale (bool): Whether to use learnable scale parameter.
+        """
         super().__init__()
         self.nh = nh
         self.hc = c2 // nh
@@ -424,7 +590,16 @@ class MaxSigmoidAttnBlock(nn.Module):
         self.scale = nn.Parameter(torch.ones(1, nh, 1, 1)) if scale else 1.0
 
     def forward(self, x, guide):
-        """Forward process."""
+        """
+        Forward pass of MaxSigmoidAttnBlock.
+
+        Args:
+            x (torch.Tensor): Input tensor.
+            guide (torch.Tensor): Guide tensor.
+
+        Returns:
+            (torch.Tensor): Output tensor after attention.
+        """
         bs, _, h, w = x.shape
 
         guide = self.gl(guide)
@@ -448,7 +623,20 @@ class C2fAttn(nn.Module):
     """C2f module with an additional attn module."""
 
     def __init__(self, c1, c2, n=1, ec=128, nh=1, gc=512, shortcut=False, g=1, e=0.5):
-        """Initializes C2f module with attention mechanism for enhanced feature extraction and processing."""
+        """
+        Initialize C2f module with attention mechanism.
+
+        Args:
+            c1 (int): Input channels.
+            c2 (int): Output channels.
+            n (int): Number of Bottleneck blocks.
+            ec (int): Embedding channels for attention.
+            nh (int): Number of heads for attention.
+            gc (int): Guide channels for attention.
+            shortcut (bool): Whether to use shortcut connections.
+            g (int): Groups for convolutions.
+            e (float): Expansion ratio.
+        """
         super().__init__()
         self.c = int(c2 * e)  # hidden channels
         self.cv1 = Conv(c1, 2 * self.c, 1, 1)
@@ -457,14 +645,32 @@ class C2fAttn(nn.Module):
         self.attn = MaxSigmoidAttnBlock(self.c, self.c, gc=gc, ec=ec, nh=nh)
 
     def forward(self, x, guide):
-        """Forward pass through C2f layer."""
+        """
+        Forward pass through C2f layer with attention.
+
+        Args:
+            x (torch.Tensor): Input tensor.
+            guide (torch.Tensor): Guide tensor for attention.
+
+        Returns:
+            (torch.Tensor): Output tensor after processing.
+        """
         y = list(self.cv1(x).chunk(2, 1))
         y.extend(m(y[-1]) for m in self.m)
         y.append(self.attn(y[-1], guide))
         return self.cv2(torch.cat(y, 1))
 
     def forward_split(self, x, guide):
-        """Forward pass using split() instead of chunk()."""
+        """
+        Forward pass using split() instead of chunk().
+
+        Args:
+            x (torch.Tensor): Input tensor.
+            guide (torch.Tensor): Guide tensor for attention.
+
+        Returns:
+            (torch.Tensor): Output tensor after processing.
+        """
         y = list(self.cv1(x).split((self.c, self.c), 1))
         y.extend(m(y[-1]) for m in self.m)
         y.append(self.attn(y[-1], guide))
@@ -475,7 +681,17 @@ class ImagePoolingAttn(nn.Module):
     """ImagePoolingAttn: Enhance the text embeddings with image-aware information."""
 
     def __init__(self, ec=256, ch=(), ct=512, nh=8, k=3, scale=False):
-        """Initializes ImagePoolingAttn with specified arguments."""
+        """
+        Initialize ImagePoolingAttn module.
+
+        Args:
+            ec (int): Embedding channels.
+            ch (Tuple): Channel dimensions for feature maps.
+            ct (int): Channel dimension for text embeddings.
+            nh (int): Number of attention heads.
+            k (int): Kernel size for pooling.
+            scale (bool): Whether to use learnable scale parameter.
+        """
         super().__init__()
 
         nf = len(ch)
@@ -493,7 +709,16 @@ class ImagePoolingAttn(nn.Module):
         self.k = k
 
     def forward(self, x, text):
-        """Executes attention mechanism on input tensor x and guide tensor."""
+        """
+        Forward pass of ImagePoolingAttn.
+
+        Args:
+            x (List[torch.Tensor]): List of input feature maps.
+            text (torch.Tensor): Text embeddings.
+
+        Returns:
+            (torch.Tensor): Enhanced text embeddings.
+        """
         bs = x[0].shape[0]
         assert len(x) == self.nf
         num_patches = self.k**2
@@ -521,14 +746,23 @@ class ContrastiveHead(nn.Module):
     """Implements contrastive learning head for region-text similarity in vision-language models."""
 
     def __init__(self):
-        """Initializes ContrastiveHead with specified region-text similarity parameters."""
+        """Initialize ContrastiveHead with region-text similarity parameters."""
         super().__init__()
         # NOTE: use -10.0 to keep the init cls loss consistency with other losses
         self.bias = nn.Parameter(torch.tensor([-10.0]))
         self.logit_scale = nn.Parameter(torch.ones([]) * torch.tensor(1 / 0.07).log())
 
     def forward(self, x, w):
-        """Forward function of contrastive learning."""
+        """
+        Forward function of contrastive learning.
+
+        Args:
+            x (torch.Tensor): Image features.
+            w (torch.Tensor): Text features.
+
+        Returns:
+            (torch.Tensor): Similarity scores.
+        """
         x = F.normalize(x, dim=1, p=2)
         w = F.normalize(w, dim=-1, p=2)
         x = torch.einsum("bchw,bkc->bkhw", x, w)
@@ -544,7 +778,12 @@ class BNContrastiveHead(nn.Module):
     """
 
     def __init__(self, embed_dims: int):
-        """Initialize ContrastiveHead with region-text similarity parameters."""
+        """
+        Initialize BNContrastiveHead.
+
+        Args:
+            embed_dims (int): Embedding dimensions for features.
+        """
         super().__init__()
         self.norm = nn.BatchNorm2d(embed_dims)
         # NOTE: use -10.0 to keep the init cls loss consistency with other losses
@@ -553,7 +792,16 @@ class BNContrastiveHead(nn.Module):
         self.logit_scale = nn.Parameter(-1.0 * torch.ones([]))
 
     def forward(self, x, w):
-        """Forward function of contrastive learning."""
+        """
+        Forward function of contrastive learning with batch normalization.
+
+        Args:
+            x (torch.Tensor): Image features.
+            w (torch.Tensor): Text features.
+
+        Returns:
+            (torch.Tensor): Similarity scores.
+        """
         x = self.norm(x)
         w = F.normalize(w, dim=-1, p=2)
         x = torch.einsum("bchw,bkc->bkhw", x, w)
@@ -564,7 +812,17 @@ class RepBottleneck(Bottleneck):
     """Rep bottleneck."""
 
     def __init__(self, c1, c2, shortcut=True, g=1, k=(3, 3), e=0.5):
-        """Initializes a RepBottleneck module with customizable in/out channels, shortcuts, groups and expansion."""
+        """
+        Initialize RepBottleneck.
+
+        Args:
+            c1 (int): Input channels.
+            c2 (int): Output channels.
+            shortcut (bool): Whether to use shortcut connection.
+            g (int): Groups for convolutions.
+            k (Tuple[int, int]): Kernel sizes for convolutions.
+            e (float): Expansion ratio.
+        """
         super().__init__(c1, c2, shortcut, g, k, e)
         c_ = int(c2 * e)  # hidden channels
         self.cv1 = RepConv(c1, c_, k[0], 1)
@@ -574,7 +832,17 @@ class RepCSP(C3):
     """Repeatable Cross Stage Partial Network (RepCSP) module for efficient feature extraction."""
 
     def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5):
-        """Initializes RepCSP layer with given channels, repetitions, shortcut, groups and expansion ratio."""
+        """
+        Initialize RepCSP layer.
+
+        Args:
+            c1 (int): Input channels.
+            c2 (int): Output channels.
+            n (int): Number of RepBottleneck blocks.
+            shortcut (bool): Whether to use shortcut connections.
+            g (int): Groups for convolutions.
+            e (float): Expansion ratio.
+        """
         super().__init__(c1, c2, n, shortcut, g, e)
         c_ = int(c2 * e)  # hidden channels
         self.m = nn.Sequential(*(RepBottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)))
@@ -584,7 +852,16 @@ class RepNCSPELAN4(nn.Module):
     """CSP-ELAN."""
 
     def __init__(self, c1, c2, c3, c4, n=1):
-        """Initializes CSP-ELAN layer with specified channel sizes, repetitions, and convolutions."""
+        """
+        Initialize CSP-ELAN layer.
+
+        Args:
+            c1 (int): Input channels.
+            c2 (int): Output channels.
+            c3 (int): Intermediate channels.
+            c4 (int): Intermediate channels for RepCSP.
+            n (int): Number of RepCSP blocks.
+        """
         super().__init__()
         self.c = c3 // 2
         self.cv1 = Conv(c1, c3, 1, 1)
@@ -609,7 +886,15 @@ class ELAN1(RepNCSPELAN4):
     """ELAN1 module with 4 convolutions."""
 
     def __init__(self, c1, c2, c3, c4):
-        """Initializes ELAN1 layer with specified channel sizes."""
+        """
+        Initialize ELAN1 layer.
+
+        Args:
+            c1 (int): Input channels.
+            c2 (int): Output channels.
+            c3 (int): Intermediate channels.
+            c4 (int): Intermediate channels for convolutions.
+        """
         super().__init__(c1, c2, c3, c4)
         self.c = c3 // 2
         self.cv1 = Conv(c1, c3, 1, 1)
@@ -622,7 +907,13 @@ class AConv(nn.Module):
     """AConv."""
 
     def __init__(self, c1, c2):
-        """Initializes AConv module with convolution layers."""
+        """
+        Initialize AConv module.
+
+        Args:
+            c1 (int): Input channels.
+            c2 (int): Output channels.
+        """
         super().__init__()
         self.cv1 = Conv(c1, c2, 3, 2, 1)
 
@@ -636,7 +927,13 @@ class ADown(nn.Module):
     """ADown."""
 
     def __init__(self, c1, c2):
-        """Initializes ADown module with convolution layers to downsample input from channels c1 to c2."""
+        """
+        Initialize ADown module.
+
+        Args:
+            c1 (int): Input channels.
+            c2 (int): Output channels.
+        """
         super().__init__()
         self.c = c2 // 2
         self.cv1 = Conv(c1 // 2, self.c, 3, 2, 1)
@@ -656,7 +953,15 @@ class SPPELAN(nn.Module):
     """SPP-ELAN."""
 
     def __init__(self, c1, c2, c3, k=5):
-        """Initializes SPP-ELAN block with convolution and max pooling layers for spatial pyramid pooling."""
+        """
+        Initialize SPP-ELAN block.
+
+        Args:
+            c1 (int): Input channels.
+            c2 (int): Output channels.
+            c3 (int): Intermediate channels.
+            k (int): Kernel size for max pooling.
+        """
         super().__init__()
         self.c = c3
         self.cv1 = Conv(c1, c3, 1, 1)
@@ -676,7 +981,17 @@ class CBLinear(nn.Module):
     """CBLinear."""
 
     def __init__(self, c1, c2s, k=1, s=1, p=None, g=1):
-        """Initializes the CBLinear module, passing inputs unchanged."""
+        """
+        Initialize CBLinear module.
+
+        Args:
+            c1 (int): Input channels.
+            c2s (List[int]): List of output channel sizes.
+            k (int): Kernel size.
+            s (int): Stride.
+            p (int | None): Padding.
+            g (int): Groups.
+        """
         super().__init__()
         self.c2s = c2s
         self.conv = nn.Conv2d(c1, sum(c2s), k, s, autopad(k, p), groups=g, bias=True)
@@ -690,12 +1005,25 @@ class CBFuse(nn.Module):
     """CBFuse."""
 
     def __init__(self, idx):
-        """Initializes CBFuse module with layer index for selective feature fusion."""
+        """
+        Initialize CBFuse module.
+
+        Args:
+            idx (List[int]): Indices for feature selection.
+        """
         super().__init__()
         self.idx = idx
 
     def forward(self, xs):
-        """Forward pass through CBFuse layer."""
+        """
+        Forward pass through CBFuse layer.
+
+        Args:
+            xs (List[torch.Tensor]): List of input tensors.
+
+        Returns:
+            (torch.Tensor): Fused output tensor.
+        """
         target_size = xs[-1].shape[2:]
         res = [F.interpolate(x[self.idx[i]], size=target_size, mode="nearest") for i, x in enumerate(xs[:-1])]
         return torch.sum(torch.stack(res + xs[-1:]), dim=0)
@@ -705,8 +1033,16 @@ class C3f(nn.Module):
     """Faster Implementation of CSP Bottleneck with 2 convolutions."""
 
     def __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5):
-        """Initialize CSP bottleneck layer with two convolutions with arguments ch_in, ch_out, number, shortcut, groups,
-        expansion.
+        """
+        Initialize CSP bottleneck layer with two convolutions.
+
+        Args:
+            c1 (int): Input channels.
+            c2 (int): Output channels.
+            n (int): Number of Bottleneck blocks.
+            shortcut (bool): Whether to use shortcut connections.
+            g (int): Groups for convolutions.
+            e (float): Expansion ratio.
         """
         super().__init__()
         c_ = int(c2 * e)  # hidden channels
@@ -716,7 +1052,7 @@ class C3f(nn.Module):
         self.m = nn.ModuleList(Bottleneck(c_, c_, shortcut, g, k=((3, 3), (3, 3)), e=1.0) for _ in range(n))
 
     def forward(self, x):
-        """Forward pass through C2f layer."""
+        """Forward pass through C3f layer."""
         y = [self.cv2(x), self.cv1(x)]
         y.extend(m(y[-1]) for m in self.m)
         return self.cv3(torch.cat(y, 1))
@@ -726,7 +1062,18 @@ class C3k2(C2f):
     """Faster Implementation of CSP Bottleneck with 2 convolutions."""
 
     def __init__(self, c1, c2, n=1, c3k=False, e=0.5, g=1, shortcut=True):
-        """Initializes the C3k2 module, a faster CSP Bottleneck with 2 convolutions and optional C3k blocks."""
+        """
+        Initialize C3k2 module.
+
+        Args:
+            c1 (int): Input channels.
+            c2 (int): Output channels.
+            n (int): Number of blocks.
+            c3k (bool): Whether to use C3k blocks.
+            e (float): Expansion ratio.
+            g (int): Groups for convolutions.
+            shortcut (bool): Whether to use shortcut connections.
+        """
         super().__init__(c1, c2, n, shortcut, g, e)
         self.m = nn.ModuleList(
             C3k(self.c, self.c, 2, shortcut, g) if c3k else Bottleneck(self.c, self.c, shortcut, g) for _ in range(n)
@@ -737,7 +1084,18 @@ class C3k(C3):
     """C3k is a CSP bottleneck module with customizable kernel sizes for feature extraction in neural networks."""
 
     def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5, k=3):
-        """Initializes the C3k module with specified channels, number of layers, and configurations."""
+        """
+        Initialize C3k module.
+
+        Args:
+            c1 (int): Input channels.
+            c2 (int): Output channels.
+            n (int): Number of Bottleneck blocks.
+            shortcut (bool): Whether to use shortcut connections.
+            g (int): Groups for convolutions.
+            e (float): Expansion ratio.
+            k (int): Kernel size.
+        """
         super().__init__(c1, c2, n, shortcut, g, e)
         c_ = int(c2 * e)  # hidden channels
         # self.m = nn.Sequential(*(RepBottleneck(c_, c_, shortcut, g, k=(k, k), e=1.0) for _ in range(n)))
@@ -748,7 +1106,12 @@ class RepVGGDW(torch.nn.Module):
     """RepVGGDW is a class that represents a depth wise separable convolutional block in RepVGG architecture."""
 
     def __init__(self, ed) -> None:
-        """Initializes RepVGGDW with depthwise separable convolutional layers for efficient processing."""
+        """
+        Initialize RepVGGDW module.
+
+        Args:
+            ed (int): Input and output channels.
+        """
         super().__init__()
         self.conv = Conv(ed, ed, 7, 1, 3, g=ed, act=False)
         self.conv1 = Conv(ed, ed, 3, 1, 1, g=ed, act=False)
@@ -757,7 +1120,7 @@ class RepVGGDW(torch.nn.Module):
 
     def forward(self, x):
         """
-        Performs a forward pass of the RepVGGDW block.
+        Perform a forward pass of the RepVGGDW block.
 
         Args:
             x (torch.Tensor): Input tensor.
@@ -769,7 +1132,7 @@ class RepVGGDW(torch.nn.Module):
 
     def forward_fuse(self, x):
         """
-        Performs a forward pass of the RepVGGDW block without fusing the convolutions.
+        Perform a forward pass of the RepVGGDW block without fusing the convolutions.
 
         Args:
             x (torch.Tensor): Input tensor.
@@ -782,7 +1145,7 @@ class RepVGGDW(torch.nn.Module):
     @torch.no_grad()
     def fuse(self):
         """
-        Fuses the convolutional layers in the RepVGGDW block.
+        Fuse the convolutional layers in the RepVGGDW block.
 
         This method fuses the convolutional layers and updates the weights and biases accordingly.
         """
@@ -819,7 +1182,16 @@ class CIB(nn.Module):
     """
 
     def __init__(self, c1, c2, shortcut=True, e=0.5, lk=False):
-        """Initializes the custom model with optional shortcut, scaling factor, and RepVGGDW layer."""
+        """
+        Initialize the CIB module.
+
+        Args:
+            c1 (int): Input channels.
+            c2 (int): Output channels.
+            shortcut (bool): Whether to use shortcut connection.
+            e (float): Expansion ratio.
+            lk (bool): Whether to use RepVGGDW.
+        """
         super().__init__()
         c_ = int(c2 * e)  # hidden channels
         self.cv1 = nn.Sequential(
@@ -860,7 +1232,18 @@ class C2fCIB(C2f):
     """
 
     def __init__(self, c1, c2, n=1, shortcut=False, lk=False, g=1, e=0.5):
-        """Initializes the module with specified parameters for channel, shortcut, local key, groups, and expansion."""
+        """
+        Initialize C2fCIB module.
+
+        Args:
+            c1 (int): Input channels.
+            c2 (int): Output channels.
+            n (int): Number of CIB modules.
+            shortcut (bool): Whether to use shortcut connection.
+            lk (bool): Whether to use local key connection.
+            g (int): Groups for convolutions.
+            e (float): Expansion ratio.
+        """
         super().__init__(c1, c2, n, shortcut, g, e)
         self.m = nn.ModuleList(CIB(self.c, self.c, shortcut, e=1.0, lk=lk) for _ in range(n))
 
@@ -885,7 +1268,14 @@ class Attention(nn.Module):
     """
 
     def __init__(self, dim, num_heads=8, attn_ratio=0.5):
-        """Initializes multi-head attention module with query, key, and value convolutions and positional encoding."""
+        """
+        Initialize multi-head attention module.
+
+        Args:
+            dim (int): Input dimension.
+            num_heads (int): Number of attention heads.
+            attn_ratio (float): Attention ratio for key dimension.
+        """
         super().__init__()
         self.num_heads = num_heads
         self.head_dim = dim // num_heads
@@ -944,7 +1334,15 @@ class PSABlock(nn.Module):
     """
 
     def __init__(self, c, attn_ratio=0.5, num_heads=4, shortcut=True) -> None:
-        """Initializes the PSABlock with attention and feed-forward layers for enhanced feature extraction."""
+        """
+        Initialize the PSABlock.
+
+        Args:
+            c (int): Input and output channels.
+            attn_ratio (float): Attention ratio for key dimension.
+            num_heads (int): Number of attention heads.
+            shortcut (bool): Whether to use shortcut connections.
+        """
         super().__init__()
 
         self.attn = Attention(c, attn_ratio=attn_ratio, num_heads=num_heads)
@@ -952,7 +1350,15 @@ class PSABlock(nn.Module):
         self.add = shortcut
 
     def forward(self, x):
-        """Executes a forward pass through PSABlock, applying attention and feed-forward layers to the input tensor."""
+        """
+        Execute a forward pass through PSABlock.
+
+        Args:
+            x (torch.Tensor): Input tensor.
+
+        Returns:
+            (torch.Tensor): Output tensor after attention and feed-forward processing.
+        """
         x = x + self.attn(x) if self.add else self.attn(x)
         x = x + self.ffn(x) if self.add else self.ffn(x)
         return x
@@ -983,7 +1389,14 @@ class PSA(nn.Module):
     """
 
     def __init__(self, c1, c2, e=0.5):
-        """Initializes the PSA module with input/output channels and attention mechanism for feature extraction."""
+        """
+        Initialize PSA module.
+
+        Args:
+            c1 (int): Input channels.
+            c2 (int): Output channels.
+            e (float): Expansion ratio.
+        """
         super().__init__()
         assert c1 == c2
         self.c = int(c1 * e)
@@ -994,7 +1407,15 @@ class PSA(nn.Module):
         self.ffn = nn.Sequential(Conv(self.c, self.c * 2, 1), Conv(self.c * 2, self.c, 1, act=False))
 
     def forward(self, x):
-        """Executes forward pass in PSA module, applying attention and feed-forward layers to the input tensor."""
+        """
+        Execute forward pass in PSA module.
+
+        Args:
+            x (torch.Tensor): Input tensor.
+
+        Returns:
+            (torch.Tensor): Output tensor after attention and feed-forward processing.
+        """
         a, b = self.cv1(x).split((self.c, self.c), dim=1)
         b = b + self.attn(b)
         b = b + self.ffn(b)
@@ -1027,7 +1448,15 @@ class C2PSA(nn.Module):
     """
 
     def __init__(self, c1, c2, n=1, e=0.5):
-        """Initializes the C2PSA module with specified input/output channels, number of layers, and expansion ratio."""
+        """
+        Initialize C2PSA module.
+
+        Args:
+            c1 (int): Input channels.
+            c2 (int): Output channels.
+            n (int): Number of PSABlock modules.
+            e (float): Expansion ratio.
+        """
         super().__init__()
         assert c1 == c2
         self.c = int(c1 * e)
@@ -1037,7 +1466,15 @@ class C2PSA(nn.Module):
         self.m = nn.Sequential(*(PSABlock(self.c, attn_ratio=0.5, num_heads=self.c // 64) for _ in range(n)))
 
     def forward(self, x):
-        """Processes the input tensor 'x' through a series of PSA blocks and returns the transformed tensor."""
+        """
+        Process the input tensor through a series of PSA blocks.
+
+        Args:
+            x (torch.Tensor): Input tensor.
+
+        Returns:
+            (torch.Tensor): Output tensor after processing.
+        """
         a, b = self.cv1(x).split((self.c, self.c), dim=1)
         b = self.m(b)
         return self.cv2(torch.cat((a, b), 1))
@@ -1069,7 +1506,15 @@ class C2fPSA(C2f):
     """
 
     def __init__(self, c1, c2, n=1, e=0.5):
-        """Initializes the C2fPSA module, a variant of C2f with PSA blocks for enhanced feature extraction."""
+        """
+        Initialize C2fPSA module.
+
+        Args:
+            c1 (int): Input channels.
+            c2 (int): Output channels.
+            n (int): Number of PSABlock modules.
+            e (float): Expansion ratio.
+        """
         assert c1 == c2
         super().__init__(c1, c2, n=n, e=e)
         self.m = nn.ModuleList(PSABlock(self.c, attn_ratio=0.5, num_heads=self.c // 64) for _ in range(n))
@@ -1100,13 +1545,29 @@ class SCDown(nn.Module):
     """
 
     def __init__(self, c1, c2, k, s):
-        """Initializes the SCDown module with specified input/output channels, kernel size, and stride."""
+        """
+        Initialize SCDown module.
+
+        Args:
+            c1 (int): Input channels.
+            c2 (int): Output channels.
+            k (int): Kernel size.
+            s (int): Stride.
+        """
         super().__init__()
         self.cv1 = Conv(c1, c2, 1, 1)
         self.cv2 = Conv(c2, c2, k=k, s=s, g=c2, act=False)
 
     def forward(self, x):
-        """Applies convolution and downsampling to the input tensor in the SCDown module."""
+        """
+        Apply convolution and downsampling to the input tensor.
+
+        Args:
+            x (torch.Tensor): Input tensor.
+
+        Returns:
+            (torch.Tensor): Downsampled output tensor.
+        """
         return self.cv2(self.cv1(x))
 
 
@@ -1128,7 +1589,16 @@ class TorchVision(nn.Module):
     """
 
     def __init__(self, model, weights="DEFAULT", unwrap=True, truncate=2, split=False):
-        """Load the model and weights from torchvision."""
+        """
+        Load the model and weights from torchvision.
+
+        Args:
+            model (str): Name of the torchvision model to load.
+            weights (str): Pre-trained weights to load.
+            unwrap (bool): Whether to unwrap the model.
+            truncate (int): Number of layers to truncate.
+            split (bool): Whether to split the output.
+        """
         import torchvision  # scope for faster 'import ultralytics'
 
         super().__init__()
@@ -1147,7 +1617,15 @@ class TorchVision(nn.Module):
             self.m.head = self.m.heads = nn.Identity()
 
     def forward(self, x):
-        """Forward pass through the model."""
+        """
+        Forward pass through the model.
+
+        Args:
+            x (torch.Tensor): Input tensor.
+
+        Returns:
+            (torch.Tensor | List[torch.Tensor]): Output tensor or list of tensors.
+        """
         if self.split:
             y = [x]
             y.extend(m(y[-1]) for m in self.m)
@@ -1184,7 +1662,7 @@ class AAttn(nn.Module):
 
     def __init__(self, dim, num_heads, area=1):
         """
-        Initializes an Area-attention module for YOLO models.
+        Initialize an Area-attention module for YOLO models.
 
         Args:
             dim (int): Number of hidden channels.
@@ -1203,7 +1681,15 @@ class AAttn(nn.Module):
         self.pe = Conv(all_head_dim, dim, 7, 1, 3, g=dim, act=False)
 
     def forward(self, x):
-        """Processes the input tensor 'x' through the area-attention."""
+        """
+        Process the input tensor through the area-attention.
+
+        Args:
+            x (torch.Tensor): Input tensor.
+
+        Returns:
+            (torch.Tensor): Output tensor after area-attention.
+        """
         B, C, H, W = x.shape
         N = H * W
 
@@ -1260,11 +1746,7 @@ class ABlock(nn.Module):
 
     def __init__(self, dim, num_heads, mlp_ratio=1.2, area=1):
         """
-        Initializes an Area-attention block module for efficient feature extraction in YOLO models.
-
-        This module implements an area-attention mechanism combined with a feed-forward network for processing feature
-        maps. It uses a novel area-based attention approach that is more efficient than traditional self-attention
-        while maintaining effectiveness.
+        Initialize an Area-attention block module.
 
         Args:
             dim (int): Number of input channels.
@@ -1281,14 +1763,27 @@ class ABlock(nn.Module):
         self.apply(self._init_weights)
 
     def _init_weights(self, m):
-        """Initialize weights using a truncated normal distribution."""
+        """
+        Initialize weights using a truncated normal distribution.
+
+        Args:
+            m (nn.Module): Module to initialize.
+        """
         if isinstance(m, nn.Conv2d):
             nn.init.trunc_normal_(m.weight, std=0.02)
             if m.bias is not None:
                 nn.init.constant_(m.bias, 0)
 
     def forward(self, x):
-        """Forward pass through ABlock, applying area-attention and feed-forward layers to the input tensor."""
+        """
+        Forward pass through ABlock.
+
+        Args:
+            x (torch.Tensor): Input tensor.
+
+        Returns:
+            (torch.Tensor): Output tensor after area-attention and feed-forward processing.
+        """
         x = x + self.attn(x)
         return x + self.mlp(x)
 
@@ -1319,7 +1814,7 @@ class A2C2f(nn.Module):
 
     def __init__(self, c1, c2, n=1, a2=True, area=1, residual=False, mlp_ratio=2.0, e=0.5, g=1, shortcut=True):
         """
-        Area-Attention C2f module for enhanced feature extraction with area-based attention mechanisms.
+        Initialize Area-Attention C2f module.
 
         Args:
             c1 (int): Number of input channels.
@@ -1349,7 +1844,15 @@ class A2C2f(nn.Module):
         )
 
     def forward(self, x):
-        """Forward pass through R-ELAN layer."""
+        """
+        Forward pass through A2C2f layer.
+
+        Args:
+            x (torch.Tensor): Input tensor.
+
+        Returns:
+            (torch.Tensor): Output tensor after processing.
+        """
         y = [self.cv1(x)]
         y.extend(m(y[-1]) for m in self.m)
         y = self.cv2(torch.cat(y, 1))
diff --git a/ultralytics/nn/tasks.py b/ultralytics/nn/tasks.py
index 3e39303fa7..ad7c3c70af 100644
--- a/ultralytics/nn/tasks.py
+++ b/ultralytics/nn/tasks.py
@@ -119,10 +119,10 @@ class BaseModel(torch.nn.Module):
 
         Args:
             x (torch.Tensor): The input tensor to the model.
-            profile (bool):  Print the computation time of each layer if True, defaults to False.
-            visualize (bool): Save the feature maps of the model if True, defaults to False.
-            augment (bool): Augment image during prediction, defaults to False.
-            embed (list, optional): A list of feature vectors/embeddings to return.
+            profile (bool): Print the computation time of each layer if True.
+            visualize (bool): Save the feature maps of the model if True.
+            augment (bool): Augment image during prediction.
+            embed (List, optional): A list of feature vectors/embeddings to return.
 
         Returns:
             (torch.Tensor): The last output of the model.
@@ -137,9 +137,9 @@ class BaseModel(torch.nn.Module):
 
         Args:
             x (torch.Tensor): The input tensor to the model.
-            profile (bool):  Print the computation time of each layer if True, defaults to False.
-            visualize (bool): Save the feature maps of the model if True, defaults to False.
-            embed (list, optional): A list of feature vectors/embeddings to return.
+            profile (bool): Print the computation time of each layer if True.
+            visualize (bool): Save the feature maps of the model if True.
+            embed (List, optional): A list of feature vectors/embeddings to return.
 
         Returns:
             (torch.Tensor): The last output of the model.
@@ -170,13 +170,12 @@ class BaseModel(torch.nn.Module):
 
     def _profile_one_layer(self, m, x, dt):
         """
-        Profile the computation time and FLOPs of a single layer of the model on a given input. Appends the results to
-        the provided list.
+        Profile the computation time and FLOPs of a single layer of the model on a given input.
 
         Args:
             m (torch.nn.Module): The layer to be profiled.
             x (torch.Tensor): The input data to the layer.
-            dt (list): A list to store the computation time of the layer.
+            dt (List): A list to store the computation time of the layer.
         """
         c = m == self.model[-1] and isinstance(x, list)  # is final layer list, copy input as inplace fix
         flops = thop.profile(m, inputs=[x.copy() if c else x], verbose=False)[0] / 1e9 * 2 if thop else 0  # GFLOPs
@@ -192,8 +191,8 @@ class BaseModel(torch.nn.Module):
 
     def fuse(self, verbose=True):
         """
-        Fuse the `Conv2d()` and `BatchNorm2d()` layers of the model into a single layer, in order to improve the
-        computation efficiency.
+        Fuse the `Conv2d()` and `BatchNorm2d()` layers of the model into a single layer for improved computation
+        efficiency.
 
         Returns:
             (torch.nn.Module): The fused model is returned.
@@ -225,7 +224,7 @@ class BaseModel(torch.nn.Module):
         Check if the model has less than a certain threshold of BatchNorm layers.
 
         Args:
-            thresh (int, optional): The threshold number of BatchNorm layers. Default is 10.
+            thresh (int, optional): The threshold number of BatchNorm layers.
 
         Returns:
             (bool): True if the number of BatchNorm layers in the model is less than the threshold, False otherwise.
@@ -235,21 +234,21 @@ class BaseModel(torch.nn.Module):
 
     def info(self, detailed=False, verbose=True, imgsz=640):
         """
-        Prints model information.
+        Print model information.
 
         Args:
-            detailed (bool): if True, prints out detailed information about the model. Defaults to False
-            verbose (bool): if True, prints out the model information. Defaults to False
-            imgsz (int): the size of the image that the model will be trained on. Defaults to 640
+            detailed (bool): If True, prints out detailed information about the model.
+            verbose (bool): If True, prints out the model information.
+            imgsz (int): The size of the image that the model will be trained on.
         """
         return model_info(self, detailed=detailed, verbose=verbose, imgsz=imgsz)
 
     def _apply(self, fn):
         """
-        Applies a function to all the tensors in the model that are not parameters or registered buffers.
+        Apply a function to all tensors in the model that are not parameters or registered buffers.
 
         Args:
-            fn (function): the function to apply to the model
+            fn (function): The function to apply to the model.
 
         Returns:
             (BaseModel): An updated BaseModel object.
@@ -264,11 +263,11 @@ class BaseModel(torch.nn.Module):
 
     def load(self, weights, verbose=True):
         """
-        Load the weights into the model.
+        Load weights into the model.
 
         Args:
             weights (dict | torch.nn.Module): The pre-trained weights to be loaded.
-            verbose (bool, optional): Whether to log the transfer progress. Defaults to True.
+            verbose (bool, optional): Whether to log the transfer progress.
         """
         model = weights["model"] if isinstance(weights, dict) else weights  # torchvision models are not dicts
         csd = model.float().state_dict()  # checkpoint state_dict as FP32
@@ -282,8 +281,8 @@ class BaseModel(torch.nn.Module):
         Compute loss.
 
         Args:
-            batch (dict): Batch to compute loss on
-            preds (torch.Tensor | List[torch.Tensor]): Predictions.
+            batch (dict): Batch to compute loss on.
+            preds (torch.Tensor | List[torch.Tensor], optional): Predictions.
         """
         if getattr(self, "criterion", None) is None:
             self.criterion = self.init_criterion()
@@ -300,7 +299,15 @@ class DetectionModel(BaseModel):
     """YOLO detection model."""
 
     def __init__(self, cfg="yolo11n.yaml", ch=3, nc=None, verbose=True):  # model, input channels, number of classes
-        """Initialize the YOLO detection model with the given config and parameters."""
+        """
+        Initialize the YOLO detection model with the given config and parameters.
+
+        Args:
+            cfg (str | dict): Model configuration file path or dictionary.
+            ch (int): Number of input channels.
+            nc (int, optional): Number of classes.
+            verbose (bool): Whether to display model information.
+        """
         super().__init__()
         self.yaml = cfg if isinstance(cfg, dict) else yaml_model_load(cfg)  # cfg dict
         if self.yaml["backbone"][0][2] == "Silence":
@@ -327,7 +334,7 @@ class DetectionModel(BaseModel):
             m.inplace = self.inplace
 
             def _forward(x):
-                """Performs a forward pass through the model, handling different Detect subclass types accordingly."""
+                """Perform a forward pass through the model, handling different Detect subclass types accordingly."""
                 if self.end2end:
                     return self.forward(x)["one2many"]
                 return self.forward(x)[0] if isinstance(m, (Segment, Pose, OBB)) else self.forward(x)
@@ -345,7 +352,15 @@ class DetectionModel(BaseModel):
             LOGGER.info("")
 
     def _predict_augment(self, x):
-        """Perform augmentations on input image x and return augmented inference and train outputs."""
+        """
+        Perform augmentations on input image x and return augmented inference and train outputs.
+
+        Args:
+            x (torch.Tensor): Input image tensor.
+
+        Returns:
+            (torch.Tensor): Augmented inference output.
+        """
         if getattr(self, "end2end", False) or self.__class__.__name__ != "DetectionModel":
             LOGGER.warning("WARNING ⚠️ Model does not support 'augment=True', reverting to single-scale prediction.")
             return self._predict_once(x)
@@ -363,7 +378,19 @@ class DetectionModel(BaseModel):
 
     @staticmethod
     def _descale_pred(p, flips, scale, img_size, dim=1):
-        """De-scale predictions following augmented inference (inverse operation)."""
+        """
+        De-scale predictions following augmented inference (inverse operation).
+
+        Args:
+            p (torch.Tensor): Predictions tensor.
+            flips (int): Flip type (0=none, 2=ud, 3=lr).
+            scale (float): Scale factor.
+            img_size (tuple): Original image size (height, width).
+            dim (int): Dimension to split at.
+
+        Returns:
+            (torch.Tensor): De-scaled predictions.
+        """
         p[:, :4] /= scale  # de-scale
         x, y, wh, cls = p.split((1, 1, 2, p.shape[dim] - 4), dim)
         if flips == 2:
@@ -373,7 +400,15 @@ class DetectionModel(BaseModel):
         return torch.cat((x, y, wh, cls), dim)
 
     def _clip_augmented(self, y):
-        """Clip YOLO augmented inference tails."""
+        """
+        Clip YOLO augmented inference tails.
+
+        Args:
+            y (List[torch.Tensor]): List of detection tensors.
+
+        Returns:
+            (List[torch.Tensor]): Clipped detection tensors.
+        """
         nl = self.model[-1].nl  # number of detection layers (P3-P5)
         g = sum(4**x for x in range(nl))  # grid points
         e = 1  # exclude layer count
@@ -392,7 +427,15 @@ class OBBModel(DetectionModel):
     """YOLO Oriented Bounding Box (OBB) model."""
 
     def __init__(self, cfg="yolo11n-obb.yaml", ch=3, nc=None, verbose=True):
-        """Initialize YOLO OBB model with given config and parameters."""
+        """
+        Initialize YOLO OBB model with given config and parameters.
+
+        Args:
+            cfg (str | dict): Model configuration file path or dictionary.
+            ch (int): Number of input channels.
+            nc (int, optional): Number of classes.
+            verbose (bool): Whether to display model information.
+        """
         super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose)
 
     def init_criterion(self):
@@ -404,7 +447,15 @@ class SegmentationModel(DetectionModel):
     """YOLO segmentation model."""
 
     def __init__(self, cfg="yolo11n-seg.yaml", ch=3, nc=None, verbose=True):
-        """Initialize YOLOv8 segmentation model with given config and parameters."""
+        """
+        Initialize YOLOv8 segmentation model with given config and parameters.
+
+        Args:
+            cfg (str | dict): Model configuration file path or dictionary.
+            ch (int): Number of input channels.
+            nc (int, optional): Number of classes.
+            verbose (bool): Whether to display model information.
+        """
         super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose)
 
     def init_criterion(self):
@@ -416,7 +467,16 @@ class PoseModel(DetectionModel):
     """YOLO pose model."""
 
     def __init__(self, cfg="yolo11n-pose.yaml", ch=3, nc=None, data_kpt_shape=(None, None), verbose=True):
-        """Initialize YOLOv8 Pose model."""
+        """
+        Initialize YOLOv8 Pose model.
+
+        Args:
+            cfg (str | dict): Model configuration file path or dictionary.
+            ch (int): Number of input channels.
+            nc (int, optional): Number of classes.
+            data_kpt_shape (tuple): Shape of keypoints data.
+            verbose (bool): Whether to display model information.
+        """
         if not isinstance(cfg, dict):
             cfg = yaml_model_load(cfg)  # load model YAML
         if any(data_kpt_shape) and list(data_kpt_shape) != list(cfg["kpt_shape"]):
@@ -433,12 +493,28 @@ class ClassificationModel(BaseModel):
     """YOLO classification model."""
 
     def __init__(self, cfg="yolo11n-cls.yaml", ch=3, nc=None, verbose=True):
-        """Init ClassificationModel with YAML, channels, number of classes, verbose flag."""
+        """
+        Initialize ClassificationModel with YAML, channels, number of classes, verbose flag.
+
+        Args:
+            cfg (str | dict): Model configuration file path or dictionary.
+            ch (int): Number of input channels.
+            nc (int, optional): Number of classes.
+            verbose (bool): Whether to display model information.
+        """
         super().__init__()
         self._from_yaml(cfg, ch, nc, verbose)
 
     def _from_yaml(self, cfg, ch, nc, verbose):
-        """Set YOLOv8 model configurations and define the model architecture."""
+        """
+        Set YOLOv8 model configurations and define the model architecture.
+
+        Args:
+            cfg (str | dict): Model configuration file path or dictionary.
+            ch (int): Number of input channels.
+            nc (int, optional): Number of classes.
+            verbose (bool): Whether to display model information.
+        """
         self.yaml = cfg if isinstance(cfg, dict) else yaml_model_load(cfg)  # cfg dict
 
         # Define model
@@ -455,7 +531,13 @@ class ClassificationModel(BaseModel):
 
     @staticmethod
     def reshape_outputs(model, nc):
-        """Update a TorchVision classification model to class count 'n' if required."""
+        """
+        Update a TorchVision classification model to class count 'n' if required.
+
+        Args:
+            model (torch.nn.Module): Model to update.
+            nc (int): New number of classes.
+        """
         name, m = list((model.model if hasattr(model, "model") else model).named_children())[-1]  # last module
         if isinstance(m, Classify):  # YOLO Classify() head
             if m.linear.out_features != nc:
@@ -500,10 +582,10 @@ class RTDETRDetectionModel(DetectionModel):
         Initialize the RTDETRDetectionModel.
 
         Args:
-            cfg (str): Configuration file name or path.
+            cfg (str | dict): Configuration file name or path.
             ch (int): Number of input channels.
-            nc (int, optional): Number of classes. Defaults to None.
-            verbose (bool, optional): Print additional information during initialization. Defaults to True.
+            nc (int, optional): Number of classes.
+            verbose (bool): Print additional information during initialization.
         """
         super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose)
 
@@ -519,7 +601,7 @@ class RTDETRDetectionModel(DetectionModel):
 
         Args:
             batch (dict): Dictionary containing image and label data.
-            preds (torch.Tensor, optional): Precomputed model predictions. Defaults to None.
+            preds (torch.Tensor, optional): Precomputed model predictions.
 
         Returns:
             (tuple): A tuple containing the total loss and main three losses in a tensor.
@@ -564,11 +646,11 @@ class RTDETRDetectionModel(DetectionModel):
 
         Args:
             x (torch.Tensor): The input tensor.
-            profile (bool, optional): If True, profile the computation time for each layer. Defaults to False.
-            visualize (bool, optional): If True, save feature maps for visualization. Defaults to False.
-            batch (dict, optional): Ground truth data for evaluation. Defaults to None.
-            augment (bool, optional): If True, perform data augmentation during inference. Defaults to False.
-            embed (list, optional): A list of feature vectors/embeddings to return.
+            profile (bool): If True, profile the computation time for each layer.
+            visualize (bool): If True, save feature maps for visualization.
+            batch (dict, optional): Ground truth data for evaluation.
+            augment (bool): If True, perform data augmentation during inference.
+            embed (List, optional): A list of feature vectors/embeddings to return.
 
         Returns:
             (torch.Tensor): Model's output tensor.
@@ -596,13 +678,28 @@ class WorldModel(DetectionModel):
     """YOLOv8 World Model."""
 
     def __init__(self, cfg="yolov8s-world.yaml", ch=3, nc=None, verbose=True):
-        """Initialize YOLOv8 world model with given config and parameters."""
+        """
+        Initialize YOLOv8 world model with given config and parameters.
+
+        Args:
+            cfg (str | dict): Model configuration file path or dictionary.
+            ch (int): Number of input channels.
+            nc (int, optional): Number of classes.
+            verbose (bool): Whether to display model information.
+        """
         self.txt_feats = torch.randn(1, nc or 80, 512)  # features placeholder
         self.clip_model = None  # CLIP model placeholder
         super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose)
 
     def set_classes(self, text, batch=80, cache_clip_model=True):
-        """Set classes in advance so that model could do offline-inference without clip model."""
+        """
+        Set classes in advance so that model could do offline-inference without clip model.
+
+        Args:
+            text (List[str]): List of class names.
+            batch (int): Batch size for processing text tokens.
+            cache_clip_model (bool): Whether to cache the CLIP model.
+        """
         try:
             import clip
         except ImportError:
@@ -628,11 +725,11 @@ class WorldModel(DetectionModel):
 
         Args:
             x (torch.Tensor): The input tensor.
-            profile (bool, optional): If True, profile the computation time for each layer. Defaults to False.
-            visualize (bool, optional): If True, save feature maps for visualization. Defaults to False.
-            txt_feats (torch.Tensor): The text features, use it if it's given. Defaults to None.
-            augment (bool, optional): If True, perform data augmentation during inference. Defaults to False.
-            embed (list, optional): A list of feature vectors/embeddings to return.
+            profile (bool): If True, profile the computation time for each layer.
+            visualize (bool): If True, save feature maps for visualization.
+            txt_feats (torch.Tensor, optional): The text features, use it if it's given.
+            augment (bool): If True, perform data augmentation during inference.
+            embed (List, optional): A list of feature vectors/embeddings to return.
 
         Returns:
             (torch.Tensor): Model's output tensor.
@@ -671,7 +768,7 @@ class WorldModel(DetectionModel):
 
         Args:
             batch (dict): Batch to compute loss on.
-            preds (torch.Tensor | List[torch.Tensor]): Predictions.
+            preds (torch.Tensor | List[torch.Tensor], optional): Predictions.
         """
         if not hasattr(self, "criterion"):
             self.criterion = self.init_criterion()
@@ -689,7 +786,18 @@ class Ensemble(torch.nn.ModuleList):
         super().__init__()
 
     def forward(self, x, augment=False, profile=False, visualize=False):
-        """Function generates the YOLO network's final layer."""
+        """
+        Generate the YOLO network's final layer.
+
+        Args:
+            x (torch.Tensor): Input tensor.
+            augment (bool): Whether to augment the input.
+            profile (bool): Whether to profile the model.
+            visualize (bool): Whether to visualize the features.
+
+        Returns:
+            (tuple): Tuple containing the concatenated predictions and None.
+        """
         y = [module(x, augment, profile, visualize)[0] for module in self]
         # y = torch.stack(y).max(0)[0]  # max ensemble
         # y = torch.stack(y).mean(0)  # mean ensemble
@@ -765,7 +873,16 @@ class SafeUnpickler(pickle.Unpickler):
     """Custom Unpickler that replaces unknown classes with SafeClass."""
 
     def find_class(self, module, name):
-        """Attempt to find a class, returning SafeClass if not among safe modules."""
+        """
+        Attempt to find a class, returning SafeClass if not among safe modules.
+
+        Args:
+            module (str): Module name.
+            name (str): Class name.
+
+        Returns:
+            (type): Found class or SafeClass.
+        """
         safe_modules = (
             "torch",
             "collections",
@@ -791,13 +908,13 @@ def torch_safe_load(weight, safe_only=False):
         weight (str): The file path of the PyTorch model.
         safe_only (bool): If True, replace unknown classes with SafeClass during loading.
 
+    Returns:
+        ckpt (dict): The loaded model checkpoint.
+        file (str): The loaded filename.
+
     Examples:
         >>> from ultralytics.nn.tasks import torch_safe_load
         >>> ckpt, file = torch_safe_load("path/to/best.pt", safe_only=True)
-
-    Returns:
-        ckpt (dict): The loaded model checkpoint.
-        file (str): The loaded filename
     """
     from ultralytics.utils.downloads import attempt_download_asset
 
@@ -858,7 +975,18 @@ def torch_safe_load(weight, safe_only=False):
 
 
 def attempt_load_weights(weights, device=None, inplace=True, fuse=False):
-    """Loads an ensemble of models weights=[a,b,c] or a single model weights=[a] or weights=a."""
+    """
+    Load an ensemble of models weights=[a,b,c] or a single model weights=[a] or weights=a.
+
+    Args:
+        weights (str | List[str]): Model weights path(s).
+        device (torch.device, optional): Device to load model to.
+        inplace (bool): Whether to do inplace operations.
+        fuse (bool): Whether to fuse model.
+
+    Returns:
+        (torch.nn.Module): Loaded model.
+    """
     ensemble = Ensemble()
     for w in weights if isinstance(weights, list) else [weights]:
         ckpt, w = torch_safe_load(w)  # load ckpt
@@ -896,7 +1024,18 @@ def attempt_load_weights(weights, device=None, inplace=True, fuse=False):
 
 
 def attempt_load_one_weight(weight, device=None, inplace=True, fuse=False):
-    """Loads a single model weights."""
+    """
+    Load a single model weights.
+
+    Args:
+        weight (str): Model weight path.
+        device (torch.device, optional): Device to load model to.
+        inplace (bool): Whether to do inplace operations.
+        fuse (bool): Whether to fuse model.
+
+    Returns:
+        (tuple): Tuple containing the model and checkpoint.
+    """
     ckpt, weight = torch_safe_load(weight)  # load ckpt
     args = {**DEFAULT_CFG_DICT, **(ckpt.get("train_args", {}))}  # combine model and default args, preferring model args
     model = (ckpt.get("ema") or ckpt["model"]).to(device).float()  # FP32 model
@@ -922,7 +1061,17 @@ def attempt_load_one_weight(weight, device=None, inplace=True, fuse=False):
 
 
 def parse_model(d, ch, verbose=True):  # model_dict, input_channels(3)
-    """Parse a YOLO model.yaml dictionary into a PyTorch model."""
+    """
+    Parse a YOLO model.yaml dictionary into a PyTorch model.
+
+    Args:
+        d (dict): Model dictionary.
+        ch (int): Input channels.
+        verbose (bool): Whether to print model details.
+
+    Returns:
+        (tuple): Tuple containing the PyTorch model and sorted list of output layers.
+    """
     import ast
 
     # Args
@@ -1086,7 +1235,15 @@ def parse_model(d, ch, verbose=True):  # model_dict, input_channels(3)
 
 
 def yaml_model_load(path):
-    """Load a YOLOv8 model from a YAML file."""
+    """
+    Load a YOLOv8 model from a YAML file.
+
+    Args:
+        path (str | Path): Path to the YAML file.
+
+    Returns:
+        (dict): Model dictionary.
+    """
     path = Path(path)
     if path.stem in (f"yolov{d}{x}6" for x in "nsmlx" for d in (5, 8)):
         new_stem = re.sub(r"(\d+)([nslmx])6(.+)?$", r"\1\2-p6\3", path.stem)
@@ -1103,15 +1260,13 @@ def yaml_model_load(path):
 
 def guess_model_scale(model_path):
     """
-    Takes a path to a YOLO model's YAML file as input and extracts the size character of the model's scale. The function
-    uses regular expression matching to find the pattern of the model scale in the YAML file name, which is denoted by
-    n, s, m, l, or x. The function returns the size character of the model scale as a string.
+    Extract the size character n, s, m, l, or x of the model's scale from the model path.
 
     Args:
         model_path (str | Path): The path to the YOLO model's YAML file.
 
     Returns:
-        (str): The size character of the model's scale, which can be n, s, m, l, or x.
+        (str): The size character of the model's scale (n, s, m, l, or x).
     """
     try:
         return re.search(r"yolo[v]?\d+([nslmx])", Path(model_path).stem).group(1)  # returns n, s, m, l, or x
@@ -1127,10 +1282,7 @@ def guess_model_task(model):
         model (torch.nn.Module | dict): PyTorch model or model configuration in YAML format.
 
     Returns:
-        (str): Task of the model ('detect', 'segment', 'classify', 'pose').
-
-    Raises:
-        SyntaxError: If the task of the model could not be determined.
+        (str): Task of the model ('detect', 'segment', 'classify', 'pose', 'obb').
     """
 
     def cfg2task(cfg):
diff --git a/ultralytics/utils/__init__.py b/ultralytics/utils/__init__.py
index 7373311795..b7ecf75051 100644
--- a/ultralytics/utils/__init__.py
+++ b/ultralytics/utils/__init__.py
@@ -304,17 +304,24 @@ def plt_settings(rcparams=None, backend="Agg"):
     """
     Decorator to temporarily set rc parameters and the backend for a plotting function.
 
-    Example:
-        decorator: @plt_settings({"font.size": 12})
-        context manager: with plt_settings({"font.size": 12}):
-
     Args:
-        rcparams (dict): Dictionary of rc parameters to set.
+        rcparams (dict, optional): Dictionary of rc parameters to set.
         backend (str, optional): Name of the backend to use. Defaults to 'Agg'.
 
     Returns:
-        (Callable): Decorated function with temporarily set rc parameters and backend. This decorator can be
-            applied to any function that needs to have specific matplotlib rc parameters and backend for its execution.
+        (Callable): Decorated function with temporarily set rc parameters and backend.
+
+    Examples:
+        >>> @plt_settings({"font.size": 12})
+        >>> def plot_function():
+        ...     plt.figure()
+        ...     plt.plot([1, 2, 3])
+        ...     plt.show()
+
+        >>> with plt_settings({"font.size": 12}):
+        ...     plt.figure()
+        ...     plt.plot([1, 2, 3])
+        ...     plt.show()
     """
     if rcparams is None:
         rcparams = {"font.size": 11}
@@ -357,6 +364,9 @@ def set_logging(name="LOGGING_NAME", verbose=True):
         name (str): Name of the logger. Defaults to "LOGGING_NAME".
         verbose (bool): Flag to set logging level to INFO if True, ERROR otherwise. Defaults to True.
 
+    Returns:
+        (logging.Logger): Configured logger object.
+
     Examples:
         >>> set_logging(name="ultralytics", verbose=True)
         >>> logger = logging.getLogger("ultralytics")
@@ -376,7 +386,7 @@ def set_logging(name="LOGGING_NAME", verbose=True):
 
         class CustomFormatter(logging.Formatter):
             def format(self, record):
-                """Sets up logging with UTF-8 encoding and configurable verbosity."""
+                """Format log records with UTF-8 encoding for Windows compatibility."""
                 return emojis(super().format(record))
 
         try:
@@ -420,9 +430,10 @@ def emojis(string=""):
 
 class ThreadingLocked:
     """
-    A decorator class for ensuring thread-safe execution of a function or method. This class can be used as a decorator
-    to make sure that if the decorated function is called from multiple threads, only one thread at a time will be able
-    to execute the function.
+    A decorator class for ensuring thread-safe execution of a function or method.
+
+    This class can be used as a decorator to make sure that if the decorated function is called from multiple threads,
+    only one thread at a time will be able to execute the function.
 
     Attributes:
         lock (threading.Lock): A lock object used to manage access to the decorated function.
@@ -435,7 +446,7 @@ class ThreadingLocked:
     """
 
     def __init__(self):
-        """Initializes the decorator class for thread-safe execution of a function or method."""
+        """Initialize the decorator class with a threading lock."""
         self.lock = threading.Lock()
 
     def __call__(self, f):
@@ -536,8 +547,7 @@ DEFAULT_CFG = IterableSimpleNamespace(**DEFAULT_CFG_DICT)
 
 def read_device_model() -> str:
     """
-    Reads the device model information from the system and caches it for quick access. Used by is_jetson() and
-    is_raspberrypi().
+    Reads the device model information from the system and caches it for quick access.
 
     Returns:
         (str): Kernel release information.
@@ -619,7 +629,7 @@ def is_docker() -> bool:
 
 def is_raspberrypi() -> bool:
     """
-    Determines if the Python environment is running on a Raspberry Pi by checking the device model information.
+    Determines if the Python environment is running on a Raspberry Pi.
 
     Returns:
         (bool): True if running on a Raspberry Pi, False otherwise.
@@ -629,7 +639,7 @@ def is_raspberrypi() -> bool:
 
 def is_jetson() -> bool:
     """
-    Determines if the Python environment is running on an NVIDIA Jetson device by checking the device model information.
+    Determines if the Python environment is running on an NVIDIA Jetson device.
 
     Returns:
         (bool): True if running on an NVIDIA Jetson device, False otherwise.
@@ -709,8 +719,7 @@ def is_github_action_running() -> bool:
 
 def get_git_dir():
     """
-    Determines whether the current file is part of a git repository and if so, returns the repository root directory. If
-    the current file is not part of a git repository, returns None.
+    Determines whether the current file is part of a git repository and if so, returns the repository root directory.
 
     Returns:
         (Path | None): Git root directory if found or None if not found.
@@ -722,8 +731,7 @@ def get_git_dir():
 
 def is_git_dir():
     """
-    Determines whether the current file is part of a git repository. If the current file is not part of a git
-    repository, returns None.
+    Determines whether the current file is part of a git repository.
 
     Returns:
         (bool): True if current file is part of a git repository.
@@ -1004,8 +1012,10 @@ def threaded(func):
 
 def set_sentry():
     """
-    Initialize the Sentry SDK for error tracking and reporting. Only used if sentry_sdk package is installed and
-    sync=True in settings. Run 'yolo settings' to see and update settings.
+    Initialize the Sentry SDK for error tracking and reporting.
+
+    Only used if sentry_sdk package is installed and sync=True in settings. Run 'yolo settings' to see and update
+    settings.
 
     Conditions required to send errors (ALL conditions must be met or no errors will be reported):
         - sentry_sdk package is installed
@@ -1016,11 +1026,6 @@ def set_sentry():
         - running with rank -1 or 0
         - online environment
         - CLI used to run package (checked with 'yolo' as the name of the main CLI command)
-
-    The function also configures Sentry SDK to ignore KeyboardInterrupt and FileNotFoundError exceptions and to exclude
-    events with 'out of memory' in their exception message.
-
-    Additionally, the function sets custom tags and user information for Sentry events.
     """
     if (
         not SETTINGS["sync"]
diff --git a/ultralytics/utils/checks.py b/ultralytics/utils/checks.py
index a7ed3d9a77..4ce167f68d 100644
--- a/ultralytics/utils/checks.py
+++ b/ultralytics/utils/checks.py
@@ -182,10 +182,10 @@ def check_version(
     Args:
         current (str): Current version or package name to get version from.
         required (str): Required version or range (in pip-style format).
-        name (str, optional): Name to be used in warning message.
-        hard (bool, optional): If True, raise an AssertionError if the requirement is not met.
-        verbose (bool, optional): If True, print warning message if requirement is not met.
-        msg (str, optional): Extra message to display if verbose.
+        name (str): Name to be used in warning message.
+        hard (bool): If True, raise an AssertionError if the requirement is not met.
+        verbose (bool): If True, print warning message if requirement is not met.
+        msg (str): Extra message to display if verbose.
 
     Returns:
         (bool): True if requirement is met, False otherwise.
@@ -307,7 +307,7 @@ def check_font(font="Arial.ttf"):
         font (str): Path or name of font.
 
     Returns:
-        file (Path): Resolved font file path.
+        (Path): Resolved font file path.
     """
     from matplotlib import font_manager
 
diff --git a/ultralytics/utils/loss.py b/ultralytics/utils/loss.py
index 06267a2ed9..3945f0391a 100644
--- a/ultralytics/utils/loss.py
+++ b/ultralytics/utils/loss.py
@@ -26,7 +26,7 @@ class VarifocalLoss(nn.Module):
 
     @staticmethod
     def forward(pred_score, gt_score, label, alpha=0.75, gamma=2.0):
-        """Computes varfocal loss."""
+        """Compute varfocal loss between predictions and ground truth."""
         weight = alpha * pred_score.sigmoid().pow(gamma) * (1 - label) + gt_score * label
         with autocast(enabled=False):
             loss = (
@@ -41,12 +41,12 @@ class FocalLoss(nn.Module):
     """Wraps focal loss around existing loss_fcn(), i.e. criteria = FocalLoss(nn.BCEWithLogitsLoss(), gamma=1.5)."""
 
     def __init__(self):
-        """Initializer for FocalLoss class with no parameters."""
+        """Initialize FocalLoss class with no parameters."""
         super().__init__()
 
     @staticmethod
     def forward(pred, label, gamma=1.5, alpha=0.25):
-        """Calculates and updates confusion matrix for object detection/classification tasks."""
+        """Calculate focal loss with modulating factors for class imbalance."""
         loss = F.binary_cross_entropy_with_logits(pred, label, reduction="none")
         # p_t = torch.exp(-loss)
         # loss *= self.alpha * (1.000001 - p_t) ** self.gamma  # non-zero power for gradient stability
@@ -63,20 +63,15 @@ class FocalLoss(nn.Module):
 
 
 class DFLoss(nn.Module):
-    """Criterion class for computing DFL losses during training."""
+    """Criterion class for computing Distribution Focal Loss (DFL)."""
 
     def __init__(self, reg_max=16) -> None:
-        """Initialize the DFL module."""
+        """Initialize the DFL module with regularization maximum."""
         super().__init__()
         self.reg_max = reg_max
 
     def __call__(self, pred_dist, target):
-        """
-        Return sum of left and right DFL losses.
-
-        Distribution Focal Loss (DFL) proposed in Generalized Focal Loss
-        https://ieeexplore.ieee.org/document/9792391
-        """
+        """Return sum of left and right DFL losses from https://ieeexplore.ieee.org/document/9792391."""
         target = target.clamp_(0, self.reg_max - 1 - 0.01)
         tl = target.long()  # target left
         tr = tl + 1  # target right
@@ -89,7 +84,7 @@ class DFLoss(nn.Module):
 
 
 class BboxLoss(nn.Module):
-    """Criterion class for computing training losses during training."""
+    """Criterion class for computing training losses for bounding boxes."""
 
     def __init__(self, reg_max=16):
         """Initialize the BboxLoss module with regularization maximum and DFL settings."""
@@ -97,7 +92,7 @@ class BboxLoss(nn.Module):
         self.dfl_loss = DFLoss(reg_max) if reg_max > 1 else None
 
     def forward(self, pred_dist, pred_bboxes, anchor_points, target_bboxes, target_scores, target_scores_sum, fg_mask):
-        """IoU loss."""
+        """Compute IoU and DFL losses for bounding boxes."""
         weight = target_scores.sum(-1)[fg_mask].unsqueeze(-1)
         iou = bbox_iou(pred_bboxes[fg_mask], target_bboxes[fg_mask], xywh=False, CIoU=True)
         loss_iou = ((1.0 - iou) * weight).sum() / target_scores_sum
@@ -114,14 +109,14 @@ class BboxLoss(nn.Module):
 
 
 class RotatedBboxLoss(BboxLoss):
-    """Criterion class for computing training losses during training."""
+    """Criterion class for computing training losses for rotated bounding boxes."""
 
     def __init__(self, reg_max):
         """Initialize the BboxLoss module with regularization maximum and DFL settings."""
         super().__init__(reg_max)
 
     def forward(self, pred_dist, pred_bboxes, anchor_points, target_bboxes, target_scores, target_scores_sum, fg_mask):
-        """IoU loss."""
+        """Compute IoU and DFL losses for rotated bounding boxes."""
         weight = target_scores.sum(-1)[fg_mask].unsqueeze(-1)
         iou = probiou(pred_bboxes[fg_mask], target_bboxes[fg_mask])
         loss_iou = ((1.0 - iou) * weight).sum() / target_scores_sum
@@ -138,15 +133,15 @@ class RotatedBboxLoss(BboxLoss):
 
 
 class KeypointLoss(nn.Module):
-    """Criterion class for computing training losses."""
+    """Criterion class for computing keypoint losses."""
 
     def __init__(self, sigmas) -> None:
-        """Initialize the KeypointLoss class."""
+        """Initialize the KeypointLoss class with keypoint sigmas."""
         super().__init__()
         self.sigmas = sigmas
 
     def forward(self, pred_kpts, gt_kpts, kpt_mask, area):
-        """Calculates keypoint loss factor and Euclidean distance loss for predicted and actual keypoints."""
+        """Calculate keypoint loss factor and Euclidean distance loss for keypoints."""
         d = (pred_kpts[..., 0] - gt_kpts[..., 0]).pow(2) + (pred_kpts[..., 1] - gt_kpts[..., 1]).pow(2)
         kpt_loss_factor = kpt_mask.shape[1] / (torch.sum(kpt_mask != 0, dim=1) + 1e-9)
         # e = d / (2 * (area * self.sigmas) ** 2 + 1e-9)  # from formula
@@ -155,10 +150,10 @@ class KeypointLoss(nn.Module):
 
 
 class v8DetectionLoss:
-    """Criterion class for computing training losses."""
+    """Criterion class for computing training losses for YOLOv8 object detection."""
 
     def __init__(self, model, tal_topk=10):  # model must be de-paralleled
-        """Initializes v8DetectionLoss with the model, defining model-related properties and BCE loss function."""
+        """Initialize v8DetectionLoss with model parameters and task-aligned assignment settings."""
         device = next(model.parameters()).device  # get model device
         h = model.args  # hyperparameters
 
@@ -178,7 +173,7 @@ class v8DetectionLoss:
         self.proj = torch.arange(m.reg_max, dtype=torch.float, device=device)
 
     def preprocess(self, targets, batch_size, scale_tensor):
-        """Preprocesses the target counts and matches with the input batch size to output a tensor."""
+        """Preprocess targets by converting to tensor format and scaling coordinates."""
         nl, ne = targets.shape
         if nl == 0:
             out = torch.zeros(batch_size, 0, ne - 1, device=self.device)
@@ -261,15 +256,15 @@ class v8DetectionLoss:
 
 
 class v8SegmentationLoss(v8DetectionLoss):
-    """Criterion class for computing training losses."""
+    """Criterion class for computing training losses for YOLOv8 segmentation."""
 
     def __init__(self, model):  # model must be de-paralleled
-        """Initializes the v8SegmentationLoss class, taking a de-paralleled model as argument."""
+        """Initialize the v8SegmentationLoss class with model parameters and mask overlap setting."""
         super().__init__(model)
         self.overlap = model.args.overlap_mask
 
     def __call__(self, preds, batch):
-        """Calculate and return the loss for the YOLO model."""
+        """Calculate and return the combined loss for detection and segmentation."""
         loss = torch.zeros(4, device=self.device)  # box, cls, dfl
         feats, pred_masks, proto = preds if len(preds) == 3 else preds[1]
         batch_size, _, mask_h, mask_w = proto.shape  # batch size, number of masks, mask height, mask width
@@ -444,10 +439,10 @@ class v8SegmentationLoss(v8DetectionLoss):
 
 
 class v8PoseLoss(v8DetectionLoss):
-    """Criterion class for computing training losses."""
+    """Criterion class for computing training losses for YOLOv8 pose estimation."""
 
     def __init__(self, model):  # model must be de-paralleled
-        """Initializes v8PoseLoss with model, sets keypoint variables and declares a keypoint loss instance."""
+        """Initialize v8PoseLoss with model parameters and keypoint-specific loss functions."""
         super().__init__(model)
         self.kpt_shape = model.model[-1].kpt_shape
         self.bce_pose = nn.BCEWithLogitsLoss()
@@ -457,7 +452,7 @@ class v8PoseLoss(v8DetectionLoss):
         self.keypoint_loss = KeypointLoss(sigmas=sigmas)
 
     def __call__(self, preds, batch):
-        """Calculate the total loss and detach it."""
+        """Calculate the total loss and detach it for pose estimation."""
         loss = torch.zeros(5, device=self.device)  # box, cls, dfl, kpt_location, kpt_visibility
         feats, pred_kpts = preds if isinstance(preds[0], list) else preds[1]
         pred_distri, pred_scores = torch.cat([xi.view(feats[0].shape[0], self.no, -1) for xi in feats], 2).split(
@@ -524,7 +519,7 @@ class v8PoseLoss(v8DetectionLoss):
 
     @staticmethod
     def kpts_decode(anchor_points, pred_kpts):
-        """Decodes predicted keypoints to image coordinates."""
+        """Decode predicted keypoints to image coordinates."""
         y = pred_kpts.clone()
         y[..., :2] *= 2.0
         y[..., 0] += anchor_points[:, [0]] - 0.5
@@ -599,7 +594,7 @@ class v8PoseLoss(v8DetectionLoss):
 
 
 class v8ClassificationLoss:
-    """Criterion class for computing training losses."""
+    """Criterion class for computing training losses for classification."""
 
     def __call__(self, preds, batch):
         """Compute the classification loss between predictions and true labels."""
@@ -613,13 +608,13 @@ class v8OBBLoss(v8DetectionLoss):
     """Calculates losses for object detection, classification, and box distribution in rotated YOLO models."""
 
     def __init__(self, model):
-        """Initializes v8OBBLoss with model, assigner, and rotated bbox loss; note model must be de-paralleled."""
+        """Initialize v8OBBLoss with model, assigner, and rotated bbox loss; model must be de-paralleled."""
         super().__init__(model)
         self.assigner = RotatedTaskAlignedAssigner(topk=10, num_classes=self.nc, alpha=0.5, beta=6.0)
         self.bbox_loss = RotatedBboxLoss(self.reg_max).to(self.device)
 
     def preprocess(self, targets, batch_size, scale_tensor):
-        """Preprocesses the target counts and matches with the input batch size to output a tensor."""
+        """Preprocess targets for oriented bounding box detection."""
         if targets.shape[0] == 0:
             out = torch.zeros(batch_size, 0, 6, device=self.device)
         else:
@@ -636,7 +631,7 @@ class v8OBBLoss(v8DetectionLoss):
         return out
 
     def __call__(self, preds, batch):
-        """Calculate and return the loss for the YOLO model."""
+        """Calculate and return the loss for oriented bounding box detection."""
         loss = torch.zeros(3, device=self.device)  # box, cls, dfl
         feats, pred_angle = preds if isinstance(preds[0], list) else preds[1]
         batch_size = pred_angle.shape[0]  # batch size, number of masks, mask height, mask width
@@ -726,7 +721,7 @@ class v8OBBLoss(v8DetectionLoss):
 
 
 class E2EDetectLoss:
-    """Criterion class for computing training losses."""
+    """Criterion class for computing training losses for end-to-end detection."""
 
     def __init__(self, model):
         """Initialize E2EDetectLoss with one-to-many and one-to-one detection losses using the provided model."""
diff --git a/ultralytics/utils/metrics.py b/ultralytics/utils/metrics.py
index cf0e827d9e..5ad8b5a951 100644
--- a/ultralytics/utils/metrics.py
+++ b/ultralytics/utils/metrics.py
@@ -25,7 +25,7 @@ def bbox_ioa(box1, box2, iou=False, eps=1e-7):
         box1 (np.ndarray): A numpy array of shape (n, 4) representing n bounding boxes.
         box2 (np.ndarray): A numpy array of shape (m, 4) representing m bounding boxes.
         iou (bool): Calculate the standard IoU if True else return inter_area/box2_area.
-        eps (float, optional): A small value to avoid division by zero. Defaults to 1e-7.
+        eps (float, optional): A small value to avoid division by zero.
 
     Returns:
         (np.ndarray): A numpy array of shape (n, m) representing the intersection over box2 area.
@@ -57,7 +57,7 @@ def box_iou(box1, box2, eps=1e-7):
     Args:
         box1 (torch.Tensor): A tensor of shape (N, 4) representing N bounding boxes.
         box2 (torch.Tensor): A tensor of shape (M, 4) representing M bounding boxes.
-        eps (float, optional): A small value to avoid division by zero. Defaults to 1e-7.
+        eps (float, optional): A small value to avoid division by zero.
 
     Returns:
         (torch.Tensor): An NxM tensor containing the pairwise IoU values for every element in box1 and box2.
@@ -73,7 +73,7 @@ def box_iou(box1, box2, eps=1e-7):
 
 def bbox_iou(box1, box2, xywh=True, GIoU=False, DIoU=False, CIoU=False, eps=1e-7):
     """
-    Calculates the Intersection over Union (IoU) between bounding boxes.
+    Calculate the Intersection over Union (IoU) between bounding boxes.
 
     This function supports various shapes for `box1` and `box2` as long as the last dimension is 4.
     For instance, you may pass tensors shaped like (4,), (N, 4), (B, N, 4), or (B, N, 1, 4).
@@ -84,11 +84,11 @@ def bbox_iou(box1, box2, xywh=True, GIoU=False, DIoU=False, CIoU=False, eps=1e-7
         box1 (torch.Tensor): A tensor representing one or more bounding boxes, with the last dimension being 4.
         box2 (torch.Tensor): A tensor representing one or more bounding boxes, with the last dimension being 4.
         xywh (bool, optional): If True, input boxes are in (x, y, w, h) format. If False, input boxes are in
-                               (x1, y1, x2, y2) format. Defaults to True.
-        GIoU (bool, optional): If True, calculate Generalized IoU. Defaults to False.
-        DIoU (bool, optional): If True, calculate Distance IoU. Defaults to False.
-        CIoU (bool, optional): If True, calculate Complete IoU. Defaults to False.
-        eps (float, optional): A small value to avoid division by zero. Defaults to 1e-7.
+                               (x1, y1, x2, y2) format.
+        GIoU (bool, optional): If True, calculate Generalized IoU.
+        DIoU (bool, optional): If True, calculate Distance IoU.
+        CIoU (bool, optional): If True, calculate Complete IoU.
+        eps (float, optional): A small value to avoid division by zero.
 
     Returns:
         (torch.Tensor): IoU, GIoU, DIoU, or CIoU values depending on the specified flags.
@@ -143,7 +143,7 @@ def mask_iou(mask1, mask2, eps=1e-7):
                         product of image width and height.
         mask2 (torch.Tensor): A tensor of shape (M, n) where M is the number of predicted objects and n is the
                         product of image width and height.
-        eps (float, optional): A small value to avoid division by zero. Defaults to 1e-7.
+        eps (float, optional): A small value to avoid division by zero.
 
     Returns:
         (torch.Tensor): A tensor of shape (N, M) representing masks IoU.
@@ -162,7 +162,7 @@ def kpt_iou(kpt1, kpt2, area, sigma, eps=1e-7):
         kpt2 (torch.Tensor): A tensor of shape (M, 17, 3) representing predicted keypoints.
         area (torch.Tensor): A tensor of shape (N,) representing areas from ground truth.
         sigma (list): A list containing 17 values representing keypoint scales.
-        eps (float, optional): A small value to avoid division by zero. Defaults to 1e-7.
+        eps (float, optional): A small value to avoid division by zero.
 
     Returns:
         (torch.Tensor): A tensor of shape (N, M) representing keypoint similarities.
@@ -177,7 +177,7 @@ def kpt_iou(kpt1, kpt2, area, sigma, eps=1e-7):
 
 def _get_covariance_matrix(boxes):
     """
-    Generating covariance matrix from obbs.
+    Generate covariance matrix from oriented bounding boxes.
 
     Args:
         boxes (torch.Tensor): A tensor of shape (N, 5) representing rotated bounding boxes, with xywhr format.
@@ -199,20 +199,18 @@ def probiou(obb1, obb2, CIoU=False, eps=1e-7):
     """
     Calculate probabilistic IoU between oriented bounding boxes.
 
-    Implements the algorithm from https://arxiv.org/pdf/2106.06072v1.pdf.
-
     Args:
         obb1 (torch.Tensor): Ground truth OBBs, shape (N, 5), format xywhr.
         obb2 (torch.Tensor): Predicted OBBs, shape (N, 5), format xywhr.
-        CIoU (bool, optional): If True, calculate CIoU. Defaults to False.
-        eps (float, optional): Small value to avoid division by zero. Defaults to 1e-7.
+        CIoU (bool, optional): If True, calculate CIoU.
+        eps (float, optional): Small value to avoid division by zero.
 
     Returns:
         (torch.Tensor): OBB similarities, shape (N,).
 
-    Note:
-        OBB format: [center_x, center_y, width, height, rotation_angle].
-        If CIoU is True, returns CIoU instead of IoU.
+    Notes:
+        - OBB format: [center_x, center_y, width, height, rotation_angle].
+        - Implements the algorithm from https://arxiv.org/pdf/2106.06072v1.pdf.
     """
     x1, y1 = obb1[..., :2].split(1, dim=-1)
     x2, y2 = obb2[..., :2].split(1, dim=-1)
@@ -243,15 +241,18 @@ def probiou(obb1, obb2, CIoU=False, eps=1e-7):
 
 def batch_probiou(obb1, obb2, eps=1e-7):
     """
-    Calculate the prob IoU between oriented bounding boxes, https://arxiv.org/pdf/2106.06072v1.pdf.
+    Calculate the probabilistic IoU between oriented bounding boxes.
 
     Args:
         obb1 (torch.Tensor | np.ndarray): A tensor of shape (N, 5) representing ground truth obbs, with xywhr format.
         obb2 (torch.Tensor | np.ndarray): A tensor of shape (M, 5) representing predicted obbs, with xywhr format.
-        eps (float, optional): A small value to avoid division by zero. Defaults to 1e-7.
+        eps (float, optional): A small value to avoid division by zero.
 
     Returns:
         (torch.Tensor): A tensor of shape (N, M) representing obb similarities.
+
+    References:
+        https://arxiv.org/pdf/2106.06072v1.pdf
     """
     obb1 = torch.from_numpy(obb1) if isinstance(obb1, np.ndarray) else obb1
     obb2 = torch.from_numpy(obb2) if isinstance(obb2, np.ndarray) else obb2
@@ -277,16 +278,16 @@ def batch_probiou(obb1, obb2, eps=1e-7):
 
 def smooth_bce(eps=0.1):
     """
-    Computes smoothed positive and negative Binary Cross-Entropy targets.
-
-    This function calculates positive and negative label smoothing BCE targets based on a given epsilon value.
-    For implementation details, refer to https://github.com/ultralytics/yolov3/issues/238#issuecomment-598028441.
+    Compute smoothed positive and negative Binary Cross-Entropy targets.
 
     Args:
-        eps (float, optional): The epsilon value for label smoothing. Defaults to 0.1.
+        eps (float, optional): The epsilon value for label smoothing.
 
     Returns:
         (tuple): A tuple containing the positive and negative label smoothing BCE targets.
+
+    References:
+        https://github.com/ultralytics/yolov3/issues/238#issuecomment-598028441
     """
     return 1.0 - 0.5 * eps, 0.5 * eps
 
@@ -304,7 +305,15 @@ class ConfusionMatrix:
     """
 
     def __init__(self, nc, conf=0.25, iou_thres=0.45, task="detect"):
-        """Initialize attributes for the YOLO model."""
+        """
+        Initialize a ConfusionMatrix instance.
+
+        Args:
+            nc (int): Number of classes.
+            conf (float, optional): Confidence threshold for detections.
+            iou_thres (float, optional): IoU threshold for matching detections to ground truth.
+            task (str, optional): Type of task, either 'detect' or 'classify'.
+        """
         self.task = task
         self.matrix = np.zeros((nc + 1, nc + 1)) if self.task == "detect" else np.zeros((nc, nc))
         self.nc = nc  # number of classes
@@ -382,11 +391,16 @@ class ConfusionMatrix:
                 self.matrix[dc, self.nc] += 1  # predicted background
 
     def matrix(self):
-        """Returns the confusion matrix."""
+        """Return the confusion matrix."""
         return self.matrix
 
     def tp_fp(self):
-        """Returns true positives and false positives."""
+        """
+        Return true positives and false positives.
+
+        Returns:
+            (tuple): True positives and false positives.
+        """
         tp = self.matrix.diagonal()  # true positives
         fp = self.matrix.sum(1) - tp  # false positives
         # fn = self.matrix.sum(0) - tp  # false negatives (missed detections)
@@ -454,7 +468,17 @@ def smooth(y, f=0.05):
 
 @plt_settings()
 def plot_pr_curve(px, py, ap, save_dir=Path("pr_curve.png"), names={}, on_plot=None):
-    """Plots a precision-recall curve."""
+    """
+    Plot precision-recall curve.
+
+    Args:
+        px (np.ndarray): X values for the PR curve.
+        py (np.ndarray): Y values for the PR curve.
+        ap (np.ndarray): Average precision values.
+        save_dir (Path, optional): Path to save the plot.
+        names (dict, optional): Dictionary mapping class indices to class names.
+        on_plot (callable, optional): Function to call after plot is saved.
+    """
     fig, ax = plt.subplots(1, 1, figsize=(9, 6), tight_layout=True)
     py = np.stack(py, axis=1)
 
@@ -479,7 +503,18 @@ def plot_pr_curve(px, py, ap, save_dir=Path("pr_curve.png"), names={}, on_plot=N
 
 @plt_settings()
 def plot_mc_curve(px, py, save_dir=Path("mc_curve.png"), names={}, xlabel="Confidence", ylabel="Metric", on_plot=None):
-    """Plots a metric-confidence curve."""
+    """
+    Plot metric-confidence curve.
+
+    Args:
+        px (np.ndarray): X values for the metric-confidence curve.
+        py (np.ndarray): Y values for the metric-confidence curve.
+        save_dir (Path, optional): Path to save the plot.
+        names (dict, optional): Dictionary mapping class indices to class names.
+        xlabel (str, optional): X-axis label.
+        ylabel (str, optional): Y-axis label.
+        on_plot (callable, optional): Function to call after plot is saved.
+    """
     fig, ax = plt.subplots(1, 1, figsize=(9, 6), tight_layout=True)
 
     if 0 < len(names) < 21:  # display per-class legend if < 21 classes
@@ -538,33 +573,33 @@ def ap_per_class(
     tp, conf, pred_cls, target_cls, plot=False, on_plot=None, save_dir=Path(), names={}, eps=1e-16, prefix=""
 ):
     """
-    Computes the average precision per class for object detection evaluation.
+    Compute the average precision per class for object detection evaluation.
 
     Args:
         tp (np.ndarray): Binary array indicating whether the detection is correct (True) or not (False).
         conf (np.ndarray): Array of confidence scores of the detections.
         pred_cls (np.ndarray): Array of predicted classes of the detections.
         target_cls (np.ndarray): Array of true classes of the detections.
-        plot (bool, optional): Whether to plot PR curves or not. Defaults to False.
-        on_plot (func, optional): A callback to pass plots path and data when they are rendered. Defaults to None.
-        save_dir (Path, optional): Directory to save the PR curves. Defaults to an empty path.
-        names (dict, optional): Dict of class names to plot PR curves. Defaults to an empty tuple.
-        eps (float, optional): A small value to avoid division by zero. Defaults to 1e-16.
-        prefix (str, optional): A prefix string for saving the plot files. Defaults to an empty string.
+        plot (bool, optional): Whether to plot PR curves or not.
+        on_plot (func, optional): A callback to pass plots path and data when they are rendered.
+        save_dir (Path, optional): Directory to save the PR curves.
+        names (dict, optional): Dict of class names to plot PR curves.
+        eps (float, optional): A small value to avoid division by zero.
+        prefix (str, optional): A prefix string for saving the plot files.
 
     Returns:
-        tp (np.ndarray): True positive counts at threshold given by max F1 metric for each class.Shape: (nc,).
-        fp (np.ndarray): False positive counts at threshold given by max F1 metric for each class. Shape: (nc,).
-        p (np.ndarray): Precision values at threshold given by max F1 metric for each class. Shape: (nc,).
-        r (np.ndarray): Recall values at threshold given by max F1 metric for each class. Shape: (nc,).
-        f1 (np.ndarray): F1-score values at threshold given by max F1 metric for each class. Shape: (nc,).
-        ap (np.ndarray): Average precision for each class at different IoU thresholds. Shape: (nc, 10).
-        unique_classes (np.ndarray): An array of unique classes that have data. Shape: (nc,).
-        p_curve (np.ndarray): Precision curves for each class. Shape: (nc, 1000).
-        r_curve (np.ndarray): Recall curves for each class. Shape: (nc, 1000).
-        f1_curve (np.ndarray): F1-score curves for each class. Shape: (nc, 1000).
-        x (np.ndarray): X-axis values for the curves. Shape: (1000,).
-        prec_values (np.ndarray): Precision values at mAP@0.5 for each class. Shape: (nc, 1000).
+        tp (np.ndarray): True positive counts at threshold given by max F1 metric for each class.
+        fp (np.ndarray): False positive counts at threshold given by max F1 metric for each class.
+        p (np.ndarray): Precision values at threshold given by max F1 metric for each class.
+        r (np.ndarray): Recall values at threshold given by max F1 metric for each class.
+        f1 (np.ndarray): F1-score values at threshold given by max F1 metric for each class.
+        ap (np.ndarray): Average precision for each class at different IoU thresholds.
+        unique_classes (np.ndarray): An array of unique classes that have data.
+        p_curve (np.ndarray): Precision curves for each class.
+        r_curve (np.ndarray): Recall curves for each class.
+        f1_curve (np.ndarray): F1-score curves for each class.
+        x (np.ndarray): X-axis values for the curves.
+        prec_values (np.ndarray): Precision values at mAP@0.5 for each class.
     """
     # Sort by objectness
     i = np.argsort(-conf)
@@ -651,7 +686,7 @@ class Metric(SimpleClass):
     """
 
     def __init__(self) -> None:
-        """Initializes a Metric instance for computing evaluation metrics for the YOLOv8 model."""
+        """Initialize a Metric instance for computing evaluation metrics for the YOLOv8 model."""
         self.p = []  # (nc, )
         self.r = []  # (nc, )
         self.f1 = []  # (nc, )
@@ -662,7 +697,7 @@ class Metric(SimpleClass):
     @property
     def ap50(self):
         """
-        Returns the Average Precision (AP) at an IoU threshold of 0.5 for all classes.
+        Return the Average Precision (AP) at an IoU threshold of 0.5 for all classes.
 
         Returns:
             (np.ndarray, list): Array of shape (nc,) with AP50 values per class, or an empty list if not available.
@@ -672,7 +707,7 @@ class Metric(SimpleClass):
     @property
     def ap(self):
         """
-        Returns the Average Precision (AP) at an IoU threshold of 0.5-0.95 for all classes.
+        Return the Average Precision (AP) at an IoU threshold of 0.5-0.95 for all classes.
 
         Returns:
             (np.ndarray, list): Array of shape (nc,) with AP50-95 values per class, or an empty list if not available.
@@ -682,7 +717,7 @@ class Metric(SimpleClass):
     @property
     def mp(self):
         """
-        Returns the Mean Precision of all classes.
+        Return the Mean Precision of all classes.
 
         Returns:
             (float): The mean precision of all classes.
@@ -692,7 +727,7 @@ class Metric(SimpleClass):
     @property
     def mr(self):
         """
-        Returns the Mean Recall of all classes.
+        Return the Mean Recall of all classes.
 
         Returns:
             (float): The mean recall of all classes.
@@ -702,7 +737,7 @@ class Metric(SimpleClass):
     @property
     def map50(self):
         """
-        Returns the mean Average Precision (mAP) at an IoU threshold of 0.5.
+        Return the mean Average Precision (mAP) at an IoU threshold of 0.5.
 
         Returns:
             (float): The mAP at an IoU threshold of 0.5.
@@ -712,7 +747,7 @@ class Metric(SimpleClass):
     @property
     def map75(self):
         """
-        Returns the mean Average Precision (mAP) at an IoU threshold of 0.75.
+        Return the mean Average Precision (mAP) at an IoU threshold of 0.75.
 
         Returns:
             (float): The mAP at an IoU threshold of 0.75.
@@ -722,7 +757,7 @@ class Metric(SimpleClass):
     @property
     def map(self):
         """
-        Returns the mean Average Precision (mAP) over IoU thresholds of 0.5 - 0.95 in steps of 0.05.
+        Return the mean Average Precision (mAP) over IoU thresholds of 0.5 - 0.95 in steps of 0.05.
 
         Returns:
             (float): The mAP over IoU thresholds of 0.5 - 0.95 in steps of 0.05.
@@ -730,41 +765,42 @@ class Metric(SimpleClass):
         return self.all_ap.mean() if len(self.all_ap) else 0.0
 
     def mean_results(self):
-        """Mean of results, return mp, mr, map50, map."""
+        """Return mean of results, mp, mr, map50, map."""
         return [self.mp, self.mr, self.map50, self.map]
 
     def class_result(self, i):
-        """Class-aware result, return p[i], r[i], ap50[i], ap[i]."""
+        """Return class-aware result, p[i], r[i], ap50[i], ap[i]."""
         return self.p[i], self.r[i], self.ap50[i], self.ap[i]
 
     @property
     def maps(self):
-        """MAP of each class."""
+        """Return mAP of each class."""
         maps = np.zeros(self.nc) + self.map
         for i, c in enumerate(self.ap_class_index):
             maps[c] = self.ap[i]
         return maps
 
     def fitness(self):
-        """Model fitness as a weighted combination of metrics."""
+        """Return model fitness as a weighted combination of metrics."""
         w = [0.0, 0.0, 0.1, 0.9]  # weights for [P, R, mAP@0.5, mAP@0.5:0.95]
         return (np.array(self.mean_results()) * w).sum()
 
     def update(self, results):
         """
-        Updates the evaluation metrics of the model with a new set of results.
+        Update the evaluation metrics with a new set of results.
 
         Args:
-            results (tuple): A tuple containing the following evaluation metrics:
-                - p (list): Precision for each class. Shape: (nc,).
-                - r (list): Recall for each class. Shape: (nc,).
-                - f1 (list): F1 score for each class. Shape: (nc,).
-                - all_ap (list): AP scores for all classes and all IoU thresholds. Shape: (nc, 10).
-                - ap_class_index (list): Index of class for each AP score. Shape: (nc,).
-
-        Side Effects:
-            Updates the class attributes `self.p`, `self.r`, `self.f1`, `self.all_ap`, and `self.ap_class_index` based
-            on the values provided in the `results` tuple.
+            results (tuple): A tuple containing evaluation metrics:
+                - p (list): Precision for each class.
+                - r (list): Recall for each class.
+                - f1 (list): F1 score for each class.
+                - all_ap (list): AP scores for all classes and all IoU thresholds.
+                - ap_class_index (list): Index of class for each AP score.
+                - p_curve (list): Precision curve for each class.
+                - r_curve (list): Recall curve for each class.
+                - f1_curve (list): F1 curve for each class.
+                - px (list): X values for the curves.
+                - prec_values (list): Precision values for each class.
         """
         (
             self.p,
@@ -781,12 +817,12 @@ class Metric(SimpleClass):
 
     @property
     def curves(self):
-        """Returns a list of curves for accessing specific metrics curves."""
+        """Return a list of curves for accessing specific metrics curves."""
         return []
 
     @property
     def curves_results(self):
-        """Returns a list of curves for accessing specific metrics curves."""
+        """Return a list of curves for accessing specific metrics curves."""
         return [
             [self.px, self.prec_values, "Recall", "Precision"],
             [self.px, self.f1_curve, "Confidence", "F1"],
@@ -797,36 +833,26 @@ class Metric(SimpleClass):
 
 class DetMetrics(SimpleClass):
     """
-    Utility class for computing detection metrics such as precision, recall, and mean average precision (mAP) of an
-    object detection model.
-
-    Args:
-        save_dir (Path): A path to the directory where the output plots will be saved. Defaults to current directory.
-        plot (bool): A flag that indicates whether to plot precision-recall curves for each class. Defaults to False.
-        names (dict of str): A dict of strings that represents the names of the classes. Defaults to an empty tuple.
+    Utility class for computing detection metrics such as precision, recall, and mean average precision (mAP).
 
     Attributes:
         save_dir (Path): A path to the directory where the output plots will be saved.
-        plot (bool): A flag that indicates whether to plot the precision-recall curves for each class.
-        names (dict of str): A dict of strings that represents the names of the classes.
-        box (Metric): An instance of the Metric class for storing the results of the detection metrics.
-        speed (dict): A dictionary for storing the execution time of different parts of the detection process.
-
-    Methods:
-        process(tp, conf, pred_cls, target_cls): Updates the metric results with the latest batch of predictions.
-        keys: Returns a list of keys for accessing the computed detection metrics.
-        mean_results: Returns a list of mean values for the computed detection metrics.
-        class_result(i): Returns a list of values for the computed detection metrics for a specific class.
-        maps: Returns a dictionary of mean average precision (mAP) values for different IoU thresholds.
-        fitness: Computes the fitness score based on the computed detection metrics.
-        ap_class_index: Returns a list of class indices sorted by their average precision (AP) values.
-        results_dict: Returns a dictionary that maps detection metric keys to their computed values.
-        curves: TODO
-        curves_results: TODO
+        plot (bool): A flag that indicates whether to plot precision-recall curves for each class.
+        names (dict): A dictionary of class names.
+        box (Metric): An instance of the Metric class for storing detection results.
+        speed (dict): A dictionary for storing execution times of different parts of the detection process.
+        task (str): The task type, set to 'detect'.
     """
 
     def __init__(self, save_dir=Path("."), plot=False, names={}) -> None:
-        """Initialize a DetMetrics instance with a save directory, plot flag, callback function, and class names."""
+        """
+        Initialize a DetMetrics instance with a save directory, plot flag, and class names.
+
+        Args:
+            save_dir (Path, optional): Directory to save plots.
+            plot (bool, optional): Whether to plot precision-recall curves.
+            names (dict, optional): Dictionary mapping class indices to names.
+        """
         self.save_dir = save_dir
         self.plot = plot
         self.names = names
@@ -835,7 +861,16 @@ class DetMetrics(SimpleClass):
         self.task = "detect"
 
     def process(self, tp, conf, pred_cls, target_cls, on_plot=None):
-        """Process predicted results for object detection and update metrics."""
+        """
+        Process predicted results for object detection and update metrics.
+
+        Args:
+            tp (np.ndarray): True positive array.
+            conf (np.ndarray): Confidence array.
+            pred_cls (np.ndarray): Predicted class indices array.
+            target_cls (np.ndarray): Target class indices array.
+            on_plot (callable, optional): Function to call after plots are generated.
+        """
         results = ap_per_class(
             tp,
             conf,
@@ -851,7 +886,7 @@ class DetMetrics(SimpleClass):
 
     @property
     def keys(self):
-        """Returns a list of keys for accessing specific metrics."""
+        """Return a list of keys for accessing specific metrics."""
         return ["metrics/precision(B)", "metrics/recall(B)", "metrics/mAP50(B)", "metrics/mAP50-95(B)"]
 
     def mean_results(self):
@@ -864,32 +899,32 @@ class DetMetrics(SimpleClass):
 
     @property
     def maps(self):
-        """Returns mean Average Precision (mAP) scores per class."""
+        """Return mean Average Precision (mAP) scores per class."""
         return self.box.maps
 
     @property
     def fitness(self):
-        """Returns the fitness of box object."""
+        """Return the fitness of box object."""
         return self.box.fitness()
 
     @property
     def ap_class_index(self):
-        """Returns the average precision index per class."""
+        """Return the average precision index per class."""
         return self.box.ap_class_index
 
     @property
     def results_dict(self):
-        """Returns dictionary of computed performance metrics and statistics."""
+        """Return dictionary of computed performance metrics and statistics."""
         return dict(zip(self.keys + ["fitness"], self.mean_results() + [self.fitness]))
 
     @property
     def curves(self):
-        """Returns a list of curves for accessing specific metrics curves."""
+        """Return a list of curves for accessing specific metrics curves."""
         return ["Precision-Recall(B)", "F1-Confidence(B)", "Precision-Confidence(B)", "Recall-Confidence(B)"]
 
     @property
     def curves_results(self):
-        """Returns dictionary of computed performance metrics and statistics."""
+        """Return dictionary of computed performance metrics and statistics."""
         return self.box.curves_results
 
 
@@ -897,31 +932,25 @@ class SegmentMetrics(SimpleClass):
     """
     Calculates and aggregates detection and segmentation metrics over a given set of classes.
 
-    Args:
-        save_dir (Path): Path to the directory where the output plots should be saved. Default is the current directory.
-        plot (bool): Whether to save the detection and segmentation plots. Default is False.
-        names (list): List of class names. Default is an empty list.
-
     Attributes:
         save_dir (Path): Path to the directory where the output plots should be saved.
         plot (bool): Whether to save the detection and segmentation plots.
-        names (list): List of class names.
+        names (dict): Dictionary of class names.
         box (Metric): An instance of the Metric class to calculate box detection metrics.
         seg (Metric): An instance of the Metric class to calculate mask segmentation metrics.
         speed (dict): Dictionary to store the time taken in different phases of inference.
-
-    Methods:
-        process(tp_m, tp_b, conf, pred_cls, target_cls): Processes metrics over the given set of predictions.
-        mean_results(): Returns the mean of the detection and segmentation metrics over all the classes.
-        class_result(i): Returns the detection and segmentation metrics of class `i`.
-        maps: Returns the mean Average Precision (mAP) scores for IoU thresholds ranging from 0.50 to 0.95.
-        fitness: Returns the fitness scores, which are a single weighted combination of metrics.
-        ap_class_index: Returns the list of indices of classes used to compute Average Precision (AP).
-        results_dict: Returns the dictionary containing all the detection and segmentation metrics and fitness score.
+        task (str): The task type, set to 'segment'.
     """
 
     def __init__(self, save_dir=Path("."), plot=False, names=()) -> None:
-        """Initialize a SegmentMetrics instance with a save directory, plot flag, callback function, and class names."""
+        """
+        Initialize a SegmentMetrics instance with a save directory, plot flag, and class names.
+
+        Args:
+            save_dir (Path, optional): Directory to save plots.
+            plot (bool, optional): Whether to plot precision-recall curves.
+            names (dict, optional): Dictionary mapping class indices to names.
+        """
         self.save_dir = save_dir
         self.plot = plot
         self.names = names
@@ -932,15 +961,15 @@ class SegmentMetrics(SimpleClass):
 
     def process(self, tp, tp_m, conf, pred_cls, target_cls, on_plot=None):
         """
-        Processes the detection and segmentation metrics over the given set of predictions.
+        Process the detection and segmentation metrics over the given set of predictions.
 
         Args:
-            tp (list): List of True Positive boxes.
-            tp_m (list): List of True Positive masks.
-            conf (list): List of confidence scores.
-            pred_cls (list): List of predicted classes.
-            target_cls (list): List of target classes.
-            on_plot (func): An optional callback to pass plots path and data when they are rendered. Defaults to None.
+            tp (np.ndarray): True positive array for boxes.
+            tp_m (np.ndarray): True positive array for masks.
+            conf (np.ndarray): Confidence array.
+            pred_cls (np.ndarray): Predicted class indices array.
+            target_cls (np.ndarray): Target class indices array.
+            on_plot (callable, optional): Function to call after plots are generated.
         """
         results_mask = ap_per_class(
             tp_m,
@@ -971,7 +1000,7 @@ class SegmentMetrics(SimpleClass):
 
     @property
     def keys(self):
-        """Returns a list of keys for accessing metrics."""
+        """Return a list of keys for accessing metrics."""
         return [
             "metrics/precision(B)",
             "metrics/recall(B)",
@@ -988,32 +1017,36 @@ class SegmentMetrics(SimpleClass):
         return self.box.mean_results() + self.seg.mean_results()
 
     def class_result(self, i):
-        """Returns classification results for a specified class index."""
+        """Return classification results for a specified class index."""
         return self.box.class_result(i) + self.seg.class_result(i)
 
     @property
     def maps(self):
-        """Returns mAP scores for object detection and semantic segmentation models."""
+        """Return mAP scores for object detection and semantic segmentation models."""
         return self.box.maps + self.seg.maps
 
     @property
     def fitness(self):
-        """Get the fitness score for both segmentation and bounding box models."""
+        """Return the fitness score for both segmentation and bounding box models."""
         return self.seg.fitness() + self.box.fitness()
 
     @property
     def ap_class_index(self):
-        """Boxes and masks have the same ap_class_index."""
+        """
+        Return the class indices.
+
+        Boxes and masks have the same ap_class_index.
+        """
         return self.box.ap_class_index
 
     @property
     def results_dict(self):
-        """Returns results of object detection model for evaluation."""
+        """Return results of object detection model for evaluation."""
         return dict(zip(self.keys + ["fitness"], self.mean_results() + [self.fitness]))
 
     @property
     def curves(self):
-        """Returns a list of curves for accessing specific metrics curves."""
+        """Return a list of curves for accessing specific metrics curves."""
         return [
             "Precision-Recall(B)",
             "F1-Confidence(B)",
@@ -1027,7 +1060,7 @@ class SegmentMetrics(SimpleClass):
 
     @property
     def curves_results(self):
-        """Returns dictionary of computed performance metrics and statistics."""
+        """Return dictionary of computed performance metrics and statistics."""
         return self.box.curves_results + self.seg.curves_results
 
 
@@ -1035,18 +1068,14 @@ class PoseMetrics(SegmentMetrics):
     """
     Calculates and aggregates detection and pose metrics over a given set of classes.
 
-    Args:
-        save_dir (Path): Path to the directory where the output plots should be saved. Default is the current directory.
-        plot (bool): Whether to save the detection and segmentation plots. Default is False.
-        names (list): List of class names. Default is an empty list.
-
     Attributes:
         save_dir (Path): Path to the directory where the output plots should be saved.
-        plot (bool): Whether to save the detection and segmentation plots.
-        names (list): List of class names.
+        plot (bool): Whether to save the detection and pose plots.
+        names (dict): Dictionary of class names.
         box (Metric): An instance of the Metric class to calculate box detection metrics.
-        pose (Metric): An instance of the Metric class to calculate mask segmentation metrics.
+        pose (Metric): An instance of the Metric class to calculate pose metrics.
         speed (dict): Dictionary to store the time taken in different phases of inference.
+        task (str): The task type, set to 'pose'.
 
     Methods:
         process(tp_m, tp_b, conf, pred_cls, target_cls): Processes metrics over the given set of predictions.
@@ -1059,7 +1088,14 @@ class PoseMetrics(SegmentMetrics):
     """
 
     def __init__(self, save_dir=Path("."), plot=False, names=()) -> None:
-        """Initialize the PoseMetrics class with directory path, class names, and plotting options."""
+        """
+        Initialize the PoseMetrics class with directory path, class names, and plotting options.
+
+        Args:
+            save_dir (Path, optional): Directory to save plots.
+            plot (bool, optional): Whether to plot precision-recall curves.
+            names (dict, optional): Dictionary mapping class indices to names.
+        """
         super().__init__(save_dir, plot, names)
         self.save_dir = save_dir
         self.plot = plot
@@ -1071,15 +1107,15 @@ class PoseMetrics(SegmentMetrics):
 
     def process(self, tp, tp_p, conf, pred_cls, target_cls, on_plot=None):
         """
-        Processes the detection and pose metrics over the given set of predictions.
+        Process the detection and pose metrics over the given set of predictions.
 
         Args:
-            tp (list): List of True Positive boxes.
-            tp_p (list): List of True Positive keypoints.
-            conf (list): List of confidence scores.
-            pred_cls (list): List of predicted classes.
-            target_cls (list): List of target classes.
-            on_plot (func): An optional callback to pass plots path and data when they are rendered. Defaults to None.
+            tp (np.ndarray): True positive array for boxes.
+            tp_p (np.ndarray): True positive array for keypoints.
+            conf (np.ndarray): Confidence array.
+            pred_cls (np.ndarray): Predicted class indices array.
+            target_cls (np.ndarray): Target class indices array.
+            on_plot (callable, optional): Function to call after plots are generated.
         """
         results_pose = ap_per_class(
             tp_p,
@@ -1110,7 +1146,7 @@ class PoseMetrics(SegmentMetrics):
 
     @property
     def keys(self):
-        """Returns list of evaluation metric keys."""
+        """Return list of evaluation metric keys."""
         return [
             "metrics/precision(B)",
             "metrics/recall(B)",
@@ -1132,17 +1168,17 @@ class PoseMetrics(SegmentMetrics):
 
     @property
     def maps(self):
-        """Returns the mean average precision (mAP) per class for both box and pose detections."""
+        """Return the mean average precision (mAP) per class for both box and pose detections."""
         return self.box.maps + self.pose.maps
 
     @property
     def fitness(self):
-        """Computes classification metrics and speed using the `targets` and `pred` inputs."""
+        """Return combined fitness score for pose and box detection."""
         return self.pose.fitness() + self.box.fitness()
 
     @property
     def curves(self):
-        """Returns a list of curves for accessing specific metrics curves."""
+        """Return a list of curves for accessing specific metrics curves."""
         return [
             "Precision-Recall(B)",
             "F1-Confidence(B)",
@@ -1156,7 +1192,7 @@ class PoseMetrics(SegmentMetrics):
 
     @property
     def curves_results(self):
-        """Returns dictionary of computed performance metrics and statistics."""
+        """Return dictionary of computed performance metrics and statistics."""
         return self.box.curves_results + self.pose.curves_results
 
 
@@ -1167,13 +1203,8 @@ class ClassifyMetrics(SimpleClass):
     Attributes:
         top1 (float): The top-1 accuracy.
         top5 (float): The top-5 accuracy.
-        speed (Dict[str, float]): A dictionary containing the time taken for each step in the pipeline.
-        fitness (float): The fitness of the model, which is equal to top-5 accuracy.
-        results_dict (Dict[str, Union[float, str]]): A dictionary containing the classification metrics and fitness.
-        keys (List[str]): A list of keys for the results_dict.
-
-    Methods:
-        process(targets, pred): Processes the targets and predictions to compute classification metrics.
+        speed (dict): A dictionary containing the time taken for each step in the pipeline.
+        task (str): The task type, set to 'classify'.
     """
 
     def __init__(self) -> None:
@@ -1184,7 +1215,13 @@ class ClassifyMetrics(SimpleClass):
         self.task = "classify"
 
     def process(self, targets, pred):
-        """Target classes and predicted classes."""
+        """
+        Process target classes and predicted classes to compute metrics.
+
+        Args:
+            targets (torch.Tensor): Target classes.
+            pred (torch.Tensor): Predicted classes.
+        """
         pred, targets = torch.cat(pred), torch.cat(targets)
         correct = (targets[:, None] == pred).float()
         acc = torch.stack((correct[:, 0], correct.max(1).values), dim=1)  # (top1, top5) accuracy
@@ -1192,35 +1229,54 @@ class ClassifyMetrics(SimpleClass):
 
     @property
     def fitness(self):
-        """Returns mean of top-1 and top-5 accuracies as fitness score."""
+        """Return mean of top-1 and top-5 accuracies as fitness score."""
         return (self.top1 + self.top5) / 2
 
     @property
     def results_dict(self):
-        """Returns a dictionary with model's performance metrics and fitness score."""
+        """Return a dictionary with model's performance metrics and fitness score."""
         return dict(zip(self.keys + ["fitness"], [self.top1, self.top5, self.fitness]))
 
     @property
     def keys(self):
-        """Returns a list of keys for the results_dict property."""
+        """Return a list of keys for the results_dict property."""
         return ["metrics/accuracy_top1", "metrics/accuracy_top5"]
 
     @property
     def curves(self):
-        """Returns a list of curves for accessing specific metrics curves."""
+        """Return a list of curves for accessing specific metrics curves."""
         return []
 
     @property
     def curves_results(self):
-        """Returns a list of curves for accessing specific metrics curves."""
+        """Return a list of curves for accessing specific metrics curves."""
         return []
 
 
 class OBBMetrics(SimpleClass):
-    """Metrics for evaluating oriented bounding box (OBB) detection, see https://arxiv.org/pdf/2106.06072.pdf."""
+    """
+    Metrics for evaluating oriented bounding box (OBB) detection.
+
+    Attributes:
+        save_dir (Path): Path to the directory where the output plots should be saved.
+        plot (bool): Whether to save the detection plots.
+        names (dict): Dictionary of class names.
+        box (Metric): An instance of the Metric class for storing detection results.
+        speed (dict): A dictionary for storing execution times of different parts of the detection process.
+
+    References:
+        https://arxiv.org/pdf/2106.06072.pdf
+    """
 
     def __init__(self, save_dir=Path("."), plot=False, names=()) -> None:
-        """Initialize an OBBMetrics instance with directory, plotting, callback, and class names."""
+        """
+        Initialize an OBBMetrics instance with directory, plotting, and class names.
+
+        Args:
+            save_dir (Path, optional): Directory to save plots.
+            plot (bool, optional): Whether to plot precision-recall curves.
+            names (dict, optional): Dictionary mapping class indices to names.
+        """
         self.save_dir = save_dir
         self.plot = plot
         self.names = names
@@ -1228,7 +1284,16 @@ class OBBMetrics(SimpleClass):
         self.speed = {"preprocess": 0.0, "inference": 0.0, "loss": 0.0, "postprocess": 0.0}
 
     def process(self, tp, conf, pred_cls, target_cls, on_plot=None):
-        """Process predicted results for object detection and update metrics."""
+        """
+        Process predicted results for object detection and update metrics.
+
+        Args:
+            tp (np.ndarray): True positive array.
+            conf (np.ndarray): Confidence array.
+            pred_cls (np.ndarray): Predicted class indices array.
+            target_cls (np.ndarray): Target class indices array.
+            on_plot (callable, optional): Function to call after plots are generated.
+        """
         results = ap_per_class(
             tp,
             conf,
@@ -1244,7 +1309,7 @@ class OBBMetrics(SimpleClass):
 
     @property
     def keys(self):
-        """Returns a list of keys for accessing specific metrics."""
+        """Return a list of keys for accessing specific metrics."""
         return ["metrics/precision(B)", "metrics/recall(B)", "metrics/mAP50(B)", "metrics/mAP50-95(B)"]
 
     def mean_results(self):
@@ -1257,30 +1322,30 @@ class OBBMetrics(SimpleClass):
 
     @property
     def maps(self):
-        """Returns mean Average Precision (mAP) scores per class."""
+        """Return mean Average Precision (mAP) scores per class."""
         return self.box.maps
 
     @property
     def fitness(self):
-        """Returns the fitness of box object."""
+        """Return the fitness of box object."""
         return self.box.fitness()
 
     @property
     def ap_class_index(self):
-        """Returns the average precision index per class."""
+        """Return the average precision index per class."""
         return self.box.ap_class_index
 
     @property
     def results_dict(self):
-        """Returns dictionary of computed performance metrics and statistics."""
+        """Return dictionary of computed performance metrics and statistics."""
         return dict(zip(self.keys + ["fitness"], self.mean_results() + [self.fitness]))
 
     @property
     def curves(self):
-        """Returns a list of curves for accessing specific metrics curves."""
+        """Return a list of curves for accessing specific metrics curves."""
         return []
 
     @property
     def curves_results(self):
-        """Returns a list of curves for accessing specific metrics curves."""
+        """Return a list of curves for accessing specific metrics curves."""
         return []
diff --git a/ultralytics/utils/ops.py b/ultralytics/utils/ops.py
index 6b351a108f..4744eca3d1 100644
--- a/ultralytics/utils/ops.py
+++ b/ultralytics/utils/ops.py
@@ -18,6 +18,11 @@ class Profile(contextlib.ContextDecorator):
     """
     YOLOv8 Profile class. Use as a decorator with @Profile() or as a context manager with 'with Profile():'.
 
+    Attributes:
+        t (float): Accumulated time.
+        device (torch.device): Device used for model inference.
+        cuda (bool): Whether CUDA is being used.
+
     Examples:
         >>> from ultralytics.utils.ops import Profile
         >>> with Profile(device=device) as dt:
@@ -30,8 +35,8 @@ class Profile(contextlib.ContextDecorator):
         Initialize the Profile class.
 
         Args:
-            t (float): Initial time. Defaults to 0.0.
-            device (torch.device): Devices used for model inference. Defaults to None (cpu).
+            t (float): Initial time.
+            device (torch.device): Device used for model inference.
         """
         self.t = t
         self.device = device
@@ -63,12 +68,12 @@ def segment2box(segment, width=640, height=640):
     Convert 1 segment label to 1 box label, applying inside-image constraint, i.e. (xy1, xy2, ...) to (xyxy).
 
     Args:
-        segment (torch.Tensor): the segment label
-        width (int): the width of the image. Defaults to 640
-        height (int): The height of the image. Defaults to 640
+        segment (torch.Tensor): The segment label.
+        width (int): The width of the image.
+        height (int): The height of the image.
 
     Returns:
-        (np.ndarray): the minimum and maximum x and y values of the segment.
+        (np.ndarray): The minimum and maximum x and y values of the segment.
     """
     x, y = segment.T  # segment xy
     # any 3 out of 4 sides are outside the image, clip coordinates first, https://github.com/ultralytics/ultralytics/pull/18294
@@ -87,21 +92,20 @@ def segment2box(segment, width=640, height=640):
 
 def scale_boxes(img1_shape, boxes, img0_shape, ratio_pad=None, padding=True, xywh=False):
     """
-    Rescales bounding boxes (in the format of xyxy by default) from the shape of the image they were originally
-    specified in (img1_shape) to the shape of a different image (img0_shape).
+    Rescale bounding boxes from img1_shape to img0_shape.
 
     Args:
         img1_shape (tuple): The shape of the image that the bounding boxes are for, in the format of (height, width).
-        boxes (torch.Tensor): the bounding boxes of the objects in the image, in the format of (x1, y1, x2, y2)
-        img0_shape (tuple): the shape of the target image, in the format of (height, width).
-        ratio_pad (tuple): a tuple of (ratio, pad) for scaling the boxes. If not provided, the ratio and pad will be
+        boxes (torch.Tensor): The bounding boxes of the objects in the image, in the format of (x1, y1, x2, y2).
+        img0_shape (tuple): The shape of the target image, in the format of (height, width).
+        ratio_pad (tuple): A tuple of (ratio, pad) for scaling the boxes. If not provided, the ratio and pad will be
             calculated based on the size difference between the two images.
         padding (bool): If True, assuming the boxes is based on image augmented by yolo style. If False then do regular
             rescaling.
-        xywh (bool): The box format is xywh or not, default=False.
+        xywh (bool): The box format is xywh or not.
 
     Returns:
-        boxes (torch.Tensor): The scaled bounding boxes, in the format of (x1, y1, x2, y2)
+        (torch.Tensor): The scaled bounding boxes, in the format of (x1, y1, x2, y2).
     """
     if ratio_pad is None:  # calculate from img0_shape
         gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1])  # gain  = old / new
@@ -146,8 +150,8 @@ def nms_rotated(boxes, scores, threshold=0.45, use_triu=True):
     Args:
         boxes (torch.Tensor): Rotated bounding boxes, shape (N, 5), format xywhr.
         scores (torch.Tensor): Confidence scores, shape (N,).
-        threshold (float, optional): IoU threshold. Defaults to 0.45.
-        use_triu (bool, optional): Whether to use `torch.triu` operator. It'd be useful for disable it
+        threshold (float): IoU threshold.
+        use_triu (bool): Whether to use `torch.triu` operator. It'd be useful for disable it
             when exporting obb models to some formats that do not support `torch.triu`.
 
     Returns:
@@ -210,7 +214,7 @@ def non_max_suppression(
             list contains the apriori labels for a given image. The list should be in the format
             output by a dataloader, with each label being a tuple of (class_index, x1, y1, x2, y2).
         max_det (int): The maximum number of boxes to keep after NMS.
-        nc (int, optional): The number of classes output by the model. Any indices after this will be considered masks.
+        nc (int): The number of classes output by the model. Any indices after this will be considered masks.
         max_time_img (float): The maximum time (seconds) for processing one image.
         max_nms (int): The maximum number of boxes into torchvision.ops.nms().
         max_wh (int): The maximum box width and height in pixels.
@@ -333,7 +337,7 @@ def clip_boxes(boxes, shape):
     Takes a list of bounding boxes and a shape (height, width) and clips the bounding boxes to the shape.
 
     Args:
-        boxes (torch.Tensor): The bounding boxes to clip.
+        boxes (torch.Tensor | numpy.ndarray): The bounding boxes to clip.
         shape (tuple): The shape of the image.
 
     Returns:
@@ -359,7 +363,7 @@ def clip_coords(coords, shape):
         shape (tuple): A tuple of integers representing the size of the image in the format (height, width).
 
     Returns:
-        (torch.Tensor | numpy.ndarray): Clipped coordinates
+        (torch.Tensor | numpy.ndarray): Clipped coordinates.
     """
     if isinstance(coords, torch.Tensor):  # faster individually (WARNING: inplace .clamp_() Apple MPS bug)
         coords[..., 0] = coords[..., 0].clamp(0, shape[1])  # x
@@ -451,10 +455,11 @@ def xywhn2xyxy(x, w=640, h=640, padw=0, padh=0):
 
     Args:
         x (np.ndarray | torch.Tensor): The bounding box coordinates.
-        w (int): Width of the image. Defaults to 640
-        h (int): Height of the image. Defaults to 640
-        padw (int): Padding width. Defaults to 0
-        padh (int): Padding height. Defaults to 0
+        w (int): Width of the image.
+        h (int): Height of the image.
+        padw (int): Padding width.
+        padh (int): Padding height.
+
     Returns:
         y (np.ndarray | torch.Tensor): The coordinates of the bounding box in the format [x1, y1, x2, y2] where
             x1,y1 is the top-left corner, x2,y2 is the bottom-right corner of the bounding box.
@@ -475,10 +480,10 @@ def xyxy2xywhn(x, w=640, h=640, clip=False, eps=0.0):
 
     Args:
         x (np.ndarray | torch.Tensor): The input bounding box coordinates in (x1, y1, x2, y2) format.
-        w (int): The width of the image. Defaults to 640
-        h (int): The height of the image. Defaults to 640
-        clip (bool): If True, the boxes will be clipped to the image boundaries. Defaults to False
-        eps (float): The minimum value of the box's width and height. Defaults to 0.0
+        w (int): The width of the image.
+        h (int): The height of the image.
+        clip (bool): If True, the boxes will be clipped to the image boundaries.
+        eps (float): The minimum value of the box's width and height.
 
     Returns:
         y (np.ndarray | torch.Tensor): The bounding box coordinates in (x, y, width, height, normalized) format
@@ -598,13 +603,13 @@ def xywhr2xyxyxyxy(x):
 
 def ltwh2xyxy(x):
     """
-    It converts the bounding box from [x1, y1, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right.
+    Convert bounding box from [x1, y1, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right.
 
     Args:
-        x (np.ndarray | torch.Tensor): the input image
+        x (np.ndarray | torch.Tensor): The input image.
 
     Returns:
-        y (np.ndarray | torch.Tensor): the xyxy coordinates of the bounding boxes.
+        (np.ndarray | torch.Tensor): The xyxy coordinates of the bounding boxes.
     """
     y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
     y[..., 2] = x[..., 2] + x[..., 0]  # width
@@ -614,13 +619,13 @@ def ltwh2xyxy(x):
 
 def segments2boxes(segments):
     """
-    It converts segment labels to box labels, i.e. (cls, xy1, xy2, ...) to (cls, xywh).
+    Convert segment labels to box labels, i.e. (cls, xy1, xy2, ...) to (cls, xywh).
 
     Args:
-        segments (list): list of segments, each segment is a list of points, each point is a list of x, y coordinates
+        segments (List): List of segments, each segment is a list of points, each point is a list of x, y coordinates.
 
     Returns:
-        (np.ndarray): the xywh coordinates of the bounding boxes.
+        (np.ndarray): The xywh coordinates of the bounding boxes.
     """
     boxes = []
     for s in segments:
@@ -634,11 +639,11 @@ def resample_segments(segments, n=1000):
     Inputs a list of segments (n,2) and returns a list of segments (n,2) up-sampled to n points each.
 
     Args:
-        segments (list): a list of (n,2) arrays, where n is the number of points in the segment.
-        n (int): number of points to resample the segment to. Defaults to 1000
+        segments (List): A list of (n,2) arrays, where n is the number of points in the segment.
+        n (int): Number of points to resample the segment to.
 
     Returns:
-        segments (list): the resampled segments.
+        segments (List): The resampled segments.
     """
     for i, s in enumerate(segments):
         if len(s) == n:
@@ -655,14 +660,14 @@ def resample_segments(segments, n=1000):
 
 def crop_mask(masks, boxes):
     """
-    It takes a mask and a bounding box, and returns a mask that is cropped to the bounding box.
+    Crop masks to bounding boxes.
 
     Args:
-        masks (torch.Tensor): [n, h, w] tensor of masks
-        boxes (torch.Tensor): [n, 4] tensor of bbox coordinates in relative point form
+        masks (torch.Tensor): [n, h, w] tensor of masks.
+        boxes (torch.Tensor): [n, 4] tensor of bbox coordinates in relative point form.
 
     Returns:
-        (torch.Tensor): The masks are being cropped to the bounding box.
+        (torch.Tensor): Cropped masks.
     """
     _, h, w = masks.shape
     x1, y1, x2, y2 = torch.chunk(boxes[:, :, None], 4, 1)  # x1 shape(n,1,1)
@@ -681,7 +686,7 @@ def process_mask(protos, masks_in, bboxes, shape, upsample=False):
         masks_in (torch.Tensor): A tensor of shape [n, mask_dim], where n is the number of masks after NMS.
         bboxes (torch.Tensor): A tensor of shape [n, 4], where n is the number of masks after NMS.
         shape (tuple): A tuple of integers representing the size of the input image in the format (h, w).
-        upsample (bool): A flag to indicate whether to upsample the mask to the original image size. Default is False.
+        upsample (bool): A flag to indicate whether to upsample the mask to the original image size.
 
     Returns:
         (torch.Tensor): A binary mask tensor of shape [n, h, w], where n is the number of masks after NMS, and h and w
@@ -707,16 +712,16 @@ def process_mask(protos, masks_in, bboxes, shape, upsample=False):
 
 def process_mask_native(protos, masks_in, bboxes, shape):
     """
-    It takes the output of the mask head, and crops it after upsampling to the bounding boxes.
+    Apply masks to bounding boxes using the output of the mask head with native upsampling.
 
     Args:
-        protos (torch.Tensor): [mask_dim, mask_h, mask_w]
+        protos (torch.Tensor): [mask_dim, mask_h, mask_w].
         masks_in (torch.Tensor): [n, mask_dim], n is number of masks after nms.
         bboxes (torch.Tensor): [n, 4], n is number of masks after nms.
         shape (tuple): The size of the input image (h,w).
 
     Returns:
-        masks (torch.Tensor): The returned masks with dimensions [h, w, n].
+        (torch.Tensor): The returned masks with dimensions [h, w, n].
     """
     c, mh, mw = protos.shape  # CHW
     masks = (masks_in @ protos.float().view(c, -1)).view(-1, mh, mw)
@@ -734,6 +739,9 @@ def scale_masks(masks, shape, padding=True):
         shape (tuple): Height and width.
         padding (bool): If True, assuming the boxes is based on image augmented by yolo style. If False then do regular
             rescaling.
+
+    Returns:
+        (torch.Tensor): Rescaled masks.
     """
     mh, mw = masks.shape[2:]
     gain = min(mh / shape[0], mw / shape[1])  # gain  = old / new
@@ -755,10 +763,10 @@ def scale_coords(img1_shape, coords, img0_shape, ratio_pad=None, normalize=False
 
     Args:
         img1_shape (tuple): The shape of the image that the coords are from.
-        coords (torch.Tensor): the coords to be scaled of shape n,2.
-        img0_shape (tuple): the shape of the image that the segmentation is being applied to.
-        ratio_pad (tuple): the ratio of the image size to the padded image size.
-        normalize (bool): If True, the coordinates will be normalized to the range [0, 1]. Defaults to False.
+        coords (torch.Tensor): The coords to be scaled of shape n,2.
+        img0_shape (tuple): The shape of the image that the segmentation is being applied to.
+        ratio_pad (tuple): The ratio of the image size to the padded image size.
+        normalize (bool): If True, the coordinates will be normalized to the range [0, 1].
         padding (bool): If True, assuming the boxes is based on image augmented by yolo style. If False then do regular
             rescaling.
 
@@ -805,14 +813,14 @@ def regularize_rboxes(rboxes):
 
 def masks2segments(masks, strategy="all"):
     """
-    It takes a list of masks(n,h,w) and returns a list of segments(n,xy).
+    Convert masks to segments.
 
     Args:
-        masks (torch.Tensor): the output of the model, which is a tensor of shape (batch_size, 160, 160)
-        strategy (str): 'all' or 'largest'. Defaults to all
+        masks (torch.Tensor): The output of the model, which is a tensor of shape (batch_size, 160, 160).
+        strategy (str): 'all' or 'largest'.
 
     Returns:
-        segments (List): list of segment masks
+        (List): List of segment masks.
     """
     from ultralytics.data.converter import merge_multi_segment
 
@@ -852,10 +860,10 @@ def clean_str(s):
     Cleans a string by replacing special characters with '_' character.
 
     Args:
-        s (str): a string needing special characters replaced
+        s (str): A string needing special characters replaced.
 
     Returns:
-        (str): a string with special characters replaced by an underscore _
+        (str): A string with special characters replaced by an underscore _.
     """
     return re.sub(pattern="[|@#!¡·$€%&()=?¿^*;:,¨´><+]", repl="_", string=s)
 
diff --git a/ultralytics/utils/plotting.py b/ultralytics/utils/plotting.py
index 04dacd1e1a..2fc9565670 100644
--- a/ultralytics/utils/plotting.py
+++ b/ultralytics/utils/plotting.py
@@ -25,9 +25,9 @@ class Colors:
     RGB values.
 
     Attributes:
-        palette (list of tuple): List of RGB color values.
+        palette (List[Tuple]): List of RGB color values.
         n (int): The number of colors in the palette.
-        pose_palette (np.ndarray): A specific color palette array with dtype np.uint8.
+        pose_palette (np.ndarray): A specific color palette array for pose estimation with dtype np.uint8.
 
     Examples:
         >>> from ultralytics.utils.plotting import Colors
@@ -142,13 +142,13 @@ class Colors:
         )
 
     def __call__(self, i, bgr=False):
-        """Converts hex color codes to RGB values."""
+        """Convert hex color codes to RGB values."""
         c = self.palette[int(i) % self.n]
         return (c[2], c[1], c[0]) if bgr else c
 
     @staticmethod
     def hex2rgb(h):
-        """Converts hex color codes to RGB values (i.e. default PIL order)."""
+        """Convert hex color codes to RGB values (i.e. default PIL order)."""
         return tuple(int(h[1 + i : 1 + i + 2], 16) for i in (0, 2, 4))
 
 
@@ -160,13 +160,15 @@ class Annotator:
     Ultralytics Annotator for train/val mosaics and JPGs and predictions annotations.
 
     Attributes:
-        im (Image.Image or numpy array): The image to annotate.
+        im (Image.Image or np.ndarray): The image to annotate.
         pil (bool): Whether to use PIL or cv2 for drawing annotations.
         font (ImageFont.truetype or ImageFont.load_default): Font used for text annotations.
         lw (float): Line width for drawing.
         skeleton (List[List[int]]): Skeleton structure for keypoints.
         limb_color (List[int]): Color palette for limbs.
         kpt_color (List[int]): Color palette for keypoints.
+        dark_colors (set): Set of colors considered dark for text contrast.
+        light_colors (set): Set of colors considered light for text contrast.
 
     Examples:
         >>> from ultralytics.utils.plotting import Annotator
@@ -256,7 +258,7 @@ class Annotator:
             txt_color (tuple, optional): The color of the text (R, G, B).
 
         Returns:
-            txt_color (tuple): Text color for label
+            (tuple): Text color for label.
 
         Examples:
             >>> from ultralytics.utils.plotting import Annotator
@@ -273,14 +275,14 @@ class Annotator:
 
     def box_label(self, box, label="", color=(128, 128, 128), txt_color=(255, 255, 255), rotated=False):
         """
-        Draws a bounding box to image with label.
+        Draw a bounding box on an image with a given label.
 
         Args:
             box (tuple): The bounding box coordinates (x1, y1, x2, y2).
-            label (str): The text label to be displayed.
+            label (str, optional): The text label to be displayed.
             color (tuple, optional): The background color of the rectangle (B, G, R).
             txt_color (tuple, optional): The color of the text (R, G, B).
-            rotated (bool, optional): Variable used to check if task is OBB
+            rotated (bool, optional): Whether the task is oriented bounding box detection.
 
         Examples:
             >>> from ultralytics.utils.plotting import Annotator
@@ -340,11 +342,11 @@ class Annotator:
         Plot masks on image.
 
         Args:
-            masks (tensor): Predicted masks on cuda, shape: [n, h, w]
-            colors (List[List[Int]]): Colors for predicted masks, [[r, g, b] * n]
-            im_gpu (tensor): Image is in cuda, shape: [3, h, w], range: [0, 1]
-            alpha (float): Mask transparency: 0.0 fully transparent, 1.0 opaque
-            retina_masks (bool): Whether to use high resolution masks or not. Defaults to False.
+            masks (torch.Tensor): Predicted masks on cuda, shape: [n, h, w]
+            colors (List[List[int]]): Colors for predicted masks, [[r, g, b] * n]
+            im_gpu (torch.Tensor): Image is in cuda, shape: [3, h, w], range: [0, 1]
+            alpha (float, optional): Mask transparency: 0.0 fully transparent, 1.0 opaque.
+            retina_masks (bool, optional): Whether to use high resolution masks or not.
         """
         if self.pil:
             # Convert to numpy first
@@ -377,11 +379,11 @@ class Annotator:
 
         Args:
             kpts (torch.Tensor): Keypoints, shape [17, 3] (x, y, confidence).
-            shape (tuple, optional): Image shape (h, w). Defaults to (640, 640).
-            radius (int, optional): Keypoint radius. Defaults to 5.
-            kpt_line (bool, optional): Draw lines between keypoints. Defaults to True.
-            conf_thres (float, optional): Confidence threshold. Defaults to 0.25.
-            kpt_color (tuple, optional): Keypoint color (B, G, R). Defaults to None.
+            shape (tuple, optional): Image shape (h, w).
+            radius (int, optional): Keypoint radius.
+            kpt_line (bool, optional): Draw lines between keypoints.
+            conf_thres (float, optional): Confidence threshold.
+            kpt_color (tuple, optional): Keypoint color (B, G, R).
 
         Note:
             - `kpt_line=True` currently only supports human pose plotting.
@@ -436,7 +438,16 @@ class Annotator:
         self.draw.rectangle(xy, fill, outline, width)
 
     def text(self, xy, text, txt_color=(255, 255, 255), anchor="top", box_style=False):
-        """Adds text to an image using PIL or cv2."""
+        """
+        Add text to an image using PIL or cv2.
+
+        Args:
+            xy (List[int]): Top-left coordinates for text placement.
+            text (str): Text to be drawn.
+            txt_color (tuple, optional): Text color (R, G, B).
+            anchor (str, optional): Text anchor position ('top' or 'bottom').
+            box_style (bool, optional): Whether to draw text with a background box.
+        """
         if anchor == "bottom":  # start y from font bottom
             w, h = self.font.getsize(text)  # text width, height
             xy[1] += 1 - h
@@ -492,7 +503,7 @@ class Annotator:
     @staticmethod
     def get_bbox_dimension(bbox=None):
         """
-        Calculate the area of a bounding box.
+        Calculate the dimensions and area of a bounding box.
 
         Args:
             bbox (tuple): Bounding box coordinates in the format (x_min, y_min, x_max, y_max).
@@ -517,7 +528,16 @@ class Annotator:
 @TryExcept()  # known issue https://github.com/ultralytics/yolov5/issues/5395
 @plt_settings()
 def plot_labels(boxes, cls, names=(), save_dir=Path(""), on_plot=None):
-    """Plot training labels including class histograms and box statistics."""
+    """
+    Plot training labels including class histograms and box statistics.
+
+    Args:
+        boxes (np.ndarray): Bounding box coordinates in format [x, y, width, height].
+        cls (np.ndarray): Class indices.
+        names (Dict, optional): Dictionary mapping class indices to class names.
+        save_dir (Path, optional): Directory to save the plot.
+        on_plot (Callable, optional): Function to call after plot is saved.
+    """
     import pandas  # scope for faster 'import ultralytics'
     import seaborn  # scope for faster 'import ultralytics'
 
@@ -580,16 +600,16 @@ def save_one_box(xyxy, im, file=Path("im.jpg"), gain=1.02, pad=10, square=False,
 
     Args:
         xyxy (torch.Tensor or list): A tensor or list representing the bounding box in xyxy format.
-        im (numpy.ndarray): The input image.
-        file (Path, optional): The path where the cropped image will be saved. Defaults to 'im.jpg'.
-        gain (float, optional): A multiplicative factor to increase the size of the bounding box. Defaults to 1.02.
-        pad (int, optional): The number of pixels to add to the width and height of the bounding box. Defaults to 10.
-        square (bool, optional): If True, the bounding box will be transformed into a square. Defaults to False.
-        BGR (bool, optional): If True, the image will be saved in BGR format, otherwise in RGB. Defaults to False.
-        save (bool, optional): If True, the cropped image will be saved to disk. Defaults to True.
+        im (np.ndarray): The input image.
+        file (Path, optional): The path where the cropped image will be saved.
+        gain (float, optional): A multiplicative factor to increase the size of the bounding box.
+        pad (int, optional): The number of pixels to add to the width and height of the bounding box.
+        square (bool, optional): If True, the bounding box will be transformed into a square.
+        BGR (bool, optional): If True, the image will be saved in BGR format, otherwise in RGB.
+        save (bool, optional): If True, the cropped image will be saved to disk.
 
     Returns:
-        (numpy.ndarray): The cropped image.
+        (np.ndarray): The cropped image.
 
     Examples:
         >>> from ultralytics.utils.plotting import save_one_box
@@ -653,7 +673,7 @@ def plot_images(
         conf_thres: Confidence threshold for displaying detections.
 
     Returns:
-        np.ndarray: Plotted image grid as a numpy array if save is False, None otherwise.
+        (np.ndarray): Plotted image grid as a numpy array if save is False, None otherwise.
 
     Note:
         This function supports both tensor and numpy array inputs. It will automatically
@@ -789,13 +809,12 @@ def plot_results(file="path/to/results.csv", dir="", segment=False, pose=False,
     pose estimation, and classification. Plots are saved as 'results.png' in the directory where the CSV is located.
 
     Args:
-        file (str, optional): Path to the CSV file containing the training results. Defaults to 'path/to/results.csv'.
-        dir (str, optional): Directory where the CSV file is located if 'file' is not provided. Defaults to ''.
-        segment (bool, optional): Flag to indicate if the data is for segmentation. Defaults to False.
-        pose (bool, optional): Flag to indicate if the data is for pose estimation. Defaults to False.
-        classify (bool, optional): Flag to indicate if the data is for classification. Defaults to False.
+        file (str, optional): Path to the CSV file containing the training results.
+        dir (str, optional): Directory where the CSV file is located if 'file' is not provided.
+        segment (bool, optional): Flag to indicate if the data is for segmentation.
+        pose (bool, optional): Flag to indicate if the data is for pose estimation.
+        classify (bool, optional): Flag to indicate if the data is for classification.
         on_plot (callable, optional): Callback function to be executed after plotting. Takes filename as an argument.
-            Defaults to None.
 
     Examples:
         >>> from ultralytics.utils.plotting import plot_results
@@ -845,15 +864,15 @@ def plot_results(file="path/to/results.csv", dir="", segment=False, pose=False,
 
 def plt_color_scatter(v, f, bins=20, cmap="viridis", alpha=0.8, edgecolors="none"):
     """
-    Plots a scatter plot with points colored based on a 2D histogram.
+    Plot a scatter plot with points colored based on a 2D histogram.
 
     Args:
         v (array-like): Values for the x-axis.
         f (array-like): Values for the y-axis.
-        bins (int, optional): Number of bins for the histogram. Defaults to 20.
-        cmap (str, optional): Colormap for the scatter plot. Defaults to 'viridis'.
-        alpha (float, optional): Alpha for the scatter plot. Defaults to 0.8.
-        edgecolors (str, optional): Edge colors for the scatter plot. Defaults to 'none'.
+        bins (int, optional): Number of bins for the histogram.
+        cmap (str, optional): Colormap for the scatter plot.
+        alpha (float, optional): Alpha for the scatter plot.
+        edgecolors (str, optional): Edge colors for the scatter plot.
 
     Examples:
         >>> v = np.random.rand(100)
@@ -880,7 +899,7 @@ def plot_tune_results(csv_file="tune_results.csv"):
     in the CSV, color-coded based on fitness scores. The best-performing configurations are highlighted on the plots.
 
     Args:
-        csv_file (str, optional): Path to the CSV file containing the tuning results. Defaults to 'tune_results.csv'.
+        csv_file (str, optional): Path to the CSV file containing the tuning results.
 
     Examples:
         >>> plot_tune_results("path/to/tune_results.csv")
@@ -959,8 +978,8 @@ def feature_visualization(x, module_type, stage, n=32, save_dir=Path("runs/detec
         x (torch.Tensor): Features to be visualized.
         module_type (str): Module type.
         stage (int): Module stage within the model.
-        n (int, optional): Maximum number of feature maps to plot. Defaults to 32.
-        save_dir (Path, optional): Directory to save results. Defaults to Path('runs/detect/exp').
+        n (int, optional): Maximum number of feature maps to plot.
+        save_dir (Path, optional): Directory to save results.
     """
     for m in {"Detect", "Segment", "Pose", "Classify", "OBB", "RTDETRDecoder"}:  # all model heads
         if m in module_type:
diff --git a/ultralytics/utils/torch_utils.py b/ultralytics/utils/torch_utils.py
index c70ba7b075..a19bbb4a6a 100644
--- a/ultralytics/utils/torch_utils.py
+++ b/ultralytics/utils/torch_utils.py
@@ -90,12 +90,12 @@ def autocast(enabled: bool, device: str = "cuda"):
     Returns:
         (torch.amp.autocast): The appropriate autocast context manager.
 
-    Note:
+    Notes:
         - For PyTorch versions 1.13 and newer, it uses `torch.amp.autocast`.
         - For older versions, it uses `torch.cuda.autocast`.
 
     Examples:
-        >>> with autocast(amp=True):
+        >>> with autocast(enabled=True):
         ...     # Your mixed precision operations here
         ...     pass
     """
@@ -130,7 +130,7 @@ def get_gpu_info(index):
 
 def select_device(device="", batch=0, newline=False, verbose=True):
     """
-    Selects the appropriate PyTorch device based on the provided arguments.
+    Select the appropriate PyTorch device based on the provided arguments.
 
     The function takes a string specifying the device or a torch.device object and returns a torch.device object
     representing the selected device. The function also validates the number of available devices and raises an
@@ -299,7 +299,18 @@ def fuse_deconv_and_bn(deconv, bn):
 
 
 def model_info(model, detailed=False, verbose=True, imgsz=640):
-    """Print and return detailed model information layer by layer."""
+    """
+    Print and return detailed model information layer by layer.
+
+    Args:
+        model (nn.Module): Model to analyze.
+        detailed (bool, optional): Whether to print detailed layer information. Defaults to False.
+        verbose (bool, optional): Whether to print model information. Defaults to True.
+        imgsz (int | List, optional): Input image size. Defaults to 640.
+
+    Returns:
+        (Tuple[int, int, int, float]): Number of layers, parameters, gradients, and GFLOPs.
+    """
     if not verbose:
         return
     n_p = get_num_params(model)  # number of parameters
@@ -343,6 +354,12 @@ def model_info_for_loggers(trainer):
     """
     Return model info dict with useful model information.
 
+    Args:
+        trainer (ultralytics.engine.trainer.BaseTrainer): The trainer object containing model and validation data.
+
+    Returns:
+        (dict): Dictionary containing model parameters, GFLOPs, and inference speeds.
+
     Examples:
         YOLOv8n info for loggers
         >>> results = {
@@ -368,7 +385,16 @@ def model_info_for_loggers(trainer):
 
 
 def get_flops(model, imgsz=640):
-    """Return a YOLO model's FLOPs."""
+    """
+    Return a YOLO model's FLOPs.
+
+    Args:
+        model (nn.Module): The model to calculate FLOPs for.
+        imgsz (int | List[int], optional): Input image size. Defaults to 640.
+
+    Returns:
+        (float): The model's FLOPs in billions.
+    """
     if not thop:
         return 0.0  # if not installed return 0.0 GFLOPs
 
@@ -392,7 +418,16 @@ def get_flops(model, imgsz=640):
 
 
 def get_flops_with_torch_profiler(model, imgsz=640):
-    """Compute model FLOPs (thop package alternative, but 2-10x slower unfortunately)."""
+    """
+    Compute model FLOPs using torch profiler (alternative to thop package, but 2-10x slower).
+
+    Args:
+        model (nn.Module): The model to calculate FLOPs for.
+        imgsz (int | List[int], optional): Input image size. Defaults to 640.
+
+    Returns:
+        (float): The model's FLOPs in billions.
+    """
     if not TORCH_2_0:  # torch profiler implemented in torch>=2.0
         return 0.0
     model = de_parallel(model)
@@ -430,7 +465,18 @@ def initialize_weights(model):
 
 
 def scale_img(img, ratio=1.0, same_shape=False, gs=32):
-    """Scales and pads an image tensor, optionally maintaining aspect ratio and padding to gs multiple."""
+    """
+    Scales and pads an image tensor, optionally maintaining aspect ratio and padding to gs multiple.
+
+    Args:
+        img (torch.Tensor): Input image tensor.
+        ratio (float, optional): Scaling ratio. Defaults to 1.0.
+        same_shape (bool, optional): Whether to maintain the same shape. Defaults to False.
+        gs (int, optional): Grid size for padding. Defaults to 32.
+
+    Returns:
+        (torch.Tensor): Scaled and padded image tensor.
+    """
     if ratio == 1.0:
         return img
     h, w = img.shape[2:]
@@ -442,7 +488,15 @@ def scale_img(img, ratio=1.0, same_shape=False, gs=32):
 
 
 def copy_attr(a, b, include=(), exclude=()):
-    """Copies attributes from object 'b' to object 'a', with options to include/exclude certain attributes."""
+    """
+    Copies attributes from object 'b' to object 'a', with options to include/exclude certain attributes.
+
+    Args:
+        a (object): Destination object to copy attributes to.
+        b (object): Source object to copy attributes from.
+        include (tuple, optional): Attributes to include. If empty, all attributes are included. Defaults to ().
+        exclude (tuple, optional): Attributes to exclude. Defaults to ().
+    """
     for k, v in b.__dict__.items():
         if (len(include) and k not in include) or k.startswith("_") or k in exclude:
             continue
@@ -451,7 +505,12 @@ def copy_attr(a, b, include=(), exclude=()):
 
 
 def get_latest_opset():
-    """Return the second-most recent ONNX opset version supported by this version of PyTorch, adjusted for maturity."""
+    """
+    Return the second-most recent ONNX opset version supported by this version of PyTorch, adjusted for maturity.
+
+    Returns:
+        (int): The ONNX opset version.
+    """
     if TORCH_1_13:
         # If the PyTorch>=1.13, dynamically compute the latest opset minus one using 'symbolic_opset'
         return max(int(k[14:]) for k in vars(torch.onnx) if "symbolic_opset" in k) - 1
@@ -461,27 +520,69 @@ def get_latest_opset():
 
 
 def intersect_dicts(da, db, exclude=()):
-    """Returns a dictionary of intersecting keys with matching shapes, excluding 'exclude' keys, using da values."""
+    """
+    Returns a dictionary of intersecting keys with matching shapes, excluding 'exclude' keys, using da values.
+
+    Args:
+        da (dict): First dictionary.
+        db (dict): Second dictionary.
+        exclude (tuple, optional): Keys to exclude. Defaults to ().
+
+    Returns:
+        (dict): Dictionary of intersecting keys with matching shapes.
+    """
     return {k: v for k, v in da.items() if k in db and all(x not in k for x in exclude) and v.shape == db[k].shape}
 
 
 def is_parallel(model):
-    """Returns True if model is of type DP or DDP."""
+    """
+    Returns True if model is of type DP or DDP.
+
+    Args:
+        model (nn.Module): Model to check.
+
+    Returns:
+        (bool): True if model is DataParallel or DistributedDataParallel.
+    """
     return isinstance(model, (nn.parallel.DataParallel, nn.parallel.DistributedDataParallel))
 
 
 def de_parallel(model):
-    """De-parallelize a model: returns single-GPU model if model is of type DP or DDP."""
+    """
+    De-parallelize a model: returns single-GPU model if model is of type DP or DDP.
+
+    Args:
+        model (nn.Module): Model to de-parallelize.
+
+    Returns:
+        (nn.Module): De-parallelized model.
+    """
     return model.module if is_parallel(model) else model
 
 
 def one_cycle(y1=0.0, y2=1.0, steps=100):
-    """Returns a lambda function for sinusoidal ramp from y1 to y2 https://arxiv.org/pdf/1812.01187.pdf."""
+    """
+    Returns a lambda function for sinusoidal ramp from y1 to y2 https://arxiv.org/pdf/1812.01187.pdf.
+
+    Args:
+        y1 (float, optional): Initial value. Defaults to 0.0.
+        y2 (float, optional): Final value. Defaults to 1.0.
+        steps (int, optional): Number of steps. Defaults to 100.
+
+    Returns:
+        (function): Lambda function for computing the sinusoidal ramp.
+    """
     return lambda x: max((1 - math.cos(x * math.pi / steps)) / 2, 0) * (y2 - y1) + y1
 
 
 def init_seeds(seed=0, deterministic=False):
-    """Initialize random number generator (RNG) seeds https://pytorch.org/docs/stable/notes/randomness.html."""
+    """
+    Initialize random number generator (RNG) seeds https://pytorch.org/docs/stable/notes/randomness.html.
+
+    Args:
+        seed (int, optional): Random seed. Defaults to 0.
+        deterministic (bool, optional): Whether to set deterministic algorithms. Defaults to False.
+    """
     random.seed(seed)
     np.random.seed(seed)
     torch.manual_seed(seed)
@@ -510,16 +611,30 @@ def unset_deterministic():
 
 class ModelEMA:
     """
-    Updated Exponential Moving Average (EMA) from https://github.com/rwightman/pytorch-image-models. Keeps a moving
-    average of everything in the model state_dict (parameters and buffers).
+    Updated Exponential Moving Average (EMA) from https://github.com/rwightman/pytorch-image-models.
 
+    Keeps a moving average of everything in the model state_dict (parameters and buffers).
     For EMA details see https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage
 
     To disable EMA set the `enabled` attribute to `False`.
+
+    Attributes:
+        ema (nn.Module): Copy of the model in evaluation mode.
+        updates (int): Number of EMA updates.
+        decay (function): Decay function that determines the EMA weight.
+        enabled (bool): Whether EMA is enabled.
     """
 
     def __init__(self, model, decay=0.9999, tau=2000, updates=0):
-        """Initialize EMA for 'model' with given arguments."""
+        """
+        Initialize EMA for 'model' with given arguments.
+
+        Args:
+            model (nn.Module): Model to create EMA for.
+            decay (float, optional): Maximum EMA decay rate. Defaults to 0.9999.
+            tau (int, optional): EMA decay time constant. Defaults to 2000.
+            updates (int, optional): Initial number of updates. Defaults to 0.
+        """
         self.ema = deepcopy(de_parallel(model)).eval()  # FP32 EMA
         self.updates = updates  # number of EMA updates
         self.decay = lambda x: decay * (1 - math.exp(-x / tau))  # decay exponential ramp (to help early epochs)
@@ -528,7 +643,12 @@ class ModelEMA:
         self.enabled = True
 
     def update(self, model):
-        """Update EMA parameters."""
+        """
+        Update EMA parameters.
+
+        Args:
+            model (nn.Module): Model to update EMA from.
+        """
         if self.enabled:
             self.updates += 1
             d = self.decay(self.updates)
@@ -541,7 +661,14 @@ class ModelEMA:
                     # assert v.dtype == msd[k].dtype == torch.float32, f'{k}: EMA {v.dtype},  model {msd[k].dtype}'
 
     def update_attr(self, model, include=(), exclude=("process_group", "reducer")):
-        """Updates attributes and saves stripped model with optimizer removed."""
+        """
+        Updates attributes and saves stripped model with optimizer removed.
+
+        Args:
+            model (nn.Module): Model to update attributes from.
+            include (tuple, optional): Attributes to include. Defaults to ().
+            exclude (tuple, optional): Attributes to exclude. Defaults to ("process_group", "reducer").
+        """
         if self.enabled:
             copy_attr(self.ema, model, include, exclude)
 
@@ -551,9 +678,9 @@ def strip_optimizer(f: Union[str, Path] = "best.pt", s: str = "", updates: dict
     Strip optimizer from 'f' to finalize training, optionally save as 's'.
 
     Args:
-        f (str): file path to model to strip the optimizer from. Default is 'best.pt'.
-        s (str): file path to save the model with stripped optimizer to. If not provided, 'f' will be overwritten.
-        updates (dict): a dictionary of updates to overlay onto the checkpoint before saving.
+        f (str | Path): File path to model to strip the optimizer from. Defaults to 'best.pt'.
+        s (str, optional): File path to save the model with stripped optimizer to. If not provided, 'f' will be overwritten.
+        updates (dict, optional): A dictionary of updates to overlay onto the checkpoint before saving.
 
     Returns:
         (dict): The combined checkpoint dictionary.
@@ -563,9 +690,6 @@ def strip_optimizer(f: Union[str, Path] = "best.pt", s: str = "", updates: dict
         >>> from ultralytics.utils.torch_utils import strip_optimizer
         >>> for f in Path("path/to/model/checkpoints").rglob("*.pt"):
         >>>    strip_optimizer(f)
-
-    Note:
-        Use `ultralytics.nn.torch_safe_load` for missing modules with `x = torch_safe_load(f)[0]`
     """
     try:
         x = torch.load(f, map_location=torch.device("cpu"))
@@ -613,7 +737,11 @@ def convert_optimizer_state_dict_to_fp16(state_dict):
     """
     Converts the state_dict of a given optimizer to FP16, focusing on the 'state' key for tensor conversions.
 
-    This method aims to reduce storage size without altering 'param_groups' as they contain non-tensor data.
+    Args:
+        state_dict (dict): Optimizer state dictionary.
+
+    Returns:
+        (dict): Converted optimizer state dictionary with FP16 tensors.
     """
     for state in state_dict["state"].values():
         for k, v in state.items():
@@ -653,6 +781,16 @@ def profile(input, ops, n=10, device=None, max_num_obj=0):
     """
     Ultralytics speed, memory and FLOPs profiler.
 
+    Args:
+        input (torch.Tensor | List[torch.Tensor]): Input tensor(s) to profile.
+        ops (nn.Module | List[nn.Module]): Model or list of operations to profile.
+        n (int, optional): Number of iterations to average. Defaults to 10.
+        device (str | torch.device, optional): Device to profile on. Defaults to None.
+        max_num_obj (int, optional): Maximum number of objects for simulation. Defaults to 0.
+
+    Returns:
+        (List): Profile results for each operation.
+
     Examples:
         >>> from ultralytics.utils.torch_utils import profile
         >>> input = torch.randn(16, 3, 640, 640)
@@ -721,7 +859,15 @@ def profile(input, ops, n=10, device=None, max_num_obj=0):
 
 
 class EarlyStopping:
-    """Early stopping class that stops training when a specified number of epochs have passed without improvement."""
+    """
+    Early stopping class that stops training when a specified number of epochs have passed without improvement.
+
+    Attributes:
+        best_fitness (float): Best fitness value observed.
+        best_epoch (int): Epoch where best fitness was observed.
+        patience (int): Number of epochs to wait after fitness stops improving before stopping.
+        possible_stop (bool): Flag indicating if stopping may occur next epoch.
+    """
 
     def __init__(self, patience=50):
         """
@@ -770,11 +916,12 @@ class FXModel(nn.Module):
     """
     A custom model class for torch.fx compatibility.
 
-    This class extends `torch.nn.Module` and is designed to ensure compatibility with torch.fx for tracing and graph manipulation.
-    It copies attributes from an existing model and explicitly sets the model attribute to ensure proper copying.
+    This class extends `torch.nn.Module` and is designed to ensure compatibility with torch.fx for tracing and graph
+    manipulation. It copies attributes from an existing model and explicitly sets the model attribute to ensure proper
+    copying.
 
-    Args:
-        model (torch.nn.Module): The original model to wrap for torch.fx compatibility.
+    Attributes:
+        model (nn.Module): The original model's layers.
     """
 
     def __init__(self, model):
@@ -782,7 +929,7 @@ class FXModel(nn.Module):
         Initialize the FXModel.
 
         Args:
-            model (torch.nn.Module): The original model to wrap for torch.fx compatibility.
+            model (nn.Module): The original model to wrap for torch.fx compatibility.
         """
         super().__init__()
         copy_attr(self, model)
@@ -793,7 +940,8 @@ class FXModel(nn.Module):
         """
         Forward pass through the model.
 
-        This method performs the forward pass through the model, handling the dependencies between layers and saving intermediate outputs.
+        This method performs the forward pass through the model, handling the dependencies between layers and saving
+        intermediate outputs.
 
         Args:
             x (torch.Tensor): The input tensor to the model.