Source code for yolort.models.yolov5

# Copyright (c) 2021, yolort team. All rights reserved.

import warnings
from typing import Any, Dict, List, Callable, Optional, Tuple

import torch
import torchvision
from torch import nn, Tensor
from torchvision.io import read_image
from yolort.data import contains_any_tensor

from . import yolo
from .transform import YOLOTransform, _get_shape_onnx
from .yolo import YOLO

__all__ = ["YOLOv5"]


[docs]class YOLOv5(nn.Module):
    """
    Wrapping the pre-processing (`LetterBox`) into the YOLO models.

    The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
    image, and should be in 0-1 range. Different images can have different sizes but they will be resized
    to a fixed size that maintains the aspect ratio before passing it to the backbone.

    The behavior of the model changes depending if it is in training or evaluation mode.

    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
    containing:
        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
        - labels (Int64Tensor[N]): the class label for each ground-truth box

    The model returns a Dict[Tensor] during training, containing the classification and regression
    losses.

    During inference, the model requires only the input tensors, and returns the post-processed
    predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as
    follows, where ``N`` is the number of detections:

        - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with
          ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
        - labels (Int64Tensor[N]): the predicted labels for each detection
        - scores (Tensor[N]): the scores for each detection

    Example:

        Demo pipeline for YOLOv5 Inference.

        .. code-block:: python

            from yolort.models import YOLOv5

            # Load the yolov5s version 6.0 models
            arch = 'yolov5_darknet_pan_s_r60'
            model = YOLOv5(arch=arch, pretrained=True, score_thresh=0.35)
            model = model.eval()

            # Perform inference on an image file
            predictions = model.predict('bus.jpg')
            # Perform inference on a list of image files
            predictions2 = model.predict(['bus.jpg', 'zidane.jpg'])

        We also support loading the custom checkpoints trained from ultralytics/yolov5

        .. code-block:: python

            from yolort.models import YOLOv5

            # Your trained checkpoint from ultralytics
            checkpoint_path = 'yolov5n.pt'
            model = YOLOv5.load_from_yolov5(checkpoint_path, score_thresh=0.35)
            model = model.eval()

            # Perform inference on an image file
            predictions = model.predict('bus.jpg')

    Args:
        arch (string): YOLO model architecture. Default: None
        model (nn.Module): YOLO model. Default: None
        num_classes (int): number of output classes of the model (doesn't including
            background). Default: 80
        pretrained (bool): If true, returns a model pre-trained on COCO train2017
        progress (bool): If True, displays a progress bar of the download to stderr
        size: (Tuple[int, int]): the minimum and maximum size of the image to be rescaled.
            Default: (640, 640)
        size_divisible (int): stride of the models. Default: 32
        fixed_shape (Tuple[int, int], optional): Padding mode for letterboxing. If set to `True`,
            the image will be padded to shape `fixed_shape` if specified. Instead the image will
            be padded to a minimum rectangle to match `min_size / max_size` and each of its edges
            is divisible by `size_divisible` if it is not specified. Default: None
        fill_color (int): fill value for padding. Default: 114
    """

    def __init__(
        self,
        arch: Optional[str] = None,
        model: Optional[nn.Module] = None,
        num_classes: int = 80,
        pretrained: bool = False,
        progress: bool = True,
        size: Tuple[int, int] = (640, 640),
        size_divisible: int = 32,
        fixed_shape: Optional[Tuple[int, int]] = None,
        fill_color: int = 114,
        **kwargs: Any,
    ) -> None:

        super().__init__()

        self.arch = arch
        self.num_classes = num_classes

        if model is None:
            model = yolo.__dict__[arch](
                pretrained=pretrained,
                progress=progress,
                num_classes=num_classes,
                **kwargs,
            )
        self.model = model

        self.transform = YOLOTransform(
            size[0],
            size[1],
            size_divisible=size_divisible,
            fixed_shape=fixed_shape,
            fill_color=fill_color,
        )

        # used only on torchscript mode
        self._has_warned = False

    def forward(
        self,
        inputs: List[Tensor],
        targets: Optional[List[Dict[str, Tensor]]] = None,
    ) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]]:

        # get the original image sizes
        original_image_sizes: List[Tuple[int, int]] = []

        if not self.training:
            for img in inputs:
                val = img.shape[-2:]
                assert len(val) == 2
                original_image_sizes.append((val[0], val[1]))

        # Transform the input
        samples, targets = self.transform(inputs, targets)
        # Compute the detections
        outputs = self.model(samples.tensors, targets=targets)

        losses = {}
        detections: List[Dict[str, Tensor]] = []

        if self.training:
            # compute the losses
            if torch.jit.is_scripting():
                losses = outputs[0]
            else:
                losses = outputs
        else:
            # Rescale coordinate
            if torch.jit.is_scripting():
                result = outputs[1]
            else:
                result = outputs

            if torchvision._is_tracing():
                im_shape = _get_shape_onnx(samples.tensors)
            else:
                im_shape = torch.tensor(samples.tensors.shape[-2:])

            detections = self.transform.postprocess(result, im_shape, original_image_sizes)

        if torch.jit.is_scripting():
            if not self._has_warned:
                warnings.warn("YOLOv5 always returns a (Losses, Detections) tuple in scripting.")
                self._has_warned = True
            return losses, detections
        else:
            return self.eager_outputs(losses, detections)

    @torch.jit.unused
    def eager_outputs(
        self,
        losses: Dict[str, Tensor],
        detections: List[Dict[str, Tensor]],
    ) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]]:
        if self.training:
            return losses

        return detections

    @torch.no_grad()
    def predict(self, x: Any, image_loader: Optional[Callable] = None) -> List[Dict[str, Tensor]]:
        """
        Predict function for raw data or processed data

        Args:
            x: Input to predict. Can be raw data or processed data.
            image_loader: Utility function to convert raw data to Tensor.

        Returns:
            The post-processed model predictions.
        """
        image_loader = image_loader or self.default_loader
        images = self.collate_images(x, image_loader)
        return self.forward(images)

    def default_loader(self, img_path: str) -> Tensor:
        """
        Default loader of read a image path.

        Args:
            img_path (str): a image path

        Returns:
            Tensor, processed tensor for prediction.
        """
        return read_image(img_path) / 255.0

    def collate_images(self, samples: Any, image_loader: Callable) -> List[Tensor]:
        """
        Prepare source samples for inference.

        Args:
            samples (Any): samples source, support the following various types:
                - str or List[str]: a image path or list of image paths.
                - Tensor or List[Tensor]: a tensor or list of tensors.

        Returns:
            List[Tensor], The processed image samples.
        """
        p = next(self.parameters())  # for device and type
        if isinstance(samples, Tensor):
            return [samples.to(p.device).type_as(p)]

        if contains_any_tensor(samples):
            return [sample.to(p.device).type_as(p) for sample in samples]

        if isinstance(samples, str):
            samples = [samples]

        if isinstance(samples, (list, tuple)) and all(isinstance(p, str) for p in samples):
            outputs = []
            for sample in samples:
                output = image_loader(sample).to(p.device).type_as(p)
                outputs.append(output)
            return outputs

        raise NotImplementedError(
            f"The type of the sample is {type(samples)}, we currently don't support it now, the "
            "samples should be either a tensor, list of tensors, a image path or list of image paths."
        )

    @classmethod
    def load_from_yolov5(
        cls,
        checkpoint_path: str,
        *,
        size: Tuple[int, int] = (640, 640),
        size_divisible: int = 32,
        fixed_shape: Optional[Tuple[int, int]] = None,
        fill_color: int = 114,
        **kwargs: Any,
    ):
        """
        Load custom checkpoints trained from YOLOv5.

        Args:
            checkpoint_path (str): Path of the YOLOv5 checkpoint model.
            size: (Tuple[int, int]): the minimum and maximum size of the image to be rescaled.
                Default: (640, 640)
            size_divisible (int): stride of the models. Default: 32
            fixed_shape (Tuple[int, int], optional): Padding mode for letterboxing. If set to `True`,
                the image will be padded to shape `fixed_shape` if specified. Instead the image will
                be padded to a minimum rectangle to match `min_size / max_size` and each of its edges
                is divisible by `size_divisible` if it is not specified. Default: None
            fill_color (int): fill value for padding. Default: 114
        """
        model = YOLO.load_from_yolov5(checkpoint_path, **kwargs)
        yolov5 = cls(
            model=model,
            size=size,
            size_divisible=size_divisible,
            fixed_shape=fixed_shape,
            fill_color=fill_color,
        )
        return yolov5