Source code for yolort.models.yolov5

# Copyright (c) 2021, yolort team. All rights reserved.

import warnings
from typing import Any, Dict, List, Callable, Optional, Tuple

import torch
import torchvision
from torch import nn, Tensor
from torchvision.io import read_image
from yolort.data import contains_any_tensor

from . import yolo
from .transform import YOLOTransform, _get_shape_onnx
from .yolo import YOLO

__all__ = ["YOLOv5"]


[docs]class YOLOv5(nn.Module): """ Wrapping the pre-processing (`LetterBox`) into the YOLO models. The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each image, and should be in 0-1 range. Different images can have different sizes but they will be resized to a fixed size that maintains the aspect ratio before passing it to the backbone. The behavior of the model changes depending if it is in training or evaluation mode. During training, the model expects both the input tensors, as well as a targets (list of dictionary), containing: - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. - labels (Int64Tensor[N]): the class label for each ground-truth box The model returns a Dict[Tensor] during training, containing the classification and regression losses. During inference, the model requires only the input tensors, and returns the post-processed predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as follows, where ``N`` is the number of detections: - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``. - labels (Int64Tensor[N]): the predicted labels for each detection - scores (Tensor[N]): the scores for each detection Example: Demo pipeline for YOLOv5 Inference. .. code-block:: python from yolort.models import YOLOv5 # Load the yolov5s version 6.0 models arch = 'yolov5_darknet_pan_s_r60' model = YOLOv5(arch=arch, pretrained=True, score_thresh=0.35) model = model.eval() # Perform inference on an image file predictions = model.predict('bus.jpg') # Perform inference on a list of image files predictions2 = model.predict(['bus.jpg', 'zidane.jpg']) We also support loading the custom checkpoints trained from ultralytics/yolov5 .. code-block:: python from yolort.models import YOLOv5 # Your trained checkpoint from ultralytics checkpoint_path = 'yolov5n.pt' model = YOLOv5.load_from_yolov5(checkpoint_path, score_thresh=0.35) model = model.eval() # Perform inference on an image file predictions = model.predict('bus.jpg') Args: arch (string): YOLO model architecture. Default: None model (nn.Module): YOLO model. Default: None num_classes (int): number of output classes of the model (doesn't including background). Default: 80 pretrained (bool): If true, returns a model pre-trained on COCO train2017 progress (bool): If True, displays a progress bar of the download to stderr size: (Tuple[int, int]): the minimum and maximum size of the image to be rescaled. Default: (640, 640) size_divisible (int): stride of the models. Default: 32 fixed_shape (Tuple[int, int], optional): Padding mode for letterboxing. If set to `True`, the image will be padded to shape `fixed_shape` if specified. Instead the image will be padded to a minimum rectangle to match `min_size / max_size` and each of its edges is divisible by `size_divisible` if it is not specified. Default: None fill_color (int): fill value for padding. Default: 114 """ def __init__( self, arch: Optional[str] = None, model: Optional[nn.Module] = None, num_classes: int = 80, pretrained: bool = False, progress: bool = True, size: Tuple[int, int] = (640, 640), size_divisible: int = 32, fixed_shape: Optional[Tuple[int, int]] = None, fill_color: int = 114, **kwargs: Any, ) -> None: super().__init__() self.arch = arch self.num_classes = num_classes if model is None: model = yolo.__dict__[arch]( pretrained=pretrained, progress=progress, num_classes=num_classes, **kwargs, ) self.model = model self.transform = YOLOTransform( size[0], size[1], size_divisible=size_divisible, fixed_shape=fixed_shape, fill_color=fill_color, ) # used only on torchscript mode self._has_warned = False def forward( self, inputs: List[Tensor], targets: Optional[List[Dict[str, Tensor]]] = None, ) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]]: # get the original image sizes original_image_sizes: List[Tuple[int, int]] = [] if not self.training: for img in inputs: val = img.shape[-2:] assert len(val) == 2 original_image_sizes.append((val[0], val[1])) # Transform the input samples, targets = self.transform(inputs, targets) # Compute the detections outputs = self.model(samples.tensors, targets=targets) losses = {} detections: List[Dict[str, Tensor]] = [] if self.training: # compute the losses if torch.jit.is_scripting(): losses = outputs[0] else: losses = outputs else: # Rescale coordinate if torch.jit.is_scripting(): result = outputs[1] else: result = outputs if torchvision._is_tracing(): im_shape = _get_shape_onnx(samples.tensors) else: im_shape = torch.tensor(samples.tensors.shape[-2:]) detections = self.transform.postprocess(result, im_shape, original_image_sizes) if torch.jit.is_scripting(): if not self._has_warned: warnings.warn("YOLOv5 always returns a (Losses, Detections) tuple in scripting.") self._has_warned = True return losses, detections else: return self.eager_outputs(losses, detections) @torch.jit.unused def eager_outputs( self, losses: Dict[str, Tensor], detections: List[Dict[str, Tensor]], ) -> Tuple[Dict[str, Tensor], List[Dict[str, Tensor]]]: if self.training: return losses return detections @torch.no_grad() def predict(self, x: Any, image_loader: Optional[Callable] = None) -> List[Dict[str, Tensor]]: """ Predict function for raw data or processed data Args: x: Input to predict. Can be raw data or processed data. image_loader: Utility function to convert raw data to Tensor. Returns: The post-processed model predictions. """ image_loader = image_loader or self.default_loader images = self.collate_images(x, image_loader) return self.forward(images) def default_loader(self, img_path: str) -> Tensor: """ Default loader of read a image path. Args: img_path (str): a image path Returns: Tensor, processed tensor for prediction. """ return read_image(img_path) / 255.0 def collate_images(self, samples: Any, image_loader: Callable) -> List[Tensor]: """ Prepare source samples for inference. Args: samples (Any): samples source, support the following various types: - str or List[str]: a image path or list of image paths. - Tensor or List[Tensor]: a tensor or list of tensors. Returns: List[Tensor], The processed image samples. """ p = next(self.parameters()) # for device and type if isinstance(samples, Tensor): return [samples.to(p.device).type_as(p)] if contains_any_tensor(samples): return [sample.to(p.device).type_as(p) for sample in samples] if isinstance(samples, str): samples = [samples] if isinstance(samples, (list, tuple)) and all(isinstance(p, str) for p in samples): outputs = [] for sample in samples: output = image_loader(sample).to(p.device).type_as(p) outputs.append(output) return outputs raise NotImplementedError( f"The type of the sample is {type(samples)}, we currently don't support it now, the " "samples should be either a tensor, list of tensors, a image path or list of image paths." ) @classmethod def load_from_yolov5( cls, checkpoint_path: str, *, size: Tuple[int, int] = (640, 640), size_divisible: int = 32, fixed_shape: Optional[Tuple[int, int]] = None, fill_color: int = 114, **kwargs: Any, ): """ Load custom checkpoints trained from YOLOv5. Args: checkpoint_path (str): Path of the YOLOv5 checkpoint model. size: (Tuple[int, int]): the minimum and maximum size of the image to be rescaled. Default: (640, 640) size_divisible (int): stride of the models. Default: 32 fixed_shape (Tuple[int, int], optional): Padding mode for letterboxing. If set to `True`, the image will be padded to shape `fixed_shape` if specified. Instead the image will be padded to a minimum rectangle to match `min_size / max_size` and each of its edges is divisible by `size_divisible` if it is not specified. Default: None fill_color (int): fill value for padding. Default: 114 """ model = YOLO.load_from_yolov5(checkpoint_path, **kwargs) yolov5 = cls( model=model, size=size, size_divisible=size_divisible, fixed_shape=fixed_shape, fill_color=fill_color, ) return yolov5