Target detection tricks (based on detectron2)

Keywords: Python Machine Learning Object Detection

Target detection tricks (based on detectron2)

Try correctly


Because the target is too small relative to the whole picture, the data is cropped (in addition to the cutting size, the overlapping size should also be paid attention to. The overlapping size should be slightly larger. Try to keep each target intact so as not to damage the target due to cutting. Here, set cutting 512 and overlapping 256)

Reference code: DOTA_devkit

Change anchor size and aspect_ratio

Because the data target is small, you need to change the default anchor.size and aspect in detectron2_ ratio

cfg.MODEL.ANCHOR_GENERATOR.SIZES = [[35], [68], [87], [130], [149]]
cfg.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS = [[1.1], [1.2], [1.4], [1.8], [2.7]]

Method: the area and length width ratio of the data annotation box are counted, and the results are obtained by kmeans clustering method. The kmeans in the package of sklearn are compared with the handwritten kmeans code on the Internet. It is considered that the clustering results obtained from the kmeans in the package of sklearn can better cover the overall data and meet the needs of the target detection anchor here.

Join TTA

Test time data enhancement, or TTA for short, is an application for data expansion of test data sets. It involves creating multiple amplified copies of each image in the test set, allowing the model to predict each image, and then returning the set of these predictions.

cfg.TEST.AUG.MIN_SIZES = (400, 500, 512, 600, 700, 800)
cfg.TEST.AUG.MAX_SIZE = 1000
cfg.TEST.AUG.FLIP = True

Due to the five parameter format of rotation box used in target detection( x , y , w , h , θ x,y,w,h,\theta x,y,w,h, θ), Therefore, some changes have been made to the TTA of detectron2, mainly from apply_box to apply_rotated_box and fast_rcnn_information_single_image_rotated

class GeneralizedRCNNWithTTA(nn.Module):
    A GeneralizedRCNN with test-time augmentation enabled.
    Its :meth:`__call__` method has the same interface as :meth:`GeneralizedRCNN.forward`.

    def __init__(self, cfg, model, tta_mapper=None, batch_size=3):
            cfg (CfgNode):
            model (GeneralizedRCNN): a GeneralizedRCNN to apply TTA on.
            tta_mapper (callable): takes a dataset dict and returns a list of
                augmented versions of the dataset dict. Defaults to
            batch_size (int): batch the augmented images into this batch size for inference.
        if isinstance(model, DistributedDataParallel):
            model = model.module
        assert isinstance(
            model, GeneralizedRCNN
        ), "TTA is only supported on GeneralizedRCNN. Got a model of type {}".format(type(model))
        self.cfg = cfg.clone()
        assert not self.cfg.MODEL.KEYPOINT_ON, "TTA for keypoint is not supported yet"
        assert (
            not self.cfg.MODEL.LOAD_PROPOSALS
        ), "TTA for pre-computed proposals is not supported yet"

        self.model = model

        if tta_mapper is None:
            tta_mapper = DatasetMapperTTA(cfg.TEST.AUG.MIN_SIZES, cfg.TEST.AUG.MAX_SIZE, cfg.TEST.AUG.FLIP)
        self.tta_mapper = tta_mapper
        self.batch_size = batch_size

    def _turn_off_roi_heads(self, attrs):
        Open a context where some heads in `model.roi_heads` are temporarily turned off.
            attr (list[str]): the attribute in `model.roi_heads` which can be used
                to turn off a specific head, e.g., "mask_on", "keypoint_on".
        roi_heads = self.model.roi_heads
        old = {}
        for attr in attrs:
                old[attr] = getattr(roi_heads, attr)
            except AttributeError:
                # The head may not be implemented in certain ROIHeads

        if len(old.keys()) == 0:
            for attr in old.keys():
                setattr(roi_heads, attr, False)
            for attr in old.keys():
                setattr(roi_heads, attr, old[attr])

    def _batch_inference(self, batched_inputs, detected_instances=None):
        Execute inference on a list of inputs,
        using batch size = self.batch_size, instead of the length of the list.

        Inputs & outputs have the same format as :meth:`GeneralizedRCNN.inference`
        if detected_instances is None:
            detected_instances = [None] * len(batched_inputs)

        outputs = []
        inputs, instances = [], []
        for idx, input, instance in zip(count(), batched_inputs, detected_instances):
            if len(inputs) == self.batch_size or idx == len(batched_inputs) - 1:
                        instances if instances[0] is not None else None,
                inputs, instances = [], []
        return outputs

    def __call__(self, batched_inputs):
        Same input/output format as :meth:`GeneralizedRCNN.forward`

        def _maybe_read_image(dataset_dict):
            ret = copy.copy(dataset_dict)
            if "image" not in ret:
                image = read_image(ret.pop("file_name"), self.model.input_format)
                image = torch.from_numpy(np.ascontiguousarray(image.transpose(2, 0, 1)))  # CHW
                ret["image"] = image
            if "height" not in ret and "width" not in ret:
                ret["height"] = image.shape[1]
                ret["width"] = image.shape[2]
            return ret

        return [self._inference_one_image(_maybe_read_image(x)) for x in batched_inputs]

    def _inference_one_image(self, input):
            input (dict): one dataset dict with "image" field being a CHW tensor

            dict: one output dict
        orig_shape = (input["height"], input["width"])
        augmented_inputs, tfms = self._get_augmented_inputs(input)
        # Detect boxes from all augmented versions
        with self._turn_off_roi_heads(["mask_on", "keypoint_on"]):
            # temporarily disable roi heads
            all_boxes, all_scores, all_classes = self._get_augmented_boxes(augmented_inputs, tfms)
        # merge all detected boxes to obtain final predictions for boxes
        merged_instances = self._merge_detections(all_boxes, all_scores, all_classes, orig_shape)

        if self.cfg.MODEL.MASK_ON:
            # Use the detected boxes to obtain masks
            augmented_instances = self._rescale_detected_boxes(
                augmented_inputs, merged_instances, tfms
            # run forward on the detected boxes
            outputs = self._batch_inference(augmented_inputs, augmented_instances)
            # Delete now useless variables to avoid being out of memory
            del augmented_inputs, augmented_instances
            # average the predictions
            merged_instances.pred_masks = self._reduce_pred_masks(outputs, tfms)
            merged_instances = detector_postprocess(merged_instances, *orig_shape)
            return {"instances": merged_instances}
            return {"instances": merged_instances}

    def _get_augmented_inputs(self, input):
        augmented_inputs = self.tta_mapper(input)
        tfms = [x.pop("transforms") for x in augmented_inputs]
        return augmented_inputs, tfms

    def _get_augmented_boxes(self, augmented_inputs, tfms):
        # 1: forward with all augmented images
        outputs = self._batch_inference(augmented_inputs)
        # 2: union the results
        all_boxes = []
        all_scores = []
        all_classes = []
        for output, tfm in zip(outputs, tfms):
            # Need to inverse the transforms on boxes, to obtain results on original image
            pred_boxes = output.pred_boxes.tensor
            original_pred_boxes = tfm.inverse().apply_rotated_box(pred_boxes.cpu().numpy())

        all_boxes =, dim=0)
        return all_boxes, all_scores, all_classes

    def _merge_detections(self, all_boxes, all_scores, all_classes, shape_hw):
        # select from the union of all results
        num_boxes = len(all_boxes)
        num_classes = self.cfg.MODEL.ROI_HEADS.NUM_CLASSES
        # +1 because fast_rcnn_inference expects background scores as well
        all_scores_2d = torch.zeros(num_boxes, num_classes + 1, device=all_boxes.device)
        for idx, cls, score in zip(count(), all_classes, all_scores):
            all_scores_2d[idx, cls] = score

        merged_instances, _ = fast_rcnn_inference_single_image_rotated(

        return merged_instances

    def _rescale_detected_boxes(self, augmented_inputs, merged_instances, tfms):
        augmented_instances = []
        for input, tfm in zip(augmented_inputs, tfms):
            # Transform the target box to the augmented image's coordinate space
            pred_boxes = merged_instances.pred_boxes.tensor.cpu().numpy()
            pred_boxes = torch.from_numpy(tfm.apply_rotated_box(pred_boxes))

            aug_instances = Instances(
        return augmented_instances

    def _reduce_pred_masks(self, outputs, tfms):
        # Should apply inverse transforms on masks.
        # We assume only resize & flip are used. pred_masks is a scale-invariant
        # representation, so we handle flip specially
        for output, tfm in zip(outputs, tfms):
            if any(isinstance(t, HFlipTransform) for t in tfm.transforms):
                output.pred_masks = output.pred_masks.flip(dims=[3])
        all_pred_masks = torch.stack([o.pred_masks for o in outputs], dim=0)
        avg_pred_masks = torch.mean(all_pred_masks, dim=0)
        return avg_pred_masks

Some super parameter adjustment

The learning rate BASE_LR is adjusted to 0.01, MAX_ITER is adjusted to 100000, and the learning rate attenuation STEPS is adjusted to (5000075000)

cfg.SOLVER.BASE_LR = 0.01
cfg.SOLVER.MAX_ITER = 100000
cfg.SOLVER.STEPS = (50000,75000)
cfg.INPUT.MIN_SIZE_TRAIN = (512, 832)

Error / invalid / failed attempt


At first, I didn't observe the data carefully, so I wanted to remove the fog when I saw the fog. However, because the picture is mostly thick clouds, the common defogging code can't bring good results, and it was found that there was no data under the cloud

(but let's also show two defogging algorithms that feel better after the experiment)


Change anchor size

The length and width of the box are marked by statistical data, and then kmeans clustering is used to obtain the length and width clustering results, and then calculate the square of the area as anchor.size. This is mainly because I am not familiar with the size and aspect ratio of the anchor sent by detectron2 by default, rather than the length and width directly sent to the anchor, so I refer to the online process of obtaining anchor by yolo clustering

(still post a blog that feels good) Detailed explanation of anchor boxes obtained by k-means clustering in yoov3

Data enhancement

In fact, in the end, the data enhancement for the rotating frame was not successful, but record the trial and error process

  • Directly use the data augmentation provided by detectron2: the data augmentation strategy in this method is not fully applicable to rotating frame target detection, and this method cannot enhance data for a single category

  • Offline copy a few categories of data and use other libraries for data enhancement: no data enhancement library suitable for rotating frame target detection was found

  • Copy a few categories of data offline; use the data augmentation provided by detectron2 for data enhancement; then convert the enhanced data into coco data in XYWHA_ABS format for training: it is not clear where to add conversion coordinates after data enhancement

Unfulfilled attempts

Add Mosaic enhancements

The mosaic data enhancement of Yolov4 refers to the CutMix data enhancement method, which is an improved version of the CutMix data enhancement method. Four pictures are spliced to get a new picture. Although the mosaic enhancement code for the pictures has been written, it has not been trained and verified for some reasons.

Paste the code. The data read here is in json format and the output is txt (in fact, it has not been changed to json format)

from PIL import Image, ImageDraw
import numpy as np
from matplotlib.colors import rgb_to_hsv, hsv_to_rgb
import math
import os
from import detection_utils as utils
from import transforms as T
import pandas as pds
import json
import cv2

def rand(a=0, b=1):
    return np.random.rand() * (b - a) + a

def merge_bboxes(bboxes, cutx, cuty):
    merge_bbox = []
    for i in range(len(bboxes)):
        for box in bboxes[i]:
            tmp_box = []
            x1, y1, x2, y2, x3, y3, x4, y4 = box[0], box[1], box[2], box[3], box[4], box[5], box[6], box[7]

            if i == 0:
                if np.min(box[1::2]) > cuty or np.min(box[::2]) > cutx:
                if np.max(box[1::2]) >= cuty and np.min(box[1::2]) <= cuty:
                    box[1::2][np.argmax(box[1::2])] = cuty
                    if cuty - np.min(box[1::2]) < 5:
                if np.max(box[::2]) >= cutx and x1 <= cutx:
                    box[::2][np.argmax(box[::2])] = cutx
                    if cutx - x1 < 5:

            if i == 1:
                if y2 < cuty or x1 > cutx:

                if y2 >= cuty and y1 <= cuty:
                    y1 = cuty
                    if y2 - y1 < 5:

                if x2 >= cutx and x1 <= cutx:
                    x2 = cutx
                    if x2 - x1 < 5:

            if i == 2:
                if y2 < cuty or x2 < cutx:

                if y2 >= cuty and y1 <= cuty:
                    y1 = cuty
                    if y2 - y1 < 5:

                if x2 >= cutx and x1 <= cutx:
                    x1 = cutx
                    if x2 - x1 < 5:

            if i == 3:
                if y1 > cuty or x2 < cutx:

                if y2 >= cuty and y1 <= cuty:
                    y2 = cuty
                    if y2 - y1 < 5:

                if x2 >= cutx and x1 <= cutx:
                    x1 = cutx
                    if x2 - x1 < 5:

    return merge_bbox

def get_random_data(image_file, annotation_line, input_shape):
    '''random preprocessing for real-time data augmentation'''
    h, w = input_shape
    box_datas = []
    cls_datas = []
    index = 0
    place_x = [0, 0, 256, 256]
    place_y = [0, 256, 0, 256]
    new_image ='RGB', (w, h), (128, 128, 128))

    for line in annotation_line:
        # Split each line
        # line_content = line.split(",")
        # Open picture
        path = os.path.join(image_file, line['imagePath'])
        image = utils.read_image(path, format='BGR')
        r = np.random.rand(2)
        augs = T.AugmentationList([
            T.RandomFlip(prob=0.5, vertical=True, horizontal=False),
            T.RandomApply(T.RandomBrightness(0.9, 1.1), prob=0.3),
            T.RandomApply(T.RandomSaturation(0.9, 1.1), prob=0.3),
            T.RandomApply(T.RandomContrast(0.9, 1.1), prob=0.3),
            T.RandomApply(T.ColorTransform(lambda x: x * r[0] + r[1] * 10), prob=0.3)
        image, transforms = T.apply_transform_gens([augs], image)
        dx = place_x[index]
        dy = place_y[index]
        image = image[:, :, ::-1]
        new_image.paste(Image.fromarray(np.uint8(image)), (dx, dy))
        # cv2.imshow('new_image', new_image)
        # cv2.imshow('image', Image.fromarray(np.uint8(image)))
        index += 1
        iw, ih = image.shape[:2]
        box = []
        cls = []
        for shape in line['shapes']:
            bbox = []
            for point in shape['points']:
        box = np.array(box)
        # box = np.array([np.array(list(map(float, box.split()[1]))) for box in line['shapes'][0:]])
        # cls = [cls.split()[-2:] for cls in line['shapes']['label']]
        if box.shape[-1] == 0:
        box = transforms.apply_coords(box.reshape(-1, 2)).clip(min=0)

        # if index == 0:
        #     image, transforms = T.apply_transform_gens([T.RandomCrop(crop_type='absolute', crop_size=(cuty, cutx))],
        #                                                image)
        #     box = transforms.apply_coords(box).clip(min=0)
        # if index == 1:
        #     image, transforms = T.apply_transform_gens(
        #         [T.RandomCrop(crop_type='absolute', crop_size=((h - cuty), cutx))],
        #         image)
        #     box = transforms.apply_coords(box).clip(min=0)
        #     box[0, :] += cutx
        # if index == 3:
        #     image, transforms = T.apply_transform_gens(
        #         [T.RandomCrop(crop_type='absolute', crop_size=(cuty, (w - cutx)))],
        #         image)
        #     box = transforms.apply_coords(box).clip(min=0)
        #     box[1, :] += cuty
        # if index == 2:
        #     image, transforms = T.apply_transform_gens(
        #         [T.RandomCrop(crop_type='absolute', crop_size=((h - cuty), (w - cutx)))],
        #         image)
        #     box = transforms.apply_coords(box).clip(min=0)
        #     box[0, :] += cutx
        #     box[1, :] += cuty

        if index == 2:
            box[:, 1] += 256
        elif index == 3:
            box[:, 0] += 256
            box[:, 1] += 256
        elif index == 4:
            box[:, 0] += 256


    if len(box_datas) == 0:
        return new_image, []

    box_datas = np.concatenate(box_datas, axis=0)

    # vis box
    box_line = box_datas.reshape(-1, 8)
    # for line in box_line:
    #     x1, y1, x2, y2, x3, y3, x4, y4 = line
    #     draw = ImageDraw.Draw(new_image)
    #     draw.line([(x1, y1), (x2, y2)], fill='red')
    #     draw.line([(x2, y2), (x3, y3)], fill='red')
    #     draw.line([(x3, y3), (x4, y4)], fill='red')
    #     draw.line([(x4, y4), (x1, y1)], fill='red')
    pd = pds.DataFrame(box_line)
    pd2 = pds.DataFrame(cls_datas)
    pd = pds.concat([pd, pd2], axis=1)
    return new_image, pd

def normal_(annotation_line, input_shape):
    '''random preprocessing for real-time data augmentation'''
    line = annotation_line.split()
    image =[0])
    box = np.array([np.array(list(map(int, box.split(',')))) for box in line[1:]])

    iw, ih = image.size
    image = image.transpose(Image.FLIP_LEFT_RIGHT)
    box[:, [0, 2]] = iw - box[:, [2, 0]]

    return image, box

def get_json(json_path):
    info_group = []
    for root, dirs, files in os.walk(json_path):
        for file in files:
            if file.endswith(".json"):
                with open(os.path.join(root, file)) as f:
                    info = json.load(f)
                    # info = ",".join(info)
    return info_group

if __name__ == "__main__":
    json_path = './train'
    output_path = './train_mosaic'
    json_group = get_json(json_path)

    for ind in range(0, len(json_group) - 4, 4):
        line = json_group[ind:ind + 4]
        image_data, box_data = get_random_data(json_path, line, [512, 512])
        if len(box_data) == 0:
        json_output_path = os.path.join(output_path, str(ind) +'.txt')
        img_output_path = os.path.join(output_path, str(ind) + '.png')
        js = box_data.to_json
        # box_data.to_json(json_output_path)
        box_data.to_csv(json_output_path, sep=' ', index=False, header=None, mode='w')
    # img = Image.fromarray((image_data * 255).astype(np.uint8))
    # for j in range(len(box_data)):
    #     x1, y1, x2, y2, x3, y3, x4, y4 = box_data[j][0:8]
    #     draw = ImageDraw.Draw(img)
    #     draw.line([(x1, y1), (x2, y2)], fill='red')
    #     draw.line([(x2, y2), (x3, y3)], fill='red')
    #     draw.line([(x3, y3), (x4, y4)], fill='red')
    #     draw.line([(x4, y4), (x1, y1)], fill='red')
    #     # thickness = 3
    #     # left, top, right, bottom = box_data[j][0:4]
    #     # draw = ImageDraw.Draw(img)
    #     # for i in range(thickness):
    #     #     draw.rectangle([left + i, top + i, right - i, bottom - i], outline=(255, 255, 255))

Change angle offset to coordinate offset

that is RoI Transformer —> Gliding vertex

Then paste the official code of github RoI Transformer Gliding vertex

Posted by nocturne on Thu, 28 Oct 2021 01:31:35 -0700