Target detection tricks (based on detectron2)
Try correctly
Cutting
Because the target is too small relative to the whole picture, the data is cropped (in addition to the cutting size, the overlapping size should also be paid attention to. The overlapping size should be slightly larger. Try to keep each target intact so as not to damage the target due to cutting. Here, set cutting 512 and overlapping 256)
Reference code: DOTA_devkit
Change anchor size and aspect_ratio
Because the data target is small, you need to change the default anchor.size and aspect in detectron2_ ratio
cfg.MODEL.ANCHOR_GENERATOR.SIZES = [[35], [68], [87], [130], [149]] cfg.MODEL.ANCHOR_GENERATOR.ASPECT_RATIOS = [[1.1], [1.2], [1.4], [1.8], [2.7]]
Method: the area and length width ratio of the data annotation box are counted, and the results are obtained by kmeans clustering method. The kmeans in the package of sklearn are compared with the handwritten kmeans code on the Internet. It is considered that the clustering results obtained from the kmeans in the package of sklearn can better cover the overall data and meet the needs of the target detection anchor here.
Join TTA
Test time data enhancement, or TTA for short, is an application for data expansion of test data sets. It involves creating multiple amplified copies of each image in the test set, allowing the model to predict each image, and then returning the set of these predictions.
cfg.TEST.AUG.ENABLED = True cfg.TEST.AUG.MIN_SIZES = (400, 500, 512, 600, 700, 800) cfg.TEST.AUG.MAX_SIZE = 1000 cfg.TEST.AUG.FLIP = True
Due to the five parameter format of rotation box used in target detection( x , y , w , h , θ x,y,w,h,\theta x,y,w,h, θ), Therefore, some changes have been made to the TTA of detectron2, mainly from apply_box to apply_rotated_box and fast_rcnn_information_single_image_rotated
class GeneralizedRCNNWithTTA(nn.Module): """ A GeneralizedRCNN with test-time augmentation enabled. Its :meth:`__call__` method has the same interface as :meth:`GeneralizedRCNN.forward`. """ def __init__(self, cfg, model, tta_mapper=None, batch_size=3): """ Args: cfg (CfgNode): model (GeneralizedRCNN): a GeneralizedRCNN to apply TTA on. tta_mapper (callable): takes a dataset dict and returns a list of augmented versions of the dataset dict. Defaults to `DatasetMapperTTA(cfg)`. batch_size (int): batch the augmented images into this batch size for inference. """ super().__init__() if isinstance(model, DistributedDataParallel): model = model.module assert isinstance( model, GeneralizedRCNN ), "TTA is only supported on GeneralizedRCNN. Got a model of type {}".format(type(model)) self.cfg = cfg.clone() assert not self.cfg.MODEL.KEYPOINT_ON, "TTA for keypoint is not supported yet" assert ( not self.cfg.MODEL.LOAD_PROPOSALS ), "TTA for pre-computed proposals is not supported yet" self.model = model if tta_mapper is None: tta_mapper = DatasetMapperTTA(cfg.TEST.AUG.MIN_SIZES, cfg.TEST.AUG.MAX_SIZE, cfg.TEST.AUG.FLIP) self.tta_mapper = tta_mapper self.batch_size = batch_size @contextmanager def _turn_off_roi_heads(self, attrs): """ Open a context where some heads in `model.roi_heads` are temporarily turned off. Args: attr (list[str]): the attribute in `model.roi_heads` which can be used to turn off a specific head, e.g., "mask_on", "keypoint_on". """ roi_heads = self.model.roi_heads old = {} for attr in attrs: try: old[attr] = getattr(roi_heads, attr) except AttributeError: # The head may not be implemented in certain ROIHeads pass if len(old.keys()) == 0: yield else: for attr in old.keys(): setattr(roi_heads, attr, False) yield for attr in old.keys(): setattr(roi_heads, attr, old[attr]) def _batch_inference(self, batched_inputs, detected_instances=None): """ Execute inference on a list of inputs, using batch size = self.batch_size, instead of the length of the list. Inputs & outputs have the same format as :meth:`GeneralizedRCNN.inference` """ if detected_instances is None: detected_instances = [None] * len(batched_inputs) outputs = [] inputs, instances = [], [] for idx, input, instance in zip(count(), batched_inputs, detected_instances): inputs.append(input) instances.append(instance) if len(inputs) == self.batch_size or idx == len(batched_inputs) - 1: outputs.extend( self.model.inference( inputs, instances if instances[0] is not None else None, do_postprocess=False, ) ) inputs, instances = [], [] return outputs def __call__(self, batched_inputs): """ Same input/output format as :meth:`GeneralizedRCNN.forward` """ def _maybe_read_image(dataset_dict): ret = copy.copy(dataset_dict) if "image" not in ret: image = read_image(ret.pop("file_name"), self.model.input_format) image = torch.from_numpy(np.ascontiguousarray(image.transpose(2, 0, 1))) # CHW ret["image"] = image if "height" not in ret and "width" not in ret: ret["height"] = image.shape[1] ret["width"] = image.shape[2] return ret return [self._inference_one_image(_maybe_read_image(x)) for x in batched_inputs] def _inference_one_image(self, input): """ Args: input (dict): one dataset dict with "image" field being a CHW tensor Returns: dict: one output dict """ orig_shape = (input["height"], input["width"]) augmented_inputs, tfms = self._get_augmented_inputs(input) # Detect boxes from all augmented versions with self._turn_off_roi_heads(["mask_on", "keypoint_on"]): # temporarily disable roi heads all_boxes, all_scores, all_classes = self._get_augmented_boxes(augmented_inputs, tfms) # merge all detected boxes to obtain final predictions for boxes merged_instances = self._merge_detections(all_boxes, all_scores, all_classes, orig_shape) if self.cfg.MODEL.MASK_ON: # Use the detected boxes to obtain masks augmented_instances = self._rescale_detected_boxes( augmented_inputs, merged_instances, tfms ) # run forward on the detected boxes outputs = self._batch_inference(augmented_inputs, augmented_instances) # Delete now useless variables to avoid being out of memory del augmented_inputs, augmented_instances # average the predictions merged_instances.pred_masks = self._reduce_pred_masks(outputs, tfms) merged_instances = detector_postprocess(merged_instances, *orig_shape) return {"instances": merged_instances} else: return {"instances": merged_instances} def _get_augmented_inputs(self, input): augmented_inputs = self.tta_mapper(input) tfms = [x.pop("transforms") for x in augmented_inputs] return augmented_inputs, tfms def _get_augmented_boxes(self, augmented_inputs, tfms): # 1: forward with all augmented images outputs = self._batch_inference(augmented_inputs) # 2: union the results all_boxes = [] all_scores = [] all_classes = [] for output, tfm in zip(outputs, tfms): # Need to inverse the transforms on boxes, to obtain results on original image pred_boxes = output.pred_boxes.tensor original_pred_boxes = tfm.inverse().apply_rotated_box(pred_boxes.cpu().numpy()) all_boxes.append(torch.from_numpy(original_pred_boxes).to(pred_boxes.device)) all_scores.extend(output.scores) all_classes.extend(output.pred_classes) all_boxes = torch.cat(all_boxes, dim=0) return all_boxes, all_scores, all_classes def _merge_detections(self, all_boxes, all_scores, all_classes, shape_hw): # select from the union of all results num_boxes = len(all_boxes) num_classes = self.cfg.MODEL.ROI_HEADS.NUM_CLASSES # +1 because fast_rcnn_inference expects background scores as well all_scores_2d = torch.zeros(num_boxes, num_classes + 1, device=all_boxes.device) for idx, cls, score in zip(count(), all_classes, all_scores): all_scores_2d[idx, cls] = score merged_instances, _ = fast_rcnn_inference_single_image_rotated( all_boxes, all_scores_2d, shape_hw, 1e-8, self.cfg.MODEL.ROI_HEADS.NMS_THRESH_TEST, self.cfg.TEST.DETECTIONS_PER_IMAGE, ) return merged_instances def _rescale_detected_boxes(self, augmented_inputs, merged_instances, tfms): augmented_instances = [] for input, tfm in zip(augmented_inputs, tfms): # Transform the target box to the augmented image's coordinate space pred_boxes = merged_instances.pred_boxes.tensor.cpu().numpy() pred_boxes = torch.from_numpy(tfm.apply_rotated_box(pred_boxes)) aug_instances = Instances( image_size=input["image"].shape[1:3], pred_boxes=Boxes(pred_boxes), pred_classes=merged_instances.pred_classes, scores=merged_instances.scores, ) augmented_instances.append(aug_instances) return augmented_instances def _reduce_pred_masks(self, outputs, tfms): # Should apply inverse transforms on masks. # We assume only resize & flip are used. pred_masks is a scale-invariant # representation, so we handle flip specially for output, tfm in zip(outputs, tfms): if any(isinstance(t, HFlipTransform) for t in tfm.transforms): output.pred_masks = output.pred_masks.flip(dims=[3]) all_pred_masks = torch.stack([o.pred_masks for o in outputs], dim=0) avg_pred_masks = torch.mean(all_pred_masks, dim=0) return avg_pred_masks
Some super parameter adjustment
The learning rate BASE_LR is adjusted to 0.01, MAX_ITER is adjusted to 100000, and the learning rate attenuation STEPS is adjusted to (5000075000)
cfg.SOLVER.BASE_LR = 0.01 cfg.SOLVER.MAX_ITER = 100000 cfg.SOLVER.STEPS = (50000,75000)
cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING = 'range' cfg.INPUT.MIN_SIZE_TRAIN = (512, 832)
Error / invalid / failed attempt
Defogging
At first, I didn't observe the data carefully, so I wanted to remove the fog when I saw the fog. However, because the picture is mostly thick clouds, the common defogging code can't bring good results, and it was found that there was no data under the cloud
(but let's also show two defogging algorithms that feel better after the experiment)
Change anchor size
The length and width of the box are marked by statistical data, and then kmeans clustering is used to obtain the length and width clustering results, and then calculate the square of the area as anchor.size. This is mainly because I am not familiar with the size and aspect ratio of the anchor sent by detectron2 by default, rather than the length and width directly sent to the anchor, so I refer to the online process of obtaining anchor by yolo clustering
(still post a blog that feels good) Detailed explanation of anchor boxes obtained by k-means clustering in yoov3
Data enhancement
In fact, in the end, the data enhancement for the rotating frame was not successful, but record the trial and error process
-
Directly use the data augmentation provided by detectron2: the data augmentation strategy in this method is not fully applicable to rotating frame target detection, and this method cannot enhance data for a single category
-
Offline copy a few categories of data and use other libraries for data enhancement: no data enhancement library suitable for rotating frame target detection was found
-
Copy a few categories of data offline; use the data augmentation provided by detectron2 for data enhancement; then convert the enhanced data into coco data in XYWHA_ABS format for training: it is not clear where to add conversion coordinates after data enhancement
Unfulfilled attempts
Add Mosaic enhancements
The mosaic data enhancement of Yolov4 refers to the CutMix data enhancement method, which is an improved version of the CutMix data enhancement method. Four pictures are spliced to get a new picture. Although the mosaic enhancement code for the pictures has been written, it has not been trained and verified for some reasons.
Paste the code. The data read here is in json format and the output is txt (in fact, it has not been changed to json format)
from PIL import Image, ImageDraw import numpy as np from matplotlib.colors import rgb_to_hsv, hsv_to_rgb import math import os from detectron2.data import detection_utils as utils from detectron2.data import transforms as T import pandas as pds import json import cv2 def rand(a=0, b=1): return np.random.rand() * (b - a) + a def merge_bboxes(bboxes, cutx, cuty): merge_bbox = [] for i in range(len(bboxes)): for box in bboxes[i]: tmp_box = [] x1, y1, x2, y2, x3, y3, x4, y4 = box[0], box[1], box[2], box[3], box[4], box[5], box[6], box[7] if i == 0: if np.min(box[1::2]) > cuty or np.min(box[::2]) > cutx: continue if np.max(box[1::2]) >= cuty and np.min(box[1::2]) <= cuty: box[1::2][np.argmax(box[1::2])] = cuty if cuty - np.min(box[1::2]) < 5: continue if np.max(box[::2]) >= cutx and x1 <= cutx: box[::2][np.argmax(box[::2])] = cutx if cutx - x1 < 5: continue if i == 1: if y2 < cuty or x1 > cutx: continue if y2 >= cuty and y1 <= cuty: y1 = cuty if y2 - y1 < 5: continue if x2 >= cutx and x1 <= cutx: x2 = cutx if x2 - x1 < 5: continue if i == 2: if y2 < cuty or x2 < cutx: continue if y2 >= cuty and y1 <= cuty: y1 = cuty if y2 - y1 < 5: continue if x2 >= cutx and x1 <= cutx: x1 = cutx if x2 - x1 < 5: continue if i == 3: if y1 > cuty or x2 < cutx: continue if y2 >= cuty and y1 <= cuty: y2 = cuty if y2 - y1 < 5: continue if x2 >= cutx and x1 <= cutx: x1 = cutx if x2 - x1 < 5: continue tmp_box.append(x1) tmp_box.append(y1) tmp_box.append(x2) tmp_box.append(y2) tmp_box.append(box[-1]) merge_bbox.append(tmp_box) return merge_bbox def get_random_data(image_file, annotation_line, input_shape): '''random preprocessing for real-time data augmentation''' h, w = input_shape box_datas = [] cls_datas = [] index = 0 place_x = [0, 0, 256, 256] place_y = [0, 256, 0, 256] new_image = Image.new('RGB', (w, h), (128, 128, 128)) for line in annotation_line: # Split each line # line_content = line.split(",") # Open picture path = os.path.join(image_file, line['imagePath']) image = utils.read_image(path, format='BGR') r = np.random.rand(2) augs = T.AugmentationList([ T.RandomFlip(prob=0.5), T.RandomFlip(prob=0.5, vertical=True, horizontal=False), T.RandomApply(T.RandomBrightness(0.9, 1.1), prob=0.3), T.RandomApply(T.RandomSaturation(0.9, 1.1), prob=0.3), T.RandomApply(T.RandomContrast(0.9, 1.1), prob=0.3), T.RandomApply(T.ColorTransform(lambda x: x * r[0] + r[1] * 10), prob=0.3) ]) image, transforms = T.apply_transform_gens([augs], image) dx = place_x[index] dy = place_y[index] image = image[:, :, ::-1] new_image.paste(Image.fromarray(np.uint8(image)), (dx, dy)) # cv2.imshow('new_image', new_image) # cv2.imshow('image', Image.fromarray(np.uint8(image))) index += 1 iw, ih = image.shape[:2] box = [] cls = [] for shape in line['shapes']: bbox = [] for point in shape['points']: bbox.append(point[0]) bbox.append(point[1]) box.append(bbox) cls.append(shape['label']) box = np.array(box) # box = np.array([np.array(list(map(float, box.split()[1]))) for box in line['shapes'][0:]]) # cls = [cls.split()[-2:] for cls in line['shapes']['label']] if box.shape[-1] == 0: continue box = transforms.apply_coords(box.reshape(-1, 2)).clip(min=0) # if index == 0: # image, transforms = T.apply_transform_gens([T.RandomCrop(crop_type='absolute', crop_size=(cuty, cutx))], # image) # box = transforms.apply_coords(box).clip(min=0) # if index == 1: # image, transforms = T.apply_transform_gens( # [T.RandomCrop(crop_type='absolute', crop_size=((h - cuty), cutx))], # image) # box = transforms.apply_coords(box).clip(min=0) # box[0, :] += cutx # if index == 3: # image, transforms = T.apply_transform_gens( # [T.RandomCrop(crop_type='absolute', crop_size=(cuty, (w - cutx)))], # image) # box = transforms.apply_coords(box).clip(min=0) # box[1, :] += cuty # if index == 2: # image, transforms = T.apply_transform_gens( # [T.RandomCrop(crop_type='absolute', crop_size=((h - cuty), (w - cutx)))], # image) # box = transforms.apply_coords(box).clip(min=0) # box[0, :] += cutx # box[1, :] += cuty if index == 2: box[:, 1] += 256 elif index == 3: box[:, 0] += 256 box[:, 1] += 256 elif index == 4: box[:, 0] += 256 box_datas.append(box) cls_datas.extend(cls) if len(box_datas) == 0: return new_image, [] box_datas = np.concatenate(box_datas, axis=0) # vis box box_line = box_datas.reshape(-1, 8) # for line in box_line: # x1, y1, x2, y2, x3, y3, x4, y4 = line # draw = ImageDraw.Draw(new_image) # draw.line([(x1, y1), (x2, y2)], fill='red') # draw.line([(x2, y2), (x3, y3)], fill='red') # draw.line([(x3, y3), (x4, y4)], fill='red') # draw.line([(x4, y4), (x1, y1)], fill='red') pd = pds.DataFrame(box_line) pd2 = pds.DataFrame(cls_datas) pd = pds.concat([pd, pd2], axis=1) return new_image, pd def normal_(annotation_line, input_shape): '''random preprocessing for real-time data augmentation''' line = annotation_line.split() image = Image.open(line[0]) box = np.array([np.array(list(map(int, box.split(',')))) for box in line[1:]]) iw, ih = image.size image = image.transpose(Image.FLIP_LEFT_RIGHT) box[:, [0, 2]] = iw - box[:, [2, 0]] return image, box def get_json(json_path): info_group = [] for root, dirs, files in os.walk(json_path): for file in files: if file.endswith(".json"): with open(os.path.join(root, file)) as f: info = json.load(f) # info = ",".join(info) info_group.append(info) return info_group if __name__ == "__main__": json_path = './train' output_path = './train_mosaic' json_group = get_json(json_path) for ind in range(0, len(json_group) - 4, 4): line = json_group[ind:ind + 4] image_data, box_data = get_random_data(json_path, line, [512, 512]) if len(box_data) == 0: continue json_output_path = os.path.join(output_path, str(ind) +'.txt') img_output_path = os.path.join(output_path, str(ind) + '.png') js = box_data.to_json # box_data.to_json(json_output_path) box_data.to_csv(json_output_path, sep=' ', index=False, header=None, mode='w') image_data.save(img_output_path) print(ind) print("finished") # img = Image.fromarray((image_data * 255).astype(np.uint8)) # for j in range(len(box_data)): # x1, y1, x2, y2, x3, y3, x4, y4 = box_data[j][0:8] # draw = ImageDraw.Draw(img) # draw.line([(x1, y1), (x2, y2)], fill='red') # draw.line([(x2, y2), (x3, y3)], fill='red') # draw.line([(x3, y3), (x4, y4)], fill='red') # draw.line([(x4, y4), (x1, y1)], fill='red') # # thickness = 3 # # left, top, right, bottom = box_data[j][0:4] # # draw = ImageDraw.Draw(img) # # for i in range(thickness): # # draw.rectangle([left + i, top + i, right - i, bottom - i], outline=(255, 255, 255)) # img.show() # img.save("box_all.jpg")
Change angle offset to coordinate offset
that is RoI Transformer —> Gliding vertex
Then paste the official code of github RoI Transformer Gliding vertex