Yolo (project) Yolo v3 image classification

After multiple convolution compression, small objects are easy to disappear, so we use 52 * 52, 26 * 26 and 13 * 13 grids to detect small objects, medium objects and large objects respectively.

(the cat is a big object, so it is detected with a 13 * 13 grid)

Output layer output:

1, Read file

Three files are required:

coco.names,yolov3.cfg,yolov3.weights.

Download address:

https://download.csdn.net/download/great_yzl/34365174

(you can also download it from yolo's official website, but I don't know if the coco.names file is available)

yolov3.cfg and yolov3.weights are officially given models (those that have been set and trained can be used directly).

# read file
def ReadFile():
    global name_list
    name_list = []
    # read file
    with open('coco.names') as f:
        name_list = f.read().split('\n')
    print(name_list)

2, Neural network initialization

1. Build neural network

According to yolov3 official settings and weights

global network
    model_configuration = 'yolov3.cfg'  # Configuration model
    model_weights = 'yolov3.weights'    # weight
    # 1. Creating neural networks
    network = cv2.dnn.readNetFromDarknet(model_configuration, model_weights)
    #                                     Configure model weights

2. GPU acceleration

# 2. GPU acceleration
    network.setPreferableBackend(cv2.dnn.DNN_BACKEND_OPENCV)    # Set opencv as the backend
    network.setPreferableTarget(cv2.dnn.DNN_TARGET_CPU)

3, Turn on the camera and read the image by frame

def Capture_Init():
    global capture, w, h
    capture = cv2.VideoCapture(0)
    w, h = 320, 320

    while True:
        global img
        success, img = capture.read()
        # img = cv2.imread("Resource/test4.jpg")

        cv2.imshow('img', img)

        # Set the interval time per frame (q key to exit)
        cv2.waitKey(1)
        # if cv2.waitKey(1) & 0XFF == ord("q"):
        #     break

4, Input to neural network

# Input to neural network
        Input_to_Network(img)

Transform the img data and input it into the image as a blob.

# Input to neural network
def Input_to_Network(image):
    # Convert image to blob data type (a series of data type conversion such as normalization) (the network can understand this method)
    blob = cv2.dnn.blobFromImage(image, 1 / 255, (w, h), [0, 0, 0], 1, crop=False)
    #                            Image normalized width and height clipping results

    # Set neural network input
    network.setInput(blob)
    print(blob)

5, Obtain neural network output

1. Get the name of each layer

# Name of each layer of neural network
    layersNames = network.getLayerNames()
    print(layersNames)

2. Get output layer name

# 2. Get the name of neural network output layer
    outputNames = [(layersNames[i[0] - 1]) for i in network.getUnconnectedOutLayers()]
    print(outputNames)

3. Get output layer image (content)

# 3. Get output layer image (content)
    # outputs: three characteristic diagrams: small, medium and large. (13*13,26*26,52*52)
    # Each feature map outputs 85 categories
    outputs = network.forward(outputNames)
    # print(outputs[0].shape)
    # print(outputs[1].shape)
    # print(outputs[2].shape)
    # print(outputs[0][0])
    return outputs

# Obtain neural network output
def Network_Output():
    # Name of each layer of neural network
    layersNames = network.getLayerNames()
    # Name of neural network output layer
    outputNames = [(layersNames[i[0] - 1]) for i in network.getUnconnectedOutLayers()]

    # outputs: three characteristic diagrams: small, medium and large. (13*13,26*26,52*52)
    # Each feature map outputs 85 categories
    outputs = network.forward(outputNames)
    # print(outputs[0].shape)
    # print(outputs[1].shape)
    # print(outputs[2].shape)
    # print(outputs[0][0])
    return outputs

A series of results in the figure are obtained:

Prediction frame coordinates, length and width, confidence, and prediction scores of each category.

[5.9418369e-02 7.4009120e-02 5.7651168e-01 1.6734526e-01 6.5560471e-07
0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00]

6, Frame object

# Frame object
def GetObject(outputs, image):
    # Create parameter list
    h_p, w_p, c_p = image.shape # Height, width and number of channels of the image
    bboxes = []                 # Prediction frame
    classIds = []               # Classification index
    confidences = []            # Confidence

1. Get all forecast boxes

Feature graph output

# Feature graph output
    for output in outputs:

Frame by frame output

# Frame by frame output
        for one_class in output:

Single prediction box results

# Calculate the prediction of each box
            # First get its 85 classification, the score of each classification, then get the subscript of the maximum score as the index, and finally get its confidence
            scores = one_class[5:]          # Get all scores
            classId = np.argmax(scores)     # Get classification index
            confidence = scores[classId]    # Gain confidence

When the confidence is over half, the possible prediction results considered by the prediction box are stored in the list

# Add predicted objects (objects considered possible)
            if confidence > 0.5:
                # Obtain the width, height and coordinates of the prediction frame
                w_b, h_b = int(one_class[2] * w_p), int(one_class[3] * h_p)
                x, y = int((one_class[0] * w_p) - w_b / 2), int((one_class[1] * h_p) - h_b / 2)
                # (x, y coordinates of center point)
                bbox = [x, y, w_b, h_b]
                # Add parameters to the list (prediction box parameters, classification index, confidence)
                bboxes.append(bbox)                     # Prediction frame parameters
                classIds.append(classId)                # Classification index
                confidences.append(float(confidence))   # Confidence
                print(confidences)

2. Keep a prediction box

Non maximum suppression was performed according to confidence and NMS. (the confidence has been filtered once before, and the confidence threshold can no longer be set this time).

# Keep a prediction box (set from confidence threshold and NMS threshold)
    indices = cv2.dnn.NMSBoxes(bboxes, confidences, 0.5,     0.1)
    #                         Prediction frame confidence threshold NMS threshold (non maximum suppression)

3. Draw the prediction box

# Output prediction box
    for i in indices:
        i = i[0]
        box = bboxes[i]
        x, y, w, h = box[0], box[1], box[2], box[3]
        # print(x,y,w,h)
        cv2.rectangle(image, (x, y), (x + w, y + h), (255, 0, 255), 2)
        cv2.putText(image, f'{name_list[classIds[i]].upper()} {int(confidences[i] * 100)}%',
                   (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 255), 2)
    cv2.imshow('image', image)

The biggest problem with forecasting is that it is relatively slow. In tiny's words, it is fast enough, but it is not accurate. There is still a lot of room for improvement.

Total code

import cv2
import numpy as np


# read file
def ReadFile():
    global name_list
    name_list = []
    # read file
    with open('yolo/coco.names') as f:
        name_list = f.read().split('\n')
    # print(name_list)


# Build neural network
def Network_Init():
    global network
    model_configuration = 'yolo/yolov3.cfg'  # Configuration model
    model_weights = 'yolo/yolov3.weights'    # weight
    # 1. Creating neural networks
    network = cv2.dnn.readNetFromDarknet(model_configuration, model_weights)
    #                                     Configure model weights

    # 2. GPU acceleration
    network.setPreferableBackend(cv2.dnn.DNN_BACKEND_OPENCV)    # Set opencv as the backend
    network.setPreferableTarget(cv2.dnn.DNN_TARGET_CPU)
    # print(network)


# Turn on the camera
def Capture_Init():
    global capture, w, h
    capture = cv2.VideoCapture(0)
    w, h = 320, 320


# Input to neural network
def Input_to_Network(image):
    # Convert image to blob data type (a series of data type conversion such as normalization) (the network can understand this method)
    blob = cv2.dnn.blobFromImage(image, 1 / 255, (w, h), [0, 0, 0], 1, crop=False)
    #                            Image normalized width and height clipping results

    # Set neural network input
    network.setInput(blob)
    # print(blob)


# Obtain neural network output
def Network_Output():
    # 1. Obtain the name of each layer of neural network
    layersNames = network.getLayerNames()

    # 2. Get the name of neural network output layer
    outputNames = [(layersNames[i[0] - 1]) for i in network.getUnconnectedOutLayers()]
    # print(outputNames)

    # 3. Get output layer image (content)
    # outputs: three characteristic diagrams: small, medium and large. (13*13,26*26,52*52)
    # Each feature map outputs 85 categories
    outputs = network.forward(outputNames)
    # print(outputs[0].shape)
    # print(outputs[1].shape)
    # print(outputs[2].shape)
    # print(outputs[0][0])
    return outputs


# Frame object
def GetObject(outputs, image):
    # Create parameter list
    h_p, w_p, c_p = image.shape # Height, width and number of channels of the image
    bboxes = []                 # Prediction frame
    classIds = []               # Classification index
    confidences = []            # Confidence

    # 1. Get the status of all forecast boxes
    # outputs: small, medium and large characteristic diagrams
    # output: small, medium and large single characteristic diagram
    # oneclass: small, medium and large
    # Feature graph output
    for output in outputs:
        # Frame by frame output
        for one_class in output:
            # Calculate the prediction result of each box
            # First get its 85 classification, the score of each classification, then get the subscript of the maximum score as the index, and finally get its confidence
            scores = one_class[5:]          # Get all scores
            classId = np.argmax(scores)     # Get classification index
            confidence = scores[classId]    # Gain confidence

            # Add predicted objects (objects considered possible)
            if confidence > 0.5:
                # Obtain the width, height and coordinates of the prediction frame
                w_b, h_b = int(one_class[2] * w_p), int(one_class[3] * h_p)
                x, y = int((one_class[0] * w_p) - w_b / 2), int((one_class[1] * h_p) - h_b / 2)
                # (x, y coordinates of center point)
                bbox = [x, y, w_b, h_b]
                # Add parameters to the list (prediction box parameters, classification index, confidence)
                bboxes.append(bbox)                     # Prediction frame parameters
                classIds.append(classId)                # Classification index
                confidences.append(float(confidence))   # Confidence
                print(confidences)

    # Keep a prediction box (set from confidence threshold and NMS threshold)
    indices = cv2.dnn.NMSBoxes(bboxes, confidences, 0.5,     0.1)
    #                         Prediction frame confidence threshold NMS threshold (non maximum suppression)

    # Output prediction box
    for i in indices:
        i = i[0]
        box = bboxes[i]
        x, y, w, h = box[0], box[1], box[2], box[3]
        # print(x,y,w,h)
        cv2.rectangle(image, (x, y), (x + w, y + h), (255, 0, 255), 2)
        cv2.putText(image, f'{name_list[classIds[i]].upper()} {int(confidences[i] * 100)}%',
                   (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 255), 2)
    cv2.imshow('image', image)


if __name__ == '__main__':
    ReadFile()          #read file
    Network_Init()      #Neural network initialization
    Capture_Init()      #Camera initialization

    while True:
        global img
        # success, img = capture.read()
        img = cv2.imread("Resource/test4.jpg")

        # Input to neural network
        Input_to_Network(img)

        # Obtain neural network output
        outputs = Network_Output()

        # Frame object
        GetObject(outputs, img)

        # Set the interval time per frame (q key to exit)
        cv2.waitKey(1)
        # if cv2.waitKey(1) & 0XFF == ord("q"):
        #     break

    cv2.waitKey(0)

Posted by adamlacombe on Sun, 24 Oct 2021 21:39:29 -0700

Programmer Group