1, Read file

2, Neural network initialization

  1. Build neural network

2. GPU acceleration

3, Turn on the camera and read the image by frame

4, Input to neural network

5, Obtain neural network output

1. Get the name of each layer

2. Get output layer name

3. Get output layer image (content)

6, Frame object

1. Get all forecast boxes  

Feature graph output

Frame by frame output

Single prediction box results

When the confidence is over half, the possible prediction results considered by the prediction box are stored in the list

2. Keep a prediction box

3. Draw the prediction box

Total code


The image is divided into 3 images:

         After multiple convolution compression, small objects are easy to disappear, so we use 52 * 52, 26 * 26 and 13 * 13 grids to detect small objects, medium objects and large objects respectively.

  (the cat is a big object, so it is detected with a 13 * 13 grid)

Output layer output:

1, Read file

  Three files are required:


  Download address:  


(you can also download it from yolo's official website, but I don't know if the coco.names file is available)  

  yolov3.cfg and yolov3.weights are officially given models (those that have been set and trained can be used directly).

# read file
def ReadFile():
    global name_list
    name_list = []
    # read file
    with open('coco.names') as f:
        name_list = f.read().split('\n')

2, Neural network initialization

  1. Build neural network

According to yolov3 official settings and weights

global network
    model_configuration = 'yolov3.cfg'  # Configuration model
    model_weights = 'yolov3.weights'    # weight
    # 1. Creating neural networks
    network = cv2.dnn.readNetFromDarknet(model_configuration, model_weights)
    #                                     Configure model weights

2. GPU acceleration

# 2. GPU acceleration
    network.setPreferableBackend(cv2.dnn.DNN_BACKEND_OPENCV)    # Set opencv as the backend

3, Turn on the camera and read the image by frame

def Capture_Init():
    global capture, w, h
    capture = cv2.VideoCapture(0)
    w, h = 320, 320
    while True:
        global img
        success, img = capture.read()
        # img = cv2.imread("Resource/test4.jpg")

        cv2.imshow('img', img)

        # Set the interval time per frame (q key to exit)
        # if cv2.waitKey(1) & 0XFF == ord("q"):
        #     break

4, Input to neural network

# Input to neural network

Transform the img data and input it into the image as a blob.  

# Input to neural network
def Input_to_Network(image):
    # Convert image to blob data type (a series of data type conversion such as normalization) (the network can understand this method)
    blob = cv2.dnn.blobFromImage(image, 1 / 255, (w, h), [0, 0, 0], 1, crop=False)
    #                            Image normalized width and height clipping results

    # Set neural network input

5, Obtain neural network output

1. Get the name of each layer

# Name of each layer of neural network
    layersNames = network.getLayerNames()

2. Get output layer name

# 2. Get the name of neural network output layer
    outputNames = [(layersNames[i[0] - 1]) for i in network.getUnconnectedOutLayers()]

3. Get output layer image (content)

# 3. Get output layer image (content)
    # outputs: three characteristic diagrams: small, medium and large. (13*13,26*26,52*52)
    # Each feature map outputs 85 categories
    outputs = network.forward(outputNames)
    # print(outputs[0].shape)
    # print(outputs[1].shape)
    # print(outputs[2].shape)
    # print(outputs[0][0])
    return outputs
# Obtain neural network output
def Network_Output():
    # Name of each layer of neural network
    layersNames = network.getLayerNames()
    # Name of neural network output layer
    outputNames = [(layersNames[i[0] - 1]) for i in network.getUnconnectedOutLayers()]

    # outputs: three characteristic diagrams: small, medium and large. (13*13,26*26,52*52)
    # Each feature map outputs 85 categories
    outputs = network.forward(outputNames)
    # print(outputs[0].shape)
    # print(outputs[1].shape)
    # print(outputs[2].shape)
    # print(outputs[0][0])
    return outputs

A series of results in the figure are obtained:

Prediction frame coordinates, length and width, confidence, and prediction scores of each category.

[5.9418369e-02 7.4009120e-02 5.7651168e-01 1.6734526e-01 6.5560471e-07
 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00] 

6, Frame object

# Frame object
def GetObject(outputs, image):
    # Create parameter list
    h_p, w_p, c_p = image.shape # Height, width and number of channels of the image
    bboxes = []                 # Prediction frame
    classIds = []               # Classification index
    confidences = []            # Confidence

1. Get all forecast boxes  

Feature graph output

# Feature graph output
    for output in outputs:

Frame by frame output

# Frame by frame output
        for one_class in output:

Single prediction box results

# Calculate the prediction of each box
            # First get its 85 classification, the score of each classification, then get the subscript of the maximum score as the index, and finally get its confidence
            scores = one_class[5:]          # Get all scores
            classId = np.argmax(scores)     # Get classification index
            confidence = scores[classId]    # Gain confidence

When the confidence is over half, the possible prediction results considered by the prediction box are stored in the list

# Add predicted objects (objects considered possible)
            if confidence > 0.5:
                # Obtain the width, height and coordinates of the prediction frame
                w_b, h_b = int(one_class[2] * w_p), int(one_class[3] * h_p)
                x, y = int((one_class[0] * w_p) - w_b / 2), int((one_class[1] * h_p) - h_b / 2)
                # (x, y coordinates of center point)
                bbox = [x, y, w_b, h_b]
                # Add parameters to the list (prediction box parameters, classification index, confidence)
                bboxes.append(bbox)                     # Prediction frame parameters
                classIds.append(classId)                # Classification index
                confidences.append(float(confidence))   # Confidence

2. Keep a prediction box

         Non maximum suppression was performed according to confidence and NMS. (the confidence has been filtered once before, and the confidence threshold can no longer be set this time).

# Keep a prediction box (set from confidence threshold and NMS threshold)
    indices = cv2.dnn.NMSBoxes(bboxes, confidences, 0.5,     0.1)
    #                         Prediction frame confidence threshold NMS threshold (non maximum suppression)

3. Draw the prediction box

# Output prediction box
    for i in indices:
        i = i[0]
        box = bboxes[i]
        x, y, w, h = box[0], box[1], box[2], box[3]
        # print(x,y,w,h)
        cv2.rectangle(image, (x, y), (x + w, y + h), (255, 0, 255), 2)
        cv2.putText(image, f'{name_list[classIds[i]].upper()} {int(confidences[i] * 100)}%',
                   (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 255), 2)
    cv2.imshow('image', image)



  The biggest problem with forecasting is that it is relatively slow. In tiny's words, it is fast enough, but it is not accurate. There is still a lot of room for improvement.

Total code

import cv2
import numpy as np

# read file
def ReadFile():
    global name_list
    name_list = []
    # read file
    with open('yolo/coco.names') as f:
        name_list = f.read().split('\n')
    # print(name_list)

# Build neural network
def Network_Init():
    global network
    model_configuration = 'yolo/yolov3.cfg'  # Configuration model
    model_weights = 'yolo/yolov3.weights'    # weight
    # 1. Creating neural networks
    network = cv2.dnn.readNetFromDarknet(model_configuration, model_weights)
    #                                     Configure model weights

    # 2. GPU acceleration
    network.setPreferableBackend(cv2.dnn.DNN_BACKEND_OPENCV)    # Set opencv as the backend
    # print(network)

# Turn on the camera
def Capture_Init():
    global capture, w, h
    capture = cv2.VideoCapture(0)
    w, h = 320, 320

# Input to neural network
def Input_to_Network(image):
    # Convert image to blob data type (a series of data type conversion such as normalization) (the network can understand this method)
    blob = cv2.dnn.blobFromImage(image, 1 / 255, (w, h), [0, 0, 0], 1, crop=False)
    #                            Image normalized width and height clipping results

    # Set neural network input
    # print(blob)

# Obtain neural network output
def Network_Output():
    # 1. Obtain the name of each layer of neural network
    layersNames = network.getLayerNames()

    # 2. Get the name of neural network output layer
    outputNames = [(layersNames[i[0] - 1]) for i in network.getUnconnectedOutLayers()]
    # print(outputNames)

    # 3. Get output layer image (content)
    # outputs: three characteristic diagrams: small, medium and large. (13*13,26*26,52*52)
    # Each feature map outputs 85 categories
    outputs = network.forward(outputNames)
    # print(outputs[0].shape)
    # print(outputs[1].shape)
    # print(outputs[2].shape)
    # print(outputs[0][0])
    return outputs

# Frame object
def GetObject(outputs, image):
    # Create parameter list
    h_p, w_p, c_p = image.shape # Height, width and number of channels of the image
    bboxes = []                 # Prediction frame
    classIds = []               # Classification index
    confidences = []            # Confidence

    # 1. Get the status of all forecast boxes
    # outputs: small, medium and large characteristic diagrams
    # output: small, medium and large single characteristic diagram
    # oneclass: small, medium and large
    # Feature graph output
    for output in outputs:
        # Frame by frame output
        for one_class in output:
            # Calculate the prediction result of each box
            # First get its 85 classification, the score of each classification, then get the subscript of the maximum score as the index, and finally get its confidence
            scores = one_class[5:]          # Get all scores
            classId = np.argmax(scores)     # Get classification index
            confidence = scores[classId]    # Gain confidence

            # Add predicted objects (objects considered possible)
            if confidence > 0.5:
                # Obtain the width, height and coordinates of the prediction frame
                w_b, h_b = int(one_class[2] * w_p), int(one_class[3] * h_p)
                x, y = int((one_class[0] * w_p) - w_b / 2), int((one_class[1] * h_p) - h_b / 2)
                # (x, y coordinates of center point)
                bbox = [x, y, w_b, h_b]
                # Add parameters to the list (prediction box parameters, classification index, confidence)
                bboxes.append(bbox)                     # Prediction frame parameters
                classIds.append(classId)                # Classification index
                confidences.append(float(confidence))   # Confidence

    # Keep a prediction box (set from confidence threshold and NMS threshold)
    indices = cv2.dnn.NMSBoxes(bboxes, confidences, 0.5,     0.1)
    #                         Prediction frame confidence threshold NMS threshold (non maximum suppression)

    # Output prediction box
    for i in indices:
        i = i[0]
        box = bboxes[i]
        x, y, w, h = box[0], box[1], box[2], box[3]
        # print(x,y,w,h)
        cv2.rectangle(image, (x, y), (x + w, y + h), (255, 0, 255), 2)
        cv2.putText(image, f'{name_list[classIds[i]].upper()} {int(confidences[i] * 100)}%',
                   (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (255, 0, 255), 2)
    cv2.imshow('image', image)

if __name__ == '__main__':
    ReadFile()          #read file
    Network_Init()      #Neural network initialization
    Capture_Init()      #Camera initialization

    while True:
        global img
        # success, img = capture.read()
        img = cv2.imread("Resource/test4.jpg")

        # Input to neural network

        # Obtain neural network output
        outputs = Network_Output()

        # Frame object
        GetObject(outputs, img)

        # Set the interval time per frame (q key to exit)
        # if cv2.waitKey(1) & 0XFF == ord("q"):
        #     break


