Linux listen() system call

Keywords: socket CentOS network

Note: the analysis in this paper is based on kernel version 3.10.0-693.el7, namely CentOS 7.4

1. Function prototype

int listen(int sockfd, int backlog);  

Parameter Description:
sockfd: the file descriptor of the socket, that is, the fd returned by the socket() system call
backlog: save the queue length of client requests

The system call of listen() is relatively simple, but it involves backlog. This parameter is relatively complex, affecting the semi connection queue and the full connection queue. For detailed analysis, please refer to< Detailed explanation of TCP's backlog>.

2. Kernel Implementation

SYSCALL_DEFINE2(listen, int, fd, int, backlog)
{
    struct socket *sock;
    int err, fput_needed;
    int somaxconn;

    //according to fd Get the corresponding sock Structure, analysisbind()Discussed at system call time
    sock = sockfd_lookup_light(fd, &err, &fput_needed);
    if (sock) {
        somaxconn = sock_net(sock->sk)->core.sysctl_somaxconn;
        if ((unsigned int)backlog > somaxconn)
            backlog = somaxconn;

        err = security_socket_listen(sock, backlog);
        if (!err)
            err = sock->ops->listen(sock, backlog);//Call thelistenfunction

        fput_light(sock->file, fput_needed);
    }
    return err;
}

Like the bind() function, since it is operated through fd, the first step must be to obtain the corresponding socket structure through fd in order to operate network related variables and structures. From the analysis of previous system calls, we know that sock - > OPS points to INET [stream] OPS structure, so Sock - > Ops - > Listen points to INET [listen().

/*
 *  Move a socket into listening state.
 */
int inet_listen(struct socket *sock, int backlog)
{
    struct sock *sk = sock->sk;
    unsigned char old_state;
    int err;

    lock_sock(sk);

    err = -EINVAL;
    if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)
        goto out;

    old_state = sk->sk_state;
    //The call to listen can only be a connection in the close or listen state (only the backlog parameter can be modified)
    if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))
        goto out;

    /* Really, if the socket is already in listen state
     * we can only allow the backlog to be adjusted.
     */
    if (old_state != TCP_LISTEN) {
        //Quick open option, no analysis
        if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) != 0 && 
            inet_csk(sk)->icsk_accept_queue.fastopenq == NULL) {
            if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) != 0)
                err = fastopen_init_queue(sk, backlog);
            else if ((sysctl_tcp_fastopen & TFO_SERVER_WO_SOCKOPT2) != 0)
                err = fastopen_init_queue(sk, ((uint)sysctl_tcp_fastopen) >> 16);
            else
                err = 0;
            if (err)
                goto out;

            tcp_fastopen_init_key_once(true);
        }
        //Initialize socket, including semi connection queue, etc
        err = inet_csk_listen_start(sk, backlog);
        if (err)
            goto out;
    }
    sk->sk_max_ack_backlog = backlog;//This actually sets the maximum length of the full connection queue
    err = 0;

out:
    release_sock(sk);
    return err;
}

INET ﹣ CSK ﹣ listen ﹣ start() is mainly composed of two things: one is to allocate the semi connection queue, the other is to add the socket to the listen hash table.

int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
{
    struct inet_sock *inet = inet_sk(sk);
    struct inet_connection_sock *icsk = inet_csk(sk);
    //Allocate the semi connection queue according to NR table entries (related to the backlog parameter set by the user)
    int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries);

    if (rc != 0)
        return rc;

    sk->sk_max_ack_backlog = 0;
    sk->sk_ack_backlog = 0;
    inet_csk_delack_init(sk);

    sk->sk_state = TCP_LISTEN;//Announce socket to listen state
    /* In fact, before calling listen(), calling bind() system call has called sk->sk_prot->get_port once.
     * This is called again because there is no atomic operation between bind and listen. You may have modified some properties of the connection here
     * For example, SK - > reuse, SK - > SK Ou reuse port, etc., so check the port again to ensure that it is available.*/
    if (!sk->sk_prot->get_port(sk, inet->inet_num)) {
        inet->inet_sport = htons(inet->inet_num);//Set source port

        sk_dst_reset(sk);
        sk->sk_prot->hash(sk);//Put the connection in the listen state into the listen hash table according to the port number

        return 0;
    }

    sk->sk_state = TCP_CLOSE;//Failed. The socket state is set to the initial value of close
    __reqsk_queue_destroy(&icsk->icsk_accept_queue);
    return -EADDRINUSE;
}

Finally, add the listen hash table.

void inet_hash(struct sock *sk)
{
    if (sk->sk_state != TCP_CLOSE) {
        local_bh_disable();
        __inet_hash(sk);
        local_bh_enable();
    }
}

static void __inet_hash(struct sock *sk)
{
    struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
    struct inet_listen_hashbucket *ilb;

    if (sk->sk_state != TCP_LISTEN) {
        __inet_hash_nolisten(sk, NULL);
        return;
    }

    WARN_ON(!sk_unhashed(sk));
    //Get the corresponding hash table according to the port number
    ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];

    spin_lock(&ilb->lock);
    __sk_nulls_add_node_rcu(sk, &ilb->head);//Put socket into listen list
    sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
    spin_unlock(&ilb->lock);
}

So listen() calculates the actual request queue length according to the backlog set by the user, and then allocates the semi connection queue. Also, confirm that the port is available again, and then add the connection to the listen hash table.

Some structural relationships are as follows:

Posted by KindMan on Sun, 05 Apr 2020 06:20:27 -0700