Implementation of Linux Network IO basic components

Keywords: Linux epoll TCP/IP

1 theoretical basis components

1.1 Epoll model overview

1.1.1 advantages of epoll model over Select/poll model

(1) The maximum number of descriptors that Epoll can use will depend only on memory in a linear relationship
(2) IO efficiency does not decrease linearly with the increase of the number of descriptors, mainly due to the callback processing mechanism.
(3) Use mmap to speed up the message transmission between kernel and user space, that is, the kernel and user space share a piece of memory to reduce unnecessary copies.

1.1.2 basic usage of epoll

Epoll is easy to use and can be divided into three parts: the first is to create epoll; The second is the control message; Finally, IO processing
Create interface:

epoll_create(1024);//Creating an epoll supports a maximum of 1024 file descriptors

Control interface: used to maintain fd sets

//Including: EPOLL_CTL_ADD modification: EPOLL_CTL_MOD delete EPOLL_CTL_DEL
epoll_ctl(m_epfd, EPOLL_CTL_ADD, fd, &ev);//Used to register fd and add related events

IO event trigger interface:

epoll_wait(m_epfd, events, 1024, wait_timeout);//Poll for IO events for note set

1.1.3 Epoll trigger mode

(1) Horizontal trigger epoll LT: the specific working mode. After the file descriptor is ready, the kernel will trigger until the user processes it. Supports blocking and non blocking modes.
(2) Edge triggered epoll et: specific working mode: after the file descriptor is ready, the kernel will notify the user for processing. If the user does not process, the notification will not continue. Only non blocking mode is supported.

2 basic implementation

With the previous theoretical basis, we can start the design of network basic components. This design mainly realizes two tasks: one is the implementation of the basic operation of socket, and the other is the implementation of the IO model of events. The component only focuses on network operation and event distribution, and separates specific services.

2.1 Socket management component

2.1.1 basic socket operation

This part mainly carries out the general operations of socket, including server listening and starting, client connection request, receiving data, sending data, closing socket connection and so on.

int base_socket_listen(void* this, const char* server_ip, uint16_t port, callback_t callback, void* callback_data)
{
    log_debug("base_socket_connect, server_ip=%s, port=%d", server_ip, port);

    base_socket_t *p = (void *base_socket_t)this;
    if(!p)  return -1;

    base_socket_priv_t *priv = (base_socket_priv_t *)p->priv;
    if(!priv) return -1;

	priv->local_ip = server_ip;
	priv->local_port = port;
	priv->callback = callback;
	priv->callback_data = callback_data;

	priv->fd = socket(AF_INET, SOCK_STREAM, 0);
	if (priv->fd == INVALID_SOCKET)
	{
		log_error("socket failed, err_code=%d, server_ip=%s, port=%u", _get_error_code(), server_ip, port);
		return -1;
	}

	_set_reuse_addr(priv->fd);
	_set_non_block(priv->fd);

	sockaddr_in serv_addr;
	_set_addr(server_ip, port, &serv_addr);
    int ret = ::bind(priv->fd, (sockaddr*)&serv_addr, sizeof(serv_addr));
	if (ret == SOCKET_ERROR)
	{
        log_error("bind failed, err_code=%d, server_ip=%s, port=%u", _get_error_code(), server_ip, port);
		close(priv->fd);
		return -1;
	}

	ret = listen(priv->fd, 64);
	if (ret == SOCKET_ERROR)
	{
        log_error("listen failed, err_code=%d, server_ip=%s, port=%u", _get_error_code(), server_ip, port);
		close(priv->fd);
		return -1;
	}

	priv->state = SOCKET_STATE_LISTENING;

	log_debug("Listen on %s:%d", server_ip, port);

	manager_add_base_socket(this);
	event_dispatch_add_event(priv->fd, SOCKET_READ | SOCKET_EXCEP);
	return 0;
}

int base_socket_connect(void* this, const char* server_ip, uint16_t port, callback_t callback, void* callback_data)
{
	log_debug("base_socket_connect, server_ip=%s, port=%d", server_ip, port);

    base_socket_t *p = (void *base_socket_t)this;
    if(!p)  return -1;

    base_socket_priv_t *priv = (base_socket_priv_t *)p->priv;
    if(!priv) return -1;

	priv->remote_ip = server_ip;
	priv->remote_port = port;
	priv->callback = callback;
	priv->callback_data = callback_data;

	priv->fd = socket(AF_INET, SOCK_STREAM, 0);
	if (priv->fd == INVALID_SOCKET)
	{
        log_error("socket failed, err_code=%d, server_ip=%s, port=%u", _get_error_code(), server_ip, port);
		return -1;
	}

	_set_non_block(priv->fd);
	_set_no_delay(priv->fd);
	sockaddr_in serv_addr;
	_set_addr(server_ip, port, &serv_addr);
	int ret = connect(priv->fd, (sockaddr*)&serv_addr, sizeof(serv_addr));
	if ( (ret == SOCKET_ERROR) && (!_is_block(_get_error_code())) )
	{	
        log_error("connect failed, err_code=%d, server_ip=%s, port=%u", _get_error_code(), server_ip, port);
		close(priv->fd);
		return -1;
	}
	priv->state = SOCKET_STATE_CONNECTING;
	manager_add_base_socket(this);
	event_dispatch_add_event(priv->fd, SOCKET_ALL);
	
	return priv->fd;
}

int base_socket_send(void* this, void* buf, int len)
{
    base_socket_t *p = (void *base_socket_t)this;
    if(!p)  return -1;

    base_socket_priv_t *priv = (base_socket_priv_t *)p->priv;
    if(!priv) return -1;

	if (priv->state != SOCKET_STATE_CONNECTED)
		return -1;

	int ret = send(priv->fd, (char*)buf, len, 0);
	if (ret == SOCKET_ERROR)
	{
		int err_code = _get_error_code();
		if (_is_block(err_code))
		{
			ret = 0;
			//log("socket send block fd=%d", priv->fd);
		}
		else
		{
            log_error("send failed, err_code=%d, len=%d", err_code, len);
		}
	}

	return ret;
}

int base_socket_recv(void* this, void* buf, int len)
{
    base_socket_t *p = (void *base_socket_t)this;
    if(!p)  return -1;

    base_socket_priv_t *priv = (base_socket_priv_t *)p->priv;
    if(!priv) return -1;

	return recv(priv->fd, (char*)buf, len, 0);
}

int base_socket_colse(void* this)
{
    base_socket_t *p = (void *base_socket_t)this;
    if(!p)  return -1;

    base_socket_priv_t *priv = (base_socket_priv_t *)p->priv;
    if(!priv) return -1;

	event_dispatch_remove_event(priv->fd, SOCKET_ALL);
	manager_remove_base_socket(this);
	close(priv->fd);
	base_socket_release_ref();

	return 0;
}

2.1.2 socket event response

This part is mainly used for event_ After the dispatch event is triggered, the callback operation is performed when the IO event is reported.

void base_socket_onread(void* this)
{
    base_socket_t *p = (void *base_socket_t)this;
    if(!p)  return ;

    base_socket_priv_t *priv = (base_socket_priv_t *)p->priv;
    if(!priv) return ;

	if (priv->state == SOCKET_STATE_LISTENING)
	{
		_accept_new_socket();
	}
	else
	{
		u_long avail = 0;
        int ret = ioctlsocket(priv->fd, FIONREAD, &avail);
		if ( (SOCKET_ERROR == ret) || (avail == 0) )
		{
			priv->call_back(priv->callback_data, SOCKET_MSG_CLOSE, (int)priv->fd, NULL);
		}
		else
		{
			priv->call_back(priv->callback_data, SOCKET_MSG_READ, (int)priv->fd, NULL);
		}
	}
}

void base_socket_onwrite(void* this)
{
    base_socket_t *p = (void *base_socket_t)this;
    if(!p)  return ;

    base_socket_priv_t *priv = (base_socket_priv_t *)p->priv;
    if(!priv) return ;

	if (priv->state == SOCKET_STATE_CONNECTING)
	{
		int error = 0;
		socklen_t len = sizeof(error);

		getsockopt(priv->fd, SOL_SOCKET, SO_ERROR, (void*)&error, &len);

		if (error) {
			priv->call_back(priv->callback_data, SOCKET_MSG_CLOSE, (int)priv->fd, NULL);
		} else {
			priv->state = SOCKET_STATE_CONNECTED;
			priv->call_back(priv->callback_data, SOCKET_MSG_CONFIRM, (int)priv->fd, NULL);
		}
	}
	else
	{
		priv->call_back(priv->callback_data, SOCKET_MSG_WRITE, (int)priv->fd, NULL);
	}
}

void base_socket_onclose(void* this)
{
    base_socket_t *p = (void *base_socket_t)this;
    if(!p)  return ;

    base_socket_priv_t *priv = (base_socket_priv_t *)p->priv;
    if(!priv) return ;

	priv->state = SOCKET_STATE_CLOSING;
	priv->call_back(priv->callback_data, SOCKET_MSG_CLOSE, (int)priv->fd, NULL);
}

2.2 Epoll event processing component

2.2.1 Epoll event management

This section is used to add socket s and remove epoll management events

void event_dispatch_add_event(int fd, uint8_t socket_event)
{
	struct epoll_event ev;
	ev.events = EPOLLIN | EPOLLOUT | EPOLLET | EPOLLPRI | EPOLLERR | EPOLLHUP;
	ev.data.fd = fd;
	if (epoll_ctl(cxt.epfd, EPOLL_CTL_ADD, fd, &ev) != 0)
	{
		log_error("epoll_ctl() failed, errno=%d", errno);
	}
}

void event_dispatch_remove_event(int fd, uint8_t socket_event)
{
	if (epoll_ctl(cxt.epfd, EPOLL_CTL_DEL, fd, NULL) != 0)
	{
		log_error("epoll_ctl failed, errno=%d", errno);
	}
}

2.2.2 Epoll event distribution

This part mainly corresponds to timer events, circularly handles event addition and checks the events triggered to add; The second part is to start the whole epoll event management, including socket event response and externally added timing machine. The loop processing events are completed in the main loop.

void event_dispatch_add_timer(callback_t callback, void* user_data, uint64_t interval)
{
	list<timer_item_t*>::iterator it;
	for (it = m_timer_list.begin(); it != m_timer_list.end(); it++)
	{
		timer_item_t* pitem = *it;
		if (pitem->callback == callback && pitem->user_data == user_data)
		{
			pitem->interval = interval;
			pitem->next_tick = get_tick_count() + interval;
			return;
		}
	}

	timer_item_t* pitem = (timer_item_t*) malloc(sizeof(timer_item_t));
	pitem->callback = callback;
	pitem->user_data = user_data;
	pitem->interval = interval;
	pitem->next_tick = get_tick_count() + interval;
	m_timer_list.push_back(pitem);
}

void event_dispatch_remove_timer(callback_t callback, void* user_data)
{
	list<timer_item_t*>::iterator it;
	for (it = m_timer_list.begin(); it != m_timer_list.end(); it++)
	{
		timer_item_t* pitem = *it;
		if (pitem->callback == callback && pitem->user_data == user_data)
		{
			m_timer_list.erase(it);
			delete pitem;
			return;
		}
	}
}

void _check_timer()
{
	uint64_t curr_tick = get_tick_count();
	list<timer_item_t*>::iterator it;

	for (it = m_timer_list.begin(); it != m_timer_list.end(); )
	{
		timer_item_t* pitem = *it;
		it++;		// iterator maybe deleted in the callback, so we should increment it before callback
		if (curr_tick >= pitem->next_tick)
		{
			pitem->next_tick += pitem->interval;
			pitem->callback(pitem->user_data, NETLIB_MSG_TIMER, 0, NULL);
		}
	}
}

void event_dispatch_add_loop(callback_t callback, void* user_data)
{
    timer_item_t* pitem = new timer_item_t;
    pitem->callback = callback;
    pitem->user_data = user_data;
    m_loop_list.push_back(pitem);
}

void _check_loop()
{
    for (list<timer_item_t*>::iterator it = m_loop_list.begin(); it != m_loop_list.end(); it++) {
        timer_item_t* pitem = *it;
        pitem->callback(pitem->user_data, NETLIB_MSG_LOOP, 0, NULL);
    }
}

void event_dispatch_start_dispatch(uint32_t wait_timeout)
{
	struct epoll_event events[1024];
	int nfds = 0;

    if(cxt.running)
        return;
    cxt.running = true;
    
	while (cxt.running)
	{
		nfds = epoll_wait(cxt.epfd, events, 1024, wait_timeout);
		for (int i = 0; i < nfds; i++)
		{
			int ev_fd = events[i].data.fd;
			base_socket_t* psocket = manager_find_base_socket(ev_fd);
			if (!psocket)
				continue;
				
            if (events[i].events & EPOLLRDHUP)
            {
                psocket->onclose();
            }
            
			if (events[i].events & EPOLLIN)
			{
				psocket->onread();
			}

			if (events[i].events & EPOLLOUT)
			{
				psocket->onwirte();
			}

			if (events[i].events & (EPOLLPRI | EPOLLERR | EPOLLHUP))
			{
				psocket->onclose();
			}
			
			psocket->release_ref();
		}

		_check_timer();
        _check_loop();
	}
}

void event_dispatch_stop_dispatch()
{
    cxt.running = false;
}

Posted by cryp7 on Tue, 21 Sep 2021 00:57:02 -0700