NAT Unique Quintuple Selection

Keywords: C network IE less Session

When using iptable for nat settings, you can use the following extension options:

# SNAT Source Address Translation, Used in POSTROUTING, INPUT Chain
--to-source [<ipaddr>[-<ipaddr>]][:port[-port]]
--random        # Mapping to random port number,
--random-fully  # Mapping to Random Port Number (PRNG Completely Randomized)
--persistent    # Mapping to fixed address

# DNA T destination address conversion, used in PREROUTING, OUTPUT chain
--to-destination [<ipaddr>[-<ipaddr>]][:port[-port]]
--random        # Mapping to random port number
--persistent    # Mapping to fixed address

In the kernel, the following flags correspond to the above options:

/* IP range specified */
#define NF_NAT_RANGE_MAP_IPS            (1 << 0)
/* Specific range of ports is specified */
#define NF_NAT_RANGE_PROTO_SPECIFIED        (1 << 1)
/* Range is random. The source port is computed using the secure_port function, which corresponds to -- random */
#define NF_NAT_RANGE_PROTO_RANDOM        (1 << 2)
/* Mapping to a fixed address, the same client uses the same source address, corresponding to -- persistent */
#define NF_NAT_RANGE_PERSISTENT            (1 << 3)
/* Completely random, corresponding to -- random-fully */
#define NF_NAT_RANGE_PROTO_RANDOM_FULLY        (1 << 4)

//Some of the above logos can be used in combination.

//Random Mark
#define NF_NAT_RANGE_PROTO_RANDOM_ALL        \
    (NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PROTO_RANDOM_FULLY)
//Scope Mark
#define NF_NAT_RANGE_MASK                    \
    (NF_NAT_RANGE_MAP_IPS | NF_NAT_RANGE_PROTO_SPECIFIED |    \
     NF_NAT_RANGE_PROTO_RANDOM | NF_NAT_RANGE_PERSISTENT |    \
     NF_NAT_RANGE_PROTO_RANDOM_FULLY)

Building nat information

netfilter builds NAT information in two places. One is to construct NAT information after hitting NAT rules, and the other is that relate connections construct NAT information in the expect function. Nat information is constructed using the function nf_nat_setup_info. The difference between the two is the range parameter. The latter is set by iptable rules, and the former is determined by help function. Nat modifies connection tracking, only the direction of the response.

/* Modify nat quintuples according to the type and scope of nat provided */
unsigned int
nf_nat_setup_info(struct nf_conn *ct,
          const struct nf_nat_range *range,
          enum nf_nat_manip_type maniptype)
{
    struct net *net = nf_ct_net(ct);/* Get the network namespace where the connection trace resides */
    struct nf_conntrack_tuple curr_tuple, new_tuple;

    /* Can't setup nat info for confirmed ct. */
    /* Connections that have been confirmed are not being built */
    if (nf_ct_is_confirmed(ct))
        return NF_ACCEPT;

    WARN_ON(maniptype != NF_NAT_MANIP_SRC &&
        maniptype != NF_NAT_MANIP_DST);

    if (WARN_ON(nf_nat_initialized(ct, maniptype)))
        return NF_DROP;

    /* What we've got will look like inverse of reply. Normally
     * this is what is in the conntrack, except for prior
     * manipulations (future optimization: if num_manips == 0,
     * orig_tp = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)
     * Gets a quintuple of the direction of the request
     */
    nf_ct_invert_tuplepr(&curr_tuple,
                 &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
    /* Get the five tuples of the request direction after nat according to the five tuples of the request direction */
    get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype);
    /* The only five tuples obtained are flipped and will serve as the five tuples in the direction of the response of the connection trace. */
    /* The new request direction of the quintuple is different from the original quintuple, it needs to change the direction of the response of the quintuple. */
    if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) {
        struct nf_conntrack_tuple reply;

        /* Alter conntrack table so will recognize replies. */
        /* A new quintile that receives the direction of the response based on the new quintile */
        nf_ct_invert_tuplepr(&reply, &new_tuple);
        /* Replace the quintile of the direction of the response */
        nf_conntrack_alter_reply(ct, &reply);

        /* Non-atomic: we own this at the moment. */
        if (maniptype == NF_NAT_MANIP_SRC)
            ct->status |= IPS_SRC_NAT;
        else
            ct->status |= IPS_DST_NAT;
        /* To determine whether help exists for this connection, seq-adj extension must be added if it exists */
        if (nfct_help(ct) && !nfct_seqadj(ct))
            if (!nfct_seqadj_ext_add(ct))
                return NF_DROP;
    }
    /* If it is a source NAT operation, add the quintuple to the nf_nat_bysource hash table */
    /* This table will be used to select the source IP of snat, that is, the same client will use the same source IP. */
    if (maniptype == NF_NAT_MANIP_SRC) {
        unsigned int srchash;
        spinlock_t *lock;

        srchash = hash_by_src(net,
                      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
        lock = &nf_nat_locks[srchash % CONNTRACK_LOCKS];
        spin_lock_bh(lock);
        hlist_add_head_rcu(&ct->nat_bysource,
                   &nf_nat_bysource[srchash]);
        spin_unlock_bh(lock);
    }

    /* It's done. nat Finished processing */
    if (maniptype == NF_NAT_MANIP_DST)
        ct->status |= IPS_DST_NAT_DONE;
    else
        ct->status |= IPS_SRC_NAT_DONE;

    return NF_ACCEPT;
}

Focus on get_unique_tuple function

nf_ct_invert_tuplepr(&curr_tuple,

             & CT - > tuplehash [IP_CT_DIR_REPLY]. tuple); the statement finds curr_tuple. For the first package or connection without nat, its value is a five-tuple in the direction of request, which is not different, but for the package with nat, it is different.
/* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING,
 * we change the source to map into the range. For NF_INET_PRE_ROUTING
 * and NF_INET_LOCAL_OUT, we change the destination to map into the
 * range. It might not be possible to get a unique tuple, but we try.
 * At worst (or if we race), we will end up with a final duplicate in
 * __ip_conntrack_confirm and drop the packet. 
 * The parameter tuple is the only quintuple to be found.
 * The parameter orig_tuple is a five-tuple in the direction of the request.
 * The parameter range is the parameter set by the rule.
 * The parameter maniptype is nat and is determined by the hook point.
 */
static void
get_unique_tuple(struct nf_conntrack_tuple *tuple,
         const struct nf_conntrack_tuple *orig_tuple,
         const struct nf_nat_range *range,
         struct nf_conn *ct,
         enum nf_nat_manip_type maniptype)
{
    const struct nf_conntrack_zone *zone;
    const struct nf_nat_l3proto *l3proto;
    const struct nf_nat_l4proto *l4proto;
    struct net *net = nf_ct_net(ct);

    zone = nf_ct_zone(ct);

    rcu_read_lock();
    l3proto = __nf_nat_l3proto_find(orig_tuple->src.l3num);
    l4proto = __nf_nat_l4proto_find(orig_tuple->src.l3num,
                    orig_tuple->dst.protonum);

    /* 1) If this srcip/proto/src-proto-part is currently mapped,
     * and that same mapping gives a unique tuple within the given
     * range, use that.
     *
     * This is only required for source (ie. NAT/masq) mappings.
     * So far, we don't do local source mappings, so multiple
     * manips not an issue.
     */
    if (maniptype == NF_NAT_MANIP_SRC && //In the first case, if it is the source nat, there is no random flag set.
        !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
        /* try the original tuple first */
        /* Firstly, it is judged whether the original direction of the quintuple meets the scope requirements of snat. If it meets and the quintuple is not used, the quintuple is used directly. 
        ** nat is not required in this case. Very rare. */
        if (in_range(l3proto, l4proto, orig_tuple, range)) {
            if (!nf_nat_used_tuple(orig_tuple, ct)) {
                *tuple = *orig_tuple;
                goto out;
            }/* If it is already in use, further calculation is needed. */
            
        /* The original quintuple is out of range. Select the source IP and select the nat IP of the same source IP recently used. */    
        } else if (find_appropriate_src(net, zone, l3proto, l4proto,
                        orig_tuple, tuple, range)) {
            pr_debug("get_unique_tuple: Found current src map\n");
            /* See if the source IP we selected satisfies uniqueness, and exit directly if it satisfies. */
            if (!nf_nat_used_tuple(tuple, ct))
                goto out;
        }
    }

    /* 2) Select the least-used IP/proto combination in the given range */
    /* 2) The previous snat did not select the appropriate source IP or dnat to further select IP here */
    *tuple = *orig_tuple;
    find_best_ips_proto(zone, tuple, range, ct, maniptype);

    /* 3) The per-protocol part of the manip is made to map into
     * the range to make a unique tuple.
     */

    /* Only bother mapping if it's not already in range and unique */
    /* No random flag set */
    if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
        if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {//Specific port range is specified
            if (l4proto->in_range(tuple, maniptype,//Check if the current port is in the specified range, and only one port is specified, and the quintuple has not been used, then the port selection is no longer performed.
                          &range->min_proto,
                          &range->max_proto) &&
                (range->min_proto.all == range->max_proto.all ||
                 !nf_nat_used_tuple(tuple, ct)))
                goto out;
        } else if (!nf_nat_used_tuple(tuple, ct)) {//If no specific port range is specified and the quintuples are not used, they are used directly.
            goto out;
        }
    }

    /* Last change: get protocol to try to obtain unique tuple. */
    /* Finally, the protocol is used to obtain a port to guarantee the uniqueness of five tuples. */
    l4proto->unique_tuple(l3proto, tuple, range, maniptype, ct);
out:
    rcu_read_unlock();
}

find_appropriate_src

/* Only called for SRC manip */
static int
find_appropriate_src(struct net *net,
             const struct nf_conntrack_zone *zone,
             const struct nf_nat_l3proto *l3proto,
             const struct nf_nat_l4proto *l4proto,
             const struct nf_conntrack_tuple *tuple,
             struct nf_conntrack_tuple *result,
             const struct nf_nat_range *range)
{
    unsigned int h = hash_by_src(net, tuple);
    const struct nf_conn *ct;
    //Traverse all five tuples in the snat request direction to see if the source IP is the same, and then use the source IP after the corresponding nat.
    hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) {
        if (same_src(ct, tuple) &&//Same source IP
            net_eq(net, nf_ct_net(ct)) &&//Same namespace
            nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) {//Same zone
            /* Copy source part from reply tuple. */
            /* Get the quintuple of the direction of the response, invert it, and get the source IP after we need nat */
            nf_ct_invert_tuplepr(result,
                       &ct->tuplehash[IP_CT_DIR_REPLY].tuple);//Reverse quintiles to obtain the direction of the response
            //Restore destination IP
            result->dst = tuple->dst;
            //If the specified range is met, the conformity returns 1, otherwise the next element is continued.
            if (in_range(l3proto, l4proto, result, range))
                return 1;
        }
    }
    return 0;
}

find_best_ips_proto

/* For [FUTURE] fragmentation handling, we want the least-used
 * src-ip/dst-ip/proto triple.  Fairness doesn't come into it.  Thus
 * if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
 * 1-65535, we don't do pro-rata allocation based on ports; we choose
 * the ip with the lowest src-ip/dst-ip/proto usage.
 * Choose a minimum IP/PRO protocol combination. The hash algorithm is used to calculate a value directly.
 */
static void
find_best_ips_proto(const struct nf_conntrack_zone *zone,
            struct nf_conntrack_tuple *tuple,
            const struct nf_nat_range *range,
            const struct nf_conn *ct,
            enum nf_nat_manip_type maniptype)
{
    union nf_inet_addr *var_ipp;
    unsigned int i, max;
    /* Host order */
    u32 minip, maxip, j, dist;
    bool full_range;

    /* No IP mapping?  Do nothing. No IP conversion flag set, exit*/
    if (!(range->flags & NF_NAT_RANGE_MAP_IPS))
        return;

    if (maniptype == NF_NAT_MANIP_SRC)/* According to nat type, point to the ip memory address that needs to be modified */
        var_ipp = &tuple->src.u3;
    else
        var_ipp = &tuple->dst.u3;

    /* Fast path: only one choice. If there is only one IP address, the IP address is used. */
    if (nf_inet_addr_cmp(&range->min_addr, &range->max_addr)) {
        *var_ipp = range->min_addr;
        return;
    }
    //Compute the offset of the last four bytes of the IP address in the IP array.
    if (nf_ct_l3num(ct) == NFPROTO_IPV4)
        max = sizeof(var_ipp->ip) / sizeof(u32) - 1;//0
    else
        max = sizeof(var_ipp->ip6) / sizeof(u32) - 1;//3

    /* Hashing source and destination IPs gives a fairly even
     * spread in practice (if there are a small number of IPs
     * involved, there usually aren't that many connections
     * anyway).  The consistency means that servers see the same
     * client coming from the same IP (some Internet Banking sites
     * like this), even across reboots.
     * If the NF_NAT_RANGE_PERSISTENT flag is set, the same client is guaranteed.
     * When using the same hash value, that is, hash, only the source IP is used, not the destination IP.
     */
    j = jhash2((u32 *)&tuple->src.u3, sizeof(tuple->src.u3) / sizeof(u32),
           range->flags & NF_NAT_RANGE_PERSISTENT ?
            0 : (__force u32)tuple->dst.u3.all[max] ^ zone->id);
    //hash each four bytes of the ip address to ensure that it is within the specified range.
    full_range = false;
    for (i = 0; i <= max; i++) {
        /* If first bytes of the address are at the maximum, use the
         * distance. Otherwise use the full range.
         */
        if (!full_range) {
            minip = ntohl((__force __be32)range->min_addr.all[i]);
            maxip = ntohl((__force __be32)range->max_addr.all[i]);
            dist  = maxip - minip + 1;
        } else {
            minip = 0;
            dist  = ~0;
        }

        var_ipp->all[i] = (__force __u32)
            htonl(minip + reciprocal_scale(j, dist));
        if (var_ipp->all[i] != range->max_addr.all[i])
            full_range = true;

        if (!(range->flags & NF_NAT_RANGE_PERSISTENT))
            j ^= (__force u32)tuple->dst.u3.all[i];
    }
}

l4proto->unique_tuple

The implementation of l4proto - > unique_tuple is nf_nat_l4proto_unique_tuple.

/*
If there is no specified range, the destination port cannot be changed at DNA T and the source port can be changed at SNAT.
There are several limitations on the range of ports. The mapping range of ports within 512 is 1-512. Ports
 The mapping range of 512-1024 is 600-1024, and the mapping range of 1024 or above is 1024 or above.
If the range of port changes is specified, then follow the specified range.
If it is in the mode of NF_NAT_RANGE_PROTO_RANDOM, call the secure_port of L3.
Calculate a hash value based on the source destination IP and the port that needs to be modified.
If the model is NF_NAT_RANGE_PROTO_RANDOM_FULLY, the random number is calculated directly.
According to the obtained value, the port is redundant according to the range, and the minimum value is added to determine whether it has been used or not.
Add 1 to judge.
*/
void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto *l3proto,
                 struct nf_conntrack_tuple *tuple,
                 const struct nf_nat_range *range,
                 enum nf_nat_manip_type maniptype,
                 const struct nf_conn *ct,
                 u16 *rover)
{
    unsigned int range_size, min, max, i;
    __be16 *portptr;
    u_int16_t off;

    if (maniptype == NF_NAT_MANIP_SRC)
        portptr = &tuple->src.u.all;
    else
        portptr = &tuple->dst.u.all;

    /* If no range specified... Determine whether a specific port range is specified */
    if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) {/* If no specific port range is specified */
        /* If it's dst rewrite, can't change port Destination nat does not change port */
        if (maniptype == NF_NAT_MANIP_DST)
            return;
        /* If the source port is a reserved port, it is necessary to ensure that the source port after nat is also a reserved port. */
        if (ntohs(*portptr) < 1024) {
            /* Loose convention: >> 512 is credential passing */
            /* If the source port is less than 512, choose between 1 and 511 */
            if (ntohs(*portptr) < 512) {
                min = 1;
                range_size = 511 - min + 1;
            } else {
                /* If it is greater than 512, the choice is between 600 and 1024. */
                min = 600;
                range_size = 1023 - min + 1;
            }
        } else {//Non-reserved ports are selected between 1024 and 65536
            min = 1024;
            range_size = 65535 - 1024 + 1;
        }
    } else {//Specific port range is specified
        min = ntohs(range->min_proto.all);
        max = ntohs(range->max_proto.all);
        if (unlikely(max < min))
            swap(max, min);
        range_size = max - min + 1;
    }

    if (range->flags & NF_NAT_RANGE_PROTO_RANDOM) {
        off = l3proto->secure_port(tuple, maniptype == NF_NAT_MANIP_SRC
                          ? tuple->dst.u.all
                          : tuple->src.u.all);
    } else if (range->flags & NF_NAT_RANGE_PROTO_RANDOM_FULLY) {
        off = prandom_u32();
    } else {
        off = *rover;
    }

    for (i = 0; ; ++off) {
        *portptr = htons(min + off % range_size);
        /* If the port is already in use, add 1 to try until the requirement is met or all cases should be traversed. 
        ** If the loop jumps out because of + + i == range_size, the session will be deleted and the message will be discarded at _nf_conntrack_confirm if a unique tuple is not selected.*/
        if (++i != range_size && nf_nat_used_tuple(tuple, ct))
            continue;
        /* If no randomization is set, set the port number currently selected */
        if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL))
            *rover = off;
        return;
    }
}

Posted by jmosterb on Wed, 09 Oct 2019 12:37:41 -0700