Linux内核网络的连接跟踪conntrack简单分析

发布于:2025-09-08 ⋅ 阅读:(19) ⋅ 点赞:(0)

连接跟踪conntrack的基本信息

Linux内核的conntrack模块是网络过滤子系统netfilter重要组成部分,它是网络地址转换NAT和防火墙等网络功能的基础。Linux内核中一个连接(可以为UDPTCP,或其他)的建立是一个冗长耗时的过程,例如,该连接经过内核过滤规则(对应防火墙的规则)或端口转发等规则的确认,最终成功建立。当连接建立后,如何避免后续数据量庞大、数量众多的网络包快速检测通过(从而降低Linux内核网络的负载),跟踪连接是十分必要的。为了跟踪一个已存在的网络连接,Linux内核(版本为6.6.67)使了以下结构体作为一个连接的指纹:

/* include/net/netfilter/nf_conntrack_tuple.h */
/* This contains the information to distinguish a connection. */
struct nf_conntrack_tuple {
    struct nf_conntrack_man src;

    /* These are the parts of the tuple which are fixed. */
    struct {
        union nf_inet_addr u3; 
        union {
            /* Add other protocols here. */
            __be16 all;

            struct {
                __be16 port;
            } tcp;
            struct {
                __be16 port;
            } udp;
......
}

可以看到,它包含了一个连接的重要信息:源和目标IP地址、源和目标端口号等。对于NAT,它还包含转换的IP地址和端口号等。该nf_conntack_tuple结构体在内核函数nf_ct_get_tuple中被填充:

/* net/netfilter/nf_conntrack_core.c */
static bool 
nf_ct_get_tuple(const struct sk_buff *skb,
        unsigned int nhoff,
        unsigned int dataoff,
        u_int16_t l3num,
        u_int8_t protonum,
        struct net *net,
        struct nf_conntrack_tuple *tuple)
{
    unsigned int size;
    const __be32 *ap; 
    __be32 _addrs[8];

    memset(tuple, 0, sizeof(*tuple));

之后通过 __nf_conntrack_find_get函数将该结构体映射到struct nf_conn指针;可以把这一过程简化成把nf_conntrack_tuple结构体作为一个哈稀表的键值,查找得到struct nf_conn指针:

/* net/netfilter/nf_conntrack_core.c */
/* Find a connection corresponding to a tuple. */
static struct nf_conntrack_tuple_hash *
__nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone,
            const struct nf_conntrack_tuple *tuple, u32 hash)
{
    struct nf_conntrack_tuple_hash *h;
    struct nf_conn *ct; 

    h = ____nf_conntrack_find(net, zone, tuple, hash);
    ......
    ct = nf_ct_tuplehash_to_ctrack(h);

最后,结构体struct nf_conn包含了一个已建立的(严格地说,也包含待建立的)链接的状态息:

/* incude/net/netfilter/nf_conntrack_core.c */
struct nf_conn {
    struct nf_conntrack ct_general;

    spinlock_t  lock;
    /* jiffies32 when this ct is considered dead */
    u32 timeout;

#ifdef CONFIG_NF_CONNTRACK_ZONES
    struct nf_conntrack_zone zone;
#endif
    /* XXX should I move this to the tail ? - Y.K */
    /* These are my tuples; original and reply */
    struct nf_conntrack_tuple_hash tuplehash[IP_CT_DIR_MAX];

    /* Have we seen traffic both ways yet? (bitset) */
    unsigned long status;

这里我们重点关注timeout/status;其中timeoutjiffies为单位,表示该连接跟踪的失效的时间;status则提供了该连接的状态比特标志位等信息。

UDP的连接状态跟踪

笔者为了加深对conntrack的了解,修改了busybox的代码,在其中增加了绑定本地端口的功能:

diff --git a/libbb/xconnect.c b/libbb/xconnect.c
index 0e0b247..6456c65 100644
--- a/libbb/xconnect.c
+++ b/libbb/xconnect.c
@@ -369,6 +369,25 @@ int FAST_FUNC xsocket_type(len_and_sockaddr **lsap, int family, int sock_type)
        lsa = xzalloc(LSA_LEN_SIZE + len);
        lsa->len = len;
        lsa->u.sa.sa_family = family;
+
+       /* bind to local port number for IPv4/IPv6 */
+       if (family == AF_INET || family == AF_INET6) {
+               int pno = -1;
+               const char * lport = getenv("BB_PORTNO");
+               if (lport && lport[0])
+                       pno = (int) strtol(lport, NULL, 0);
+               if (pno > 0 && pno < 65536) {
+                       if (family == AF_INET) {
+                               struct sockaddr_in * addr;
+                               addr = (struct sockaddr_in *) &(lsa->u.sa);
+                               addr->sin_port = htons((unsigned short) pno);
+                       } else {
+                               struct sockaddr_in6 * addr;
+                               addr = (struct sockaddr_in6 *) &(lsa->u.sa);
+                               addr->sin6_port = htons((unsigned short) pno);
+                       }
+               }
+       }
        *lsap = lsa;
        return fd;
 }

这样,通过配置环境变量BB_PORTNO固定为4321,可以强制nslookup命令行工具多次调用时,使用同一端口:

root@localhost:~# export BB_PORTNO=4321
root@localhost:~# nslookup www.baidu.com 223.5.5.5
Server:		223.5.5.5
Address:	223.5.5.5:53

Non-authoritative answer:
www.baidu.com	canonical name = www.a.shifen.com
Name:	www.a.shifen.com
Address: 223.109.82.16
Name:	www.a.shifen.com
Address: 223.109.82.212

以上命令在PC侧执行;此时,在路由器设备上(笔者使用了树莓派做软路由),可以通过conntrack命令行工具查看连建立的UDP连接信息:

root@OpenWrt:~# conntrack -L | grep -e udp 
conntrack v1.4.8 (conntrack-tools): 13 flow entries have been shown.
udp      17 49 src=192.167.7.169 dst=223.5.5.5 sport=4321 dport=53 packets=1 bytes=90 src=223.5.5.5 dst=192.168.1.3 sport=53 dport=4321 packets=2 bytes=266 mark=0 use=1
root@OpenWrt:~# conntrack -L | grep -e udp 
conntrack v1.4.8 (conntrack-tools): 13 flow entries have been shown.
udp      17 46 src=192.167.7.169 dst=223.5.5.5 sport=4321 dport=53 packets=1 bytes=90 src=223.5.5.5 dst=192.168.1.3 sport=53 dport=4321 packets=2 bytes=266 mark=0 use=1
root@OpenWrt:~# conntrack -L | grep -e udp 
conntrack v1.4.8 (conntrack-tools): 13 flow entries have been shown.
udp      17 176 src=192.167.7.169 dst=223.5.5.5 sport=4321 dport=53 packets=2 bytes=180 src=223.5.5.5 dst=192.168.1.3 sport=53 dport=4321 packets=4 bytes=532 [ASSURED] mark=0 use=1
root@OpenWrt:~# conntrack -L | grep -e udp 
conntrack v1.4.8 (conntrack-tools): 13 flow entries have been shown.
udp      17 174 src=192.167.7.169 dst=223.5.5.5 sport=4321 dport=53 packets=3 bytes=270 src=223.5.5.5 dst=192.168.1.3 sport=53 dport=4321 packets=6 bytes=798 [ASSURED] mark=0 use=1

其中,17为网络协议编号,对应UDP;之后的数值单位为秒,即该连接跟踪在多少秒后超时。超时后,DNS服务器223.5.5.5的回应不会被软路由NAT转发。注意到,一开始该UDP连接的超时时间分别为49秒和46秒;但之后变成了176秒,这是笔者在PC上多次执行nslookup www.baidu.com 223.5.5.5命令的结果;简单地说,当一个UDP有了初次的回应后,它的超时时间会变成60秒;当有多次回应后,超时时间会变成180秒。这一变化过程下面有相关说明。

UDP连接的状态的内核调试

笔者编写了一个简单的bpftrace脚本,用于跟踪UDP的连接状态信息:

#!/usr/bin/bpftrace

#include <net/netfilter/nf_conntrack.h>

kprobe:nf_conntrack_udp_packet {
	$c = (struct nf_conn *) arg0;
	printf("%8d.%06d: PID: %d, comm: %s, nf_conntrack_udp_packet(0x%lx, 0x%lx, 0x%lx), status: 0x%x, timeout: %u",
		elapsed / 1000000, elapsed % 1000000, pid, comm, arg0, arg1, arg2, $c->status, $c->timeout);
	print(kstack);
}

kretprobe:__nf_conntrack_alloc {
	printf("%8d.%06d: PID: %d, comm: %s, __nf_conntrack_alloc has returned: 0x%lx",
		elapsed / 1000000, elapsed % 1000000, pid, comm, retval);
	print(kstack);
}

kprobe:nf_conntrack_free {
	printf("%8d.%06d: PID: %d, comm: %s, nf_conntrack_free(0x%lx)",
		elapsed / 1000000, elapsed % 1000000, pid, comm, arg0);
	print(kstack);
}

使用该脚本对这一过程进行调试,得到的结果如下(调试结果有精简):

    4675.050648: PID: 0, comm: swapper/0, __nf_conntrack_alloc has returned: 0xffffff8006469200
        init_conntrack.isra.0+976
        nf_conntrack_in+912
        ipv4_conntrack_in+24
        nf_hook_slow+72
        br_nf_pre_routing+444
        br_handle_frame+404
        __netif_receive_skb_core.constprop.0+500
        __netif_receive_skb_one_core+44
        process_backlog+168
        __napi_poll.constprop.0+56
        net_rx_action+344
        handle_softirqs+352
        __softirqentry_text_start+20
        ____do_softirq+16

    4675.180752: PID: 0, comm: swapper/0, nf_conntrack_udp_packet(0xffffff8006469200, 0xffffff80050e0e00, 0x14), status: 0x0, timeout: 0
        nf_conntrack_udp_packet+0
        ipv4_conntrack_in+24
        nf_hook_slow+72
        br_nf_pre_routing+444
        br_handle_frame+404
        __netif_receive_skb_core.constprop.0+500
        __netif_receive_skb_one_core+44
        process_backlog+168
        __napi_poll.constprop.0+56
        net_rx_action+344
        handle_softirqs+352

    4681.793946: PID: 2774, comm: bpftrace, nf_conntrack_udp_packet(0xffffff8006469200, 0xffffff8001681b00, 0x14), status: 0x198, timeout: 372070
        nf_conntrack_udp_packet+0
        ipv4_conntrack_in+24
        nf_hook_slow+72
        ip_rcv+92
        __netif_receive_skb_one_core+72
        process_backlog+168
        __napi_poll.constprop.0+56
        net_rx_action+344
        handle_softirqs+352
        
    4682.189260: PID: 2774, comm: bpftrace, nf_conntrack_udp_packet(0xffffff8006469200, 0xffffff8001681600, 0x14), status: 0x19a, timeout: 372071
        nf_conntrack_udp_packet+0
        ipv4_conntrack_in+24
        nf_hook_slow+72
        ip_rcv+92
        __netif_receive_skb_one_core+72
        process_backlog+168
        __napi_poll.constprop.0+56
        net_rx_action+344
        handle_softirqs+352

当一个连接生成时,会调用__nf_conntrack_alloc函数分配连接跟踪结构体nf_conn。函数nf_conntrack_udp_packet用于检查并更新一个UDP连接的跟踪信息;第一次调用时,可以看到nf_conn中的statustimeout都为0,此时会默认使用以下代码更新跟踪信息:

nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[UDP_CT_UNREPLIED]);

注意,timeouts[UDP_CT_UNREPLIED]默认值为 30*HZ,但openwrt系统将之配置为60秒:

root@OpenWrt:~# cat /proc/sys/net/netfilter/nf_conntrack_udp_timeout
60
root@OpenWrt:~# cat /proc/sys/net/netfilter/nf_conntrack_udp_timeout_stream 
180

针对笔者使用的树莓派设备,使能了内核选项CONFIG_HZ_100=y,那么HZ值为100;上面的调试结果:status: 0x198, timeout: 372070,表明该UDP连接跟踪会在启动系动的第3720.7秒后超时失效。可以确定,该UDP连接是树莓派设备启动的第3720.7 - 60秒,即第3660.7秒时从PC机上收到的(此时树莓派启动了约一小时)。下面会有数据的变化与此印证。此时,该UDP连接的状态位为0x198,对应着:

	-----------------------------------------------
	Value [0x198] (0x198, 408):
	    28    24    20    16    12    8     4     0
	 0000  0000  0000  0000  0000  0001  1001  1000  
	31    27    23    19    15    11     7     3

	/* Connection is confirmed: originating packet has left box */
    IPS_CONFIRMED_BIT = 3,
    IPS_CONFIRMED = (1 << IPS_CONFIRMED_BIT),

    /* Connection needs src nat in orig dir.  This bit never changed. */
    IPS_SRC_NAT_BIT = 4,
    IPS_SRC_NAT = (1 << IPS_SRC_NAT_BIT)

上面调用了nf_conntrack_udp_packet函数两次,分别对应对DNS 223.5.5.5服务器的一收一发,连接已确认,第3位比特会置1。

下面笔者再次(即第二次)在PC机上执行了nslookup www.baidu.com 223.5.5.5,但连接跟踪信息的超时时间没有变化,仍是系统启动的第3720.7秒。此时,状态位由之前的0x198变为0x19a,即第1位置1(其实是第二次调用nf_conntrack_udp_packet函数返回后的状态值):

    /* We've seen packets both ways: bit 1 set.  Can be set, not unset. */
    IPS_SEEN_REPLY_BIT = 1,
    IPS_SEEN_REPLY = (1 << IPS_SEEN_REPLY_BIT),

第二次执行nslookup的内核调试结果如下:

   21181.259131: PID: 0, comm: swapper/0, nf_conntrack_udp_packet(0xffffff8006469200, 0xffffff8001681e00, 0x14), status: 0x19a, timeout: 372071
        nf_conntrack_udp_packet+0
        ipv4_conntrack_in+24
        nf_hook_slow+72
        br_nf_pre_routing+444
        br_handle_frame+404
        __netif_receive_skb_core.constprop.0+500
        __netif_receive_skb_one_core+44
        process_backlog+168
        __napi_poll.constprop.0+56
        net_rx_action+344
        handle_softirqs+352

   21187.751231: PID: 308, comm: kworker/u13:1, nf_conntrack_udp_packet(0xffffff8006469200, 0xffffff80065e6700, 0x14), status: 0x19e, timeout: 385721
        nf_conntrack_udp_packet+0
        ipv4_conntrack_in+24
        nf_hook_slow+72
        ip_rcv+92
        __netif_receive_skb_one_core+72
        process_backlog+168
        __napi_poll.constprop.0+56
        net_rx_action+344
        handle_softirqs+352

除了status中的比特位2置1外:

    /* Conntrack should never be early-expired. */
    IPS_ASSURED_BIT = 2,
    IPS_ASSURED = (1 << IPS_ASSURED_BIT),

该UDP跟踪信息的超时时间由原来的372071变成了385721,二者相差了13650 jiffies,对应着135.5秒;也就是说,内核把这个UDP连接跟踪失效的时间在原来的基础上又推迟了136.5秒,这个超时时间接近nf_conntrack_udp_timeout_stream中指定的180超时时间:

cat /proc/sys/net/netfilter/nf_conntrack_udp_timeout_stream
180

最后,UDP连接跟踪的状态更新函数内容如下:

/* net/netfilter/nf_conntrack_proto_udp.c */
int nf_conntrack_udp_packet(struct nf_conn *ct,
                struct sk_buff *skb,
                unsigned int dataoff,
                enum ip_conntrack_info ctinfo,
                const struct nf_hook_state *state)
{
    unsigned int *timeouts;
    unsigned long status;

    if (udp_error(skb, dataoff, state))
        return -NF_ACCEPT;

    timeouts = nf_ct_timeout_lookup(ct);
    if (!timeouts)
        timeouts = udp_get_timeouts(nf_ct_net(ct));

    status = READ_ONCE(ct->status);
    if ((status & IPS_CONFIRMED) == 0)
        ct->proto.udp.stream_ts = 2 * HZ + jiffies;

    /* If we've seen traffic both ways, this is some kind of UDP
     * stream. Set Assured.
     */
    if (status & IPS_SEEN_REPLY) {
        unsigned long extra = timeouts[UDP_CT_UNREPLIED];
        bool stream = false;

        /* Still active after two seconds? Extend timeout. */
        if (time_after(jiffies, ct->proto.udp.stream_ts)) {
            extra = timeouts[UDP_CT_REPLIED];
            stream = (status & IPS_ASSURED) == 0;
        }   

        nf_ct_refresh_acct(ct, ctinfo, skb, extra);

        /* never set ASSURED for IPS_NAT_CLASH, they time out soon */
        if (unlikely((status & IPS_NAT_CLASH)))
            return NF_ACCEPT;

        /* Also, more likely to be important, and not a probe */
        if (stream && !test_and_set_bit(IPS_ASSURED_BIT, &ct->status))
            nf_conntrack_event_cache(IPCT_ASSURED, ct);
    } else {
        nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[UDP_CT_UNREPLIED]);
    }
    return NF_ACCEPT;
}

以上代码中的UDP_CT_REPLIED即对应内核配置/proc/sys/net/netfilter/nf_conntrack_udp_timeout_stream 的值(UDP_CT_UNREPLIED对应nf_conntrack_udp_timeout)。注意到,把超时时间更新到timeouts[UDP_CT_REPLIED]是有条件的,其条件就是间隔两秒之后仍有数据活动(从而该连接被视为持续活动的连接,即udp_stream)。

上面提到,连接跟踪结构体struct nf_conn包含了一些NAT的信息,这个信息是网络地址转换需要的;例如上面的status字段中第4位比待位置1(对应IPS_SRC_NAT),则以下代码会执行:

/* net/netfilter/nf_conntrack_core.c */
static int __nf_conntrack_update(struct net *net, struct sk_buff *skb, ...) {
    if (ct->status & IPS_SRC_NAT) {
        memcpy(tuple.src.u3.all,
               ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.all,
               sizeof(tuple.src.u3.all));
        tuple.src.u.all =
            ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all;
    }
    ......
    if (status & IPS_SRC_NAT &&
        nat_hook->manip_pkt(skb, ct, NF_NAT_MANIP_SRC,
                IP_CT_DIR_ORIGINAL) == NF_DROP)
        return -1;
}

上面根据标志位IPS_SRC_NAT更新了tuple中的UDP源地址。之后调用了manip_pkt来进一步处理。下面笔者找到了函数__udp_manip_pkt对应的汇编代码,编写了另一个bpftrace脚本,查看对网络数据包的UDP端口的修改:

#!/usr/bin/bpftrace

/*
net/netfilter/nf_nat_proto.c
static void __udp_manip_pkt(struct sk_buff *skb,
            unsigned int iphdroff, struct udphdr *hdr,
            const struct nf_conntrack_tuple *tuple,
            enum nf_nat_manip_type maniptype, bool do_csum)
{
    __be16 *portptr, newport;
	......
    *portptr = newport;  // => 0xffffffc080810410 <l4proto_manip_pkt+428>:	strh	w25, [x24]
}

Dump of assembler code from 0xffffffc080810400 to 0xffffffc080810420:
   0xffffffc080810400 <l4proto_manip_pkt+412>:	cbnz	w21, 0xffffffc08081063c <l4proto_manip_pkt+984>
   0xffffffc080810404 <l4proto_manip_pkt+416>:	ldrh	w25, [x22, #16]
   0xffffffc080810408 <l4proto_manip_pkt+420>:	mov	x24, x19
   0xffffffc08081040c <l4proto_manip_pkt+424>:	cbnz	w0, 0xffffffc080810398 <l4proto_manip_pkt+308>
   0xffffffc080810410 <l4proto_manip_pkt+428>:	strh	w25, [x24]
 */
kprobe:l4proto_manip_pkt+0x1ac {
	$n = reg("r25");
	$r = (uint16 *) reg("r24");
	$o = *kptr($r);
	$o = ($o >> 8) | (($o << 8) & 0x00FF00);
	$n = ($n >> 8) | (($n << 8) & 0x00FF00);
	printf("PID: %d, comm: %s, UDP/NAT replacing port from %d to %d",
		pid, comm, $o, $n);
	print(kstack);
}

上面脚本的调试结果只有把路由端口修改为4321端口的操作,却没有把4321端口替代成路由端口的操作,需要进一步探究:

PID: 0, comm: swapper/1, UDP/NAT replacing port from 57616 to 4321
        l4proto_manip_pkt+428
        nf_nat_ipv4_manip_pkt+116
        nf_nat_manip_pkt+192
        nf_nat_inet_fn+460
        nf_nat_ipv4_pre_routing+84
        nf_hook_slow+72
        ip_rcv+92

跟踪连接的超时失效

结构体struct nf_conn保存了已建立连接的基本信息;当一个nf_conn失效时,Linux内核会丢弃该数据包(必要时返回TCP/RSTicmp/unreachable),因为不知道如何对该数据包进行NAT转发。连接跟踪超时的判断,目前的调试观察到有两种方式,分别是内核工作线徎周期性检测,和应用层的netlink访问(例如上面的conntrack命令行工具)。当一个连接跟踪失效时,会调用nf_conntrack_free释放内存:

  146678.923239: PID: 36, comm: kworker/u12:0, nf_conntrack_free(0xffffff8006469900)
        nf_conntrack_free+0
        nf_ct_gc_expired.part.0+152
        nf_ct_gc_expired+96
        gc_worker+592
        process_one_work+408
        worker_thread+768
        kthread+220
        ret_from_fork+16
 
   235795.008082: PID: 2817, comm: conntrack, nf_conntrack_free(0xffffff8006469200)
        nf_conntrack_free+0
        ctnetlink_dump_table+1024
        netlink_dump+300
        __netlink_dump_start+364
        ctnetlink_get_conntrack+484
        nfnetlink_rcv_msg+560
        netlink_rcv_skb+96
        nfnetlink_rcv+108
        netlink_unicast+488
        netlink_sendmsg+412
        __sys_sendto+224
        __arm64_sys_sendto+40
        invoke_syscall.constprop.0+92
        do_el0_svc+64
        el0_svc+48
        el0t_64_sync_handler+288
        el0t_64_sync+376

至此,我们就对Linux内核的连接跟踪conntrack有了初步的了解;这一块比较复杂,值得深入探究。


网站公告

今日签到

点亮在社区的每一天
去签到