Nat 用途很廣,家里的寬帶就是這種模式,將局域網的私有地址轉換成公網地址。沒有 dr 二層的限制,但是 nat 也有缺點,需要配置路由或是指定為 real server 的網關,同時也會有性能擴展問題。

對于進入的流量,實際上做的是 dnat, 將目標 ip 由 lb ip 換成真正的 rs ip, 此時后端 rs 是能拿到 client ip 的。返回的流量做 snat, 將源地址換成 lb ip.
三層處理 ipv4_rcv
數據接收和上文都是一樣的,直接看 ipv4_rcv
INET_HOOK(INET_HOOK_PRE_ROUTING, mbuf, port, NULL, ipv4_rcv_fin);
INET_HOOK_PRE_ROUTING 注冊兩個函數,dp_vs_pre_routing 和 dp_vs_in,由于 nat 不做 syn_proxy, 所以直接看 dp_vs_in
static int dp_vs_in(void *priv, struct rte_mbuf *mbuf,
const struct inet_hook_state *state)
{
struct dp_vs_iphdr iph;
struct dp_vs_proto *prot;
struct dp_vs_conn *conn;
int dir, af, verdict, err, related;
bool drop = false;
eth_type_t etype = mbuf->packet_type; /* FIXME: use other field ? */
assert(mbuf && state);
......
prot = dp_vs_proto_lookup(iph.proto);
if (unlikely(!prot))
return INET_ACCEPT;
/* packet belongs to existing connection ? */
conn = prot->conn_lookup(prot, &iph, mbuf, &dir, false, &drop);
if (unlikely(drop)) {
RTE_LOG(DEBUG, IPVS, "%s: deny ip try to visit.\n", __func__);
return INET_DROP;
}
// 如果沒找到,那么調用 conn_sched 去和 real server 連接
if (unlikely(!conn)) {
/* try schedule RS and create new connection */
if (prot->conn_sched(prot, &iph, mbuf, &conn, &verdict) != EDPVS_OK) {
/* RTE_LOG(DEBUG, IPVS, "%s: fail to schedule.\n", __func__); */
return verdict;
}
/* only SNAT triggers connection by inside-outside traffic. */
if (conn->dest->fwdmode == DPVS_FWD_MODE_SNAT)
dir = DPVS_CONN_DIR_OUTBOUND;
else
dir = DPVS_CONN_DIR_INBOUND;
}
......
if (prot->state_trans) {
err = prot->state_trans(prot, conn, mbuf, dir);
if (err != EDPVS_OK)
RTE_LOG(WARNING, IPVS, "%s: fail to trans state.", __func__);
}
conn->old_state = conn->state;
/* holding the conn, need a "put" later. */
if (dir == DPVS_CONN_DIR_INBOUND)
return xmit_inbound(mbuf, prot, conn);
else
return xmit_outbound(mbuf, prot, conn);
}
忽略部分源碼,一共四步操作
-
dp_vs_proto_lookup獲取四層處理協(xié)義,以 tcp 為例 -
conn_lookup在流表中查找連接,有時叫 session 也可以 -
conn_sched如果不存在 conn,那么一定是新來的請求,調度 -
state_trans狀態(tài)轉移 -
xmit_inbound或是xmit_outbound根據不同方向的流量將數據寫回網卡
新請求綁定 nat 回調
上文介紹 dr 時,講到 conn_sched 會根據一定算法選擇后端 rs 建立連接。最重要的一步操作就是 conn_bind_dest
switch (dest->fwdmode) {
case DPVS_FWD_MODE_NAT:
conn->packet_xmit = dp_vs_xmit_nat;
conn->packet_out_xmit = dp_vs_out_xmit_nat;
break;
case DPVS_FWD_MODE_TUNNEL:
conn->packet_xmit = dp_vs_xmit_tunnel;
break;
case DPVS_FWD_MODE_DR:
conn->packet_xmit = dp_vs_xmit_dr;
break;
case DPVS_FWD_MODE_FNAT:
conn->packet_xmit = dp_vs_xmit_fnat;
conn->packet_out_xmit = dp_vs_out_xmit_fnat;
break;
case DPVS_FWD_MODE_SNAT:
conn->packet_xmit = dp_vs_xmit_snat;
conn->packet_out_xmit = dp_vs_out_xmit_snat;
break;
default:
return EDPVS_NOTSUPP;
}
可以看到當前 dpvs 支持 nat, tunnel, dr, fullnat, snat.
進入流量處理 dp_vs_xmit_nat
int dp_vs_xmit_nat(struct dp_vs_proto *proto,
struct dp_vs_conn *conn,
struct rte_mbuf *mbuf)
{
struct flow4 fl4;
struct ipv4_hdr *iph = ip4_hdr(mbuf);
struct route_entry *rt;
int err, mtu;
if (!fast_xmit_close && !(conn->flags & DPVS_CONN_F_NOFASTXMIT)) {
dp_vs_save_xmit_info(mbuf, proto, conn);
if (!dp_vs_fast_xmit_nat(proto, conn, mbuf)) {
return EDPVS_OK;
}
}
/*
* drop old route. just for safe, because
* NAT is PREROUTING, should not have route.
*/
if (unlikely(mbuf->userdata != NULL)) {
RTE_LOG(WARNING, IPVS, "%s: NAT have route %p ?\n",
__func__, mbuf->userdata);
route4_put((struct route_entry*)mbuf->userdata);
}
memset(&fl4, 0, sizeof(struct flow4));
fl4.daddr = conn->daddr.in;
fl4.saddr = conn->caddr.in;
fl4.tos = iph->type_of_service;
rt = route4_output(&fl4);
if (!rt) {
err = EDPVS_NOROUTE;
goto errout;
}
這里最重要的就是 route4_output 查找路由
dp_vs_conn_cache_rt(conn, rt, true);
mtu = rt->mtu;
if (mbuf->pkt_len > mtu
&& (iph->fragment_offset & htons(IPV4_HDR_DF_FLAG))) {
RTE_LOG(DEBUG, IPVS, "%s: frag needed.\n", __func__);
err = EDPVS_FRAG;
goto errout;
}
mbuf->userdata = rt;
設路由賦給 mbuf
/* after route lookup and before translation */
if (xmit_ttl) {
if (unlikely(iph->time_to_live <= 1)) {
icmp_send(mbuf, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
err = EDPVS_DROP;
goto errout;
}
iph->time_to_live--;
}
/* L3 translation before l4 re-csum */
iph->hdr_checksum = 0;
iph->dst_addr = conn->daddr.in.s_addr;
注意這里 iph->dst_addr = conn->daddr.in.s_addr 將目標地址換成了后端 rs 地址
/* L4 NAT translation */
if (proto->fnat_in_handler) {
err = proto->nat_in_handler(proto, conn, mbuf);
if (err != EDPVS_OK)
goto errout;
}
L4 nat 處理,由于是 tcp 協(xié)義,查看 dp_vs_proto_tcp 變量得知這里會調用 tcp_snat_in_handler
if (likely(mbuf->ol_flags & PKT_TX_IP_CKSUM)) {
iph->hdr_checksum = 0;
} else {
ip4_send_csum(iph);
}
return INET_HOOK(INET_HOOK_LOCAL_OUT, mbuf, NULL, rt->port, ipv4_output);
errout:
if (rt)
route4_put(rt);
rte_pktmbuf_free(mbuf);
return err;
}
回調 INET_HOOK_LOCAL_OUT 鏈注冊的回調,查看源碼這里沒有,所以最后調用 ipv4_output
進入流量處理 tcp_snat_in_handler
static int tcp_snat_in_handler(struct dp_vs_proto *proto,
struct dp_vs_conn *conn, struct rte_mbuf *mbuf)
{
struct tcphdr *th;
int ip4hlen = ip4_hdrlen(mbuf);
struct netif_port *dev = NULL;
struct route_entry *rt = mbuf->userdata;
if (mbuf_may_pull(mbuf, ip4hlen + sizeof(*th)) != 0)
return EDPVS_INVPKT;
th = tcp_hdr(mbuf);
if (unlikely(!th))
return EDPVS_INVPKT;
if (mbuf_may_pull(mbuf, ip4hlen + (th->doff<<2)) != 0)
return EDPVS_INVPKT;
/* L4 translation */
th->dest = conn->dport;
注意這里 th->dest = conn->dport 將目標端口換成了 rs port
/* L4 re-checksum */
if (rt && rt->port)
dev = rt->port;
/* leverage HW TX TCP csum offload if possible */
if (likely(dev && (dev->flag & NETIF_PORT_FLAG_TX_TCP_CSUM_OFFLOAD))) {
mbuf->l4_len = ntohs(ip4_hdr(mbuf)->total_length) - ip4hlen;
mbuf->l3_len = ip4hlen;
mbuf->ol_flags |= (PKT_TX_TCP_CKSUM | PKT_TX_IP_CKSUM | PKT_TX_IPV4);
th->check = rte_ipv4_phdr_cksum(ip4_hdr(mbuf), mbuf->ol_flags);
} else {
if (mbuf_may_pull(mbuf, mbuf->pkt_len) != 0)
return EDPVS_INVPKT;
tcp4_send_csum(ip4_hdr(mbuf), th);
}
return EDPVS_OK;
}
因為修改了數據包內容,所以 checksum 也要重新計算
進入流量處理 ipv4_output
int ipv4_output(struct rte_mbuf *mbuf)
{
struct route_entry *rt = mbuf->userdata;
assert(rt);
IP4_UPD_PO_STATS(out, mbuf->pkt_len);
return INET_HOOK(INET_HOOK_POST_ROUTING, mbuf,
NULL, rt->port, ipv4_output_fin);
}
查看源碼并沒有 INET_HOOK_POST_ROUTING 回調,所以直接調用 ipv4_output_fin
static int ipv4_output_fin(struct rte_mbuf *mbuf)
{
struct route_entry *rt = mbuf->userdata;
if (mbuf->pkt_len > rt->mtu)
return ipv4_fragment(mbuf, rt->mtu, ipv4_output_fin2);
return ipv4_output_fin2(mbuf);
}
如果包長度大于 mtu,那么要分片發(fā)送,正常走 ipv4_output_fin2 邏輯,最后調用 neigh_resolve_output 發(fā)送數據到網卡。
返回流量處理 dp_vs_out_xmit_nat
int dp_vs_out_xmit_nat(struct dp_vs_proto *proto,
struct dp_vs_conn *conn,
struct rte_mbuf *mbuf)
{
struct flow4 fl4;
struct ipv4_hdr *iph = ip4_hdr(mbuf);
struct route_entry *rt;
int err, mtu;
...
/* L3 translation before l4 re-csum */
iph->hdr_checksum = 0;
iph->src_addr = conn->vaddr.in.s_addr;
這里省略部份代碼,最重要的就是 iph->src_addr = conn->vaddr.in.s_addr 設置源地址為 lb ip.
/* L4 NAT translation */
if (proto->fnat_in_handler) {
err = proto->nat_out_handler(proto, conn, mbuf);
if (err != EDPVS_OK)
goto errout;
}
if (likely(mbuf->ol_flags & PKT_TX_IP_CKSUM)) {
iph->hdr_checksum = 0;
} else {
ip4_send_csum(iph);
}
return INET_HOOK(INET_HOOK_LOCAL_OUT, mbuf, NULL, rt->port, ipv4_output);
}
調用 nat_out_handler 處理數據,查看源碼回調 tcp_snat_out_handler 函數
static int tcp_snat_out_handler(struct dp_vs_proto *proto,
struct dp_vs_conn *conn, struct rte_mbuf *mbuf)
{
...
/* L4 translation */
th->source = conn->vport;
/* L4 re-checksum */
if (rt && rt->port)
dev = rt->port;
/* leverage HW TX TCP csum offload if possible */
if (likely(dev && (dev->flag & NETIF_PORT_FLAG_TX_TCP_CSUM_OFFLOAD))) {
mbuf->l4_len = ntohs(ip4_hdr(mbuf)->total_length) - ip4hlen;
mbuf->l3_len = ip4hlen;
mbuf->ol_flags |= (PKT_TX_TCP_CKSUM | PKT_TX_IP_CKSUM | PKT_TX_IPV4);
th->check = rte_ipv4_phdr_cksum(ip4_hdr(mbuf), mbuf->ol_flags);
} else {
if (mbuf_may_pull(mbuf, mbuf->pkt_len) != 0)
return EDPVS_INVPKT;
tcp4_send_csum(ip4_hdr(mbuf), th);
}
return EDPVS_OK;
}
省略部份源碼,這里最重要的就是 th->source = conn->vport 設置源端口為 lb port
在 nat 的最后也是調用 ipv4_output 將數據寫回網卡,完成返回流量的轉發(fā)。
小結
由于有上一篇的存在,所以本文代碼較少,可以看到 nat 實現(xiàn)還是很簡潔明了的。