哥哥的CSDN集 2011-03-31
先来回顾一个潜在的结构 在skb的 pskb_copy, skb_copy 等操作中起到了关键作用
unsignedshortnr_frags;
unsignedshortgso_size;
/*Warning:thisfieldisnotalwaysfilledin(UFO)!*/
unsignedshortgso_segs;
unsignedshortgso_type;
__be32ip6_frag_id;
__u8tx_flags;
structsk_buff*frag_list;
structskb_shared_hwtstampshwtstamps;
atomic_tdataref;
void*destructor_arg;
skb_frag_tfrags[MAX_SKB_FRAGS];
};想要顺利的访问这个结构就要了解一个宏
#define skb_shinfo(SKB) ((struct skb_shared_info *)((SKB)->end))
再回忆一下头部结构
#ifdefined(__LITTLE_ENDIAN_BITFIELD)
__u8ihl:4,
version:4;
#elifdefined(__BIG_ENDIAN_BITFIELD)
__u8version:4,
ihl:4;
#else
#error"Pleasefix<asm/byteorder.h>"
#endif
__u8tos;
__be16tot_len;
__be16id;
__be16frag_off;
__u8ttl;
__u8protocol;
__sum16check;
__be32saddr;
__be32daddr;
/*Theoptionsstarthere.*/
};好了现在就看看, 上前天说到的 ip_rcv 如何Ip处理包
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
/*在sk_buff __netif_receive_skb eth_type_trans 中已经让包符合L3 */
struct iphdr *iph;
u32 len;
/*如果是因为开了promiscuous 模式而让垃圾包到了L3 ,就丢弃不属于自己的 */
if (skb->pkt_type == PACKET_OTHERHOST)
goto drop;
/*依旧SNMP 采集点*/
IP_UPD_PO_STATS_BH(dev_net(dev), IPSTATS_MIB_IN, skb->len);
/*如果这包在别的子系统也使用 就拷贝一份给自己专门用*/
if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) {
IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
goto out;/*拷贝失败就丢弃*/
}
/*如果是分包的话 ,用__pskb_pull_tail把skb_shared_info中frag_list找回来 */
if (!pskb_may_pull(skb, sizeof(struct iphdr)))
goto inhdr_error;
/*重新指向L3头部*/
iph = ip_hdr(skb);
/*
* RFC1122: 3.2.1.2 MUST silently discard any IP frame that fails the checksum.
*
* Is the datagram acceptable?
*
* 1. Length at least the size of an ip header
* 2. Version of 4
* 3. Checksums correctly. [Speed optimisation for later, skip loopback checksums]
* 4. Doesn't have a bogus length
*/
/*头部和版本检查*/
if (iph->ihl < 5 || iph->version != 4)
goto inhdr_error;
/*同样的动作, 不过和上一次比这次是整个IP头部了*/
if (!pskb_may_pull(skb, iph->ihl*4))
goto inhdr_error;
iph = ip_hdr(skb);
/*校验*/
if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
goto inhdr_error;
/*长度完整性校验*/
len = ntohs(iph->tot_len);
/*skb buffer的真实长度只能比包头报告的长度大<因为可能被L2层填充了>或正好,小的话就有问题咯*/
if (skb->len < len) {
IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS);
goto drop;
} else if (len < (iph->ihl*4))
/*包头可能比包体还长吗? :)*/
goto inhdr_error;
/* Our transport medium may have padded the buffer out. Now we know it
* is IP we can trim to the true length of the frame.
* Note this now means skb->len holds ntohs(iph->tot_len).
*/
/*被L2填充了吗? 去掉! 前面校验不算~*/
if (pskb_trim_rcsum(skb, len)) {
IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
goto drop;
}
/* Remove any debris in the socket control block */
/*看起来L3不需要 ip_options*/
memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
/* Must drop socket now because of tproxy. */
/*既然都被我处理过了,就跟我把,帮你净身*/
skb_orphan(skb);
/*接受Netfilter 的洗礼吧(LVS基于此),最后再执行 ip_rcv_finish,这是正常之旅*/
return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, dev, NULL,
ip_rcv_finish);
inhdr_error:
/*去MIB树上反应一下*/
IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS);
drop:
kfree_skb(skb);
out: /*哎*/
return NET_RX_DROP;
}接下来就是看看 ip_rcv_finish
static int ip_rcv_finish(struct sk_buff *skb)
{
const struct iphdr *iph = ip_hdr(skb);
struct rtable *rt;
/*
* 如果包中没有dst_entry结构(不知如何转发),就直接询问路由子系统看看有没人要它(后面就不处理了)
*/
if (skb_dst(skb) == NULL) {
int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
iph->tos, skb->dev);
if (unlikely(err)) {
if (err == -EHOSTUNREACH)
IP_INC_STATS_BH(dev_net(skb->dev),
IPSTATS_MIB_INADDRERRORS);
else if (err == -ENETUNREACH)
IP_INC_STATS_BH(dev_net(skb->dev),
IPSTATS_MIB_INNOROUTES);
else if (err == -EXDEV)
NET_INC_STATS_BH(dev_net(skb->dev),
LINUX_MIB_IPRPFILTER);
goto drop;
}
}
/*更新该CPU的ip_rt_acct 统计,参考/proc/net/rt_acct*/
#ifdef CONFIG_NET_CLS_ROUTE
if (unlikely(skb_dst(skb)->tclassid)) {
struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct);
u32 idx = skb_dst(skb)->tclassid;
st[idx&0xFF].o_packets++;
st[idx&0xFF].o_bytes += skb->len;
st[(idx>>16)&0xFF].i_packets++;
st[(idx>>16)&0xFF].i_bytes += skb->len;
}
#endif
/*如果有IP options就去处理*/
if (iph->ihl > 5 && ip_rcv_options(skb))
goto drop;
/*根据目的地类型做一些统计*/
rt = skb_rtable(skb);
if (rt->rt_type == RTN_MULTICAST) {
IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INMCAST,
skb->len);
} else if (rt->rt_type == RTN_BROADCAST)
IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INBCAST,
skb->len);
/"where to go 见下面"/
return dst_input(skb);
drop:
kfree_skb(skb);
return NET_RX_DROP;
}这个决定L3包走向的函数指针在那里设置的呢,跟着我看一下吧
接着上一篇讲过的inet_init()->ip_init()->ip_rt_init()
-->
rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
<route.c __rtnl_register登记 >
-->
int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
u8 tos, struct net_device *dev, bool noref)
{
//...
if (our ) {
int res = ip_route_input_mc(skb, daddr, saddr,
tos, dev, our);
//...
}
//...
}
//...
res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
} inet_rtm_getroute()->ip_route_input()->ip_route_input_common()-
A:ip_route_input_slow()
->ip_mkroute_input()->__mkroute_input()
"ip_forward;"
B<our>: ip_route_input_mc()
B1 "ip_local_deliver"
B2 "init_net.loopback_dev";
这是几个likely的不考虑异常分支,别的就先不介绍了
int ip_forward(struct sk_buff *skb)
{
struct iphdr *iph; /* Our header */
struct rtable *rt; /* Route we use */
struct ip_options * opt = &(IPCB(skb)->opt);
if (skb_warn_if_lro(skb))
goto drop;
if (!xfrm4_policy_check(NULL, XFRM_POLICY_FWD, skb))
goto drop;
/*Router Alert option 的处理,后面会分析 ip_call_ra_chain()函数*/
if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb))
return NET_RX_SUCCESS;
/*pkt_type在L2层处理的时候有无设置PACKET_HOST*/
if (skb->pkt_type != PACKET_HOST)
goto drop;
/*什么都没做 直接让pass了*/
skb_forward_csum(skb);
/*rfc 规定防止路由循环等等用的*/
if (ip_hdr(skb)->ttl <= 1)
goto too_many_hops;
/*VPN IPSec交给xfrm 框架处理转发*/
if (!xfrm4_route_forward(skb))
goto drop;
rt = skb_rtable(skb);
/*如果IP包选项指明了要用自己提供的路由来走<Strict Source Routing >,而又不能满*足就失败*/
if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
goto sr_failed;
/*如果包长度大于了目的地的MTU 却禁止分包 就发送一个ICMP<这块参考 TCP/IP详解卷一 9章>*/
if (unlikely(skb->len > dst_mtu(&rt->dst) && !skb_is_gso(skb) &&
(ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) {
IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS);
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
htonl(dst_mtu(&rt->dst)));
goto drop;
}
/* We are about to mangle packet. Copy it! 这个之前解释过了 */
if (skb_cow(skb, LL_RESERVED_SPACE(rt->dst.dev)+rt->dst.header_len))
goto drop;
iph = ip_hdr(skb);
/* Decrease ttl after skb cow done 协议要求*/
ip_decrease_ttl(iph);
/*
*如果包允许走别的路由而且他也表示希望走一个更好的,就重新计算路由,当然也会 * 引起一个ICMP ,这个函数后面分析 */
if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr && !skb_sec_path(skb))
ip_rt_send_redirect(skb);
/*根据 IP包头部的TOS 设置包优先级 给后面的 Traffic Control 用<ps: 路由器设计很重视这个选项>*/
skb->priority = rt_tos2priority(iph->tos);
/*之前分析过,经过NetFilter后调用ip_forward_finish*/
return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, skb, skb->dev,
rt->dst.dev, ip_forward_finish);
sr_failed:
/*
* Strict routing permits no gatewaying 协议规定的参考 TCP/IP-I 1
*/
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_SR_FAILED, 0);
goto drop;
too_many_hops:
/* Tell the sender its packet died... */
IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_INHDRERRORS);
icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
drop:
kfree_skb(skb);
return NET_RX_DROP;
}好吧 ,下面就是
static int ip_forward_finish(struct sk_buff *skb)
{
struct ip_options * opt = &(IPCB(skb)->opt);
IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
/* ip_forward里面已经处理了2个IP options <Router Alert 和Strict Source Routing>,* 下面就是把剩下的处理完,由于几乎很少会用到IP-O <看看这里的unlikely也知道概率了吧* >所以就不分析了*/
if (unlikely(opt->optlen))
ip_forward_options(skb);
/*最终归属,根据 消息包类型去调用相应的函数指针 ip_mc_output ..ip_finish_output */
return dst_output(skb);
}
=========================
补充上面说的 ip_call_ra_chian
structip_ra_chain__rcu*next;
structsock*sk;
union{
void(*destructor)(structsock*);
structsock*saved_sk;
};
structrcu_headrcu;
};int ip_call_ra_chain(struct sk_buff *skb)
{
struct ip_ra_chain *ra;
u8 protocol = ip_hdr(skb)->protocol;
struct sock *last = NULL;
struct net_device *dev = skb->dev;
/*这里遍历了整个raw sock 链表 */
for (ra = rcu_dereference(ip_ra_chain); ra; ra = rcu_dereference(ra->next)) {
struct sock *sk = ra->sk;
/* If socket is bound to an interface, only report
* the packet if it came from that interface.
*/
/*包头端口号和该raw sock 的端口匹配 设备接口序号也匹配*/
if (sk && inet_sk(sk)->inet_num == protocol &&
(!sk->sk_bound_dev_if ||
sk->sk_bound_dev_if == dev->ifindex) &&
net_eq(sock_net(sk), dev_net(dev))) {
/*如果分段过 就去重组整个IP包 ip_fragment.c*/
if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN))
return 1;
}
if (last) {
/*关键就是这里,把包复制一遍然后传给上层
*放入该sock的sk_receive_queue*/
struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
if (skb2)
raw_rcv(last, skb2);
}
/*下一个raw sock */
last = sk;
}
}
if (last) {
raw_rcv(last, skb);
return 1;
}