路由缓存子系统

1. 初始化

/* 用于分配每个rtable节点的内存池 */
ipv4_dst_ops.kmem_cachep =
    kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
              SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
 
/* 路由缓存哈希表, 2^rt_hash_log=rt_hash_mask */
rt_hash_table = (struct rt_hash_bucket *)
    alloc_large_system_hash("IP route cache",
                sizeof(struct rt_hash_bucket),
                rhash_entries,
                (num_physpages >= 128 * 1024) ?
                15 : 17,
                0,
                &rt_hash_log,
                &rt_hash_mask,
                0);
memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
rt_hash_lock_init();
 
ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
ip_rt_max_size = (rt_hash_mask + 1) * 16;

路由哈希表在内存中结构如下:

其中rtable为ipv4的dst_entry结构的封装, dst_entry是协议无关部分, dst_entry中包含了路由缓存管理所需成员以及邻居子系统及其缓存的接口;

2. 插入

static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
{
    struct rtable    *rth, **rthp;
    unsigned long    now;
    struct rtable *cand, **candp;
    u32         min_score;
    int        chain_length;
    int attempts = !in_softirq();
 
restart:
    chain_length = 0;
    min_score = ~(u32)0;
    cand = NULL;
    candp = NULL;
    now = jiffies;
 
    rthp = &rt_hash_table[hash].chain;
 
    /* 获取hash bucket锁 */
    spin_lock_bh(rt_hash_lock_addr(hash));
    while ((rth = *rthp) != NULL) {
        /* 比较flowi */
#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
        if (!(rth->u.dst.flags & DST_BALANCED) &&
            compare_keys(&rth->fl, &rt->fl)) {
#else
        if (compare_keys(&rth->fl, &rt->fl)) { 
#endif
            /* 即使本函数是由于缓存查找失败引起的,但是还是先做查找操作,因为可能其他CPU添加了本路由cache */
            /* 找到则Put it first */
            *rthp = rth->u.dst.rt_next;
            /*
             * Since lookup is lockfree, the deletion
             * must be visible to another weakly ordered CPU before
             * the insertion at the start of the hash chain.
             */
            rcu_assign_pointer(rth->u.dst.rt_next,
                       rt_hash_table[hash].chain);
            /*
             * Since lookup is lockfree, the update writes
             * must be ordered for consistency on SMP.
             */
            rcu_assign_pointer(rt_hash_table[hash].chain, rth);
 
            rth->u.dst.__use++;
            dst_hold(&rth->u.dst);
            rth->u.dst.lastuse = now;
            spin_unlock_bh(rt_hash_lock_addr(hash));
 
            rt_drop(rt);
            *rp = rth;
            return 0;
        }
 
        /* 查找路由表过程中,记录最有可能被删除的路由缓存 */
        if (!atomic_read(&rth->u.dst.__refcnt)) {
            u32 score = rt_score(rth);
 
            if (score <= min_score) {
                cand = rth;
                candp = rthp;
                min_score = score;
            }
        }
 
        chain_length++;
 
        rthp = &rth->u.dst.rt_next;
    }
 
    if (cand) {
        /* ip_rt_gc_elasticity used to be average length of chain
         * length, when exceeded gc becomes really aggressive.
         *
         * The second limit is less certain. At the moment it allows
         * only 2 entries per bucket. We will see.
         */
        if (chain_length > ip_rt_gc_elasticity) {
            /* 为了不让表太大,插入一个路由的同时删除一个 */
            *candp = cand->u.dst.rt_next;
            rt_free(cand);
        }
    }
 
    /* 发给广播,组播,本机的地址不需要路由 */
    /* Try to bind route to arp only if it is output
       route or unicast forwarding path.
     */
    if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
        int err = arp_bind_neighbour(&rt->u.dst);
        if (err) {
            spin_unlock_bh(rt_hash_lock_addr(hash));
 
              /* 如果不是由于内存短缺引起的,返回失败 */
            if (err != -ENOBUFS) {
                rt_drop(rt);
                return err;
            }
 
            /* 只在非中断上下文中执行一次垃圾收集, 因为同步垃圾收集比较耗时 */
            /* Neighbour tables are full and nothing
               can be released. Try to shrink route cache,
               it is most likely it holds some neighbour records.
             */
            if (attempts-- > 0) {
                int saved_elasticity = ip_rt_gc_elasticity;
                int saved_int = ip_rt_gc_min_interval;
                /* 说明内存不够, 减小ip_rt_gc_elasticity和ip_rt_gc_min_interval,调用垃圾收集 */
                ip_rt_gc_elasticity    = 1;
                ip_rt_gc_min_interval    = 0;
                rt_garbage_collect();
                ip_rt_gc_min_interval    = saved_int;
                ip_rt_gc_elasticity    = saved_elasticity;
                goto restart;
            }
 
            if (net_ratelimit())
                printk(KERN_WARNING "Neighbour table overflow.\n");
            rt_drop(rt);
            return -ENOBUFS;
        }
    }
 
    /* 插到表头 */
    rt->u.dst.rt_next = rt_hash_table[hash].chain;
#if RT_CACHE_DEBUG >= 2
    if (rt->u.dst.rt_next) {
        struct rtable *trt;
        printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
               NIPQUAD(rt->rt_dst));
        for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
            printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
        printk("\n");
    }
#endif
    rt_hash_table[hash].chain = rt;
    spin_unlock_bh(rt_hash_lock_addr(hash));
    *rp = rt;
    return 0;
}

打分函数如下:

/* Bits of score are:
 * 31: very valuable
 * 30: not quite useless
 * 29..0: usage counter
 */
static inline u32 rt_score(struct rtable *rt)
{
    /* 使用时间越长越有可能被删除 */
    u32 score = jiffies - rt->u.dst.lastuse;
 
    score = ~score & ~(3<<30);
 
    /* 由ICMP rediect或者用户空间程序添加的, 或者准备超时的路由最重要  */
    if (rt_valuable(rt))
        score |= (1<<31);
 
    /* 输出路由, 组播,广播,发完本机的路由次之 */
    if (!rt->fl.iif ||
        !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
        score |= (1<<30);
 
    /* 分数越小,被删除的可能性越高 */
    return score;
}

3. 查找

路由子系统对外提供了两个查找函数ip_route_input和ip_route_output_key, 分别用于输入和输出的路由查找; 这两个函数都先搜索路由缓存, 因为路由缓存是输入和输出路由共用的; 如果缓存中没有命中, 则会搜索路由表;

输入方向:

/* 不管路由查找成功或者失败,都会初始化skb->dst->input和skb->dst->output
 * skb->dst是满足路由请求的缓存,如果cache没有命中,会创建新的dst_entry连接到skb->dst
 * 接着报文会被dst_input或者dst_output处理, 其中分别调用了之前初始化的两个函数
 */
int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
           u8 tos, struct net_device *dev)
{
    struct rtable * rth;
    unsigned    hash;
    int iif = dev->ifindex;
 
    tos &= IPTOS_RT_MASK;
    hash = rt_hash(daddr, saddr, iif);
 
    rcu_read_lock();
    for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
         rth = rcu_dereference(rth->u.dst.rt_next)) {
        /* 输入方向需要匹配输入设备 */
        if (rth->fl.fl4_dst == daddr &&
            rth->fl.fl4_src == saddr &&
            rth->fl.iif == iif &&
            rth->fl.oif == 0 &&
            rth->fl.mark == skb->mark &&
            rth->fl.fl4_tos == tos) {
            rth->u.dst.lastuse = jiffies;
            dst_hold(&rth->u.dst);
            rth->u.dst.__use++;
            RT_CACHE_STAT_INC(in_hit);
            rcu_read_unlock();
            skb->dst = (struct dst_entry*)rth;
            return 0;
        }
        RT_CACHE_STAT_INC(in_hlist_search);
    }
    rcu_read_unlock();
 
    /* Multicast recognition logic is moved from route cache to here.
       The problem was that too many Ethernet cards have broken/missing
       hardware multicast filters :-( As result the host on multicasting
       network acquires a lot of useless route cache entries, sort of
       SDR messages from all the world. Now we try to get rid of them.
       Really, provided software IP multicast filter is organized
       reasonably (at least, hashed), it does not result in a slowdown
       comparing with route cache reject entries.
       Note, that multicast routers are not affected, because
       route cache entry is created eventually.
     */
    if (MULTICAST(daddr)) {
        struct in_device *in_dev;
 
        rcu_read_lock();
        if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
            /* 检查本地是否配置了组播地址 */
            int our = ip_check_mc(in_dev, daddr, saddr,
                skb->nh.iph->protocol);
            if (our
#ifdef CONFIG_IP_MROUTE
                /* 该编译宏是否支持组播路由 */
                || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
#endif
                ) {
                rcu_read_unlock();
                return ip_route_input_mc(skb, daddr, saddr,
                             tos, dev, our);
            }
        }
        rcu_read_unlock();
        return -EINVAL;
    }
 
    /* 路由cache miss, 查找路由表 */
    return ip_route_input_slow(skb, daddr, saddr, tos, dev);
}

输出方向:

int ip_route_output_key(struct rtable **rp, struct flowi *flp)
{
    return ip_route_output_flow(rp, flp, NULL, 0);
}

int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
{
    int err;
 
    if ((err = __ip_route_output_key(rp, flp)) != 0)
        return err;
 
    if (flp->proto) {
        if (!flp->fl4_src)
            flp->fl4_src = (*rp)->rt_src;
        if (!flp->fl4_dst)
            flp->fl4_dst = (*rp)->rt_dst;
        return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
    }
 
    return 0;
}

int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
{
    unsigned hash;
    struct rtable *rth;
 
    hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
 
    rcu_read_lock_bh();
    for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
        rth = rcu_dereference(rth->u.dst.rt_next)) {
        /* 如果设置了RTO_ONLINK标记,则需要匹配; 该标记表示目的在本地子网上,不需要路由 */
        if (rth->fl.fl4_dst == flp->fl4_dst &&
            rth->fl.fl4_src == flp->fl4_src &&
            rth->fl.iif == 0 &&
            rth->fl.oif == flp->oif &&
            rth->fl.mark == flp->mark &&
            !((rth->fl.fl4_tos ^ flp->fl4_tos) &
                (IPTOS_RT_MASK | RTO_ONLINK))) {
 
            /* check for multipath routes and choose one if
             * necessary
             */
            if (multipath_select_route(flp, rth, rp)) {
                dst_hold(&(*rp)->u.dst);
                RT_CACHE_STAT_INC(out_hit);
                rcu_read_unlock_bh();
                return 0;
            }
 
            rth->u.dst.lastuse = jiffies;
            dst_hold(&rth->u.dst);
            rth->u.dst.__use++;
            RT_CACHE_STAT_INC(out_hit);
            rcu_read_unlock_bh();
            *rp = rth;
            return 0;
        }
        RT_CACHE_STAT_INC(out_hlist_search);
    }
    rcu_read_unlock_bh();
 
    /* cache查找失败, 查找路由表 */
    return ip_route_output_slow(rp, flp);
}

4. flush

会导致flush的事件:

device的up down
添加, 删除device的ip地址
全局或者设备的转发开关变化
某条路由被删除

/proc/sys/net/ipv4/flush操作

void rt_cache_flush(int delay)
{
    unsigned long now = jiffies;
    int user_mode = !in_softirq();
 
    /* 0 立刻刷新; > 0 delay后刷新; < 0 ip_rt_min_delay后刷新*/
    if (delay < 0)
        delay = ip_rt_min_delay;
 
    /* flush existing multipath state*/
    multipath_flush();
 
    spin_lock_bh(&rt_flush_lock);
 
    if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
        long tmo = (long)(rt_deadline - now);
 
        /* If flush timer is already running
           and flush request is not immediate (delay > 0):
           if deadline is not achieved, prolongate timer to "delay",
           otherwise fire it at deadline time.
         */
 
        if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
            tmo = 0;
 
        if (delay > tmo)
            delay = tmo;
    }
 
    if (delay <= 0) {
        spin_unlock_bh(&rt_flush_lock);
        rt_run_flush(0);
        return;
    }
 
    if (rt_deadline == 0)
        rt_deadline = now + ip_rt_max_delay;
 
    mod_timer(&rt_flush_timer, now+delay);
    spin_unlock_bh(&rt_flush_lock);
}

/* This can run from both BH and non-BH contexts, the latter
 * in the case of a forced flush event.
 */
static void rt_run_flush(unsigned long dummy)
{
    int i;
    struct rtable *rth, *next;
 
    rt_deadline = 0;
 
    get_random_bytes(&rt_hash_rnd, 4);
 
    for (i = rt_hash_mask; i >= 0; i--) {
        spin_lock_bh(rt_hash_lock_addr(i));
        rth = rt_hash_table[i].chain;
        if (rth)
            rt_hash_table[i].chain = NULL;
        spin_unlock_bh(rt_hash_lock_addr(i));
 
        /* 删除cache中所有entry */
        for (; rth; rth = next) {
            next = rth->u.dst.rt_next;
            rt_free(rth);
        }
    }
}

5. 删除

dst_entry的引用计数是通过dst_hold 和dst_release分别增加和减少的; 当dst_release中引用次数减少到0时,该函数并不会删除该entry; 删除dst_entry在ipv4中是通过rt_free 和 rt_drop删除的;

删除一条cache entry之前有两点必须注意:

当一条entry因为还在被引用而不能删除的时候, 它的obsolete被标记为2; 要删除已经obsolete的entry是不能成功的;
dst_entry可能会有child, 比如ipsec中使用dst_entry作为transformation bundles, 最后的那个dst_entry才是作为路由使用;

static inline void dst_free(struct dst_entry * dst)
{
    /* 已经标记为dead */
    if (dst->obsolete > 1)
        return;
 
    /* 引用计数为0 */
    if (!atomic_read(&dst->__refcnt)) {
        dst = dst_destroy(dst);
        /* 可以删除 */
        if (!dst)
            return;
    }
 
    /* 如果还有DST_NOHASH的child,且引用计数不为0, 则把它标记为dead, 挂到dst_garbage_list中稍后删除 */
    __dst_free(dst);
}

struct dst_entry *dst_destroy(struct dst_entry * dst)
{
    struct dst_entry *child;
    struct neighbour *neigh;
    struct hh_cache *hh;
 
    smp_rmb();
 
again:
    neigh = dst->neighbour;
    hh = dst->hh;
    child = dst->child;
 
    dst->hh = NULL;
    if (hh && atomic_dec_and_test(&hh->hh_refcnt))
        kfree(hh);
 
    if (neigh) {
        dst->neighbour = NULL;
        neigh_release(neigh);
    }
 
    atomic_dec(&dst->ops->entries);
 
    /* ipv4主要in_dev_put(rt->idev) */
    if (dst->ops->destroy)
        dst->ops->destroy(dst);
    if (dst->dev)
        dev_put(dst->dev);
#if RT_CACHE_DEBUG >= 2
    atomic_dec(&dst_total);
#endif
    kmem_cache_free(dst->ops->kmem_cachep, dst);
 
    /* 如果有dst_entry链表 */
    dst = child;
    if (dst) {
        /* DST_NOHASH表示该dst_entry是作为transformation bundles  */
        int nohash = dst->flags & DST_NOHASH;
 
        if (atomic_dec_and_test(&dst->__refcnt)) {
            /* We were real parent of this dst, so kill child. */
            if (nohash)
                goto again;
        } else {
            /* Child is still referenced, return it for freeing. */
            if (nohash)
                return dst;
            /* Child is still in his hash table */
        }
    }
    return NULL;
}

void __dst_free(struct dst_entry * dst)
{
    spin_lock_bh(&dst_lock);
    ___dst_free(dst);
    
    /* 挂到dst_garbage_list中 */
    dst->next = dst_garbage_list;
    dst_garbage_list = dst;
 
    if (dst_gc_timer_inc > DST_GC_INC) {
        dst_gc_timer_inc = DST_GC_INC;
        dst_gc_timer_expires = DST_GC_MIN;
        mod_timer(&dst_gc_timer, jiffies + dst_gc_timer_expires);
    }
    spin_unlock_bh(&dst_lock);
}

static void dst_run_gc(unsigned long dummy)
{
    int    delayed = 0;
    int    work_performed;
    struct dst_entry * dst, **dstp;
 
    /* 获取不到锁,1/10秒后重试 */
    if (!spin_trylock(&dst_lock)) {
        mod_timer(&dst_gc_timer, jiffies + HZ/10);
        return;
    }
 
    del_timer(&dst_gc_timer);
    dstp = &dst_garbage_list;
    work_performed = 0;
    while ((dst = *dstp) != NULL) {
        /* 还在被引用 */
        if (atomic_read(&dst->__refcnt)) {
            dstp = &dst->next;
            delayed++;
            continue;
        }
        *dstp = dst->next;
        work_performed = 1;
 
        dst = dst_destroy(dst);
        if (dst) {
            /* NOHASH and still referenced. Unless it is already
             * on gc list, invalidate it and add to gc list.
             *
             * Note: this is temporary. Actually, NOHASH dst's
             * must be obsoleted when parent is obsoleted.
             * But we do not have state "obsoleted, but
             * referenced by parent", so it is right.
             */
            if (dst->obsolete > 1)
                continue;
 
            ___dst_free(dst);
            dst->next = *dstp;
            *dstp = dst;
            dstp = &dst->next;
        }
    }
    if (!dst_garbage_list) {
        dst_gc_timer_inc = DST_GC_MAX;
        goto out;
    }
    if (!work_performed) {
        if ((dst_gc_timer_expires += dst_gc_timer_inc) > DST_GC_MAX)
            dst_gc_timer_expires = DST_GC_MAX;
        dst_gc_timer_inc += DST_GC_INC;
    } else {
        dst_gc_timer_inc = DST_GC_INC;
        dst_gc_timer_expires = DST_GC_MIN;
    }
#if RT_CACHE_DEBUG >= 2
    printk("dst_total: %d/%d %ld\n",
           atomic_read(&dst_total), delayed,  dst_gc_timer_expires);
#endif
    /* if the next desired timer is more than 4 seconds in the future
     * then round the timer to whole seconds
     */
    if (dst_gc_timer_expires > 4*HZ)
        mod_timer(&dst_gc_timer,
            round_jiffies(jiffies + dst_gc_timer_expires));
    else
        mod_timer(&dst_gc_timer, jiffies + dst_gc_timer_expires);
 
out:
    spin_unlock(&dst_lock);
}

6. 垃圾收集

同步垃圾收集

同步垃圾收集是DST子系统检测到内存短缺时候调用的; DST子系统决定要进行垃圾收集, 而对应协议的VFT来实现, 如ipv4中dst_ops->gc初始化为rt_garbage_collect; 会调用同步垃圾收集的情况有以下两种:

rt_intern_hash中绑定neighbour和route cache的时候,如果由于内存分配失败导致neighbour创建失败的时候,会调用垃圾收集机制, 因为删除不用的route cache可以删除关联的neighbour

if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
        int err = arp_bind_neighbour(&rt->u.dst);
        if (err) {
            spin_unlock_bh(rt_hash_lock_addr(hash));
 
            if (err != -ENOBUFS) {
                rt_drop(rt);
                return err;
            }
 
            /* Neighbour tables are full and nothing
               can be released. Try to shrink route cache,
               it is most likely it holds some neighbour records.
             */
            if (attempts-- > 0) {
                int saved_elasticity = ip_rt_gc_elasticity;
                int saved_int = ip_rt_gc_min_interval;
                ip_rt_gc_elasticity    = 1;
                ip_rt_gc_min_interval    = 0;
                /* 如果不在中断上下文中, 进行一次垃圾收集 */
                rt_garbage_collect();
                ip_rt_gc_min_interval    = saved_int;
                ip_rt_gc_elasticity    = saved_elasticity;
                goto restart;
            }
 
            if (net_ratelimit())
                printk(KERN_WARNING "Neighbour table overflow.\n");
            rt_drop(rt);
            return -ENOBUFS;
        }
    }

dst_alloc中发现route cache的数量大于gc_thresh时候

void * dst_alloc(struct dst_ops * ops)
{
    struct dst_entry * dst;
 
    /* 垃圾收集 */
    if (ops->gc && atomic_read(&ops->entries) > ops->gc_thresh) {
        if (ops->gc())
            return NULL;
    }
    dst = kmem_cache_zalloc(ops->kmem_cachep, GFP_ATOMIC);
    if (!dst)
        return NULL;
    atomic_set(&dst->__refcnt, 0);
    dst->ops = ops;
    dst->lastuse = jiffies;
    dst->path = dst;
    dst->input = dst_discard_in;
    dst->output = dst_discard_out;
#if RT_CACHE_DEBUG >= 2
    atomic_inc(&dst_total);
#endif
    atomic_inc(&ops->entries);
    return dst;
}

/*
   Short description of GC goals.
   We want to build algorithm, which will keep routing cache
   at some equilibrium point, when number of aged off entries
   is kept approximately equal to newly generated ones.
   Current expiration strength is variable "expire".
   We try to adjust it dynamically, so that if networking
   is idle expires is large enough to keep enough of warm entries,
   and when load increases it reduces to limit cache size.
 */
 
static int rt_garbage_collect(void)
{
    static unsigned long expire = RT_GC_TIMEOUT;
    static unsigned long last_gc;
    static int rover;
    static int equilibrium;
    struct rtable *rth, **rthp;
    unsigned long now = jiffies;
    int goal;
 
    /*
     * Garbage collection is pretty expensive,
     * do not make it too frequently.
     */
 
    RT_CACHE_STAT_INC(gc_total);
 
    /* 上次垃圾收集与本次垃圾收集时间间隔小于ip_rt_gc_min_interval, 且route cache数量没有达到最大 */
    if (now - last_gc < ip_rt_gc_min_interval &&
        atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
        /* 避免经常执行太耗时的操作 */
        RT_CACHE_STAT_INC(gc_ignored);
        goto out;
    }
 
    /* goal是要删除的路由缓存数量, equilibrium是当前的route cache删除goal后剩余的route cache数量 */
    /* Calculate number of entries, which we want to expire now. */
    goal = atomic_read(&ipv4_dst_ops.entries) -
        (ip_rt_gc_elasticity << rt_hash_log);
    if (goal <= 0) {
        if (equilibrium < ipv4_dst_ops.gc_thresh)
            equilibrium = ipv4_dst_ops.gc_thresh;
        goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
        if (goal > 0) {
            equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
            goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
        }
    } else {
        /* 个数大于ip_rt_gc_elasticity * (2 ^ rt_hash_log) */
        /* We are in dangerous area. Try to reduce cache really
         * aggressively.
         */
        goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
        equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
    }
 
    if (now - last_gc >= ip_rt_gc_min_interval)
        last_gc = now;
 
    if (goal <= 0) {
        equilibrium += goal;
        goto work_done;
    }
 
    do {
        int i, k;
 
        /* rover记录了上次调用垃圾收集最后清除的bucket,防止每次都从第0个开始搜索  */
        for (i = rt_hash_mask, k = rover; i >= 0; i--) {
            unsigned long tmo = expire;
 
            k = (k + 1) & rt_hash_mask;
            rthp = &rt_hash_table[k].chain;
            spin_lock_bh(rt_hash_lock_addr(k));
            while ((rth = *rthp) != NULL) {
                if (!rt_may_expire(rth, tmo, expire)) {
                    /* 不能expire就对tmo减半 */
                    tmo >>= 1;
                    rthp = &rth->u.dst.rt_next;
                    continue;
                }
#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
                /* remove all related balanced entries
                 * if necessary
                 */
                if (rth->u.dst.flags & DST_BALANCED) {
                    int r;
 
                    rthp = rt_remove_balanced_route(
                        &rt_hash_table[k].chain,
                        rth,
                        &r);
                    goal -= r;
                    if (!rthp)
                        break;
                } else {
                    *rthp = rth->u.dst.rt_next;
                    rt_free(rth);
                    goal--;
                }
#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
                *rthp = rth->u.dst.rt_next;
                rt_free(rth);
                goal--;
#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
            }
            spin_unlock_bh(rt_hash_lock_addr(k));
 
            /* 如果删除够了就不再浏览下一个bucket */
            if (goal <= 0)
                break;
        }
        rover = k;
 
        if (goal <= 0)
            goto work_done;
 
        /* Goal is not achieved. We stop process if:
           - if expire reduced to zero. Otherwise, expire is halfed.
           - if table is not full.
           - if we are called from interrupt.
           - jiffies check is just fallback/debug loop breaker.
             We will not spin here for long time in any case.
         */
 
        RT_CACHE_STAT_INC(gc_goal_miss);
 
        if (expire == 0)
            break;
 
        /* 如果整个表都浏览过了删除的route cache个数还没达到goal,降低expire重新浏览cache hash表 */
        expire >>= 1;
#if RT_CACHE_DEBUG >= 2
        printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
                atomic_read(&ipv4_dst_ops.entries), goal, i);
#endif
 
        if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
            goto out;
    } while (!in_softirq() && time_before_eq(jiffies, now)); /* 在软中断中只浏览一次, 一个时钟中断内只浏览一次 */
 
    if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
        goto out;
    if (net_ratelimit())
        printk(KERN_WARNING "dst cache overflow\n");
    RT_CACHE_STAT_INC(gc_dst_overflow);
    return 1;
 
work_done:
    expire += ip_rt_gc_min_interval;
    if (expire > ip_rt_gc_timeout ||
        atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
        expire = ip_rt_gc_timeout;
#if RT_CACHE_DEBUG >= 2
    printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
            atomic_read(&ipv4_dst_ops.entries), goal, rover);
#endif
out:    return 0;
}

hash table的大小为rt_hash_mask+1 (2^rt_hash_log) , 当cache中的个数大于gc_thresh(默认为hash table大小)时调用rt_garbage_collect

hash table最多能保存的entry数ip_rt_max_size=16*(hash_mask+1)

hash table中的entry数大于ip_rt_gc_elasticity*(2^rt_hash_log)时系统设置比较大的goal数进行清理

用于检查cache中的entry是否可以删除的函数rt_may_expire

static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
{
    unsigned long age;
    int ret = 0;
 
    if (atomic_read(&rth->u.dst.__refcnt))
        goto out;
 
    ret = 1;
    if (rth->u.dst.expires &&
        time_after_eq(jiffies, rth->u.dst.expires))
        goto out;
 
    age = jiffies - rth->u.dst.lastuse;
    ret = 0;
    if ((age <= tmo1 && !rt_fast_clean(rth)) ||
        (age <= tmo2 && rt_valuable(rth)))
        goto out;
    ret = 1;
out:    return ret;
}

异步垃圾收集

异步垃圾收集是通过rt_periodic_timer调用rt_check_expire来删除cache中的可能may expire entry; 该定时器每ip_rt_gc_interval调用一次;

/* This runs via a timer and thus is always in BH context. */
static void rt_check_expire(unsigned long dummy)
{
    static unsigned int rover;
    unsigned int i = rover, goal;
    struct rtable *rth, **rthp;
    unsigned long now = jiffies;
    u64 mult;
 
    mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
    if (ip_rt_gc_timeout > 1)
        do_div(mult, ip_rt_gc_timeout);
    goal = (unsigned int)mult;
    if (goal > rt_hash_mask) 
        goal = rt_hash_mask + 1;
    for (; goal > 0; goal--) {
        unsigned long tmo = ip_rt_gc_timeout;
 
        /* 与同步垃圾收集类似, 异步垃圾收集也记录了上次垃圾收集最后浏览的bucket */
        i = (i + 1) & rt_hash_mask;
        rthp = &rt_hash_table[i].chain;
 
        if (*rthp == 0)
            continue;
        spin_lock(rt_hash_lock_addr(i));
        while ((rth = *rthp) != NULL) {
            if (rth->u.dst.expires) {
                /* Entry is expired even if it is in use */
                if (time_before_eq(now, rth->u.dst.expires)) {
                    tmo >>= 1;
                    rthp = &rth->u.dst.rt_next;
                    continue;
                }
            } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
                tmo >>= 1;
                rthp = &rth->u.dst.rt_next;
                continue;
            }
 
            /* Cleanup aged off entries. */
#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
            /* remove all related balanced entries if necessary */
            if (rth->u.dst.flags & DST_BALANCED) {
                rthp = rt_remove_balanced_route(
                    &rt_hash_table[i].chain,
                    rth, NULL);
                if (!rthp)
                    break;
            } else {
                *rthp = rth->u.dst.rt_next;
                rt_free(rth);
            }
#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
            /* 删除cache entry */
            *rthp = rth->u.dst.rt_next;
            rt_free(rth);
#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
        }
        spin_unlock(rt_hash_lock_addr(i));
 
        /* 避免执行太久, 只在一个时钟中断内执行 */
        /* Fallback loop breaker. */
        if (time_after(jiffies, now))
            break;
    }
    rover = i;
    mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
}

上面的代码中还检查了dst_entry->expires是否超时, dst_entry->expires在初始化的时候是设置成0,也就是不超时的, 当以下情况发生时会设置这个成员:

当收到ICMP UNREACHABLE 或 FRAGMENTATION NEEDED, 在ICMP消息payload中指出的ip作为目的的路由项都必须设置为超时的,ip_rt_frag_needed 中设置为ip_rt_mtu_expires
TCP通过路径MTU发现算法更新一条路由的MTU时调用ip_rt_update_mtu设置超时
当目的IP变为不可达的时候,与其关联的dst_entry都被设置为超时的,ipv4是通过dst_ops的link_failure设置;