构建借用内存的后台

Linux管理内存分阶段抽象，用数据结构管理。先用节点集合管理内存，然后用zone的集合管理节点，再用页的集合管理zone.

pglist_data结构描述节点

typedef struct pglist_data {
    struct zone node_zones[MAX_NR_ZONES];
    struct zonelist node_zonelists[MAX_ZONELISTS];
    int nr_zones;
#ifdef CONFIG_FLAT_NODE_MEM_MAP    /* means !SPARSEMEM */
    struct page *node_mem_map;
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
    struct page_cgroup *node_page_cgroup;
#endif
#endif
    struct bootmem_data *bdata;
#ifdef CONFIG_MEMORY_HOTPLUG
    /*
     * Must be held any time you expect node_start_pfn, node_present_pages
     * or node_spanned_pages stay constant.  Holding this will also
     * guarantee that any pfn_valid() stays that way.
     *
     * Nests above zone->lock and zone->size_seqlock.
     */
    spinlock_t node_size_lock;
#endif
    unsigned long node_start_pfn;
    unsigned long node_present_pages; /* total number of physical pages */
    unsigned long node_spanned_pages; /* total size of physical page
                         range, including holes */
    int node_id;
    wait_queue_head_t kswapd_wait;
    struct task_struct *kswapd;
    int kswapd_max_order;
} pg_data_t;

struct pglist_data

zone结构体描述zone

struct zone{
    /* Fields commonly accessed by the page allocator */
    unsigned long        pages_min, pages_low, pages_high;
    /*
     * We don't know if the memory that we're going to allocate will be freeable
     * or/and it will be released eventually, so to avoid totally wasting several
     * GB of ram we must reserve some of the lower zone memory (otherwise we risk
     * to run OOM on the lower zones despite there's tons of freeable ram
     * on the higher zones). This array is recalculated at runtime if the
     * sysctl_lowmem_reserve_ratio sysctl changes.
     */
    unsigned long        lowmem_reserve[MAX_NR_ZONES];

#ifdef CONFIG_NUMA
    int node;
    /*
     * zone reclaim becomes active if more unmapped pages exist.
     */
    unsigned long        min_unmapped_pages;
    unsigned long        min_slab_pages;
    struct per_cpu_pageset    *pageset[NR_CPUS];
#else
    struct per_cpu_pageset    pageset[NR_CPUS];
#endif
    /*
     * free areas of different sizes
     */
    spinlock_t        lock;
#ifdef CONFIG_MEMORY_HOTPLUG
    /* see spanned/present_pages for more description */
    seqlock_t        span_seqlock;
#endif
    struct free_area    free_area[MAX_ORDER];

#ifndef CONFIG_SPARSEMEM
    /*
     * Flags for a pageblock_nr_pages block. See pageblock-flags.h.
     * In SPARSEMEM, this map is stored in struct mem_section
     */
    unsigned long        *pageblock_flags;
#endif /* CONFIG_SPARSEMEM */


    ZONE_PADDING(_pad1_)

    /* Fields commonly accessed by the page reclaim scanner */
    spinlock_t        lru_lock;    
    struct {
        struct list_head list;
        unsigned long nr_scan;
    } lru[NR_LRU_LISTS];

    struct zone_reclaim_stat reclaim_stat;

    unsigned long        pages_scanned;       /* since last reclaim */
    unsigned long        flags;           /* zone flags, see below */

    /* Zone statistics */
    atomic_long_t        vm_stat[NR_VM_ZONE_STAT_ITEMS];

    /*
     * prev_priority holds the scanning priority for this zone.  It is
     * defined as the scanning priority at which we achieved our reclaim
     * target at the previous try_to_free_pages() or balance_pgdat()
     * invokation.
     *
     * We use prev_priority as a measure of how much stress page reclaim is
     * under - it drives the swappiness decision: whether to unmap mapped
     * pages.
     *
     * Access to both this field is quite racy even on uniprocessor.  But
     * it is expected to average out OK.
     */
    int prev_priority;

    /*
     * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
     * this zone's LRU.  Maintained by the pageout code.
     */
    unsigned int inactive_ratio;


    ZONE_PADDING(_pad2_)
    /* Rarely used or read-mostly fields */

    /*
     * wait_table        -- the array holding the hash table
     * wait_table_hash_nr_entries    -- the size of the hash table array
     * wait_table_bits    -- wait_table_size == (1 << wait_table_bits)
     *
     * The purpose of all these is to keep track of the people
     * waiting for a page to become available and make them
     * runnable again when possible. The trouble is that this
     * consumes a lot of space, especially when so few things
     * wait on pages at a given time. So instead of using
     * per-page waitqueues, we use a waitqueue hash table.
     *
     * The bucket discipline is to sleep on the same queue when
     * colliding and wake all in that wait queue when removing.
     * When something wakes, it must check to be sure its page is
     * truly available, a la thundering herd. The cost of a
     * collision is great, but given the expected load of the
     * table, they should be so rare as to be outweighed by the
     * benefits from the saved space.
     *
     * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the
     * primary users of these fields, and in mm/page_alloc.c
     * free_area_init_core() performs the initialization of them.
     */
    wait_queue_head_t    * wait_table;
    unsigned long        wait_table_hash_nr_entries;
    unsigned long        wait_table_bits;

    /*
     * Discontig memory support fields.
     */
    struct pglist_data    *zone_pgdat;
    /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
    unsigned long        zone_start_pfn;

    /*
     * zone_start_pfn, spanned_pages and present_pages are all
     * protected by span_seqlock.  It is a seqlock because it has
     * to be read outside of zone->lock, and it is done in the main
     * allocator path.  But, it is written quite infrequently.
     *
     * The lock is declared along with zone->lock because it is
     * frequently read in proximity to zone->lock.  It's good to
     * give them a chance of being in the same cacheline.
     */
    unsigned long        spanned_pages;    /* total size, including holes */
    unsigned long        present_pages;    /* amount of memory (excluding holes) */

    /*
     * rarely used fields:
     */
    const char        *name;
}

struct zone

1.构建借用内存的结构体

这里主要的结构体是struct node_zonelists

/*
 * One allocation request operates on a zonelist. A zonelist
 * is a list of zones, the first one is the 'goal' of the
 * allocation, the other zones are fallback zones, in decreasing
 * priority.
 *
 * If zlcache_ptr is not NULL, then it is just the address of zlcache,
 * as explained above.  If zlcache_ptr is NULL, there is no zlcache.
 * *
 * To speed the reading of the zonelist, the zonerefs contain the zone index
 * of the entry being read. Helper functions to access information given
 * a struct zoneref are
 *
 * zonelist_zone()    - Return the struct zone * for an entry in _zonerefs
 * zonelist_zone_idx()    - Return the index of the zone for an entry
 * zonelist_node_idx()    - Return the index of the node for an entry
 */
struct zonelist {
    struct zonelist_cache *zlcache_ptr;             // NULL or &zlcache
    struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1];
#ifdef CONFIG_NUMA
    struct zonelist_cache zlcache;                 // optional ...
#endif
};

struct zonelist

相关的结构体有struct zoneref

struct zoneref {
    struct zone *zone;    /* Pointer to actual zone */
    int zone_idx;        /* zone_idx(zoneref->zone) */
};

struct zonelist_cache

#ifdef CONFIG_NUMA

/*
 * The NUMA zonelists are doubled becausse we need zonelists that restrict the
 * allocations to a single node for GFP_THISNODE.
 *
 * [0]    : Zonelist with fallback
 * [1]    : No fallback (GFP_THISNODE)
 */
#define MAX_ZONELISTS 2

struct zonelist_cache {
    unsigned short z_to_n[MAX_ZONES_PER_ZONELIST];        /* zone->nid */
    DECLARE_BITMAP(fullzones, MAX_ZONES_PER_ZONELIST);    /* zone full? */
    unsigned long last_full_zap;        /* when last zap'd (jiffies) */
};
#else
#define MAX_ZONELISTS 1
struct zonelist_cache;
#endif

pg_data_t->node_zonelists[MAX_ZONELISTS]结构体中MAX_ZONELISTS根据CONFIG_NUMA配置不同。

配置NUMA时，内存有多个节点，node_zonelists[0]保存备份列表，node_zonelists[1]构建的是相应节点的zone列表。

1.set_zonelist_order

借用内存的两种策略，节点优先，先可本节点的内存分配，本节点无可用内存在用其它节点的内存，速度优先；

zone优先，先可低成本的zone的内存分配，本节点没有到其它节点同zone下分配内存，都没有在找高成本的zone分配，可靠性优先。

/*
 *  zonelist_order:
 *  0 = automatic detection of better ordering.
 *  1 = order by ([node] distance, -zonetype)
 *  2 = order by (-zonetype, [node] distance)
 *
 *  If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
 *  the same zonelist. So only NUMA can configure this param.
 */
#define ZONELIST_ORDER_DEFAULT  0
#define ZONELIST_ORDER_NODE     1
#define ZONELIST_ORDER_ZONE     2

对应UMA类型的内存，就一个节点，只能选ZONELIST_ORDER_ZONE.

static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;是mm/page_alloc.c中定义的全局变量。

static void set_zonelist_order(void)
{
    current_zonelist_order = ZONELIST_ORDER_ZONE;
}

对于NUMA类型的内存，有多个节点，借用内存的类型要根据不同zone中内存大小的分布来决定。

static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
static void set_zonelist_order(void)
{
    if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
        current_zonelist_order = default_zonelist_order();
    else
        current_zonelist_order = user_zonelist_order;
}

如果没哟DMA zone或者DMA zone的内存比较多，选择节点顺序ZONELIST_ORDER_NODE。否则选择ZONE顺序ZONELIST_ORDER_ZONE。

2.__build_all_zonelists

构建备用列表的主要工作是在__build_all_zonelists函数中实现的。

static int __build_all_zonelists(void *dummy)
{
    int nid;

    for_each_online_node(nid) {
        pg_data_t *pgdat = NODE_DATA(nid);

        build_zonelists(pgdat);
        build_zonelist_cache(pgdat);
    }
    return 0;
}

2.1build_zonelists函数即完成pg_data_t->node_zonelists[0]备份列表的初始化，同时完成pg_data_t->node_zonelists[1]自身节点的zone列表的初始化。

以上主要完成zonelists->_zonerefs数组的初始化，再次把zoneref结构体的定义贴在这里

struct zoneref {
    struct zone *zone;    /* Pointer to actual zone */
    int zone_idx;        /* zone_idx(zoneref->zone) */
};

最终的设置zoneref结构体的函数是zoneref_set_zone。

static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
{
    zoneref->zone = zone;
    zoneref->zone_idx = zone_idx(zone);
}

2.2 build_zonelist_cache主要完成node_zonelists[0]->zlcache，即zonelist_cache结构体的初始化。

这里把zonelist_cache结构体的定义及build_zonelist_cache函数贴在这里。这个结构体的作用主要是为了提高查找效率。

struct zonelist_cache {
    unsigned short z_to_n[MAX_ZONES_PER_ZONELIST];        /* zone->nid */
    DECLARE_BITMAP(fullzones, MAX_ZONES_PER_ZONELIST);    /* zone full? */
    unsigned long last_full_zap;        /* when last zap'd (jiffies) */
};

/* Construct the zonelist performance cache - see further mmzone.h */
static void build_zonelist_cache(pg_data_t *pgdat)
{
    struct zonelist *zonelist;
    struct zonelist_cache *zlc;
    struct zoneref *z;

    zonelist = &pgdat->node_zonelists[0];
    zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
    bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
    for (z = zonelist->_zonerefs; z->zone; z++)
        zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
}

3.mminit_verify_zonelist()

输出系统内所有节点的备份列表信息。这里解释其中一个关键的函数for_each_zone_zonelist(zone, z, zonelist, zoneid)遍历zonelist的过程。

3.1 for循环的主体

/* 
Iterate the zonelist
struct zone *zone;  @zone - The current zone in the iterator.zone = &pgdat->node_zones[zoneid];
struct zoneref *z;  @z - The current pointer within zonelist->zones being iterated
struct zonelist *zonelist; 固定的指针，指向gd_data的备用列表
int zoneid;   固定的数值，用于比较
 */
for_each_zone_zonelist(zone, z, zonelist, zoneid) 
    -->for_each_zone_zonelist_nodemask(zone, z, zonelist, zoneid, NULL)
        -->for (  z = first_zones_zonelist(zonelist, zoneid, NULL, &zone);    
              zone;                            
              z = next_zones_zonelist(++z, zoneid, NULL, &zone)        )

3.2 循环初始化

/*
struct zonelist *zonelist; 固定的指针，指向gd_data的备用列表
struct zoneref *z;  @z - The current pointer within zonelist->zones being iterated
struct zone *zone;  @zone - The current zone in the iterator.zone = &pgdat->node_zones[zoneid];
int zoneid;   固定的数值，用于比较
*/
z=first_zones_zonelist(zonelist, zoneid, NULL, &zone)
    -->next_zones_zonelist(zonelist->_zonerefs, zoneid, NULL, &zone); //返回指向zoneref结构的指针

3.3循环条件变化

z = next_zones_zonelist(++z, zoneid, NULL, &zone)
    -->while (zonelist_zone_idx(z) > zoneid)    z++;
    -->*zone = zonelist_zone(z);
    -->return z;