内存管理(buddy system)

伙伴系统已经出现很长一段时间了,有了一些优化,看了一下,比想象中的复杂很多啊。

 1 static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
2 {
3 /* 如果这个节点没有page,那就直接跳过这个节点*/
4 if (!pgdat->node_spanned_pages)
5 return;
6
7 #ifdef CONFIG_FLAT_NODE_MEM_MAP
8 /* 用node_mem_map这个page用来管理mem_map */
9 if (!pgdat->node_mem_map) {
10 unsigned long size, start, end;
11 struct page *map;
12 /* 初始化start、end、size */
13 start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
14 end = pgdat->node_start_pfn + pgdat->node_spanned_pages;
15 end = ALIGN(end, MAX_ORDER_NR_PAGES);
16 size = (end - start) * sizeof(struct page);
17 /* 申请空间 */
18 map = alloc_remap(pgdat->node_id, size);
19 if (!map)
20 map = alloc_bootmem_node(pgdat, size);
21 pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
22 }
23 #ifndef CONFIG_NEED_MULTIPLE_NODES
24 /* 设置全局mem_map */
25 if (pgdat == NODE_DATA(0)) {
26 mem_map = NODE_DATA(0)->node_mem_map;
27 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
28 if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
29 mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
30 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
31 }
32 #endif
33 #endif /* CONFIG_FLAT_NODE_MEM_MAP */
34 }

alloc_node_mem_map(struct pglist_data *pgdat)用来初始化page。

 1 static void __paginginit free_area_init_core(struct pglist_data *pgdat, unsigned long *zones_size, unsigned long *zholes_size)
2 {
3 enum zone_type j;
4 int nid = pgdat->node_id;
5 unsigned long zone_start_pfn = pgdat->node_start_pfn;
6 int ret;
7
8 /* 在改版一些值的时候要上锁 */
9 pgdat_resize_init(pgdat);
10 pgdat->nr_zones = 0;
11 /* 初始化地kswapd等待队列(上锁) */
12 init_waitqueue_head(&pgdat->kswapd_wait);
13 pgdat->kswapd_max_order = 0;
14 /* 循环初始化每个zone结构 */
15 for (j = 0; j < MAX_NR_ZONES; j++) {
16 struct zone *zone = pgdat->node_zones + j;
17 unsigned long size, realsize, memmap_pages;
18 /* 计算这个zone上的page数量(包括hole) */
19 size = zone_spanned_pages_in_node(nid, j, zones_size);
20 /* 计算实际的值 */
21 realsize = size - zone_absent_pages_in_node(nid, j, zholes_size);
22 memmap_pages = (size * sizeof(struct page)) >> PAGE_SHIFT;
23 /* 去掉memmap占用的内存 */
24 if (realsize >= memmap_pages) {
25 realsize -= memmap_pages;
26 printk(KERN_DEBUG" %s zone: %lu pages used for memmap\n",zone_names[j], memmap_pages);
27 } else
28 printk(KERN_WARNING" %s zone: %lu pages exceeds realsize %lu\n", zone_names[j], memmap_pages, realsize);
29 /* 去掉用于DMA的内存 */
30 if (j == 0 && realsize > dma_reserve) {
31 realsize -= dma_reserve;
32 printk(KERN_DEBUG " %s zone: %lu pages reserved\n", zone_names[0], dma_reserve);
33 }
34 /* 更新nr_kernel_pages、nr_all_pages */
35 if (!is_highmem_idx(j))
36 nr_kernel_pages += realsize;
37 nr_all_pages += realsize;
38 /* 更新zone中的信息 */
39 zone->spanned_pages = size;
40 zone->present_pages = realsize;
41 #ifdef CONFIG_NUMA
42 zone->node = nid;
43 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) / 100;
44 zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
45 #endif
46 zone->name = zone_names[j];
47 spin_lock_init(&zone->lock);
48 spin_lock_init(&zone->lru_lock);
49 zone_seqlock_init(zone);
50 zone->zone_pgdat = pgdat;
51
52 zone->prev_priority = DEF_PRIORITY;
53
54 zone_pcp_init(zone);
55 INIT_LIST_HEAD(&zone->active_list);
56 INIT_LIST_HEAD(&zone->inactive_list);
57 zone->nr_scan_active = 0;
58 zone->nr_scan_inactive = 0;
59 /* 把zone中的状态全部初始化为0 */
60 zap_zone_vm_stats(zone);
61 zone->flags = 0;
62 /* 如果占用的内存大小为0,那就就此为止 */
63 if (!size)
64 continue;
65
66 set_pageblock_order(pageblock_default_order());
67 setup_usemap(pgdat, zone, size);
68 /* 初始化freelist */
69 ret = init_currently_empty_zone(zone, zone_start_pfn, size, MEMMAP_EARLY);
70 BUG_ON(ret);
71 /* 初始化内存域的页 */
72 memmap_init(size, nid, j, zone_start_pfn);
73 zone_start_pfn += size;
74 }
75 }

上面这段代码初始化了zone结构(其中的各个字段,详细的都在注释里了)。其中还有几个初始化per-CPU相关的数据结构(zone_pcp_init)、初始化freearea(init_currently_empty_zone)、初始化内存域的页(memmap_init)。zone_pcp_init函数的代码如下:

 1 static __meminit void zone_pcp_init(struct zone *zone)
2 {
3 int cpu;
4 /* 批处理的大小,每次处理的大小是batch而不是单页 */
5 unsigned long batch = zone_batchsize(zone);
6 /* 依次处理每个CPU */
7 for (cpu = 0; cpu < NR_CPUS; cpu++) {
8 #ifdef CONFIG_NUMA
9 zone_pcp(zone, cpu) = &boot_pageset[cpu];
10 /* 设置pageset */
11 setup_pageset(&boot_pageset[cpu],0);
12 #else
13 setup_pageset(zone_pcp(zone,cpu), batch);
14 #endif
15 }
16 if (zone->present_pages)
17 printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
18 zone->name, zone->present_pages, batch);
19 }
20 static int zone_batchsize(struct zone *zone)
21 {
22 int batch;
23 /* CPU页缓存的大小大约为zone大小的千分之一 */
24 batch = zone->present_pages / 1024;
25 if (batch * PAGE_SIZE > 512 * 1024) batch = (512 * 1024) / PAGE_SIZE;
26 batch /= 4;
27 if (batch < 1) batch = 1;
28 batch = (1 << (fls(batch + batch/2)-1)) - 1;
29 return batch;
30 }

其中热页CPU缓存的热页很可能现在就在CPU的硬件高速缓存中。

下面是init_currently_empty_zone的代码:

 1 __meminit int init_currently_empty_zone(struct zone *zone, unsigned long zone_start_pfn, unsigned long 
2
3 size, enum memmap_context context)
4 {
5 struct pglist_data *pgdat = zone->zone_pgdat;
6 int ret;
7 /* 初始化管理区的等待队列hash表和等待队列头,这个就没什么好说的了 */
8 ret = zone_wait_table_init(zone, size);
9 if (ret)
10 return ret;
11 pgdat->nr_zones = zone_idx(zone) + 1;
12 zone->zone_start_pfn = zone_start_pfn;
13 /* 初始化管理区的空闲块列表 */
14 zone_init_free_lists(zone);
15 return 0;
16 }
17 static void __meminit zone_init_free_lists(struct zone *zone)
18 {
19 int order, t;
20 /* 这里其实是一个双循环(建的应该是空的吧),看下面 */
21 for_each_migratetype_order(order, t) {
22 INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
23 zone->free_area[order].nr_free = 0;
24 }
25 }
26 #define for_each_migratetype_order(order, type) \
27 for (order = 0; order < MAX_ORDER; order++) \
28 for (type = 0; type < MIGRATE_TYPES; type++)

这段代码比想象中的好理解吧(也许我想错了)。下面是初始化mem_map数组的代码(memmap_init):

 1 #define memmap_init(size, nid, zone, start_pfn) \
2 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
3 #endif
4 void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
5 unsigned long start_pfn, enum memmap_context context)
6 {
7 struct page *page;
8 unsigned long end_pfn = start_pfn + size;
9 unsigned long pfn;
10 struct zone *z;
11
12 z = &NODE_DATA(nid)->node_zones[zone];
13 /* 遍历每个page */
14 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
15 if (context == MEMMAP_EARLY) {
16 if (!early_pfn_valid(pfn))
17 continue;
18 if (!early_pfn_in_nid(pfn, nid))
19 continue;
20 }
21 /* 取得pfn对应的page */
22 page = pfn_to_page(pfn);
23 /* 设置page->flags */
24 set_page_links(page, zone, nid, pfn);
25 /* 设置page->_count为1 */
26 init_page_count(page);
27 /* 设置page->_mapcount为-1 */
28 reset_page_mapcount(page);
29 /* 设置page为保留的 */
30 SetPageReserved(page);
31 /* 检查一下是否可以设置为可移动的 */
32 if ((z->zone_start_pfn <= pfn)
33 && (pfn < z->zone_start_pfn + z->spanned_pages)
34 && !(pfn & (pageblock_nr_pages - 1)))
35 set_pageblock_migratetype(page, MIGRATE_MOVABLE);
36
37 INIT_LIST_HEAD(&page->lru);
38 #ifdef WANT_PAGE_VIRTUAL
39 /* 如果不是高端页 */
40 if (!is_highmem_idx(zone))
41 set_page_address(page, __va(pfn << PAGE_SHIFT));
42 #endif
43 }
44 }

通过上面这几步就建立了buddy system需要的数据结构。下面就开始用它来管理内存了。下面是分配页面的代码:

  1 static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
2 {
3 /* 分配不了这么大的页 */
4 if (unlikely(order >= MAX_ORDER))
5 return NULL;
6 /* 应该是如果没有确定从哪个节点分配就从现在的这个CPU对应的节点分配??*/
7 if (nid < 0)
8 nid = numa_node_id();
9 /* 真正的分配函数(伙伴系统的心脏) */
10 return __alloc_pages(gfp_mask, order, NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_mask));
11 }
12 struct page* __alloc_pages(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist)
13 {
14 const gfp_t wait = gfp_mask & __GFP_WAIT;
15 struct zone **z;
16 struct page *page;
17 struct reclaim_state reclaim_state;
18 struct task_struct *p = current;
19 int do_retry;
20 int alloc_flags;
21 int did_some_progress;
22
23 /* 自旋锁睡眠调试函数 */
24 might_sleep_if(wait);
25 /* 通过fail_page_alloc和标志位快速判断是否会失败 */
26 if (should_fail_alloc_page(gfp_mask, order))
27 return NULL;
28
29 restart:
30 z = zonelist->zones;
31 if (unlikely(*z == NULL)) {
32 return NULL;
33 }
34 /* 用get_page_from_freelist函数去取page */
35 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET);
36 if (page)
37 goto got_pg;
38 /* 判断是否需要进一步努力 */
39 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
40 goto nopage;
41 /* 依次唤醒kswapd守护进程来换页 */
42 for (z = zonelist->zones; *z; z++)
43 wakeup_kswapd(*z, order);
44
45 /* 修改标志位 */
46 alloc_flags = ALLOC_WMARK_MIN;
47 if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
48 alloc_flags |= ALLOC_HARDER;
49 if (gfp_mask & __GFP_HIGH)
50 alloc_flags |= ALLOC_HIGH;
51 if (wait)
52 alloc_flags |= ALLOC_CPUSET;
53 /* 再次尝试 */
54 page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags);
55 if (page)
56 goto got_pg;
57
58 rebalance:
59 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) && !in_interrupt()) {
60 if (!(gfp_mask & __GFP_NOMEMALLOC)) {
61 nofail_alloc:
62 /* 忽略水印再来一次 */
63 page = get_page_from_freelist(gfp_mask, order, zonelist, ALLOC_NO_WATERMARKS);
64 if (page)
65 goto got_pg;
66 /* 如果设置不能分配失败 */
67 if (gfp_mask & __GFP_NOFAIL) {
68 congestion_wait(WRITE, HZ/50);
69 goto nofail_alloc;
70 }
71 }
72 goto nopage;
73 }
74
75 if (!wait)
76 goto nopage;
77 /* 主动放弃CPU提供给优先级更高的任务使用 */
78 cond_resched();
79
80 /* 开始同步回收 */
81 cpuset_memory_pressure_bump();
82 p->flags |= PF_MEMALLOC;
83 reclaim_state.reclaimed_slab = 0;
84 p->reclaim_state = &reclaim_state;
85 /* 尝试释放一些页 */
86 did_some_progress = try_to_free_pages(zonelist->zones, order, gfp_mask);
87
88 p->reclaim_state = NULL;
89 p->flags &= ~PF_MEMALLOC;
90
91 /* 主动放弃CPU提供给优先级更高的任务使用 */
92 cond_resched();
93 /* 释放pcp中的页 */
94 if (order != 0) drain_all_pages();
95 /* 如果释放了一些页就在此尝试 */
96 if (likely(did_some_progress)) {
97 page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags);
98 if (page)
99 goto got_pg;
100 } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
101 /* 想办法调用OOM killer(在内存不够用的时候选择性地出来干掉一些进程) */
102 if (!try_set_zone_oom(zonelist)) {
103 schedule_timeout_uninterruptible(1);
104 goto restart;
105 }
106 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET);
107 if (page) {
108 /* 清楚标志避免调用OOM killer */
109 clear_zonelist_oom(zonelist);
110 goto got_pg;
111 }
112 /* 如果在分配一个很大的页分配不成功也是很正常的 */
113 if (order > PAGE_ALLOC_COSTLY_ORDER) {
114 /* 清楚标志避免调用OOM killer */
115 clear_zonelist_oom(zonelist);
116 goto nopage;
117 }
118 /* 找出一个进程干掉(不然的话系统会挂掉吧) */
119 out_of_memory(zonelist, gfp_mask, order);
120 clear_zonelist_oom(zonelist);
121 goto restart;
122 }
123
124 do_retry = 0;
125 if (!(gfp_mask & __GFP_NORETRY)) {
126 if ((order <= PAGE_ALLOC_COSTLY_ORDER) || (gfp_mask & __GFP_REPEAT))
127 do_retry = 1;
128 if (gfp_mask & __GFP_NOFAIL)
129 do_retry = 1;
130 }
131 /* 等待一些写操作完成的时候重试 */
132 if (do_retry) {
133 congestion_wait(WRITE, HZ/50);
134 goto rebalance;
135 }
136
137 nopage:
138 /* 承认这次分配失败 */
139 if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
140 printk(KERN_WARNING "%s: page allocation failure." " order:%d, mode:0x%x\n",p->comm, order, gfp_mask);
141 dump_stack();
142 show_mem();
143 }
144 got_pg:
145 return page;
146 }

可以好好研究这段代码,我们会发现内核在分配页面的时候真的尽力了。先是尝试一下(大部分的分配这个时候会成功),如果不行的话就开始进程调度、“没收”其他地方占用的内存,甚至不惜杀死进程。可以看出,如果没有内存了,我们找谁要?谁有就朝谁要(有点像《让子弹飞》里面的意思了)。下面是从buddy system中具体的分配页面的代码(很好理解,但是细节比较多):

  1 static struct page* get_page_from_freelist(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, int alloc_flags)
2 {
3 struct zone **z;
4 struct page *page = NULL;
5 int classzone_idx = zone_idx(zonelist->zones[0]);
6 struct zone *zone;
7 nodemask_t *allowednodes = NULL;
8 int zlc_active = 0;
9 int did_zlc_setup = 0;
10 enum zone_type highest_zoneidx = -1;
11
12 zonelist_scan:
13 z = zonelist->zones;
14 /* 遍历zonelist来找到一个有足够空闲页的zone */
15 do {
16 /* 检查标志看是否能在这个节点分配 */
17 if (unlikely(alloc_should_filter_zonelist(zonelist))) {
18 if (highest_zoneidx == -1)
19 highest_zoneidx = gfp_zone(gfp_mask);
20 if (zone_idx(*z) > highest_zoneidx)
21 continue;
22 }
23
24 if (NUMA_BUILD && zlc_active && !zlc_zone_worth_trying(zonelist, z, allowednodes))
25 continue;
26 zone = *z;
27 /* 检查CPU限制 */
28 if ((alloc_flags & ALLOC_CPUSET) &&
29 !cpuset_zone_allowed_softwall(zone, gfp_mask))
30 goto try_next_zone;
31 /* 检查标志和限制 */
32 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
33 unsigned long mark;
34 if (alloc_flags & ALLOC_WMARK_MIN)
35 mark = zone->pages_min;
36 else if (alloc_flags & ALLOC_WMARK_LOW)
37 mark = zone->pages_low;
38 else
39 mark = zone->pages_high;
40 if (!zone_watermark_ok(zone, order, mark, classzone_idx, alloc_flags)) {
41 if (!zone_reclaim_mode || !zone_reclaim(zone, gfp_mask, order))
42 goto this_zone_full;
43 }
44 }
45 /* 正在的分配 */
46 page = buffered_rmqueue(zonelist, zone, order, gfp_mask);
47 if (page)
48 break;
49 this_zone_full:
50 /* 标志为满?? */
51 if (NUMA_BUILD)
52 zlc_mark_zone_full(zonelist, z);
53 try_next_zone:
54 /* 设置缓存的标志 */
55 if (NUMA_BUILD && !did_zlc_setup) {
56 allowednodes = zlc_setup(zonelist, alloc_flags);
57 zlc_active = 1;
58 did_zlc_setup = 1;
59 }
60 } while (*(++z) != NULL);
61 /* 禁用缓存再来一次,不行就放弃 */
62 if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
63 zlc_active = 0;
64 goto zonelist_scan;
65 }
66 return page;
67 }
68 /* 在这里分成两种选择,是从缓存中取还是从伙伴系统中取? */
69 static struct page *buffered_rmqueue(struct zonelist *zonelist, struct zone *zone, int order, gfp_t gfp_flags)
70 {
71 unsigned long flags;
72 struct page *page;
73 int cold = !!(gfp_flags & __GFP_COLD);
74 struct per_cpu_pageset *pset;
75 int migratetype = allocflags_to_migratetype(gfp_flags);
76 int this_cpu;
77 again:
78 /* 取到per_cpu_pageset */
79 pset = get_zone_pcp(zone, &flags, &this_cpu);
80 /* 如果分配单页,这种情况占大多数 */
81 if (likely(order == 0)) {
82 /* 取得pcp */
83 struct per_cpu_pages *pcp = &pset->pcp;
84 /* 如果缓存上没有页 */
85 if (!pcp->count) {
86 /* 分配batch的页来填充缓存 */
87 pcp->count = rmqueue_bulk(zone, 0, pcp->batch, &pcp->list, migratetype);
88 if (unlikely(!pcp->count))
89 goto failed;
90 }
91 /* 遍历缓存分配 */
92 if (cold) {
93 list_for_each_entry_reverse(page, &pcp->list, lru)
94 if (page_private(page) == migratetype)
95 break;
96 } else {
97 list_for_each_entry(page, &pcp->list, lru)
98 if (page_private(page) == migratetype)
99 break;
100 }
101
102 /* 如果需要的话分配更多的缓存 */
103 if (unlikely(&page->lru == &pcp->list)) {
104 pcp->count += rmqueue_bulk(zone, 0, pcp->batch, &pcp->list, migratetype);
105 page = list_entry(pcp->list.next, struct page, lru);
106 }
107
108 list_del(&page->lru);
109 pcp->count--;
110 } else {
111 /* 如果不是单页就不能从缓存上分配了 */
112 spin_lock(&zone->lock);
113 page = __rmqueue(zone, order, migratetype);
114 spin_unlock(&zone->lock);
115 if (!page)
116 goto failed;
117 }
118
119 __count_zone_vm_events(PGALLOC, zone, 1 << order);
120 /* 更新统计信息? */
121 zone_statistics(zonelist, zone);
122 /* 释放锁? */
123 put_zone_pcp(zone, flags, this_cpu);
124
125 VM_BUG_ON(bad_range(zone, page));
126 if (prep_new_page(page, order, gfp_flags))
127 goto again;
128 return page;
129 failed:
130 put_zone_pcp(zone, flags, this_cpu);
131 return NULL;
132 }
133 /* 从伙伴系统中remove一项 */
134 static struct page *__rmqueue(struct zone *zone, unsigned int order, int migratetype)
135 {
136 struct page *page;
137 /* 选择尽可能小的页移除 */
138 page = __rmqueue_smallest(zone, order, migratetype);
139 /* */
140 if (unlikely(!page))
141 page = __rmqueue_fallback(zone, order, migratetype);
142
143 return page;
144 }
145 /* 选择尽可能小的页移除 */
146 static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, int migratetype)
147 {
148 unsigned int current_order;
149 struct free_area * area;
150 struct page *page;
151
152 for (current_order = order; current_order < MAX_ORDER; ++current_order) {
153 area = &(zone->free_area[current_order]);
154 /* 如果这个free_area是空的,那就直接去从更大的order里面找 */
155 if (list_empty(&area->free_list[migratetype]))
156 continue;
157 /* 从free_area里面随便找一个删除 */
158 page = list_entry(area->free_list[migratetype].next, struct page, lru);
159 list_del(&page->lru);
160 /* 清除buddy标志并设置private为0 */
161 rmv_page_order(page);
162 area->nr_free--;
163 /* 在不能中断的时候使用 */
164 __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order));
165 /* 分割页 */
166 expand(zone, page, order, current_order, area, migratetype);
167 return page;
168 }
169 return NULL;
170 }
171 /* 如果是一个大页,就把剩下的放回 */
172 static inline void expand(struct zone *zone, struct page *page, int low, int high, struct free_area *area, int migratetype)
173 {
174 unsigned long size = 1 << high;
175
176 while (high > low) {
177 area--;
178 high--;
179 size >>= 1;
180 VM_BUG_ON(bad_range(zone, &page[size]));
181 /* 这个貌似是在尾部,其实是在在头部开刀,坑爹啊 */
182 list_add(&page[size].lru, &area->free_list[migratetype]);
183 area->nr_free++;
184 set_page_order(&page[size], high);
185 }
186 }
187 /* 从fallback list中找页(其实我看的也不是太懂,是不是如果我想分配一种但是没有了,就从其他的类型中分配?) */
188 static struct page *__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
189 {
190 struct free_area * area;
191 int current_order;
192 struct page *page;
193 int migratetype, i;
194 /* 这次改用从大到小遍历,应该能更快找到空闲页? */
195 for (current_order = MAX_ORDER-1; current_order >= order; --current_order) {
196 /* 遍历所有类型 */
197 for (i = 0; i < MIGRATE_TYPES - 1; i++) {
198 migratetype = fallbacks[start_migratetype][i];
199 if (migratetype == MIGRATE_RESERVE)
200 continue;
201
202 area = &(zone->free_area[current_order]);
203 if (list_empty(&area->free_list[migratetype]))
204 continue;
205 /* 下面的内容就差不多了 */
206 page = list_entry(area->free_list[migratetype].next, struct page, lru);
207 area->nr_free--;
208
209 if (unlikely(current_order >= (pageblock_order >> 1)) || start_migratetype == MIGRATE_RECLAIMABLE) {
210 unsigned long pages;
211 pages = move_freepages_block(zone, page, start_migratetype);
212
213 if (pages >= (1 << (pageblock_order-1)))
214 set_pageblock_migratetype(page, start_migratetype);
215
216 migratetype = start_migratetype;
217 }
218
219 list_del(&page->lru);
220 rmv_page_order(page);
221 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order));
222
223 if (current_order == pageblock_order)
224 set_pageblock_migratetype(page, start_migratetype);
225
226 expand(zone, page, order, current_order, area, migratetype);
227 return page;
228 }
229 }
230 /* 实在不行就。。。*/
231 return __rmqueue_smallest(zone, order, MIGRATE_RESERVE);
232 }

以前在看书的时候有个问题不怎么清楚,在释放page的时候怎么知道它在freearea中的位置?所有的答案都在代码中:

  1 /* 释放页的入口 */
2 void __free_pages(struct page *page, unsigned int order)
3 {
4 if (put_page_testzero(page)) {
5 if (order == 0)
6 free_hot_page(page); /* 如果是单页就放到页缓存中 */
7 else
8 __free_pages_ok(page, order); /* 释放非单页 */
9 }
10 }
11 void free_hot_page(struct page *page)
12 {
13 free_hot_cold_page(page, 0);
14 }
15 /* 释放“冷页”或者“惹页” */
16 static void free_hot_cold_page(struct page *page, int cold)
17 {
18 struct zone *zone = page_zone(page);
19 struct per_cpu_pageset *pset;
20 struct per_cpu_pages *pcp;
21 unsigned long flags;
22 int this_cpu;
23 /* 检查是不是匿名页? 没有映射过的? */
24 if (PageAnon(page))
25 page->mapping = NULL;
26 /* 检查标志位 */
27 if (free_pages_check(page))
28 return;
29 /* 如果是highmem */
30 if (!PageHighMem(page))
31 debug_check_no_locks_freed(page_address(page), PAGE_SIZE);
32 arch_free_page(page, 0);
33 kernel_map_pages(page, 1, 0);
34
35 /* 取得pcp */
36 pset = get_zone_pcp(zone, &flags, &this_cpu);
37 pcp = &pset->pcp;
38
39 count_vm_event(PGFREE);
40
41 if (cold)
42 list_add_tail(&page->lru, &pcp->list);
43 else
44 list_add(&page->lru, &pcp->list);
45 set_page_private(page, get_pageblock_migratetype(page));
46 pcp->count++;
47 /* 如果已经超过阀值,那么就释放batch个页到伙伴系统 */
48 if (pcp->count >= pcp->high) {
49 free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
50 pcp->count -= pcp->batch;
51 }
52 /* 释放锁? */
53 put_zone_pcp(zone, flags, this_cpu);
54 }
55 static void __free_pages_ok(struct page *page, unsigned int order)
56 {
57 unsigned long flags;
58 int reserved = 0;
59 int this_cpu;
60 int i;
61 /* 检查标志位 */
62 for (i = 0 ; i < (1 << order) ; ++i)
63 reserved += free_pages_check(page + i);
64 if (reserved)
65 return;
66 if (!PageHighMem(page))
67 debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
68 arch_free_page(page, order);
69 kernel_map_pages(page, 1 << order, 0);
70
71 lock_cpu_pcp(&flags, &this_cpu);
72 count_vm_events(PGFREE, 1 << order);
73 /* 释放页 */
74 free_one_page(page_zone(page), page, order);
75 unlock_cpu_pcp(flags, this_cpu);
76 }
77 static void free_one_page(struct zone *zone, struct page *page, int order)
78 {
79 spin_lock(&zone->lock);
80 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
81 zone->pages_scanned = 0;
82 __free_one_page(page, zone, order);
83 spin_unlock(&zone->lock);
84 }
85 /* 答案就在这里 */
86 static inline void __free_one_page(struct page *page, struct zone *zone, unsigned int order)
87 {
88 unsigned long page_idx;
89 int order_size = 1 << order;
90 /* 取得type */
91 int migratetype = get_pageblock_migratetype(page);
92 /* 如果是复合页 */
93 if (unlikely(PageCompound(page)))
94 destroy_compound_page(page, order);
95
96 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
97
98 VM_BUG_ON(page_idx & (order_size - 1));
99 VM_BUG_ON(bad_range(zone, page));
100
101 __mod_zone_page_state(zone, NR_FREE_PAGES, order_size);
102 /* 从小到大遍历 */
103 while (order < MAX_ORDER-1) {
104 unsigned long combined_idx;
105 struct page *buddy;
106 /* 找到伙伴(只是找到位置) */
107 buddy = __page_find_buddy(page, page_idx, order);
108 /* 判断是不是伙伴 */
109 if (!page_is_buddy(page, buddy, order))
110 break;
111 /* 把buddy从所在的链表中删除 */
112 list_del(&buddy->lru);
113 zone->free_area[order].nr_free--;
114 /* 修改标志位 */
115 rmv_page_order(buddy);
116 /* 找到合并后的位置 */
117 combined_idx = __find_combined_index(page_idx, order);
118 page = page + (combined_idx - page_idx);
119 page_idx = combined_idx;
120 order++;
121 }
122 /* 最后设置page */
123 set_page_order(page, order);
124 list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
125 zone->free_area[order].nr_free++;
126 }
127 static inline struct page * __page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
128 {
129 /* 找到伙伴(这个理解起来还是很简单的) */
130 unsigned long buddy_idx = page_idx ^ (1 << order);
131 return page + (buddy_idx - page_idx);
132 }

这里的代码都很好理解,看的过程也都做了注释。先是定下策略,怎么找?找不到怎么办?然后找的时候检查标志,看是否用缓存。最后就是找的过程了。看的匆忙,难免有错的地方。

----------------------------

个人理解,欢迎拍砖。

原文地址:https://www.cnblogs.com/ggzwtj/p/2126408.html