内存Zone中的pageset成员分析

   1: struct per_cpu_pageset __percpu *pageset;

首先，分析一个函数，__free_pages，这个函数是Buddy System提供的API接口函数，用于翻译曾经分配的一组页（多少个页视order大小而定）

   1: void __free_pages(struct page *page, unsigned int order)   2: {   3:     if (put_page_testzero(page)) {   4:         if (order == 0)   5:             free_hot_cold_page(page, 0);   6:         else   7:             __free_pages_ok(page, order);   8:     }   9: }

首先，调用put_page_testzero来查看该页是否还有其他引用（struct page结构中的_count）。

即先减去当前的这次引用（减1），然后查看是否引用值已经为0。

   1: /*   2:  * Drop a ref, return true if the refcount fell to zero (the page has no users)   3:  */   4: static inline int put_page_testzero(struct page *page)   5: {   6:     VM_BUG_ON(atomic_read(&page->_count) == 0);   7:     return atomic_dec_and_test(&page->_count);   8: }

其中，atomic_xxx是内核提供的原子操作实现，有兴趣的话可以进一步深入研究。

然后，如果order为1，代表只有一个内存页需要释放，就调用free_hot_cold_page函数。

   1: /*   2:  * Free a 0-order page   3:  * cold == 1 ? free a cold page : free a hot page   4:  */   5: void free_hot_cold_page(struct page *page, int cold)   6: {   7:     struct zone *zone = page_zone(page);   8:     struct per_cpu_pages *pcp;   9:     unsigned long flags;  10:     int migratetype;  11:     int wasMlocked = __TestClearPageMlocked(page);  12:    13:     if (!free_pages_prepare(page, 0))  14:         return;  15:    16:     migratetype = get_pageblock_migratetype(page);  17:     set_page_private(page, migratetype);  18:     local_irq_save(flags);  19:     if (unlikely(wasMlocked))  20:         free_page_mlock(page);  21:     __count_vm_event(PGFREE);  22:    23:     /*  24:      * We only track unmovable, reclaimable and movable on pcp lists.  25:      * Free ISOLATE pages back to the allocator because they are being  26:      * offlined but treat RESERVE as movable pages so we can get those  27:      * areas back if necessary. Otherwise, we may have to free  28:      * excessively into the page allocator  29:      */  30:     if (migratetype >= MIGRATE_PCPTYPES) {  31:         if (unlikely(migratetype == MIGRATE_ISOLATE)) {  32:             free_one_page(zone, page, 0, migratetype);  33:             goto out;  34:         }  35:         migratetype = MIGRATE_MOVABLE;  36:     }  37:    38:     pcp = &this_cpu_ptr(zone->pageset)->pcp;  39:     if (cold)  40:         list_add_tail(&page->lru, &pcp->lists[migratetype]);  41:     else  42:         list_add(&page->lru, &pcp->lists[migratetype]);  43:     pcp->count++;  44:     if (pcp->count >= pcp->high) {  45:         free_pcppages_bulk(zone, pcp->batch, pcp);  46:         pcp->count -= pcp->batch;  47:     }  48:    49: out:  50:     local_irq_restore(flags);  51: }

page_zone是根据page找到其所在的zone的函数，具体实现是在page->flags里面有相应的比特位，保存它是从哪个zone上分配的。

那么page->flags是从什么时候开始携带这些信息的呢？

首先，所有的page结构体都保存在pglist_data的成员node_mem_map指向的一片内存里。

   1: /*   2:  * Initially all pages are reserved - free ones are freed   3:  * up by free_all_bootmem() once the early boot process is   4:  * done. Non-atomic initialization, single-pass.   5:  */   6: void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,   7:         unsigned long start_pfn, enum memmap_context context)   8: {   9:     struct page *page;  10:     unsigned long end_pfn = start_pfn + size;  11:     unsigned long pfn;  12:     struct zone *z;  13:    14:     if (highest_memmap_pfn < end_pfn - 1)  15:         highest_memmap_pfn = end_pfn - 1;  16:    17:     z = &NODE_DATA(nid)->node_zones[zone];  18:     for (pfn = start_pfn; pfn < end_pfn; pfn++) {  19:         /*  20:          * There can be holes in boot-time mem_map[]s  21:          * handed to this function.  They do not  22:          * exist on hotplugged memory.  23:          */  24:         if (context == MEMMAP_EARLY) {  25:             if (!early_pfn_valid(pfn))  26:                 continue;  27:             if (!early_pfn_in_nid(pfn, nid))  28:                 continue;  29:         }  30:         page = pfn_to_page(pfn);  31:         set_page_links(page, zone, nid, pfn);  32:         mminit_verify_page_links(page, zone, nid, pfn);  33:         init_page_count(page);  34:         reset_page_mapcount(page);  35:         SetPageReserved(page);  36:         /*  37:          * Mark the block movable so that blocks are reserved for  38:          * movable at startup. This will force kernel allocations  39:          * to reserve their blocks rather than leaking throughout  40:          * the address space during boot when many long-lived  41:          * kernel allocations are made. Later some blocks near  42:          * the start are marked MIGRATE_RESERVE by  43:          * setup_zone_migrate_reserve()  44:          *  45:          * bitmap is created for zone's valid pfn range. but memmap  46:          * can be created for invalid pages (for alignment)  47:          * check here not to call set_pageblock_migratetype() against  48:          * pfn out of zone.  49:          */  50:         if ((z->zone_start_pfn <= pfn)  51:             && (pfn < z->zone_start_pfn + z->spanned_pages)  52:             && !(pfn & (pageblock_nr_pages - 1)))  53:             set_pageblock_migratetype(page, MIGRATE_MOVABLE);  54:    55:         INIT_LIST_HEAD(&page->lru);  56: #ifdef WANT_PAGE_VIRTUAL  57:         /* The shift won't overflow because ZONE_NORMAL is below 4G. */  58:         if (!is_highmem_idx(zone))  59:             set_page_address(page, __va(pfn << PAGE_SHIFT));  60: #endif  61:     }  62: }

在Buddy System初始化的过程中，会调用memmap_init_zone函数，在该函数中，会将属于该Zone的所有page结构体都遍历处理一遍，都调用一次set_page_links来建立page与zone之间的对应关系。

   1: static inline void set_page_zone(struct page *page, enum zone_type zone)   2: {   3:     page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT);   4:     page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT;   5: }   6:     7: static inline void set_page_node(struct page *page, unsigned long node)   8: {   9:     page->flags &= ~(NODES_MASK << NODES_PGSHIFT);  10:     page->flags |= (node & NODES_MASK) << NODES_PGSHIFT;  11: }  12:    13: static inline void set_page_links(struct page *page, enum zone_type zone,  14:     unsigned long node, unsigned long pfn)  15: {  16:     set_page_zone(page, zone);  17:     set_page_node(page, node);  18: #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)  19:     set_page_section(page, pfn_to_section_nr(pfn));  20: #endif  21: }

内存启动过程的初始化

   1: void __init setup_arch(char **cmdline_p)   2: {   3: ......   4: /* max_pfn_mapped is updated here */   5: max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);   6: max_pfn_mapped = max_low_pfn_mapped;   7: ......   8: paging_init();   9: ......  10: }

调用init_memory_mapping

   1: /*   2:  * Setup the direct mapping of the physical memory at PAGE_OFFSET.   3:  * This runs before bootmem is initialized and gets pages directly from   4:  * the physical memory. To access them they are temporarily mapped.   5:  */   6: unsigned long __init_refok init_memory_mapping(unsigned long start,   7:                            unsigned long end)   8: {   9: ......  10:    11: for (i = 0; i < nr_range; i++)  12:     ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,  13:                        mr[i].page_size_mask);  14:    15: ......  16: }

调用kernel_physical_mapping_init

   1: /*   2:  * This maps the physical memory to kernel virtual address space, a total   3:  * of max_low_pfn pages, by creating page tables starting from address   4:  * PAGE_OFFSET:   5:  */   6: unsigned long __init   7: kernel_physical_mapping_init(unsigned long start,   8:                  unsigned long end,   9:                  unsigned long page_size_mask)  10: {  11:     int use_pse = page_size_mask == (1<<PG_LEVEL_2M);  12:     unsigned long last_map_addr = end;  13:     unsigned long start_pfn, end_pfn;  14:     pgd_t *pgd_base = swapper_pg_dir;  15:     int pgd_idx, pmd_idx, pte_ofs;  16:     unsigned long pfn;  17:     pgd_t *pgd;  18:     pmd_t *pmd;  19:     pte_t *pte;  20:     unsigned pages_2m, pages_4k;  21:     int mapping_iter;  22:    23:     start_pfn = start >> PAGE_SHIFT;  24:     end_pfn = end >> PAGE_SHIFT;  25:    26:     /*  27:      * First iteration will setup identity mapping using large/small pages  28:      * based on use_pse, with other attributes same as set by  29:      * the early code in head_32.S  30:      *  31:      * Second iteration will setup the appropriate attributes (NX, GLOBAL..)  32:      * as desired for the kernel identity mapping.  33:      *  34:      * This two pass mechanism conforms to the TLB app note which says:  35:      *  36:      *     "Software should not write to a paging-structure entry in a way  37:      *      that would change, for any linear address, both the page size  38:      *      and either the page frame or attributes."  39:      */  40:     mapping_iter = 1;  41:    42:     if (!cpu_has_pse)  43:         use_pse = 0;  44:    45: repeat:  46:     pages_2m = pages_4k = 0;  47:     pfn = start_pfn;  48:     pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);  49:     pgd = pgd_base + pgd_idx;  50:     for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {  51:         pmd = one_md_table_init(pgd);  52:    53:         if (pfn >= end_pfn)  54:             continue;  55: #ifdef CONFIG_X86_PAE  56:         pmd_idx = pmd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);  57:         pmd += pmd_idx;  58: #else  59:         pmd_idx = 0;  60: #endif  61:         for (; pmd_idx < PTRS_PER_PMD && pfn < end_pfn;  62:              pmd++, pmd_idx++) {  63:             unsigned int addr = pfn * PAGE_SIZE + PAGE_OFFSET;  64:    65:             /*  66:              * Map with big pages if possible, otherwise  67:              * create normal page tables:  68:              */  69:             if (use_pse) {  70:                 unsigned int addr2;  71:                 pgprot_t prot = PAGE_KERNEL_LARGE;  72:                 /*  73:                  * first pass will use the same initial  74:                  * identity mapping attribute + _PAGE_PSE.  75:                  */  76:                 pgprot_t init_prot =  77:                     __pgprot(PTE_IDENT_ATTR |  78:                          _PAGE_PSE);  79:    80:                 addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +  81:                     PAGE_OFFSET + PAGE_SIZE-1;  82:    83:                 if (is_kernel_text(addr) ||  84:                     is_kernel_text(addr2))  85:                     prot = PAGE_KERNEL_LARGE_EXEC;  86:    87:                 pages_2m++;  88:                 if (mapping_iter == 1)  89:                     set_pmd(pmd, pfn_pmd(pfn, init_prot));  90:                 else  91:                     set_pmd(pmd, pfn_pmd(pfn, prot));  92:    93:                 pfn += PTRS_PER_PTE;  94:                 continue;  95:             }  96:             pte = one_page_table_init(pmd);  97:    98:             pte_ofs = pte_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);  99:             pte += pte_ofs; 100:             for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn; 101:                  pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) { 102:                 pgprot_t prot = PAGE_KERNEL; 103:                 /* 104:                  * first pass will use the same initial 105:                  * identity mapping attribute. 106:                  */ 107:                 pgprot_t init_prot = __pgprot(PTE_IDENT_ATTR); 108:   109:                 if (is_kernel_text(addr)) 110:                     prot = PAGE_KERNEL_EXEC; 111:   112:                 pages_4k++; 113:                 if (mapping_iter == 1) { 114:                     set_pte(pte, pfn_pte(pfn, init_prot)); 115:                     last_map_addr = (pfn << PAGE_SHIFT) + PAGE_SIZE; 116:                 } else 117:                     set_pte(pte, pfn_pte(pfn, prot)); 118:             } 119:         } 120:     } 121:     if (mapping_iter == 1) { 122:         /* 123:          * update direct mapping page count only in the first 124:          * iteration. 125:          */ 126:         update_page_count(PG_LEVEL_2M, pages_2m); 127:         update_page_count(PG_LEVEL_4K, pages_4k); 128:   129:         /* 130:          * local global flush tlb, which will flush the previous 131:          * mappings present in both small and large page TLB's. 132:          */ 133:         __flush_tlb_all(); 134:   135:         /* 136:          * Second iteration will set the actual desired PTE attributes. 137:          */ 138:         mapping_iter = 2; 139:         goto repeat; 140:     } 141:     return last_map_addr; 142: }

在这里面，将swapper_pg_dir作为pgd_t（Page Directory）的指针，对swapper_pg_dir指向的内存区域作处理，将Normal区域的映射关系建立到该页目录中。

然后在paging_init中

   1: static void __init pagetable_init(void)   2: {   3:     pgd_t *pgd_base = swapper_pg_dir;   4:     5:     permanent_kmaps_init(pgd_base);   6: }