page cache 与free

我们经常用free查看服务器的内存使用情况，而free中的输出却有些让人困惑，如下：

先看看各个数字的意义以及如何计算得到：

free命令输出的第二行(Mem)：这行分别显示了物理内存的总量(total)、已使用的 (used)、空闲的(free)、共享的(shared)、buffer(buffer大小)、 cache(cache的大小)的内存。我们知道Total、free、buffers、cached这几个字段是从/proc/meminfo中获取的，而used = total – free。Share列已经过时，忽略(见参考)。

free命令输出的第三行(-/+ buffers/cache)：

它显示的第一个值(used)：210236，这个值表示系统本身使用的内存总量，即除去buffer/cache，等于Mem行used列 - Mem行buffers列 - Mem行cached列。

它显示的第二个值(free)：814956，这个值表示系统当前可用内存，它等于Mem行total列— buffers/cache used，也等于Mem行free列 + Mem行buffers列 + Mem行cached列。

free命令输出的第四行(Swap) 这行显示交换内存的总量、已使用量、空闲量。

我们都知道free是从/proc/meminfo中读取相关的数据的。

下面是/proc/meminfo的实现：

复制代码
static int meminfo_read_proc(char *page, char **start, off_t off,
int count, int *eof, void *data)
{
struct sysinfo i;
int len;
unsigned long committed;
unsigned long allowed;
struct vmalloc_info vmi;
long cached;

/*
* display in kilobytes.
*/
#define K(x) ((x) << (PAGE_SHIFT - 10))
si_meminfo(&i);
si_swapinfo(&i);
committed = atomic_read(&vm_committed_space);
allowed = ((totalram_pages - hugetlb_total_pages())
* sysctl_overcommit_ratio / 100) + total_swap_pages;

cached = global_page_state(NR_FILE_PAGES) -
total_swapcache_pages - i.bufferram;
if (cached < 0)
cached = 0;

get_vmalloc_info(&vmi);

/*
* Tagged format, for easy grepping and expansion.
*/
len = sprintf(page,
"MemTotal: %8lu kB "
"MemFree: %8lu kB "
"Buffers: %8lu kB "
"Cached: %8lu kB "
"SwapCached: %8lu kB "

......

K(i.totalram),
K(i.freeram),
K(i.bufferram),
K(cached),
K(total_swapcache_pages),

......

#undef K
}

struct sysinfo {
long uptime; /* Seconds since boot */
unsigned long loads[3]; /* 1, 5, and 15 minute load averages */
unsigned long totalram; /* Total usable main memory size */
unsigned long freeram; /* Available memory size */
unsigned long sharedram; /* Amount of shared memory */
unsigned long bufferram; /* Memory used by buffers */
unsigned long totalswap; /* Total swap space size */
unsigned long freeswap; /* swap space still available */
unsigned short procs; /* Number of current processes */
unsigned short pad; /* explicit padding for m68k */
unsigned long totalhigh; /* Total high memory size */
unsigned long freehigh; /* Available high memory size */
unsigned int mem_unit; /* Memory unit size in bytes */
char _f[20-2*sizeof(long)-sizeof(int)]; /* Padding: libc5 uses this.. */
};
复制代码

图中，Buffers对应sysinfo.bufferram，内核中以页框为单位，通过宏K转化成以KB为单位输出。

复制代码
void si_meminfo(struct sysinfo *val)
{
val->totalram = totalram_pages;//total ram pages
val->sharedram = 0;
val->freeram = global_page_state(NR_FREE_PAGES);//free mem pages
val->bufferram = nr_blockdev_pages();//block devices used pages
val->totalhigh = totalhigh_pages;
val->freehigh = nr_free_highpages();
val->mem_unit = PAGE_SIZE;
}

long nr_blockdev_pages(void)
{
struct block_device *bdev;
long ret = 0;
spin_lock(&bdev_lock);
list_for_each_entry(bdev, &all_bdevs, bd_list) {
ret += bdev->bd_inode->i_mapping->nrpages;
}
spin_unlock(&bdev_lock);
return ret;
}
复制代码

nr_blockdev_pages计算块设备使用的页框数，遍历所有块设备，将使用的页框数相加。而不包含普通文件使用的页框数。

cached = global_page_state(NR_FILE_PAGES) - total_swapcache_pages - i.bufferram;

复制代码
static inline unsigned long global_page_state(enum zone_stat_item item)
{
long x = atomic_long_read(&vm_stat[item]);
#ifdef CONFIG_SMP
if (x < 0)
x = 0;
#endif
return x;
}
复制代码
Cache的大小为内核总的page cache减去swap cache和块设备占用的页框数量，实际上cache即为普通文件的占用的page cache。实际上，在函数add_to_page_cache和__add_to_swap_cache 中，都会通过调用pagecache_acct实现对内核变量nr_pagecache进行累加。前者对应page cache，内核读块设备和普通文件使用；后者对应swap cache，内核读交换分区使用。

Page cache(页面缓存)

在linux系统中，为了加快文件的读写，内核中提供了page cache作为缓存，称为页面缓存(page cache)。为了加快对块设备的读写，内核中还提供了buffer cache作为缓存。在2.4内核中，这两者是分开的。这样就造成了双缓冲，因为文件读写最后还是转化为对块设备的读写。在2.6中，buffer cache合并到page cache中，对应的页面叫作buffer page。当进行文件读写时，如果文件在磁盘上的存储块是连续的，那么文件在page cache中对应的页是普通的page，如果文件在磁盘上的数据块是不连续的，或者是设备文件，那么文件在page cache中对应的页是buffer page。buffer page与普通的page相比，每个页多了几个buffer_head结构体(个数视块的大小而定)。此外，如果对单独的块（如超级块）直接进行读写，对应的page cache中的页也是buffer page。这两种页面虽然形式略有不同，但是最终他们的数据都会被封装成bio结构体，提交到通用块设备驱动层，统一进行I/O调度。

复制代码
/**
* 块缓冲头描述符
*/
struct buffer_head {
/* 块缓冲状态位图，如BH_Uptodate */
unsigned long b_state; /* buffer state bitmap (see above) */
/* 指向下一个块缓冲，二者属于同一个页缓存 */
struct buffer_head *b_this_page;/* circular list of page's buffers */
/* 如果缓冲区属于页缓存，则指向缓存页。如果独立于页缓存，则为NULL */
struct page *b_page; /* the page this bh is mapped to */

/* 对应的块号 */
sector_t b_blocknr; /* start block number */
/* 块长 */
size_t b_size; /* size of mapping */
/* 内存中的数据指针 */
char *b_data; /* pointer to data within the page */

/* 后备设备 */
struct block_device *b_bdev;
/* 当IO操作完成时，由内核调用的回调函数 */
bh_end_io_t *b_end_io; /* I/O completion */
/* 预留指针，用于b_end_io。一般用于日志文件系统。 */
void *b_private; /* reserved for b_end_io */
struct list_head b_assoc_buffers; /* associated with another mapping */
/* 所属地址空间 */
struct address_space *b_assoc_map; /* mapping this buffer is
associated with */
/* 访问计数器 */
atomic_t b_count; /* users using this buffer_head */
};
复制代码

在kernel2.6之后，buffer_head没有别的作用，主要用来保持页框与块设备中数据块的映射关系。

Buffer page(缓冲页)

如果内核需要单独访问一个块，就会涉及到buffer page，并会检查对应的buffer head。

内核创建buffer page的两种常见情况：

(1)当读或者写一个文件页的数据块不相邻时。发生这种情况是因为文件系统为文件分配了非连续的块，或者文件有洞。具体请参见block_read_full_page(fs/buffer.c)函数:

复制代码
/**
* 从块设备中读取整页
*/
int block_read_full_page(struct page *page, get_block_t *get_block)
{
struct inode *inode = page->mapping->host;
sector_t iblock, lblock;
struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
unsigned int blocksize;
int nr, i;
int fully_mapped = 1;

BUG_ON(!PageLocked(page));
blocksize = 1 << inode->i_blkbits;
if (!page_has_buffers(page))/* 如果还没有建立缓冲区，则建立几个空缓冲区 */
create_empty_buffers(page, blocksize, 0);
/* 取页面关联的第一个缓冲区 */
head = page_buffers(page);

/* 计算要读取的块号 */
iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
bh = head;
nr = 0;
i = 0;

/* 遍历所有缓冲区 */
do {
if (buffer_uptodate(bh))/* 缓冲区已经与设备匹配了，不需要处理 */
continue;

if (!buffer_mapped(bh)) {/* 没有映射 */
int err = 0;

fully_mapped = 0;
if (iblock < lblock) {/* 在设备上还不存在块 */
WARN_ON(bh->b_size != blocksize);
/* 获得逻辑块在磁盘上的位置 */
err = get_block(inode, iblock, bh, 0);
if (err)
SetPageError(page);
}
if (!buffer_mapped(bh)) {/* 对应的块是稀疏块，写入0即可 */
zero_user_page(page, i * blocksize, blocksize,
KM_USER0);
if (!err)
set_buffer_uptodate(bh);
continue;
}
/*
* get_block() might have updated the buffer
* synchronously
*/
if (buffer_uptodate(bh))/* get_block将缓冲区更新了，继续处理下一块 */
continue;
}
/* 缓冲区已经映射，但内容不是最新的，将它放到临时数组中 */
arr[nr++] = bh;
} while (i++, iblock++, (bh = bh->b_this_page) != head);

if (fully_mapped)
SetPageMappedToDisk(page);

if (!nr) {/* 所有缓冲区都是最新的 */
/*
* All buffers are uptodate - we can set the page uptodate
* as well. But not if get_block() returned an error.
*/
if (!PageError(page))/* 设置页的uptodate标志，然后退出 */
SetPageUptodate(page);
unlock_page(page);
return 0;
}

/* Stage two: lock the buffers */
for (i = 0; i < nr; i++) {/* 锁定缓冲区 */
bh = arr[i];
lock_buffer(bh);
mark_buffer_async_read(bh);
}

/*
* Stage 3: start the IO. Check for uptodateness
* inside the buffer lock in case another process reading
* the underlying blockdev brought it uptodate (the sct fix).
*/
for (i = 0; i < nr; i++) {/* 遍历页内所有需要更新的缓冲区 */
bh = arr[i];
if (buffer_uptodate(bh))/* 在没有获得锁的期间，如果有其他进程读取的内容 */
end_buffer_async_read(bh, 1);
else
submit_bh(READ, bh);/* 提交IO请求 */
}
return 0;
}
复制代码

这里使用buffer head主要是通过buffer head建立页框与数据块的映射关系。因为页面中的数据不是连接的，而页框描述符struct page的字段又不足以表达这种信息。

该函数会调用create_empty_buffers来创建一组全新的缓冲区，并与page关联起来

复制代码
/**
* 创建一组全新的缓冲区，以便与页关联
*/
void create_empty_buffers(struct page *page,
unsigned long blocksize, unsigned long b_state)
{
struct buffer_head *bh, *head, *tail;

/* 创建所需要数目的缓冲头，并将其形成一个链表，返回第一个缓冲头 */
head = alloc_page_buffers(page, blocksize, 1);
/* 设置所有缓冲头的状态，并将缓冲头形成一个环形链表 */
bh = head;
do {
bh->b_state |= b_state;
tail = bh;
bh = bh->b_this_page;
} while (bh);
tail->b_this_page = head;

/* 根据页面状态设置块缓冲区的状态 */
spin_lock(&page->mapping->private_lock);
if (PageUptodate(page) || PageDirty(page)) {
bh = head;
do {/* 更新每一个缓冲头的状态 */
if (PageDirty(page))
set_buffer_dirty(bh);
if (PageUptodate(page))
set_buffer_uptodate(bh);
bh = bh->b_this_page;
} while (bh != head);
}
/* 将缓冲区关联到页面 */
attach_page_buffers(page, head);
spin_unlock(&page->mapping->private_lock);
}
复制代码

create_empty_buffers调用alloc_page_buffers来创建一组buffer head链表，但还不是循环链表：

复制代码
struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
int retry)
{
struct buffer_head *bh, *head;
long offset;

try_again:
head = NULL;
offset = PAGE_SIZE;
while ((offset -= size) >= 0) {
bh = alloc_buffer_head(GFP_NOFS);
if (!bh)
goto no_grow;

bh->b_bdev = NULL;
bh->b_this_page = head;
bh->b_blocknr = -1;
head = bh;

bh->b_state = 0;
atomic_set(&bh->b_count, 0);
bh->b_private = NULL;
bh->b_size = size;

/* Link the buffer to its page */
set_bh_page(bh, page, offset);

init_buffer(bh, NULL, NULL);
}
return head;

......

}
复制代码

alloc_page_buffers调用set_bh_page来设置b_data.

复制代码
void set_bh_page(struct buffer_head *bh,
struct page *page, unsigned long offset)
{
bh->b_page = page;
BUG_ON(offset >= PAGE_SIZE);
if (PageHighMem(page))
/*
* This catches illegal uses and preserves the offset:
*/
bh->b_data = (char *)(0 + offset);
else
bh->b_data = page_address(page) + offset;
}
复制代码

(2)访问一个单独的磁盘块(比如，读超级块或者索引节点块时)。参见ext2_fill_super(fs/ext2/super.c)，该函数在安装ext2文件系统时调用。

Buffer page和buffer head的关系：

因此，对于普通文件，如果页面中的块是连续的，则页面没有对应buffer head；如果不连续，则页面有对应的buffer head，参见do_mpage_readpage函数。对于块设备，无论是读取单独的数据块，还是作为设备文件来进行读取，页面始终有对应的buffer head，参见block_read_full_page/__bread函数。