磁盘性能统计

iostat统计磁盘信息的时候,使用的是/proc/diskstats 。而/proc/diskstats是谁在写入呢?

主要数据结构:

//genhd.h
struct disk_stats {
    unsigned long sectors[2];    /* READs and WRITEs */
    unsigned long ios[2];
    unsigned long merges[2];
    unsigned long ticks[2]; // jiffies差
    unsigned long io_ticks; // 从入队列到完成io的时间
    unsigned long time_in_queue;
};

proc初始化:

//block/genhd.c
static int __init proc_genhd_init(void)
{
    proc_create("diskstats", 0, NULL, &proc_diskstats_operations);
    proc_create("partitions", 0, NULL, &proc_partitions_operations);
    return 0;
}
static const struct file_operations proc_diskstats_operations = {
    .open        = diskstats_open,
    .read        = seq_read,
    .llseek        = seq_lseek,
    .release    = seq_release,
};
static int diskstats_open(struct inode *inode, struct file *file)
{
    return seq_open(file, &diskstats_op);
}
static const struct seq_operations diskstats_op = {
    .start    = disk_seqf_start,
    .next    = disk_seqf_next,
    .stop    = disk_seqf_stop,
    .show    = diskstats_show
};

看到,diskstats_show这个函数才是关键:

static int diskstats_show(struct seq_file *seqf, void *v)
{
    ......
     disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0);
    while ((hd = disk_part_iter_next(&piter))) {
        cpu = part_stat_lock();
        part_round_stats(cpu, hd);
        part_stat_unlock();
        seq_printf(seqf, "%4d %7d %s %lu %lu %lu "
               "%u %lu %lu %lu %u %u %u %u
",
               MAJOR(part_devt(hd)), MINOR(part_devt(hd)),
               disk_name(gp, hd->partno, buf),
               part_stat_read(hd, ios[READ]),
               part_stat_read(hd, merges[READ]),
               part_stat_read(hd, sectors[READ]),
               jiffies_to_msecs(part_stat_read(hd, ticks[READ])),
               part_stat_read(hd, ios[WRITE]),
               part_stat_read(hd, merges[WRITE]),
               part_stat_read(hd, sectors[WRITE]),
               jiffies_to_msecs(part_stat_read(hd, ticks[WRITE])),
               part_in_flight(hd),
               jiffies_to_msecs(part_stat_read(hd, io_ticks)),
               jiffies_to_msecs(part_stat_read(hd, time_in_queue))
            );
    }
    disk_part_iter_exit(&piter);
}

/proc/diskstats各列具体的函数参考下面:

$cat /proc/diskstats

22 0 hdc 159807 57894 6328277 1476593 179991 467858 5184662 2664218 0 886604 4140851

$cat /sys/block/hdc/stat

159807 57894 6328277 1476593 179989 467844 5184534 2664218 0 886604 4140851


/proc/diskstats文件比/sys/block/hdc/stat文件多3个域,从左至右分别对应主设备号,次设备号和设备名称。后续的11个域在这两个文件里是相同的,它们的函义将在下面解释。除了第9个域,所有的域都是从启动时的累积值。

第1个域:读磁盘的次数,成功完成读的总次数。

第2个域:合并读次数, field 6 – 合并写次数。为了效率可能会合并相邻的读和写。从而两次4K的读在它最终被处理到磁盘上之前可能会变成一次8K的读,才被计数(和排队),因此只有一次I/O操作。这个域使你知道这样的操作有多频繁。

第3个域:读扇区的次数,成功读过的扇区总次数。

第4个域:读花费的毫秒数,这是所有读操作所花费的毫秒数(用__make_request()到end_that_request_last()测量)。

第5个域:写完成的次数,成功写完成的总次数。

第7个域:写扇区的次数,成功写扇区总次数。

第8个域:写花费的毫秒数,这是所有写操作所花费的毫秒数(用__make_request()到end_that_request_last()测量)。

第9个域:I/O的当前进度,只有这个域应该是0。当请求被交给适当的request_queue_t时增加和请求完成时减小。

第10个域:花在I/O操作上的毫秒数,这个域会增长只要field 9不为0。

第11个域:加权, 花在I/O操作上的毫秒数,在每次I/O开始,I/O结束,I/O合并时这个域都会增加。这可以给I/O完成时间和存储那些可以累积的提供一个便利的测量标准。

而驱动层需要怎么提供这些数据呢?driver需要调用类似这样的一组函数:

part_stat_inc、part_stat_add、__part_stat_add(其中part_stat_add是调用 __part_stat_add,只不够它同时操作partition)

iostat是怎么根据/proc/diskstats来得到各项数据呢?

 1 //iostat.c function read_diskstats_stat
 2 if ((fp = fopen(DISKSTATS, "r")) == NULL)
 3       return;
 4 
 5    while (fgets(line, 256, fp) != NULL) {
 6 
 7       /* major minor name rio rmerge rsect ruse wio wmerge wsect wuse running use aveq */
 8       i = sscanf(line, "%u %u %s %lu %lu %llu %lu %lu %lu %llu %lu %lu %lu %lu",
 9          &major, &minor, dev_name,
10          &rd_ios, &rd_merges_or_rd_sec, &rd_sec_or_wr_ios, &rd_ticks_or_wr_sec,
11          &wr_ios, &wr_merges, &wr_sec, &wr_ticks, &ios_pgr, &tot_ticks, &rq_ticks);
12 
13       if (i == 14) {
14      /* Device */
15      sdev.rd_ios     = rd_ios;
16      sdev.rd_merges  = rd_merges_or_rd_sec;
17      sdev.rd_sectors = rd_sec_or_wr_ios;
18      sdev.rd_ticks   = rd_ticks_or_wr_sec;
19      sdev.wr_ios     = wr_ios;
20      sdev.wr_merges  = wr_merges;
21      sdev.wr_sectors = wr_sec;
22      sdev.wr_ticks   = wr_ticks;
23      sdev.ios_pgr    = ios_pgr;
24      sdev.tot_ticks  = tot_ticks;
25      sdev.rq_ticks   = rq_ticks;
26       }
27       else if (i == 7) {
28      /* Partition */
29      if (DISPLAY_EXTENDED(flags) || (!dlist_idx && !DISPLAY_PARTITIONS(flags)))
30         continue;
31 
32      sdev.rd_ios     = rd_ios;
33      sdev.rd_sectors = rd_merges_or_rd_sec;
34      sdev.wr_ios     = rd_sec_or_wr_ios;
35      sdev.wr_sectors = rd_ticks_or_wr_sec;
36       }
37       else
38      /* Unknown entry: Ignore it */
39      continue;
 1 void write_ext_stat(int curr, unsigned long long itv, int flags, int fctr,
 2             struct io_hdr_stats *shi, struct io_stats *ioi,
 3             struct io_stats *ioj)
 4 {
 5    unsigned long long rd_sec, wr_sec;
 6    double tput, util, await, svctm, arqsz, nr_ios;
 7     
 8    /*
 9     * Counters overflows are possible, but don't need to be handled in
10     * a special way: the difference is still properly calculated if the
11     * result is of the same type as the two values.
12     * Exception is field rq_ticks which is incremented by the number of
13     * I/O in progress times the number of milliseconds spent doing I/O.
14     * But the number of I/O in progress (field ios_pgr) happens to be
15     * sometimes negative...
16     */
17    nr_ios = (ioi->rd_ios - ioj->rd_ios) + (ioi->wr_ios - ioj->wr_ios);
18    tput = ((double) nr_ios) * HZ / itv;
19    util = S_VALUE(ioj->tot_ticks, ioi->tot_ticks, itv);
20    svctm = tput ? util / tput : 0.0;
21    /*
22     * Kernel gives ticks already in milliseconds for all platforms
23     * => no need for further scaling.
24     */
25    await = nr_ios ?
26       ((ioi->rd_ticks - ioj->rd_ticks) + (ioi->wr_ticks - ioj->wr_ticks)) /
27       nr_ios : 0.0;
28 
29    rd_sec = ioi->rd_sectors - ioj->rd_sectors;
30    if ((ioi->rd_sectors < ioj->rd_sectors) && (ioj->rd_sectors <= 0xffffffff))
31       rd_sec &= 0xffffffff;
32    wr_sec = ioi->wr_sectors - ioj->wr_sectors;
33    if ((ioi->wr_sectors < ioj->wr_sectors) && (ioj->wr_sectors <= 0xffffffff))
34       wr_sec &= 0xffffffff;
35 
36    arqsz = nr_ios ? (rd_sec + wr_sec) / nr_ios : 0.0;
37 
38    /*      DEV   rrq/s wrq/s   r/s   w/s  rsec  wsec  rqsz  qusz await svctm %util */
39    printf("%-13s %8.2f %8.2f %7.2f %7.2f %8.2f %8.2f %8.2f %8.2f %7.2f %6.2f %6.2f
",
40       shi->name,
41       S_VALUE(ioj->rd_merges, ioi->rd_merges, itv),
42       S_VALUE(ioj->wr_merges, ioi->wr_merges, itv),
43       S_VALUE(ioj->rd_ios, ioi->rd_ios, itv),
44       S_VALUE(ioj->wr_ios, ioi->wr_ios, itv),
45       ll_s_value(ioj->rd_sectors, ioi->rd_sectors, itv) / fctr,
46       ll_s_value(ioj->wr_sectors, ioi->wr_sectors, itv) / fctr,
47       arqsz,
48       S_VALUE(ioj->rq_ticks, ioi->rq_ticks, itv) / 1000.0,
49       await,
50       /* The ticks output is biased to output 1000 ticks per second */
51       svctm,
52       /* Again: Ticks in milliseconds */
53       util / 10.0);
54 }
原文地址:https://www.cnblogs.com/hbt19860104/p/3457929.html