Hbase之缓存扫描加快读取速度

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.client.metrics.ScanMetrics;

import java.io.IOException;

/**
 * Created by similarface on 16/8/23.
 */
public class ScanDataUseCache {
    private static Table table=null;
    public static Table getTable() {
        if(table==null){
            try {
                Configuration configuration = HBaseConfiguration.create();
                Connection connection = ConnectionFactory.createConnection(configuration);
                //建立表的连接
                return connection.getTable(TableName.valueOf("testtable"));
            }catch (IOException e){
                return table;
            }
        }
        return table;
    }
    private static void scan(int caching,int batch,boolean small) {
        int count=0;
        //setCaching 设置的值为每次rpc的请求记录数，默认是1；cache大可以优化性能，但是太大了会花费很长的时间进行一次传输。
        //setBatch 设置每次取的column size；有些row特别大，所以需要分开传给client，就是一次传一个row的几个column。
        //setSmall 是否为小扫描
        //setScanMetricsEnabled 使用了集合
        Scan scan = new Scan().setCaching(caching).setBatch(batch).setSmall(small).setScanMetricsEnabled(true);
        ResultScanner scanner=null;
        try {
            scanner = getTable().getScanner(scan);
        }catch (IOException e){
            System.out.println(e);
        }
        if (scanner!=null){
            for (Result result:scanner){
                count++;
            }
        scanner.close();
        ScanMetrics metrics = scan.getScanMetrics();
        System.out.println("Caching: " + caching + ", Batch: " + batch + ", Small: " + small + ", Results: " + count + ", RPCs: " + metrics.countOfRPCcalls);
        }
        else {
            System.out.println("Error");
        }
    }

    public static void main(String[] args) throws IOException {
        // Caching: 1, Batch: 1, Small: false, Results: 9, RPCs: 12
        scan(1, 1, false);

        //Caching: 1, Batch: 0, Small: false, Results: 4, RPCs: 7
        scan(1, 0, false);

        // Caching: 1, Batch: 0, Small: true, Results: 4, RPCs: 0
        scan(1, 0, true);

        //Caching: 200, Batch: 1, Small: false, Results: 9, RPCs: 3
        scan(200, 1, false);

        //Caching: 200, Batch: 0, Small: false, Results: 4, RPCs: 3
        scan(200, 0, false);

        //Caching: 200, Batch: 0, Small: true, Results: 4, RPCs: 0
        scan(200, 0, true);

        // Caching: 2000, Batch: 100, Small: false, Results: 4, RPCs: 3
        scan(2000, 100, false);

        // Caching: 2, Batch: 100, Small: false, Results: 4, RPCs: 5
        scan(2, 100, false);

        // Caching: 2, Batch: 10, Small: false, Results: 4, RPCs: 5
        scan(2, 10, false);

        // Caching: 2, Batch: 10, Small: false, Results: 4, RPCs: 5
        scan(5, 100, false);

        // Caching: 5, Batch: 100, Small: false, Results: 4, RPCs: 3
        scan(5, 20, false);

        // Caching: 10, Batch: 10, Small: false, Results: 4, RPCs: 3
        scan(10, 10, false);
    }
}

/**
 Caching: 1, Batch: 0, Small: false, Results: 5, RPCs: 8
 Caching: 1, Batch: 0, Small: true, Results: 5, RPCs: 0
 Caching: 200, Batch: 1, Small: false, Results: 1009, RPCs: 8
 Caching: 200, Batch: 0, Small: false, Results: 5, RPCs: 3
 Caching: 200, Batch: 0, Small: true, Results: 5, RPCs: 0
 Caching: 2000, Batch: 100, Small: false, Results: 14, RPCs: 3
 Caching: 2, Batch: 100, Small: false, Results: 14, RPCs: 10
 Caching: 2, Batch: 10, Small: false, Results: 104, RPCs: 55
 Caching: 5, Batch: 100, Small: false, Results: 14, RPCs: 5
 Caching: 5, Batch: 20, Small: false, Results: 54, RPCs: 13
 Caching: 10, Batch: 10, Small: false, Results: 104, RPCs: 13
 **/

这是一个9行数据的表

每行包含一些列

使用缓存为6 批量为3的扫描器

需要3个RPC

3个列装入一个Result实例

6个result到缓存中组成一个RPC

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.client.metrics.ScanMetrics;

import java.io.IOException;

/**
 * Created by similarface on 16/8/24.
 */
public class ScanWithOffsetAndLimit {
    private static Table table = null;

    public static Table getTable() {
        if (table == null) {
            try {
                Configuration configuration = HBaseConfiguration.create();
                Connection connection = ConnectionFactory.createConnection(configuration);
                //建立表的连接
                return connection.getTable(TableName.valueOf("testtable"));
            } catch (IOException e) {
                return table;
            }
        }
        return table;
    }

    /**
     * 遍历访问数据
     * @param num 运行次序
     * @param caching
     * @param batch
     * @param offset
     * @param maxResults
     * @param maxResultSize
     * @param dump
     * @throws IOException
     */
    private static void scan(int num, int caching, int batch, int offset, int maxResults, int maxResultSize, boolean dump
    ) throws IOException {
        int count = 0;
        Scan scan = new Scan().setCaching(caching).setBatch(batch)
                .setRowOffsetPerColumnFamily(offset)
                .setMaxResultsPerColumnFamily(maxResults)
                .setMaxResultSize(maxResultSize)
                .setScanMetricsEnabled(true);
        ResultScanner scanner = getTable().getScanner(scan);
        System.out.println("Scan #" + num + " running...");
        for (Result result : scanner) {
            count++;
            if (dump)
                System.out.println("Result [" + count + "]:" + result);
        }
        scanner.close();
        ScanMetrics metrics = scan.getScanMetrics();
        System.out.println("Caching: " + caching + ", Batch: " + batch +
                ", Offset: " + offset + ", maxResults: " + maxResults +
                ", maxSize: " + maxResultSize + ", Results: " + count +
                ", RPCs: " + metrics.countOfRPCcalls);
    }

    public static void main(String[] args) throws IOException {
        //偏移为0 最大2个cell 所以会扫描到列1 和列2
        scan(1, 11, 0, 0, 2, -1, true);
        //偏移为4 最大2个cell 所以会扫描到列5 和列6
        scan(2, 11, 0, 4, 2, -1, true);
        //
        scan(3, 5, 0, 0, 2, -1, false);
        scan(4, 11, 2, 0, 5, -1, true);
        scan(5, 11, -1, -1, -1, 1, false);
        scan(6, 11, -1, -1, -1, 10000, false);
    }
}

/**
 Caching: 11, Batch: 0, Offset: 0, maxResults: 2, maxSize: -1, Results: 5005, RPCs: 458
 Caching: 11, Batch: 0, Offset: 4, maxResults: 2, maxSize: -1, Results: 1, RPCs: 3
 Caching: 5, Batch: 0, Offset: 0, maxResults: 2, maxSize: -1, Results: 5005, RPCs: 1004
 Caching: 11, Batch: 2, Offset: 0, maxResults: 5, maxSize: -1, Results: 5009, RPCs: 458
 Caching: 11, Batch: -1, Offset: -1, maxResults: -1, maxSize: 1, Results: 5005, RPCs: 11012
 Caching: 11, Batch: -1, Offset: -1, maxResults: -1, maxSize: 10000, Results: 5005, RPCs: 469
**/