impala通过jdbc写入效率低下优化

背景:

Kudu+Impala很适合数据分析, 但直接使用Insert values语句往Kudu表插入数据, 效率实在不好, 测试下来insert的速度仅为80笔/秒. 原因也是显然的, Kudu本身写入效率很高, 但是Impala并没有做这方面优化, 观察下来每次Impala语句执行的overhead都太大了, 导致频繁小批次写入效率非常差, Kudu官方推荐使用Java API或Python API完成数据写入工作. 下面是使用Java API的测试用例, 也可以看出Kudu API的大致用法. 

解决方法:

1、采用外部表的方式,对于自动化流程比较麻烦

2、采用文件导入,对于自动化流程比较麻烦

3、使用kudu对应原生的java api,用于提升写入、更新和删除操作,本文主要是采用java api的方式

maven依赖:

        <dependency>
            <groupId>org.apache.kudu</groupId>
            <artifactId>kudu-client</artifactId>
<!--            <version>1.4.0</version>-->
            <version>1.11.1</version>
        </dependency>

写入程序:

package org.kudu;


import org.apache.kudu.client.*;

import java.sql.*;
import java.util.Date;

public class InsertRow {

    public static void main(String[] args) throws Exception {

        // impala链接地址:用于模拟数据
        Connection con = null;
        ResultSet rs = null;
        PreparedStatement ps = null;
        String JDBC_DRIVER = "com.cloudera.impala.jdbc41.Driver";
        String CONNECTION_URL = "jdbc:impala://ecs-kudu-0004:21051/pro_dfcv_tsp";
        Class.forName(JDBC_DRIVER);
        con = (Connection) DriverManager.getConnection(CONNECTION_URL);


        // kudu 链接地址
        String masterAddr = "cdh02.dongfeng.com:7051";
        KuduClient client = new KuduClient.KuduClientBuilder(masterAddr)
                .defaultSocketReadTimeoutMs(6000).build();


        // 对应的表名称:这个表名称特殊,补全前缀
        KuduTable table = client.openTable("impala::pro_dfcv_tsp.hy_car2");
        KuduSession kuduSession = client.newSession();

        // 采用手动刷新
        kuduSession.setFlushMode(SessionConfiguration.FlushMode.MANUAL_FLUSH);
        // 用于批量写入的临时缓冲区要足够大
        kuduSession.setMutationBufferSpace(300000000);


        // 一共73个字段,取出15000条
        String sql = "select * from hy_car  limit 15000";
        ps = con.prepareStatement(sql);
        rs = ps.executeQuery();
        final ResultSetMetaData metaData = rs.getMetaData();
        final int columnCount = metaData.getColumnCount();


        int idx = 0;

        while (rs.next()) {

            idx++;

            // 创建每条的Insert
            Insert insert = table.newInsert();
            for(int i=1;i<=columnCount;i++){
                final String columnTypeName = metaData.getColumnTypeName(i);
                if ("BIGINT".equals(columnTypeName)){
                    insert.getRow().addLong(metaData.getColumnName(i), rs.getLong(i));
                }else if ("INT".equals(columnTypeName)){
                    insert.getRow().addInt(metaData.getColumnName(i), rs.getInt(i));
                }else{
                    if (rs.getString(i)!= null && rs.getString(i).equals("")){
                        insert.getRow().addString(metaData.getColumnName(i), "");
                    }
                }
            }

            // 先不提交kudu
            kuduSession.apply(insert);

            if (idx % 2000 == 0){

                System.out.println(new Date());
                // 批量写入kudu
                kuduSession.flush();

            }

        }


        // 最后,补加一条批量写入
        kuduSession.flush();

        // 关闭和kudu的会话
        kuduSession.close();

        // 关闭impala的链接
        con.close();


    }
}

更改表数据:

package cn.wcb.kudu;

import org.apache.kudu.client.*;

public class UpdateRow {

    public static void main(String[] args) {
        //master地址
        String masterAddr = "192.168.56.56";

        KuduClient client = new KuduClient.KuduClientBuilder(masterAddr)
                .defaultSocketReadTimeoutMs(6000).build();

        try {
            KuduTable table = client.openTable("student");
            KuduSession session = client.newSession();

            session.setFlushMode(SessionConfiguration.FlushMode.AUTO_FLUSH_SYNC);

            //更新数据
            Update update = table.newUpdate();
            PartialRow row = update.getRow();
            row.addInt("id",1);
            row.addString("name","didi");
            session.apply(update);

            session.close();
        } catch (KuduException e) {
            e.printStackTrace();
        } finally {
            try {
                client.close();
            } catch (KuduException e) {
                e.printStackTrace();
            }
        }

    }
}

删除数据:

package cn.wcb.kudu;

import org.apache.kudu.client.*;

public class DeleteRow {

    public static void main(String[] args) {
        //master地址
        String masterAddr = "192.168.56.56";

        KuduClient client = new KuduClient.KuduClientBuilder(masterAddr)
                .defaultSocketReadTimeoutMs(6000).build();

        try {
            KuduTable table = client.openTable("student");
            KuduSession session = client.newSession();

            //删除数据
            Delete delete = table.newDelete();
            //注意:行删除和更新操作必须指定要更改行的行的完整主键
            delete.getRow().addInt("id",1);
            session.flush();
            session.apply(delete);

            session.close();
        } catch (KuduException e) {
            e.printStackTrace();
        } finally {
            try {
                client.close();
            } catch (KuduException e) {
                e.printStackTrace();
            }
        }

    }
}

效果:

原文地址:https://www.cnblogs.com/lilei2blog/p/15670920.html