【flink】【kafka】【hive】flink消费kafka数据存到hive

1、maven依赖,pom.xml文件

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>
    <groupId>com.xiaostudy.flink</groupId>
    <artifactId>flink-kafka-to-hive</artifactId>
    <version>1.0-SNAPSHOT</version>

    <properties>
        <maven.compiler.source>8</maven.compiler.source>
        <maven.compiler.target>8</maven.compiler.target>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <flink.version>1.9.0</flink.version>
        <flink-table.version>1.9.0-csa1.0.0.0</flink-table.version>
        <kafka.version>2.3.0</kafka.version>
        <hadoop.version>3.0.0</hadoop.version>
        <fastjson.version>1.2.70</fastjson.version>
        <hive.version>2.1.1</hive.version>
    </properties>


    <dependencies>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-connector-kafka_2.12</artifactId>
            <version>${flink.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-streaming-java_2.12</artifactId>
            <version>${flink.version}</version>
            <exclusions>
                <exclusion>
                    <groupId>javax.el</groupId>
                    <artifactId>javax.el-api</artifactId>
                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-clients_2.12</artifactId>
            <version>${flink.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-sql-parser</artifactId>
            <version>${flink-table.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.kafka</groupId>
            <artifactId>kafka-clients</artifactId>
            <version>${kafka.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-table-api-java</artifactId>
            <version>1.10.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-table-planner-blink_2.11</artifactId>
            <version>${flink-table.version}</version>
        </dependency>

        <!-- hive -->
        <dependency>
            <groupId>org.apache.hive</groupId>
            <artifactId>hive-jdbc</artifactId>
            <version>${hive.version}</version>
        </dependency>

        <!-- hdfs -->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>${hadoop.version}</version>
            <exclusions>
                <exclusion>
                    <groupId>com.google.guava</groupId>
                    <artifactId>guava</artifactId>
                </exclusion>
            </exclusions>
        </dependency>

        <!-- mr2 -->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-common</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-jobclient</artifactId>
            <version>${hadoop.version}</version>
        </dependency>

        <dependency>
            <groupId>javax.el</groupId>
            <artifactId>javax.el-api</artifactId>
            <version>3.0.1-b06</version>
        </dependency>

        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>fastjson</artifactId>
            <version>${fastjson.version}</version>
        </dependency>

        <dependency>
            <groupId>org.oracle</groupId>
            <artifactId>ojdbc14</artifactId>
            <version>10.2.0.5</version>
        </dependency>

    </dependencies>


    <build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <configuration>
                    <source>1.8</source>
                    <target>1.8</target>
                    <encoding>${project.build.sourceEncoding}</encoding>
                </configuration>
                <version>3.1</version>
            </plugin>


            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-shade-plugin</artifactId>
                <version>2.2</version>
                <configuration>
                    <filters>
                        <filter>
                            <artifact>*:*</artifact>
                            <excludes>
                                <exclude>META-INF/*.SF</exclude>
                                <exclude>META-INF/*.DSA</exclude>
                                <exclude>META-INF/*.RSA</exclude>
                            </excludes>
                        </filter>
                    </filters>
                </configuration>


                <!--<executions>-->
                    <!--<execution>-->
                        <!--<phase>-->
                            <!--package-->
                        <!--</phase>-->
                        <!--<goals>-->
                            <!--<goal>-->
                                <!--shade-->
                            <!--</goal>-->
                        <!--</goals>-->
                        <!--<configuration>-->
                            <!--<transformers>-->
                                <!--<transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">-->
                                    <!--<resource>-->
                                        <!--reference.conf-->
                                    <!--</resource>-->
                                <!--</transformer>-->
                            <!--</transformers>-->
                        <!--</configuration>-->
                    <!--</execution>-->
                <!--</executions>-->



            </plugin>
            <!-- 使用maven-assembly-plugin插件打包 -->
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-assembly-plugin</artifactId>
                <version>3.2.0</version>
                <configuration>
                    <archive>
                        <manifest>
                            <addClasspath>true</addClasspath>
                            <classpathPrefix>lib/</classpathPrefix>
                            <mainClass>com.xiaostudy.flink.job.StartMain</mainClass>
                        </manifest>
                    </archive>
                    <descriptorRefs>
                        <descriptorRef>jar-with-dependencies</descriptorRef>
                    </descriptorRefs>
                </configuration>


                <executions>
                    <execution>
                        <id>make-assembly</id>
                        <phase>package</phase>
                        <goals>
                            <goal>single</goal>
                        </goals>
                    </execution>
                </executions>


            </plugin>
        </plugins>
    </build>

</project>

2、代码结构

  3、任务入口StartMain.java

package com.xiaostudy.flink.job;

import com.xiaostudy.flink.entity.HiveDataEntity;
import com.xiaostudy.flink.helper.ConfigHelper;
import com.xiaostudy.flink.sink.SinkHive;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;

import java.util.Properties;

public class StartMain {

    /**
     * job 运行入口
     *
     * @param args
     */
    public static void main(String[] args) throws Exception {
        //获取flink的运行环境
        //1.构建执行环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//        //只有开启了CheckPointing,才会有重启策略
//        env.enableCheckpointing(30 * 1000);
//        //此处设置重启策略为:出现异常重启3次,隔10秒一次
//        env.getConfig().setRestartStrategy(RestartStrategies.fixedDelayRestart(3, 10000));
//        //系统异常退出或人为 Cancel 掉,不删除checkpoint数据
//        env.getCheckpointConfig().enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION);
//        //设置Checkpoint模式(与Kafka整合,一定要设置Checkpoint模式为Exactly_Once)
//        env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE);
        env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
        //每隔20分钟进行启动一个检查点
        env.enableCheckpointing(20 * 60 * 1000, CheckpointingMode.AT_LEAST_ONCE);


        //设置kafka基本信息
        Properties properties = ConfigHelper.getKafkaProperties();
        FlinkKafkaConsumer<String> myConsumer = new FlinkKafkaConsumer<>("test", new SimpleStringSchema(), properties);
        myConsumer.setStartFromGroupOffsets();

        // 数据源输入
        DataStreamSource<String> stringDataStreamSource = env.addSource(myConsumer);

        // kafka消息转实体
        SingleOutputStreamOperator<HiveDataEntity> outputStreamOperator = stringDataStreamSource.process(new MyProcessFunction());

        outputStreamOperator.addSink(new SinkHive()).name("kafka-to-hive");
        env.execute("kafka-to-hive-任务");

    }

}

4、ProcessFunction类

package com.xiaostudy.flink.job;

import com.xiaostudy.flink.constant.StringConstants;
import com.xiaostudy.flink.entity.HiveDataEntity;
import com.xiaostudy.flink.enumeration.KafkaFieldEnum;
import org.apache.flink.streaming.api.functions.ProcessFunction;
import org.apache.flink.util.Collector;

import java.util.List;

public class MyProcessFunction extends ProcessFunction<String, HiveDataEntity> {

    private static final long serialVersionUID = -2852475205749156550L;

    @Override
    public void processElement(String value, Context context, Collector<HiveDataEntity> out) {
        System.out.println(String.format("消息:%s", value));
        out.collect(convert(value));
    }

    private static HiveDataEntity convert(String kafkaValue) {
        try {

            HiveDataEntity glycEntity = new HiveDataEntity();
            if (null == kafkaValue || kafkaValue.trim().length() <= 0) {
                return glycEntity;
            }
            String[] values = kafkaValue.split(StringConstants.KAFKA_SPLIT_CHAR);
            List<String> fieldNames = KafkaFieldEnum.getNameList();
            if (fieldNames.size() != values.length) {
                return glycEntity;
            }
            String sjsj = values[0];
            String ds = values[1];

            glycEntity.setDs(ds);
            glycEntity.setSjsj(sjsj);

            return glycEntity;
        } catch (Exception e) {
            e.printStackTrace();
            return new HiveDataEntity();
        }
    }

}

5、SinkFunction类

package com.xiaostudy.flink.sink;

import com.xiaostudy.flink.constant.StringConstants;
import com.xiaostudy.flink.entity.HiveDataEntity;
import com.xiaostudy.flink.util.HiveConfig;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.functions.sink.RichSinkFunction;
import org.apache.flink.streaming.api.functions.sink.SinkFunction;

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;

public class SinkHive extends RichSinkFunction<HiveDataEntity> implements SinkFunction<HiveDataEntity> {

    private Connection connection;
    private PreparedStatement statement;

    // 1,初始化,正常情况下只执行一次
    @Override
    public void open(Configuration parameters) throws Exception {
        super.open(parameters);
        Class.forName(StringConstants.HIVE_DRIVER_NAME);
        connection = DriverManager.getConnection(HiveConfig.HIVE_ADDR, HiveConfig.HIVE_USER, HiveConfig.HIVE_PASSWORD);
        System.out.println("SinkHive-open");
    }

    private String ycHiveTableName = "hive_table";
    private String sql;

    // 2,执行
    @Override
    public void invoke(HiveDataEntity glycEntity, Context context) throws Exception {
        if (null == glycEntity) {
            return;
        }
        sql = String.format("insert into test.%s partition(dt=20210421) values ('%s','%s')", ycHiveTableName, glycEntity.getSjsj(), glycEntity.getDs());
        statement = connection.prepareStatement(sql);
        statement.execute();

        System.out.println("SinkHive-invoke");
    }

    // 3,关闭,正常情况下只执行一次
    @Override
    public void close() throws Exception {
        super.close();
        if (statement != null) {
            statement.close();
        }
        if (connection != null) {
            connection.close();
        }
        System.out.println("SinkHive-close");
    }
}

6、HiveConfig类

package com.xiaostudy.flink.util;

public class HiveConfig {

    //hive 连接信息
    public static String HIVE_ADDR = "jdbc:hive2://localhost:10001/test";
    public static String HIVE_USER = "liwei";
    public static String HIVE_PASSWORD = "liwei";
}

7、kafka配置类

package com.xiaostudy.flink.helper;

import java.io.IOException;
import java.io.InputStream;
import java.util.Properties;

public class ConfigHelper {

    /**
     * kafka的配置
     */
    private final static Properties KAFKA_PROPERTIES = new Properties();

    /**
     * 获取kafka的配置
     *
     * @return
     * @throws IOException
     */
    public static Properties getKafkaProperties() {
        if (KAFKA_PROPERTIES.isEmpty()) {
            synchronized (KAFKA_PROPERTIES) {
                loadProperties(KAFKA_PROPERTIES, "kafka.properties");
            }
        }
        Properties properties = new Properties();
        for (String key : KAFKA_PROPERTIES.stringPropertyNames()) {
            properties.setProperty(key, KAFKA_PROPERTIES.getProperty(key));
        }
        return properties;
    }

    /**
     * @param properties
     * @param file
     * @throws IOException
     */
    private static synchronized void loadProperties(Properties properties, String file) {
        if (properties.isEmpty()) {
            try (InputStream in = Thread.currentThread().getContextClassLoader().getResourceAsStream(file)) {
                properties.load(in);
                if (properties.isEmpty()) {
                    throw new RuntimeException("配置文件内容为空");
                }
            } catch (IOException e) {
                throw new RuntimeException("读取kafka配置文件错误");
            }
        }
    }
}

8、kafka字段枚举

package com.xiaostudy.flink.enumeration;

import java.util.ArrayList;
import java.util.List;

/**
 * kafka数据字段,顺便不可变
 *
 * @author liwei
 * @date 2021/3/5 21:32
 */
public enum KafkaFieldEnum {
    SJSJ("sjsj")
    , DS("ds")
    ;

    private String name;

    private KafkaFieldEnum(String name) {
        this.name = name;
    }

    public static List<String> getNameList() {
        List<String> list = new ArrayList<>();
        KafkaFieldEnum[] values = KafkaFieldEnum.values();
        for (KafkaFieldEnum provinceCodeEnum : values) {
            list.add(provinceCodeEnum.name);
        }
        return list;
    }
}

9、hive实体类

package com.xiaostudy.flink.entity;

import java.io.Serializable;

/**
 * @author liwei
 * @since 2021-03-10
 */
public class HiveDataEntity implements Serializable {

    private static final long serialVersionUID = 1L;

    //数据日期 yyyy-MM-dd HH-mm-ss
    private String sjsj;

    private String ds;

    public String getSjsj() {
        return sjsj;
    }

    public void setSjsj(String sjsj) {
        this.sjsj = sjsj;
    }

    public String getDs() {
        return ds;
    }

    public void setDs(String ds) {
        this.ds = ds;
    }
}

10、常量类

package com.xiaostudy.flink.constant;

public class StringConstants {

    public static final String KAFKA_SPLIT_CHAR = "\|\|";

    public static String HIVE_DRIVER_NAME = "org.apache.hive.jdbc.HiveDriver";

}

11、kafka.properties

bootstrap.servers=localhost:9092
group.id=test1
enable.auto.commit=true
auto.commit.interval.ms=1000
session.timeout.ms=300000
key.serializer=org.apache.kafka.common.serialization.StringSerializer
value.serializer=org.apache.kafka.common.serialization.StringSerializer
message.timeout.ms=300000

 12、运行命令,例如:

flink run -m yarn-cluster  -yjm 1G -ytm 1G  -yn 1 -ys 1 /.../flink-kafka-to-hive.jar -class com.xiaostudy.flink.job.StartMain

或者

/.../flink/bin/yarn-session.sh -n 7 -s 15 -jm 8g -tm 32g -nm flink_kafka_to_hive -d
/.../bin/flink  run  -yid  application_***_0001 /.../flink-kafka-to-hive.jar
注:application_***_0001为上面第一个yarn-session启动容器id
原文地址:https://www.cnblogs.com/xiaostudy/p/14682290.html