Flume笔记--batchSize 和 transactionCapacity分析及应用

理解FlumeNG的batchSize和transactionCapacity参数和传输事务的原理:http://blog.itpub.net/29754888/viewspace-1220545/
Flume中transactionCapacity和batchSize概念的具体分析和解惑:http://www.mamicode.com/info-detail-513301.html

自定义sink实现和属性注入:http://www.coderli.com/flume-ng-sink-properties/
自定义拦截器:http://blog.csdn.net/xiao_jun_0820/article/details/38333171
自定义kafkasink:www.itnose.net/detail/6187977.html

 

batchSize是针对Source和Sink提出的一个概念,它用来限制source和sink对event批量处理的。
即一次性你可以处理batchSize个event,这个一次性就是指在一个事务中。当event数量超过batchSize事务就会提交
这个参数值越大,每个事务提交的范围就越大,taskList的清空等操作次数会减少,因此性能肯定会提升,但是可能在出错时,回滚的返回也会变大。

transactionCapacity参数官方解释:channel在一次事务中传递事件的最大数量。 其实就是putList和takeList的容量大小。在flume1.5版本中SpillableMemoryChannel的putList和takeList的长度为largestTakeTxSize和largestPutTxSize参数,该参数值为5000

capacity参数官方解释:The maximum number of events stored in the channel,channel存储事件的最大数量

 


例:

Constants.java

public class Constants {
    static final String BATCH_SIZE = "batchSize"; 
    static final int DEFAULT_BATCH_SIZE = 50; //默认batch大小为50
    static final String TOPIC = "topic";
    static final String DEFAULT_TOPIC = "kafka_flume_topic"; //默认topic

}


KafkaUtil.java
负责创建Properties

import java.util.Properties;
import org.apache.flume.Context;

public class KafkaUtil {
    public static Properties getKafkaConfig(Context context){        
        //其实没有用到context参数
        Properties props = new Properties();
        props.setProperty("metadata.broker.list","192.168.1.160:9092,192.168.1.164:9093");
        props.setProperty("serializer.class", "kafka.serializer.StringEncoder"); 
        props.put("request.required.acks", "1");
        
        return props;
    }
}


KafkaSinkBatch.java
自定义kafkaSink

package com.evor.kafkawithbatch;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;

import kafka.javaapi.producer.Producer;
import kafka.producer.KeyedMessage;
import kafka.producer.ProducerConfig;

import org.apache.flume.Channel;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.EventDeliveryException;
import org.apache.flume.Transaction;
import org.apache.flume.conf.Configurable;
import org.apache.flume.sink.AbstractSink;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

//KafkaSinkBatch2的简化版
public class KafkaSinkBatch extends AbstractSink implements Configurable {
    private static final Logger log = LoggerFactory.getLogger(KafkaSinkBatch.class);
    private Properties kafkaProps;
    private Producer<String,String> producer;
    private int batchSize;// 一次事务的event数量,整体提交
    private List<KeyedMessage<String, String>> messageList;

    @Override
    public Status process() throws EventDeliveryException {
        Status result = Status.READY;
        Channel channel = getChannel();
        Transaction transaction = null;
        Event event = null;
        String eventTopic = "kafka_flume_topic";
        try {
            long processedEvent = 0;
            transaction = channel.getTransaction();
            transaction.begin();// 事务开始
            //messageList.clear();
            for (; processedEvent < batchSize; processedEvent++) {
                event = channel.take();// 从channel取出一个事件
                if (event == null) {
                    break;
                }
                // Event对象有头和体之分
                byte[] eventBody = event.getBody();                
                KeyedMessage<String, String> data = new KeyedMessage<String, String>(eventTopic,new String(eventBody)+" batch大小:"+batchSize);
                messageList.add(data);
            }
            if (processedEvent > 0) {
                producer.send(messageList);
            }
            transaction.commit();// batchSize个事件处理完成,一次事务提交

        } catch (Exception e) {
            String errorMsg = "Failed to publish events !";
            log.error(errorMsg, e);
            result = Status.BACKOFF;
            transaction.rollback();
            log.debug("transaction rollback success !");
        } finally {
            transaction.close();
        }
        return result;
    }

    @Override
    public synchronized void start() {
        ProducerConfig config = new ProducerConfig(kafkaProps);
        producer = new Producer<String, String>(config);
        super.start();
    }

    @Override
    public synchronized void stop() {
        producer.close();
        super.stop();
    }

    @Override
    public void configure(Context context) {
        //获取flume配置文件中的各种配置,如果没有则使用默认值    
        batchSize = context.getInteger(Constants.BATCH_SIZE,    Constants.DEFAULT_BATCH_SIZE);        
        messageList = new ArrayList<KeyedMessage<String, String>>(batchSize);
        kafkaProps = KafkaUtil.getKafkaConfig(context);//通过函数来创建kafkaProps        
    }

}


附:KafkaSinkBatch2.java
(KafkaSinkBatch.java的完全体,丰富了日志和处理)

package com.evor.kafkawithbatch;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Properties;

import kafka.javaapi.producer.Producer;
import kafka.producer.KeyedMessage;
import kafka.producer.ProducerConfig;

import org.apache.flume.Channel;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.EventDeliveryException;
import org.apache.flume.Transaction;
import org.apache.flume.conf.Configurable;
import org.apache.flume.sink.AbstractSink;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

//KafkaSInkBatch.java的日志完整版
public class KafkaSinkBatch2 extends AbstractSink implements Configurable {
    private static final Logger log = LoggerFactory.getLogger(KafkaSinkBatch2.class);

    public static final String KEY_HDR = "key";
    public static final String TOPIC_HDR = "topic";
    private static final String CHARSET = "UTF-8";
    private Properties kafkaProps;
    private Producer<String, String> producer;
    private String topic;
    private int batchSize;// 一次事务的event数量,整体提交
    private List<KeyedMessage<String, String>> messageList;

    @Override
    public Status process() throws EventDeliveryException {
        Status result = Status.READY;
        Channel channel = getChannel();
        Transaction transaction = null;
        Event event = null;
        String eventTopic = null;
        String eventKey = null;
        try {
            long processedEvent = 0;
            transaction = channel.getTransaction();
            transaction.begin();// 事务开始
            messageList.clear();
            for (; processedEvent < batchSize; processedEvent++) {
                event = channel.take();// 从channel取出一个事件
                if (event == null) {
                    break;
                }
                // Event对象有头和体之分
                Map<String, String> headers = event.getHeaders();
                byte[] eventBody = event.getBody();
                if ((eventTopic = headers.get(TOPIC_HDR)) == null) {// 判断event头部中的topic是否为null
                    eventTopic = topic;
                }
                eventKey = headers.get(KEY_HDR);

                if (log.isDebugEnabled()) {
                    log.debug("{Event}" + eventTopic + ":" + eventKey + ":"    + new String(eventBody, CHARSET));
                    log.debug("event #{}", processedEvent);
                }

                KeyedMessage<String, String> data = new KeyedMessage<String, String>(eventTopic, eventKey, new String(eventBody));
                messageList.add(data);
                //producer.send(data);

            }
            if (processedEvent > 0) {
                producer.send(messageList);
            }
            transaction.commit();// batchSize个事件处理完成,一次事务提交

        } catch (Exception e) {
            String errorMsg = "Failed to publish events !";
            log.error(errorMsg, e);
            result = Status.BACKOFF;
            transaction.rollback();
            log.debug("transaction rollback success !");
        } finally {
            transaction.close();
        }
        return result;
    }

    @Override
    public synchronized void start() {
        ProducerConfig config = new ProducerConfig(kafkaProps);
        producer = new Producer<String, String>(config);
        super.start();
    }

    @Override
    public synchronized void stop() {
        producer.close();
        super.stop();
    }

    @Override
    public void configure(Context context) {
        //获取flume配置文件中的各种配置,如果没有则使用默认值
        batchSize = context.getInteger(Constants.BATCH_SIZE,    Constants.DEFAULT_BATCH_SIZE);        
        messageList = new ArrayList<KeyedMessage<String, String>>(batchSize);
        
        log.debug("batch size:", batchSize);
        
        topic = context.getString(Constants.TOPIC, Constants.DEFAULT_TOPIC);
        if (topic.equals(Constants.DEFAULT_TOPIC)) {
            log.warn("Default topic name [" + Constants.DEFAULT_TOPIC + "]");
        } else {
            log.info("configured topic:[" + topic    + "], may be over-ridden by event headers");
        }
        
        kafkaProps = KafkaUtil.getKafkaConfig(context);
        if (log.isDebugEnabled()) {
            log.debug("Kafka producer properties : " + kafkaProps);
        }
    }

}
View Code
原文地址:https://www.cnblogs.com/gnivor/p/4988935.html