storm(5)-分布式单词计数例子

例子需求

spout:向后端发送{"sentence":"my dog has fleas"}。一般要连数据源,此处简化写死了。

语句分割bolt(SplitSentenceBolt):订阅spout发送的tuple。每收到一个tuple,bolt会获取"sentence"对应值域的值,然后分割为一个个的单词。最后,每个单词向后发送1个tuple:

{"word":"my"}
{"word":"dog"}
{"word":"has"}
{"word":"fleas"}

单词计数bolt(WordCountBolt):订阅SplitSentenceBolt的输出。每当接收到1个tuple,会将对应单词的计数加1,最后向后发送该单词当前的计数。

{"word":"dog","count":5}

上报bolt:接收WordCountBolt输出,维护各单词对应计数表,并修改累计计数值。

代码实现:

package com.ebc.spout;

import com.ebc.Utils;
import org.apache.storm.spout.SpoutOutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichSpout;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Values;

import java.util.Map;

/**
 * @author yaoyuan2
 * @date 2019/4/11
 */
public class SentenceSpout extends BaseRichSpout {
    private SpoutOutputCollector collector;
    private final String [] sentences = {"a b", "c a"};
    private int index = 0;
    /**
     * 所有spout组件在初始化时调用这个方法。
     * @param conf:storm配置信息
     * @param context:topology中组件信息
     * @param collector:提供了emit tuple方法
     * @return void
     */
    @Override
    public void open(Map conf, TopologyContext context, SpoutOutputCollector collector) {
        this.collector = collector;
    }

    @Override
    public void nextTuple() {
        this.collector.emit(new Values(sentences[index]));
        index++;
        if (index >= sentences.length) {
            index = 0;
        }
        Utils.waitForMillis(1);
    }

    @Override
    public void declareOutputFields(OutputFieldsDeclarer declarer) {
        declarer.declare(new Fields("sentence"));
    }
}
package com.ebc.blot;

import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichBolt;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;

import java.util.Map;

/**
 * @author yaoyuan2
 * @date 2019/4/11
 */
public class SplitSentenceBolt extends BaseRichBolt {
    private OutputCollector collector;
    /**
     * 类似spout中的open()方法,可初始化数据库连接
     * @param stormConf
     * @param context
     * @param collector
     * @return void
     */
    @Override
    public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {
        this.collector = collector;
    }
    /**
     * <pre>
     * 每当从订阅的数据流中接收1个tuple,都会调用这个方法。
     * 形如:{"sentence":"my dog has fleas"}
     * </pre>
     * @param input
     * @return void
     */
    @Override
    public void execute(Tuple input) {
        String sentence = input.getStringByField("sentence");
        String [] words = sentence.split(" ");
        for (String word:words) {
            this.collector.emit(new Values(word));
        }
    }
    /**
     * <pre>
     * 每个tuple包含一个"word",如
     * {"word":"my"}
     * {"word":"dog"}
     * {"word":"has"}
     * {"word":"fleas"}
     * </pre>
     * @param declarer
     * @return void
     */
    @Override
    public void declareOutputFields(OutputFieldsDeclarer declarer) {
        declarer.declare(new Fields("word"));
    }
}
package com.ebc.blot;

import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichBolt;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Tuple;
import org.apache.storm.tuple.Values;

import java.util.HashMap;
import java.util.Map;

/**
 * @author yaoyuan2
 * @date 2019/4/11
 */
public class WordCountBolt extends BaseRichBolt {
    private OutputCollector collector;
    private HashMap<String,Long> counts = null;
    /**
     * <pre>
     *     一般情况下,
     *     在构造函数中,只能对可序列化的对象赋值和实例化
     *     在prepare中,对不可序列化的对象实例化。
     * </pre>
     * @param stormConf
     * @param context
     * @param collector
     * @return void
     */
    @Override
    public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {
        this.collector = collector;
        this.counts = new HashMap<String,Long>();
    }

    @Override
    public void execute(Tuple input) {
        String word = input.getStringByField("word");
        Long count = this.counts.get(word);
        if (count == null) {
            count = 0L;
        }
        count++;
        this.counts.put(word,count);
        this.collector.emit(new Values(word,count));

    }
    /**
     * 形如:{"word":"dog","count":5}
     * @param declarer 
     * @return void
     */
    @Override
    public void declareOutputFields(OutputFieldsDeclarer declarer) {
        declarer.declare(new Fields("word","count"));
    }
}
package com.ebc.blot;

import lombok.extern.slf4j.Slf4j;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.topology.base.BaseRichBolt;
import org.apache.storm.tuple.Tuple;

import java.util.*;

/**
 * @author yaoyuan2
 * @date 2019/4/11
 */
@Slf4j
public class ReportBolt extends BaseRichBolt {
    private HashMap<String,Long> counts = null;
    @Override
    public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {
        this.counts = new HashMap<String,Long>();
    }

    @Override
    public void execute(Tuple input) {
        String word = input.getStringByField("word");
        Long count = input.getLongByField("count");
        this.counts.put(word,count);
    }

    @Override
    public void declareOutputFields(OutputFieldsDeclarer declarer) {
        //该bolt不再向后emit任何数据流
    }
    /**
     * <pre>
     * 当topology关闭时输出最终的计数结果。
     * 通常,clean()方法用来释放bolt占用的资源,如数据库连接。
     * 注意,当topology在strom集群上运行时,clean()不能保证会执行,但本地模式能保证执行。
     * </pre>
     * @return void
     */
    @Override
    public void cleanup() {
        log.info("最终counts");
        List<String> keys = new ArrayList<String>();
        keys.addAll(this.counts.keySet());
        Collections.sort(keys);
        for (String key : keys) {
            System.out.println(key + " : " + this.counts.get(key));
        }
        log.info("---------");

    }
}
package com.ebc;

import com.ebc.blot.ReportBolt;
import com.ebc.blot.SplitSentenceBolt;
import com.ebc.blot.WordCountBolt;
import com.ebc.spout.SentenceSpout;
import org.apache.storm.Config;
import org.apache.storm.LocalCluster;
import org.apache.storm.topology.TopologyBuilder;
import org.apache.storm.tuple.Fields;

/**
 * @author yaoyuan2
 * @date 2019/4/12
 */
public class WordCountTopology {
    private static final String SENTENCE_SPOUT_ID = "sentence-spout";
    private static final String SPLIT_BOLT_ID = "split-bolt";
    private static final String COUNT_BOLT_ID = "count-bolt";
    private static final String REPORT_BOLT_ID = "report-bolt";
    private static final String TOPOLOGY_NAME = "word-count-topology";

    public static void main(String[] args) {
        SentenceSpout spout = new SentenceSpout();
        SplitSentenceBolt splitBolt = new SplitSentenceBolt();
        WordCountBolt countBolt = new WordCountBolt();
        ReportBolt reportBolt = new ReportBolt();

        TopologyBuilder builder = new TopologyBuilder();
        builder.setSpout(SENTENCE_SPOUT_ID,spout);
        //SentenceSpout --> SplitSentenceBolt。shuffleGrouping:要求spout将tuple随机均匀的分发给splitBolt实例
        builder.setBolt(SPLIT_BOLT_ID,splitBolt).shuffleGrouping(SENTENCE_SPOUT_ID);
        //SplitSentenceBolt --> WordCountBolt。fieldsGrouping:field="word"的tuple会被路由到同一个WordCountBolt实例中。
        builder.setBolt(COUNT_BOLT_ID,countBolt).fieldsGrouping(SPLIT_BOLT_ID,new Fields("word"));
        //WordCountBolt --> ReportBolt。globalGrouping:将WordCountBolt上所有的tuple都发送到唯一的一个ReportBolt实例中。此时并发度失去了意义。
        builder.setBolt(REPORT_BOLT_ID,reportBolt).globalGrouping(COUNT_BOLT_ID);

        Config config = new Config();
        LocalCluster cluster = new LocalCluster();
        cluster.submitTopology(TOPOLOGY_NAME,config,builder.createTopology());
        Utils.waitForSeconds(10);
        cluster.killTopology(TOPOLOGY_NAME);
        cluster.shutdown();
    }
}

输出:

10:16:22.675 [SLOT_1027] INFO com.ebc.blot.ReportBolt:45 - 最终counts
a : 7526
b : 3763
c : 3763
10:16:22.686 [SLOT_1027] INFO com.ebc.blot.ReportBolt:52 - ---------

原文地址:https://www.cnblogs.com/yaoyuan2/p/10690863.html