MapReduce的DBInputFormat使用

  1 package com.mengyao.hadoop.mapreduce;
  2 
  3 import java.io.DataInput;
  4 import java.io.DataOutput;
  5 import java.io.IOException;
  6 import java.sql.PreparedStatement;
  7 import java.sql.ResultSet;
  8 import java.sql.SQLException;
  9 
 10 import org.apache.hadoop.conf.Configuration;
 11 import org.apache.hadoop.conf.Configured;
 12 import org.apache.hadoop.fs.Path;
 13 import org.apache.hadoop.io.LongWritable;
 14 import org.apache.hadoop.io.Text;
 15 import org.apache.hadoop.io.Writable;
 16 import org.apache.hadoop.mapreduce.Job;
 17 import org.apache.hadoop.mapreduce.Mapper;
 18 import org.apache.hadoop.mapreduce.lib.db.DBConfiguration;
 19 import org.apache.hadoop.mapreduce.lib.db.DBInputFormat;
 20 import org.apache.hadoop.mapreduce.lib.db.DBWritable;
 21 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 22 import org.apache.hadoop.util.Tool;
 23 import org.apache.hadoop.util.ToolRunner;
 24 
 25 /**
 26  * 使用DBInputFormat类读取数据库并将结果数据写到HDFS的/mapreduces/dboutput目录下,
 27  *         /mapreduces/dboutput/_SUCCESS:_SUCCESS空文件表示作业执行成功
 28  *         /mapreduces/dboutput/part-r-00000:文件表示作业的结果,内容如下:
 29  *             0    1    New Balance 999复古跑鞋    ML999GY    ML999GY    999.0
 30  *             1    2    New Balance 999复古跑鞋    ML999BB    ML999BB    999.0
 31  *             2    3    New Balance 996复古跑鞋    MRL996DG    MRL996DG    819.0
 32  *             3    4    New Balance 996复古跑鞋    MRL996EM    MRL996EM    819.0
 33  *             4    5    New Balance 996复古跑鞋    MRL996ES    MRL996ES    819.0
 34  * 这个作业没有Reducer类,在默认的MapReduce作业中,如果输出的key,value是默认的LongWritable, Text,则Reducer类可以省略,省略不写时则默认启动一个Reducer
 35  *
 36  * 一定要记住在使用MapReduce操作数据库时一定要添加JDBC驱动jar包到Hadoop的classpath中,否则会报无法加载JDBC Driver类异常,如下:
 37  *       1、我这里添加到/usr/local/installs/hadoop/share/hadoop/mapreduce/lib/mysql-connector-java-5.1.26-bin.jar这里了,务必要重启集群使classpath生效。
 38  *       2、将JDBC驱动jar包打包到这个MapReduce作业jar包中。
 39  *
 40  * @author mengyao
 41  *
 42  */
 43 public class DBInputFormatApp extends Configured implements Tool {
 44 
 45     /**
 46      * 这个JavaBean需要实现Hadoop的序列化接口Writable和与数据库交互时的序列化接口DBWritable
 47      * 官方API中解释如下:
 48      *         public class DBInputFormat<T extends DBWritable>
 49      *             extends InputFormat<LongWritable, T> implements Configurable
 50      *             即Mapper的Key是LongWritable类型,不可改变;Value是继承自DBWritable接口的自定义JavaBean
 51      *         
 52      * @author mengyao
 53      *
 54      */
 55     static class ProductWritable implements Writable, DBWritable {
 56 
 57         private long id;            // bigint(20) NOT NULL AUTO_INCREMENT,
 58         private String name;        // varchar(50) COLLATE utf8_unicode_ci DEFAULT NULL COMMENT '商品名称',
 59         private String model;        // varchar(30) COLLATE utf8_unicode_ci DEFAULT NULL COMMENT '型号',
 60         private String color;        // varchar(10) COLLATE utf8_unicode_ci DEFAULT NULL COMMENT '颜色',
 61         private double price;        // decimal(10,0) DEFAULT NULL COMMENT '售价',
 62         
 63         @Override
 64         public void write(PreparedStatement ps) throws SQLException {
 65             ps.setLong(1, id);
 66             ps.setString(2, name);
 67             ps.setString(3, model);
 68             ps.setString(4, color);
 69             ps.setDouble(5, price);
 70         }
 71 
 72         @Override
 73         public void readFields(ResultSet rs) throws SQLException {
 74             this.id = rs.getLong(1);
 75             this.name = rs.getString(2);
 76             this.model = rs.getString(3);
 77             this.color = rs.getString(4);
 78             this.price = rs.getDouble(5);
 79         }
 80 
 81         @Override
 82         public void readFields(DataInput in) throws IOException {
 83             this.id = in.readLong();
 84             this.name = in.readUTF();
 85             this.model = in.readUTF();
 86             this.color = in.readUTF();
 87             this.price = in.readDouble();
 88         }
 89 
 90         @Override
 91         public void write(DataOutput output) throws IOException {
 92             output.writeLong(id);
 93             output.writeUTF(name);
 94             output.writeUTF(model);
 95             output.writeUTF(color);
 96             output.writeDouble(price);
 97         }
 98 
 99         @Override
100         public String toString() {
101             return id +"\t"+ name +"\t"+ model +"\t"+ color +"\t"+ price;
102         }
103         
104     }
105     
106     static class DBInputFormatMapper extends Mapper<LongWritable, ProductWritable, LongWritable, Text> {
107         
108         private LongWritable outputKey;
109         private Text outputValue;
110         
111         @Override
112         protected void setup(
113                 Mapper<LongWritable, ProductWritable, LongWritable, Text>.Context context)
114                 throws IOException, InterruptedException {
115             this.outputKey = new LongWritable();
116             this.outputValue = new Text();
117         }
118         
119         @Override
120         protected void map(LongWritable key, ProductWritable value,
121                 Mapper<LongWritable, ProductWritable, LongWritable, Text>.Context context)
122                 throws IOException, InterruptedException {
123             outputKey.set(key.get());
124             outputValue.set(value.toString());
125             context.write(outputKey, outputValue);
126         }
127     }
128     
129     @Override
130     public int run(String[] args) throws Exception {
131         Configuration conf = getConf();
132         //在创建Configuration对象时紧跟着配置当前作业需要使用的JDBC配置
133         DBConfiguration.configureDB(
134                 conf,
135                 "com.mysql.jdbc.Driver",
136                 "jdbc:mysql://192.168.1.10:3306/shops",
137                 "root",
138                 "123456");
139         
140         Job job = Job.getInstance(conf, DBInputFormatApp.class.getSimpleName());
141         job.setJarByClass(DBInputFormatApp.class);
142         
143         job.setInputFormatClass(DBInputFormat.class);
144         FileOutputFormat.setOutputPath(job, new Path(args[0]));
145         
146         job.setMapperClass(DBInputFormatMapper.class);
147         job.setMapOutputKeyClass(LongWritable.class);
148         job.setMapOutputValueClass(Text.class);
149         
150         job.setOutputKeyClass(LongWritable.class);
151         job.setOutputValueClass(Text.class);
152         //配置当前作业要查询的SQL语句和接收查询结果的JavaBean
153         DBInputFormat.setInput(
154                 job,
155                 ProductWritable.class,
156                 "SELECT `id`,`name`,`model`,`color`,`price` FROM `product`",
157                 "SELECT COUNT(1) FROM `product`");
158         
159         return job.waitForCompletion(true)?0:1;
160     }
161      
162     public static int createJob(String[] args) {
163         Configuration conf = new Configuration();
164         conf.set("dfs.datanode.socket.write.timeout", "7200000");
165         conf.set("mapreduce.input.fileinputformat.split.minsize", "268435456");
166         conf.set("mapreduce.input.fileinputformat.split.maxsize", "536870912");
167         int status = 0;
168         
169         try {
170             status = ToolRunner.run(conf, new DBInputFormatApp(), args);
171         } catch (Exception e) {
172             e.printStackTrace();
173         }
174         
175         return status;
176     }
177     
178     public static void main(String[] args) {
179         args = new String[]{"/mapreduces/dboutput"};
180         if (args.length!=1) {
181             System.out.println("Usage: "+DBInputFormatApp.class.getName()+" Input paramters <OUTPUT_PATH>");
182         } else {
183             int status = createJob(args);
184             System.exit(status);
185         }
186 
187     }
188 
189 }
原文地址:https://www.cnblogs.com/mengyao/p/4865565.html