大数据作业之利用MapRedeuce实现简单的数据操作

Map/Reduce编程作业

现有student.txt和student_score.txt。将两个文件上传到hdfs上。使用Map/Reduce框架完成下面的题目

student.txt

2016001,王毅
2016002,张小明
2016003,李学彭
2016004,王东
2016005,王笑笑

student_score.txt

2016001,操作系统,60
2016001,数据库,88
2016001,大数据概论,85
2016002,操作系统,91
2016002,大数据概论,91
2016003,大数据概论,56
2016003,操作系统,88
2016004,数据库,90
2016004,大数据概论,82
2016004,操作系统,78
2016005,操作系统,69
2016005,大数据概论,70
2016005,数据库,89

1）将stduent.txt和student_score.txt连接，输出学号、姓名、课程、分数字段。

2）统计每个同学的平均成绩，显示学号、姓名和平均成绩，并按照成绩高低降序排序。

3）统计每门课的最高分、最低分和平均分。

问题一：

StudentScore1.java

import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class StudentScore1 {

	public static void main(String[] args) throws IllegalArgumentException, IOException, ClassNotFoundException, InterruptedException {
		Configuration conf=new Configuration();
		Job job=Job.getInstance(conf,"StudentScore1");
		job.setJarByClass(StudentScore1.class);
        
		job.setMapperClass(ScoreMapper.class);
		//Map的输出，避免程序不确定Map输出的值的类型不确定
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(SC.class);
		
		job.setReducerClass(ScoreReduce.class);
		//输出类型
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(NullWritable.class);
		
		//数据来源
		FileInputFormat.addInputPath(job,new Path("/StudentInput"));
		//输出位置
		FileOutputFormat.setOutputPath(job, new Path("/Output1"));
		
		System.exit(job.waitForCompletion(true)?0:1);
	}
     public static class ScoreMapper extends Mapper<Object, Text, Text, SC>{

		@Override
		protected void map(Object key, Text value, Mapper<Object, Text, Text, SC>.Context context)
				throws IOException, InterruptedException {
			//以“,”分割字符串
			//Student         2016001,王毅          [2016001,王毅]
			//Student_score   2016001,操作系统,60    [2016001,操作系统,60]
			String[] words=value.toString().split(",");
			//记录学号
			String Sid=words[0];
			SC sc=new SC();
			//区分字符串属于那个类型
			if(words.length==2) {//长度为2的记录信息是  学生
				sc.setSid(Sid);
				sc.setName(words[1]);
				sc.setTable("Student");
			    context.write(new Text(Sid), sc);
			}else {//长度为3的记录信息是   学科成绩
			sc.setSid(Sid);
			sc.setCourse(words[1]);
			sc.setScore(Integer.parseInt(words[2]));
			sc.setTable("Student_score");
			context.write(new Text(Sid), sc);
		     }
    	 
         }
      }
     public static class ScoreReduce extends Reducer<Text, SC, Text, NullWritable>{

		@Override
		protected void reduce(Text key, Iterable<SC> values,
				Reducer<Text, SC, Text,NullWritable>.Context context) throws IOException, InterruptedException {
			
			List<SC> list=new ArrayList<SC>();
			String Name="";
			//遍历结果集的value
			for(SC value:values) {
				
				if(value.getTable().equals("Student")) {//只有姓名信息的记录下来
					Name=value.getName();
				}else {//否则，将其添加到待输出list中
					SC sc=new SC();
					try {
						BeanUtils.copyProperties(sc, value);
						list.add(sc);
					} catch (IllegalAccessException e) {
						// TODO Auto-generated catch block
						e.printStackTrace();
					} catch (InvocationTargetException e) {
						// TODO Auto-generated catch block
						e.printStackTrace();
					}
					
				}
			}
			//遍历list 
			for(SC sc:list) {
				sc.setName(Name);
				context.write(new Text(sc.toString()), NullWritable.get());
			}
		}
    	 
     }
}

SC.java

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Writable;

public class SC implements Writable{

	private String Name="";
	private String Sid="";
	private String Course="";
	private String Table="";
	private int Score=0;
	public String getName() {
		return Name;
	}
	public void setName(String name) {
		Name = name;
	}
	public String getSid() {
		return Sid;
	}
	public void setSid(String sid) {
		Sid = sid;
	}
	public String getCourse() {
		return Course;
	}
	public void setCourse(String course) {
		Course = course;
	}
	public String getTable() {
		return Table;
	}
	public void setTable(String table) {
		Table = table;
	}
	public int getScore() {
		return Score;
	}
	public void setScore(int score) {
		Score = score;
	}
	
	@Override
	public String toString() {
		return  Sid + "," + Name + "," + Course + "," + Score;
	}
	@Override
	public void readFields(DataInput in) throws IOException {
		this.Sid=in.readUTF();
		this.Name=in.readUTF();
		this.Course=in.readUTF();
		this.Table=in.readUTF();
		this.Score=in.readInt();
		
	}
	@Override
	public void write(DataOutput out) throws IOException {
		out.writeUTF(Sid);
		out.writeUTF(Name);
		out.writeUTF(Course);
		out.writeUTF(Table);
		out.writeInt(Score);
	}
}

结果：

2016001,王毅,操作系统,60
2016001,王毅,数据库,88
2016001,王毅,大数据概论,85
2016002,张小明,操作系统,91
2016002,张小明,大数据概论,91
2016003,李学彭,操作系统,88
2016003,李学彭,大数据概论,56
2016004,王东,大数据概论,82
2016004,王东,操作系统,78
2016004,王东,数据库,90
2016005,王笑笑,数据库,89
2016005,王笑笑,操作系统,69
2016005,王笑笑,大数据概论,70

问题二：

Average2.java

import java.io.IOException;
import java.util.Comparator;
import java.util.TreeMap;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class Average2 {

	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
		Configuration conf=new Configuration();
		Job job=Job.getInstance(conf,"Average2");
		
		job.setJarByClass(Average2.class);
		job.setMapperClass(Average2Mapper.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(DoubleWritable.class);
		
		
		job.setReducerClass(Average2Reduce.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(DoubleWritable.class);
		
		FileInputFormat.addInputPath(job, new Path("/Output1"));
		FileOutputFormat.setOutputPath(job, new Path("/Output2"));
		System.exit(job.waitForCompletion(true)?0:1);

	}
 
     public static class Average2Mapper extends Mapper<Object,Text,Text,DoubleWritable>{
		@Override
		protected void map(Object key, Text value, Mapper<Object, Text, Text, DoubleWritable>.Context context)
				throws IOException, InterruptedException {
			//分割
			String[] words=value.toString().split(",");
			//keybuf=[2016001,王毅,]
			StringBuffer keybuf=new StringBuffer();
			keybuf.append(words[0]).append(",").append(words[1]).append(",");
			//score用来记录成绩
			Double score=Double.parseDouble(words[3]);
			context.write(new Text(keybuf.toString()), new DoubleWritable(score));
		}
     }
     
    public static class Average2Reduce extends Reducer<Text,DoubleWritable,Text,DoubleWritable>{
        //new Comparetor<Double> 的方法  倒叙（从高到低）排序       	
    	private TreeMap<Double, String> treeMap=new TreeMap<Double, String>(new Comparator<Double>() {
			@Override
			public int compare(Double x, Double y) {
				return y.compareTo(x);
			}
		});
    	 
		@Override
		protected void reduce(Text key, Iterable<DoubleWritable> values,
				Reducer<Text, DoubleWritable, Text, DoubleWritable>.Context context)
				throws IOException, InterruptedException {
			//reduce的操作对象是[key,<value1,value2...>]
			Double sumscore=0.0;
			int num=0;
			for(DoubleWritable value:values) {
				num++;
				sumscore=sumscore+value.get();
			}
			Double avg= sumscore/num;
			//得到的结果先不输出，到treepMap里面先排个序
			treeMap.put(avg, key.toString());
		}
                //输出
		protected void cleanup(Context context) throws IOException, InterruptedException {
			for(Double key:treeMap.keySet()) {
				context.write(new Text(treeMap.get(key)), new DoubleWritable(key));
			}
		}
    	 
     }
}

结果：

2016002,张小明,	91.0
2016004,王东,	83.33333333333333
2016001,王毅,	77.66666666666667
2016005,王笑笑,	76.0
2016003,李学彭,	72.0

问题三：

Course3.java

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class Course3 {

	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
		Configuration conf=new Configuration();
		Job job=Job.getInstance(conf,"Course3");
		
		job.setJarByClass(Course3.class);
		job.setMapperClass(Course3Mapper.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(IntWritable.class);
		
		job.setReducerClass(Course3Reduce.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		
		FileInputFormat.addInputPath(job, new Path("/Output1"));
		FileOutputFormat.setOutputPath(job, new Path("/Output3"));
		System.exit(job.waitForCompletion(true)?0:1);
		

	}
 
     public static class Course3Mapper extends Mapper<Object,Text,Text,IntWritable>{

		@Override
		protected void map(Object key, Text value, Mapper<Object, Text,Text, IntWritable>.Context context)
				throws IOException, InterruptedException {
			//分割
			String[] words=value.toString().split(",");
			int Score=Integer.parseInt(words[3]);
			//key=课程  value=某人某科成绩
			context.write(new Text(words[2]), new IntWritable(Score));
			
		}
     }
     
     
     public static class Course3Reduce extends Reducer<Text,IntWritable,Text,Text>{

		@Override
		protected void reduce(Text key, Iterable<IntWritable> values,
				Reducer<Text, IntWritable, Text, Text>.Context context) throws IOException, InterruptedException {
			
			int mmax=0;//最大值
			int mmin=101;//最小值
			double avg=0;//平均成绩
			int num=0;//每科人数
			for(IntWritable value:values) {
				num++;
				if(value.get()>mmax) mmax=value.get();
				if(value.get()<mmin) mmin=value.get();
				avg=avg+value.get();
			}
			avg=avg/num;
			String score=String.valueOf(mmax)+","+String.valueOf(mmin)+","+String.valueOf(avg);
			context.write(key,new Text(score));
		}
     }
}

结果：

大数据概论	91,56,76.8
操作系统	91,60,77.2
数据库	90,88,89.0