MapReduce的倒排索引

索引：

什么是索引：索引（Index）是帮助数据库高效获取数据的数据结构。索引是在基于数据库表创建的，它包含一个表中某些列的值以及记录对应的地址，并且把这些值存储在一个数据结构中。最常见的就是使用哈希表、B+树作为索引。

索引的具体分析：https ：//blog.csdn.net/meiLin_Ya/article/details/80854232

用代码说事，先来看看我的数据吧：

包com.huhu.day05;

import java.io.IOException;

导入org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import com.huhu.day04.ProgenyCount;

公共类InvertedIndex扩展ToolRunner实现工具{

	私人配置conf;

	公共静态类MyMapper扩展Mapper <LongWritable，文本，文本，文本> {

		私人FileSplit拆分;
		private Text va = new Text（）;

		@覆盖
		保护无效设置（Mapper <LongWritable，Text，Text，Text> .Context上下文）
				抛出IOException，InterruptedException {
			split =（FileSplit）context.getInputSplit（）;
		}

		@覆盖
		protected void map（LongWritable key，Text value，Context context）throws IOException，InterruptedException {
			String [] line = value.toString（）。split（“”）;
			通信System.err.println（线）;
			String filename = split.getPath（）。getName（）;
			for（String s：line）{
				va.set（“fileName：”+ filename +“：”+ key.get（）+“ t索引位置：”+ value.toString（）。indexOf（s）+“ t”）;
				context.write（new Text（“搜索词：”+ s +“ r”），new Text（va））;
			}

		}
	}

	公共静态类MyReduce扩展Reducer <文本，文本，文本，文本> {

		@覆盖
		保护无效设置（上下文上下文）抛出IOException，InterruptedException {
		}

		@覆盖
		protected void reduce（Text key，Iterable <Text> values，Context context）
				抛出IOException，InterruptedException {
			StringBuffer sb = new StringBuffer（）;
			for（Text v：values）{
				sb.append（v.toString（））;
			}
			context.write（new Text（key），new Text（sb.toString（）））;
		}

		@覆盖
		保护无效清理（上下文上下文）抛出IOException，InterruptedException {
		}
	}

	公共静态无效的主要（字符串[]参数）抛出异常{
		InvertedIndex t = new InvertedIndex（）;
		配置conf = t.getConf（）;
		String [] other = new GenericOptionsParser（conf，args）.getRemainingArgs（）;
		if（other.length！= 2）{
			System.err.println（“number is fail”）;
		}
		int run = ToolRunner.run（conf，t，args）;
		System.exit（运行）;
	}

	@覆盖
	public Configuration getConf（）{
		if（conf！= null）{
			返回conf;
		}
		返回新的配置（）;
	}

	@覆盖
	public void setConf（Configuration arg0）{

	}

	@覆盖
	公共诠释运行（字符串[]其他）抛出异常{
		配置con = getConf（）;
		Job job = Job.getInstance（con）;
		job.setJarByClass（ProgenyCount.class）;
		job.setMapperClass（MyMapper.class）;
		job.setMapOutputKeyClass（Text.class）;
		job.setMapOutputValueClass（Text.class）;

		//默认分区
		// job.setPartitionerClass（HashPartitioner.class）;

		job.setReducerClass（MyReduce.class）;
		job.setOutputKeyClass（Text.class）;
		job.setOutputValueClass（Text.class）;

		FileInputFormat.addInputPath（job，new Path（“hdfs：// ry-hadoop1：8020 / in / day05 / InvertedIndex”））;
		Path path = new Path（“hdfs：// ry-hadoop1：8020 / out / day05.txt”）;
		FileSystem fs = FileSystem.get（getConf（））;
		if（fs.exists（path））{
			fs.delete（path，true）;
		}
		FileOutputFormat.setOutputPath（job，path）;

		返回job.waitForCompletion（true）？0：1;
	}

}

索引很重要：

详情：https ：//blog.csdn.net/meiLin_Ya/article/details/80854232