orc格式文件

1、Hive支持
创建表时指定orc格式即可:

create table tmp.orc_test(id bigint, name string, age int) stored as orc TBLPROPERTIES('orc.compress'='SNAPPY')

压缩格式有"SNAPPY"和 "ZLIB"两种,需要哪种格式指定即可

2、SPARK支持

Spark读:

df = spark.read.orc("/tmp/test/orc_data") # 读出来的数据是一个dataframe

Spark写:

df.write.format("orc").save("/tmp/test/orc_data2")

3、Hadoop Streaming支持

3.1、读orc文件,输出text (常用查看orc文件)

hadoop jar /usr/local/hadoop-2.7.0//share/hadoop/tools/lib/hadoop-streaming-2.7.0.jar 

-libjars /usr/local/hive-1.2.0/lib/hive-exec-1.2.0-SNAPSHOT.jar 

-mapper /bin/cat -reducer /bin/cat 

-input /tmp/test/orc_test1 

-output /tmp/test/orc_streaming_test3 

-inputformat org.apache.hadoop.hive.ql.io.orc.OrcInputFormat

3.2、读orc文件,写orc文件

hadoop jar /usr/local/hadoop-2.7.0//share/hadoop/tools/lib/hadoop-streaming-2.7.0.jar 
-libjars orc_maprd_test.jar 
-D orc.mapred.output.schema="struct<id:string,name:string,sex:string,age:string>" 
-input /tmp/test/orc_streaming_test 
-output /tmp/test/orc_streaming_test2 
-inputformat org.apache.orc.mapred.OrcInputFormat 
-outputformat org.apache.orc.mapred.OrcOutputFormat 
-mapper is.orc.MyMapper -reducer is.orc.MyReducer

例子:

mvn依赖

 <!--orc文件-->
        <dependency>
            <groupId>org.apache.orc</groupId>
            <artifactId>orc-core</artifactId>
            <version>1.2.3</version>
        </dependency>
        <dependency>
            <groupId>org.apache.orc</groupId>
            <artifactId>orc-mapreduce</artifactId>
            <version>1.1.0</version>
        </dependency>
        <dependency>
            <groupId>com.yammer.metrics </groupId>
            <artifactId>metrics-core </artifactId>
            <version>2.2.0 </version>
        </dependency>

 编写orc文件

/**
 * 编写ORC文件
 * https://orc.apache.org/docs/mapreduce.html
 */
public class OrcWriterMR {
    public static class OrcWriterMapper extends Mapper<LongWritable,Text,NullWritable,OrcStruct> {
        //要创建的ORC文件中的字段类型
        private TypeDescription schema = TypeDescription.fromString(
                //"struct<str:string>"
                "struct<datano:bigint,datatime:bigint,type:int,val:int>"
        );
        private OrcStruct pair = (OrcStruct) OrcStruct.createValue(schema);
        private final NullWritable outKey = NullWritable.get();
        public void map(LongWritable key, Text value, Context output) throws IOException, InterruptedException {
            if(!"".equals(value.toString())){
                //String lineStr = value.toString().trim();
                //pair.setFieldValue("str",new Text(lineStr));
                String[] lineStrs = value.toString().split("\,");
                pair.setFieldValue("datano",new LongWritable(Long.parseLong(lineStrs[0])));
                pair.setFieldValue("datatime",new LongWritable(Long.parseLong(lineStrs[1])));
                pair.setFieldValue("type",new IntWritable(Integer.parseInt(lineStrs[2])));
                pair.setFieldValue("val",new IntWritable(Integer.parseInt(lineStrs[3])));
                output.write(outKey, pair);
            }
        }
    }
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        //conf.set("orc.mapred.output.schema","struct<str:string>");
        conf.set("orc.mapred.output.schema","struct<datano:bigint,datatime:bigint,type:int,val:int>");
        Job job = Job.getInstance(conf);
        job.setJarByClass(OrcWriterMR.class);
        job.setJobName("Writter");
        String in = "file:///C:/Users/Administrator/Desktop/CAN.txt";
        String out = "file:///C:/Users/Administrator/Desktop/CAN1.orc";
        job.setMapperClass(OrcWriterMapper.class);
        job.setInputFormatClass(TextInputFormat.class);
        job.setNumReduceTasks(0);
        job.setOutputFormatClass(OrcOutputFormat.class);
        FileInputFormat.addInputPath(job, new Path(in));
        OrcOutputFormat.setOutputPath(job, new Path(out));
        job.waitForCompletion(true);
    }
}
读取orc文件编写成text文件
/**
 * 读取orc文件编写成text文件
 */
public class OrcReaderMR {

    public static class OrcMap extends Mapper<NullWritable,OrcStruct,NullWritable,Text> {
        Text text = new Text();
        public void map(NullWritable key, OrcStruct value, Context output) throws IOException, InterruptedException {
            StringBuffer sb= new StringBuffer();
            if (!"".equals(value.getFieldValue(0).toString())){
                sb.append(value.getFieldValue(0).toString()+ "	");
            }
            text.set(sb.toString());
            output.write(NullWritable.get(),text);
        }
    }

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        job.setJarByClass(OrcReaderMR.class);
        job.setJobName("OrcReaderMR");
        String in = "file:///C:/Users/Administrator/Desktop/gps1/gps1.orc";
        String out = "file:///C:/Users/Administrator/Desktop/CAN信息";
        job.setMapperClass(OrcMap.class);
        OrcInputFormat.addInputPath(job, new Path(in));
        FileOutputFormat.setOutputPath(job, new Path(out));
        job.setInputFormatClass(OrcInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);
        job.setNumReduceTasks(0);
        job.waitForCompletion(true);
    }


原文地址:https://www.cnblogs.com/zyanrong/p/12726543.html