hadoop深入研究:(十六)——Avro序列化与反序列化

转载请写明来源地址：http://blog.csdn.net/lastsweetop/article/details/9773233

所有源码在github上，https://github.com/lastsweetop/styhadoop

使用avro在很多情况下是对原有系统的改造，框架格式都已经定义好了，我们只能直接用avro对原有数据进行整合。（如果是新建系统，最好还是用avro的datafile，下一章讲datafile）

准备工作

将一下schema保存成文件StringPair.avsc,放在src/test/resources目录下

{
    "type":"record",
    "name":"StringPair",
    "doc":"A pair of strings",
    "fields":[
        {"name":"left","type":"string"},
        {"name":"right","type":"string"}
    ]
}

引入最新版本的avro时要主要，最新的avro包为1.7.4，依赖org.codehaus.jackson:jackson-core-asl:1.8.8包，但是maven库中已经没有该版本

所以要换成其他版本

    <dependency>
                <groupId>org.codehaus.jackson</groupId>
                <artifactId>jackson-core-asl</artifactId>
                <version>1.9.9</version>
            </dependency>

如果你用的时1.0.4版本的hadoop（或者其他版本），依赖于jackson-mapper-asl，如果与jackson-core-asl版本不一致就会产生找不到方法等异常

你需要入引入相同版本

            <dependency>
                <groupId>org.codehaus.jackson</groupId>
                <artifactId>jackson-mapper-asl</artifactId>
                <version>1.9.9</version>
            </dependency>

generic方式

这一节我们用代码讲解

package com.sweetop.styhadoop;

import junit.framework.Assert;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericDatumWriter;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.*;
import org.junit.Test;

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;

/**
 * Created with IntelliJ IDEA.
 * User: lastsweetop
 * Date: 13-8-5
 * Time: 下午7:59
 * To change this template use File | Settings | File Templates.
 */
public class TestGenericMapping {
    @Test
    public void test() throws IOException {
        //将schema从StringPair.avsc文件中加载
        Schema.Parser parser = new Schema.Parser();
        Schema schema = parser.parse(getClass().getResourceAsStream("/StringPair.avsc"));

        //根据schema创建一个record示例
        GenericRecord datum = new GenericData.Record(schema);
        datum.put("left", "L");
        datum.put("right", "R");


        ByteArrayOutputStream out = new ByteArrayOutputStream();
        //DatumWriter可以将GenericRecord变成edncoder可以理解的类型
        DatumWriter<GenericRecord> writer = new GenericDatumWriter<GenericRecord>(schema);
        //encoder可以将数据写入流中，binaryEncoder第二个参数是重用的encoder，这里不重用，所用传空
        Encoder encoder = EncoderFactory.get().binaryEncoder(out, null);
        writer.write(datum,encoder);
        encoder.flush();
        out.close();

        DatumReader<GenericRecord> reader=new GenericDatumReader<GenericRecord>(schema);
        Decoder decoder=DecoderFactory.get().binaryDecoder(out.toByteArray(),null);
        GenericRecord result=reader.read(null,decoder);
        Assert.assertEquals("L",result.get("left").toString());
        Assert.assertEquals("R",result.get("right").toString());
    }
}

result.get返回的是utf-8格式，需要调用toString方法，才能和字符串一致。

specific方式

首先使用avro-maven-plugin生成代码，pom的配置

  <plugin>
                    <groupId>org.apache.avro</groupId>
                    <artifactId>avro-maven-plugin</artifactId>
                    <version>1.7.0</version>
                    <executions>
                        <execution>
                            <id>schemas</id>
                            <phase>generate-sources</phase>
                            <goals>
                                <goal>schema</goal>
                            </goals>
                            <configuration>
                                <includes>
                                    <include>StringPair.avsc</include>
                                </includes>
                                <sourceDirectory>src/test/resources</sourceDirectory>
                                <outputDirectory>${project.build.directory}/generated-sources/java</outputDirectory>
                            </configuration>
                        </execution>
                    </executions>
                </plugin>

avro-maven-plugin插件绑定在generate-sources阶段，调用mvn generate-sources即可生成源代码，我们来看下生成的源代码

package com.sweetop.styhadoop;

/**
 * Autogenerated by Avro
 * <p/>
 * DO NOT EDIT DIRECTLY
 */
@SuppressWarnings("all")
/** A pair of strings */
public class StringPair extends org.apache.avro.specific.SpecificRecordBase implements org.apache.avro.specific.SpecificRecord {
    public static final org.apache.avro.Schema SCHEMA$ = new org.apache.avro.Schema.Parser().parse("{"type":"record","name":"StringPair","doc":"A pair of strings","fields":[{"name":"left","type":"string","avro.java.string":"String"},{"name":"right","type":"string"}]}");
    @Deprecated
    public java.lang.CharSequence left;
    @Deprecated
    public java.lang.CharSequence right;

    public org.apache.avro.Schema getSchema() {
        return SCHEMA$;
    }

    // Used by DatumWriter.  Applications should not call.
    public java.lang.Object get(int field$) {
        switch (field$) {
            case 0:
                return left;
            case 1:
                return right;
            default:
                throw new org.apache.avro.AvroRuntimeException("Bad index");
        }
    }

    // Used by DatumReader.  Applications should not call.
    @SuppressWarnings(value = "unchecked")
    public void put(int field$, java.lang.Object value$) {
        switch (field$) {
            case 0:
                left = (java.lang.CharSequence) value$;
                break;
            case 1:
                right = (java.lang.CharSequence) value$;
                break;
            default:
                throw new org.apache.avro.AvroRuntimeException("Bad index");
        }
    }

    /**
     * Gets the value of the 'left' field.
     */
    public java.lang.CharSequence getLeft() {
        return left;
    }

    /**
     * Sets the value of the 'left' field.
     *
     * @param value the value to set.
     */
    public void setLeft(java.lang.CharSequence value) {
        this.left = value;
    }

    /**
     * Gets the value of the 'right' field.
     */
    public java.lang.CharSequence getRight() {
        return right;
    }

    /**
     * Sets the value of the 'right' field.
     *
     * @param value the value to set.
     */
    public void setRight(java.lang.CharSequence value) {
        this.right = value;
    }
}

为了兼容之前的版本生成了一组get，put方法，1.6.0后生成添加了getter/setter方法，还有一个与Builder的类，没什么用已经被我删掉

另外上一篇文章有点没讲到就是schama里的name里可以使用命名空间，如com.sweetop.styhadoop.StringPair，这样生成的源代码才会是带package的

那我们来看如果使用这个生成的类，和generic方式有什么不同：

package com.sweetop.styhadoop;

import junit.framework.Assert;
import org.apache.avro.Schema;
import org.apache.avro.io.*;
import org.apache.avro.specific.SpecificDatumReader;
import org.apache.avro.specific.SpecificDatumWriter;
import org.junit.Test;

import java.io.ByteArrayOutputStream;
import java.io.IOException;

/**
 * Created with IntelliJ IDEA.
 * User: lastsweetop
 * Date: 13-8-6
 * Time: 下午2:19
 * To change this template use File | Settings | File Templates.
 */
public class TestSprecificMapping {
    @Test
    public void test() throws IOException {
        //因为已经生成StringPair的源代码，所以不再使用schema了，直接调用setter和getter即可
        StringPair datum=new StringPair();
        datum.setLeft("L");
        datum.setRight("R");

        ByteArrayOutputStream out=new ByteArrayOutputStream();
        //不再需要传schema了，直接用StringPair作为范型和参数，
        DatumWriter<StringPair> writer=new SpecificDatumWriter<StringPair>(StringPair.class);
        Encoder encoder= EncoderFactory.get().binaryEncoder(out,null);
        writer.write(datum, encoder);
        encoder.flush();
        out.close();

        DatumReader<StringPair> reader=new SpecificDatumReader<StringPair>(StringPair.class);
        Decoder decoder= DecoderFactory.get().binaryDecoder(out.toByteArray(),null);
        StringPair result=reader.read(null,decoder);
        Assert.assertEquals("L",result.getLeft().toString());
        Assert.assertEquals("R",result.getRight().toString());
    }
}

不同点总结一下， schema->StringPair.class, GenericRecord->StringPair