【hadoop】python通过hdfs模块读hdfs数据

hdfs官网：http://hdfscli.readthedocs.io/en/latest/api.html

一个非常好的博客：http://blog.csdn.net/gamer_gyt/article/details/52446757

hdfs库中自带avro序列化与反序列化模块，不需要单独做

#!/usr/bin/env python
# encoding: utf-8

"""Avro extension example."""

from hdfs import Config
from hdfs.ext.avro import AvroReader, AvroWriter


# Get the default alias' client.
client = Config().get_client()

# Some sample data.
records = [
  {'name': 'Ann', 'age': 23},
  {'name': 'Bob', 'age': 22},
]

# Write an Avro File to HDFS (since our records' schema is very simple, we let
# the writer infer it automatically, otherwise we would pass it as argument).
with AvroWriter(client, 'names.avro', overwrite=True) as writer:
  for record in records:
    writer.write(record)

# Read it back.
with AvroReader(client, 'names.avro') as reader:
  schema = reader.schema # The inferred schema.
  content = reader.content # The remote file's HDFS content object.
  assert list(reader) == records # The records match!

遍历hdfs目录

from hdfs import *
import os
from hdfs.ext.avro import AvroReader, AvroWriter


def main():
    client=Client("http://127.0.0.1:50070")
    path = "/test/tmp_data"
    for root, dir, files in client.walk(path):
        for file in files:
            full_path = os.path.join(root, file)
            print full_path
            with AvroReader(client, full_path) as reader:
                schema = reader.schema # The inferred schema.
                content = reader.content # The remote file's HDFS content object.
                #assert list(reader) == records
                for user in list(reader):
                    print user


main()