【hadoop】python通过hdfs模块读hdfs数据
hdfs官网:http://hdfscli.readthedocs.io/en/latest/api.html
一个非常好的博客:http://blog.csdn.net/gamer_gyt/article/details/52446757
hdfs库中自带avro序列化与反序列化模块,不需要单独做
#!/usr/bin/env python # encoding: utf-8 """Avro extension example.""" from hdfs import Config from hdfs.ext.avro import AvroReader, AvroWriter # Get the default alias' client. client = Config().get_client() # Some sample data. records = [ {'name': 'Ann', 'age': 23}, {'name': 'Bob', 'age': 22}, ] # Write an Avro File to HDFS (since our records' schema is very simple, we let # the writer infer it automatically, otherwise we would pass it as argument). with AvroWriter(client, 'names.avro', overwrite=True) as writer: for record in records: writer.write(record) # Read it back. with AvroReader(client, 'names.avro') as reader: schema = reader.schema # The inferred schema. content = reader.content # The remote file's HDFS content object. assert list(reader) == records # The records match!
遍历hdfs目录
from hdfs import * import os from hdfs.ext.avro import AvroReader, AvroWriter def main(): client=Client("http://127.0.0.1:50070") path = "/test/tmp_data" for root, dir, files in client.walk(path): for file in files: full_path = os.path.join(root, file) print full_path with AvroReader(client, full_path) as reader: schema = reader.schema # The inferred schema. content = reader.content # The remote file's HDFS content object. #assert list(reader) == records for user in list(reader): print user main()