使用Python调用Hadoop Hdfs的API
一、Java调用hdfs的api
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.junit.After; import org.junit.Before; import org.junit.Test; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; public class HadoopClient { private static FileSystem fs; @Before public void init() throws URISyntaxException, IOException, InterruptedException { URI uri=new URI("hdfs://hadoop01:9000"); Configuration config=new Configuration(); config.set("dfs.clent.use.datanode.hostname","true"); config.set("dfs.replication","1"); fs= FileSystem.get(uri,config,"root"); } @Test public void mkdir() throws IOException { Path path= new Path("/java"); fs.mkdirs(path); } @Test public void put() throws IOException { Path srcpath= new Path("D:/HadoopDemo/local.txt"); Path dstpath= new Path("/java/local.txt"); fs.copyFromLocalFile(srcpath,dstpath); } // @Test // public void rmdir() throws IOException { // Path path= new Path("/java"); // fs.deleteOnExit(path); // } @After public void close() throws IOException { fs.close(); } }
看着尚硅谷的hadoop课程学习的,我也尝试着使用Java调用hdfs的api,在调用的时候能正常在hdfs上新建文件夹,当上传本地文件时就报错了,通过hdfs的web页面也可以看到文件名但size=0,应该是namanode起作用了,datanode未起作用。
org.apache.hadoop.ipc.RemoteException(java.io.IOException): File /java/local.txt could only be written to 0 of the 1 minReplication nodes. There are 1 datanode(s) running and 1 node(s) are excluded in this operation. at org.apache.hadoop.hdfs.server.blockmanagement.BlockManager.chooseTarget4NewBlock(BlockManager.java:2350) at org.apache.hadoop.hdfs.server.namenode.FSDirWriteFileOp.chooseTargetForNewBlock(FSDirWriteFileOp.java:294) at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.getAdditionalBlock(FSNamesystem.java:2989) at org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.addBlock(NameNodeRpcServer.java:912) at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslatorPB.addBlock(ClientNamenodeProtocolServerSideTranslatorPB.java:595) at org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNamenodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java) at org.apache.hadoop.ipc.ProtobufRpcEngine2$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine2.java:621) at org.apache.hadoop.ipc.ProtobufRpcEngine2$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine2.java:589) at org.apache.hadoop.ipc.ProtobufRpcEngine2$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine2.java:573) at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1227) at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:1094) at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:1017) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:422) at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1899) at org.apache.hadoop.ipc.Server$Handler.run(Server.java:3048) at org.apache.hadoop.ipc.Client.getRpcResponse(Client.java:1567) at org.apache.hadoop.ipc.Client.call(Client.java:1513) at org.apache.hadoop.ipc.Client.call(Client.java:1410) at org.apache.hadoop.ipc.ProtobufRpcEngine2$Invoker.invoke(ProtobufRpcEngine2.java:258) at org.apache.hadoop.ipc.ProtobufRpcEngine2$Invoker.invoke(ProtobufRpcEngine2.java:139) at jdk.proxy2/jdk.proxy2.$Proxy18.addBlock(Unknown Source) at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolTranslatorPB.addBlock(ClientNamenodeProtocolTranslatorPB.java:531) at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77) at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.base/java.lang.reflect.Method.invoke(Method.java:568) at org.apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocationHandler.java:433) at org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invokeMethod(RetryInvocationHandler.java:166) at org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invoke(RetryInvocationHandler.java:158) at org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invokeOnce(RetryInvocationHandler.java:96) at org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHandler.java:362) at jdk.proxy2/jdk.proxy2.$Proxy19.addBlock(Unknown Source) at org.apache.hadoop.hdfs.DFSOutputStream.addBlock(DFSOutputStream.java:1088) at org.apache.hadoop.hdfs.DataStreamer.locateFollowingBlock(DataStreamer.java:1915) at org.apache.hadoop.hdfs.DataStreamer.nextBlockOutputStream(DataStreamer.java:1717) at org.apache.hadoop.hdfs.DataStreamer.run(DataStreamer.java:713)
File /java/local.txt could only be written to 0 of the 1 minReplication nodes. There are 1 datanode(s) running and 1 node(s) are excluded in this operation.
网上搜解决办法,参考https://blog.csdn.net/xiaoyao_zhy/article/details/127134090,有说是域名配置问题,还有说是磁盘满了的,我还把hdfs重新格式化了下,在前面hadoop部署篇 我配置的core-site.xml的fs.defaultFS属性是locahost,我看别人教程配置的都是节点名称,我当时怀疑是不是这个地方出现的问题,我改成了fs.defaultFS=hdfs://hadoop01:9000,这样就把机器实例名也改成了hadoop01,在云主机/etc/hosts中设置机器名hadoop与id地址映射。
<configuration> <property> <name>fs.defaultFS</name> <value>hdfs://localhost:9000</value> </property> <property> <name>hadoop.tmp.dir</name> <value>/home/Hadoop/hadooptest/hdata</value> </property> </configuration>
在我window本地机子上修改了hosts文件,云主机ip hadoop01。昨天是一顿操作之后,我想着能把上面的问题解决了,可还是报上面一样的问题,我真是服了,都想着放弃了。今天想着既然能不能用python试下,看到底是我hadoop环境部署问题还是java问题,不试不知道,一试有新发现,没想到使用python就能正常从本地上传文件到服务器。所以就有了标题写的python调用hdfs的api,虽然网上教程是用java调用的,那我就使用python按教程来调用。
二、python调用hdfs api
python调用hadoop主要使用PyHDFS类库, pip install PyHDFS即可使用。
1.创建文件夹、上传文件
下面代码是在hdfs上创建了个cyw的文件夹,同时将本地test.txt文本拷贝到hdfs上。下面是打印的结果和hdfs的web显示的文件信息,是能正常显示的,说明python调用hdfs api没问题,说明hadoop环境配置的没问题,应该是java的问题,估计是我java项目哪里配置的不正确,这个后续再说。
# -*- coding: utf-8 -*- import pyhdfs fs = pyhdfs.HdfsClient(hosts='hadoop01:9870',user_name='root') home_directory=fs.get_home_directory()#返回这个用户的根目录 print(home_directory) active_namenode=fs.get_active_namenode()#返回可用的namenode节点 print(active_namenode) path='/cyw/' if not fs.exists(path): fs.mkdirs(path) #创建hdfs文件夹 file='test.txt' file_name=path+file if not fs.exists(path+file) : fs.copy_from_local('test.txt',path+file,) #从本地拷贝文件到hdfs
下面代码是使用pyhdfs操作hdfs api的测试。包括常用的上传、下载、写入、合并、删除等操作。
# -*- coding: utf-8 -*- import pyhdfs fs = pyhdfs.HdfsClient(hosts='hadoop01:9870',user_name='root') home_directory=fs.get_home_directory()#返回这个用户的根目录 print(home_directory) active_namenode=fs.get_active_namenode()#返回可用的namenode节点 print(active_namenode) path='/cyw/' if not fs.exists(path): fs.mkdirs(path) #创建hdfs文件夹 # 递归创建文件 path='/cyw/sub1' if not fs.exists(path) : fs.mkdirs(path) # 从本地拷贝文件到hdfs file='test.txt' file_name=path+file if not fs.exists(path+file) : fs.copy_from_local('test.txt',path+file,) #从本地拷贝文件到hdfs # 写入 file='/cyw/test.txt' if not fs.exists(file) : fs.append(path=file, data="new hello", encoding="utf-8") # 通过open打开 通过read读取 response=fs.open('/cyw/test.txt') print(response.read()) # 通过copy_to_local将hdfs文件拷贝到本地 fs.copy_to_local('/cyw/test.txt','test1.txt') # concat 文件合并 将/java/data.txt合并到/java/move.txt,/java/data.txt文件消失 path='/java/data.txt' if fs.exists(path) : fs.concat('/java/move.txt',['/java/data.txt']) # 重命名文件夹 path='/java/move.txt' if fs.exists(path) : fs.rename("/java/move.txt", "/java/new_move.txt") # 先创建后删除 path='/cyw/delete' if not fs.exists(path): fs.mkdirs(path) if fs.exists(path): fs.delete(path) path='/cyw/sub1' if fs.exists(path): # recursive不传或Flase 删除必须是空文件夹才能删除 # recursive=True 递归删除 fs.delete(path,recursive=True) # 路径总览信息 path='/cyw/test.txt' content_summary=fs.get_content_summary(path) print(content_summary) # 路径状态 path='/cyw/test.txt' file_status=fs.get_file_status(path) print(file_status) path='/cyw' file_status=fs.get_file_status(path) print(file_status) filetype=file_status.get('type') #获取文件类型 print(filetype) # 路径校验和 path='/cyw/test.txt' file_checksum=fs.get_file_checksum(path) print(file_checksum) # 获取文件夹下文件列表 path='/cyw' dirlist=fs.listdir(path) print(dirlist)
/user/root hadoop01:9870 b'abc\r\n123new hellonew hellonew hellonew hellonew hello' ContentSummary(directoryCount=0, ecPolicy='Replicated', fileCount=1, length=53, quota=-1, snapshotDirectoryCount=0, snapshotFileCount=0, snapshotLength=0, snapshotSpaceConsumed=0, spaceConsumed=53, spaceQuota=-1, typeQuota={}) FileStatus(accessTime=1694563976129, blockSize=134217728, childrenNum=0, fileId=16443, group='supergroup', length=53, modificationTime=1694564428530, owner='root', pathSuffix='', permission='644', replication=1, storagePolicy=0, type='FILE') FileStatus(accessTime=0, blockSize=0, childrenNum=2, fileId=16442, group='supergroup', length=0, modificationTime=1694566764561, owner='root', pathSuffix='', permission='755', replication=0, storagePolicy=0, type='DIRECTORY') DIRECTORY FileChecksum(algorithm='MD5-of-0MD5-of-512CRC32C', bytes='0000020000000000000000007e9365afb9323129fbe488ed4bc6071500000000', length=28) ['sub1test.txt', 'test.txt']
作者:社会主义接班人
出处:http://www.cnblogs.com/5ishare/
本文版权归作者和博客园共有,欢迎转载,但未经作者同意必须保留此段声明,且在文章页面明显位置给出原文连接,否则保留追究法律责任的权利。
如果文中有什么错误,欢迎指出。以免更多的人被误导。