使用Python调用Hadoop Hdfs的API

一、Java调用hdfs的api

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;


public class HadoopClient {

    private static FileSystem fs;

    @Before
    public  void init() throws URISyntaxException, IOException, InterruptedException {
        URI uri=new URI("hdfs://hadoop01:9000");
        Configuration config=new Configuration();
        config.set("dfs.clent.use.datanode.hostname","true");
        config.set("dfs.replication","1");
        fs= FileSystem.get(uri,config,"root");
    }

    @Test
    public void mkdir() throws IOException {
        Path path= new Path("/java");
        fs.mkdirs(path);
    }

    @Test
    public void put() throws IOException {

        Path srcpath= new Path("D:/HadoopDemo/local.txt");
        Path dstpath= new Path("/java/local.txt");

        fs.copyFromLocalFile(srcpath,dstpath);
    }

//    @Test
//    public void rmdir() throws IOException {
//        Path path= new Path("/java");
//        fs.deleteOnExit(path);
//    }
    @After
    public void close() throws IOException {
        fs.close();
    }

}
View Code

看着尚硅谷的hadoop课程学习的,我也尝试着使用Java调用hdfs的api,在调用的时候能正常在hdfs上新建文件夹,当上传本地文件时就报错了,通过hdfs的web页面也可以看到文件名但size=0,应该是namanode起作用了,datanode未起作用。

org.apache.hadoop.ipc.RemoteException(java.io.IOException): File /java/local.txt could only be written to 0 of the 1 minReplication nodes. There are 1 datanode(s) running and 1 node(s) are excluded in this operation.
    at org.apache.hadoop.hdfs.server.blockmanagement.BlockManager.chooseTarget4NewBlock(BlockManager.java:2350)
    at org.apache.hadoop.hdfs.server.namenode.FSDirWriteFileOp.chooseTargetForNewBlock(FSDirWriteFileOp.java:294)
    at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.getAdditionalBlock(FSNamesystem.java:2989)
    at org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.addBlock(NameNodeRpcServer.java:912)
    at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslatorPB.addBlock(ClientNamenodeProtocolServerSideTranslatorPB.java:595)
    at org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNamenodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java)
    at org.apache.hadoop.ipc.ProtobufRpcEngine2$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine2.java:621)
    at org.apache.hadoop.ipc.ProtobufRpcEngine2$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine2.java:589)
    at org.apache.hadoop.ipc.ProtobufRpcEngine2$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine2.java:573)
    at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1227)
    at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:1094)
    at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:1017)
    at java.security.AccessController.doPrivileged(Native Method)
    at javax.security.auth.Subject.doAs(Subject.java:422)
    at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1899)
    at org.apache.hadoop.ipc.Server$Handler.run(Server.java:3048)


    at org.apache.hadoop.ipc.Client.getRpcResponse(Client.java:1567)
    at org.apache.hadoop.ipc.Client.call(Client.java:1513)
    at org.apache.hadoop.ipc.Client.call(Client.java:1410)
    at org.apache.hadoop.ipc.ProtobufRpcEngine2$Invoker.invoke(ProtobufRpcEngine2.java:258)
    at org.apache.hadoop.ipc.ProtobufRpcEngine2$Invoker.invoke(ProtobufRpcEngine2.java:139)
    at jdk.proxy2/jdk.proxy2.$Proxy18.addBlock(Unknown Source)
    at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolTranslatorPB.addBlock(ClientNamenodeProtocolTranslatorPB.java:531)
    at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
    at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.base/java.lang.reflect.Method.invoke(Method.java:568)
    at org.apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocationHandler.java:433)
    at org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invokeMethod(RetryInvocationHandler.java:166)
    at org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invoke(RetryInvocationHandler.java:158)
    at org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invokeOnce(RetryInvocationHandler.java:96)
    at org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHandler.java:362)
    at jdk.proxy2/jdk.proxy2.$Proxy19.addBlock(Unknown Source)
    at org.apache.hadoop.hdfs.DFSOutputStream.addBlock(DFSOutputStream.java:1088)
    at org.apache.hadoop.hdfs.DataStreamer.locateFollowingBlock(DataStreamer.java:1915)
    at org.apache.hadoop.hdfs.DataStreamer.nextBlockOutputStream(DataStreamer.java:1717)
    at org.apache.hadoop.hdfs.DataStreamer.run(DataStreamer.java:713)
View Code
File /java/local.txt could only be written to 0 of the 1 minReplication nodes. There are 1 datanode(s) running and 1 node(s) are excluded in this operation.

 网上搜解决办法,参考https://blog.csdn.net/xiaoyao_zhy/article/details/127134090,有说是域名配置问题,还有说是磁盘满了的,我还把hdfs重新格式化了下,在前面hadoop部署篇 我配置的core-site.xml的fs.defaultFS属性是locahost,我看别人教程配置的都是节点名称,我当时怀疑是不是这个地方出现的问题,我改成了fs.defaultFS=hdfs://hadoop01:9000,这样就把机器实例名也改成了hadoop01,在云主机/etc/hosts中设置机器名hadoop与id地址映射。

<configuration>
 <property>
 <name>fs.defaultFS</name>
 <value>hdfs://localhost:9000</value>
 </property>
 <property>
 <name>hadoop.tmp.dir</name>
 <value>/home/Hadoop/hadooptest/hdata</value>
 </property>
 </configuration>

在我window本地机子上修改了hosts文件,云主机ip hadoop01。昨天是一顿操作之后,我想着能把上面的问题解决了,可还是报上面一样的问题,我真是服了,都想着放弃了。今天想着既然能不能用python试下,看到底是我hadoop环境部署问题还是java问题,不试不知道,一试有新发现,没想到使用python就能正常从本地上传文件到服务器。所以就有了标题写的python调用hdfs的api,虽然网上教程是用java调用的,那我就使用python按教程来调用。

二、python调用hdfs api

python调用hadoop主要使用PyHDFS类库, pip install PyHDFS即可使用。

1.创建文件夹、上传文件

下面代码是在hdfs上创建了个cyw的文件夹,同时将本地test.txt文本拷贝到hdfs上。下面是打印的结果和hdfs的web显示的文件信息,是能正常显示的,说明python调用hdfs api没问题,说明hadoop环境配置的没问题,应该是java的问题,估计是我java项目哪里配置的不正确,这个后续再说。

# -*- coding: utf-8 -*-
import pyhdfs
fs = pyhdfs.HdfsClient(hosts='hadoop01:9870',user_name='root')

home_directory=fs.get_home_directory()#返回这个用户的根目录
print(home_directory)
active_namenode=fs.get_active_namenode()#返回可用的namenode节点
print(active_namenode)
path='/cyw/'
if not fs.exists(path):
    fs.mkdirs(path) #创建hdfs文件夹
file='test.txt'
file_name=path+file
if not fs.exists(path+file) :
    fs.copy_from_local('test.txt',path+file,) #从本地拷贝文件到hdfs

下面代码是使用pyhdfs操作hdfs api的测试。包括常用的上传、下载、写入、合并、删除等操作。

# -*- coding: utf-8 -*-

import pyhdfs
fs = pyhdfs.HdfsClient(hosts='hadoop01:9870',user_name='root')

home_directory=fs.get_home_directory()#返回这个用户的根目录
print(home_directory)
active_namenode=fs.get_active_namenode()#返回可用的namenode节点
print(active_namenode)
path='/cyw/'
if not fs.exists(path):
    fs.mkdirs(path) #创建hdfs文件夹

# 递归创建文件
path='/cyw/sub1'
if not fs.exists(path) :
    fs.mkdirs(path)

# 从本地拷贝文件到hdfs
file='test.txt'
file_name=path+file
if not fs.exists(path+file) :
    fs.copy_from_local('test.txt',path+file,) #从本地拷贝文件到hdfs

# 写入
file='/cyw/test.txt'
if not fs.exists(file) :
    fs.append(path=file, data="new hello", encoding="utf-8")

# 通过open打开 通过read读取
response=fs.open('/cyw/test.txt')
print(response.read())

# 通过copy_to_local将hdfs文件拷贝到本地
fs.copy_to_local('/cyw/test.txt','test1.txt')

# concat 文件合并 将/java/data.txt合并到/java/move.txt,/java/data.txt文件消失
path='/java/data.txt'
if  fs.exists(path) :
    fs.concat('/java/move.txt',['/java/data.txt'])

# 重命名文件夹
path='/java/move.txt'
if fs.exists(path) :
    fs.rename("/java/move.txt", "/java/new_move.txt")

# 先创建后删除
path='/cyw/delete'
if not fs.exists(path):
    fs.mkdirs(path)
if fs.exists(path):
    fs.delete(path)

path='/cyw/sub1'
if fs.exists(path):
    # recursive不传或Flase 删除必须是空文件夹才能删除
    # recursive=True 递归删除
    fs.delete(path,recursive=True)
# 路径总览信息
path='/cyw/test.txt'
content_summary=fs.get_content_summary(path)
print(content_summary)

# 路径状态
path='/cyw/test.txt'
file_status=fs.get_file_status(path)
print(file_status)

path='/cyw'
file_status=fs.get_file_status(path)
print(file_status)
filetype=file_status.get('type') #获取文件类型
print(filetype)
# 路径校验和
path='/cyw/test.txt'
file_checksum=fs.get_file_checksum(path)
print(file_checksum)

# 获取文件夹下文件列表
path='/cyw'
dirlist=fs.listdir(path)
print(dirlist)
/user/root
hadoop01:9870
b'abc\r\n123new hellonew hellonew hellonew hellonew hello'
ContentSummary(directoryCount=0, ecPolicy='Replicated', fileCount=1, length=53, quota=-1, snapshotDirectoryCount=0, snapshotFileCount=0, snapshotLength=0, snapshotSpaceConsumed=0, spaceConsumed=53, spaceQuota=-1, typeQuota={})
FileStatus(accessTime=1694563976129, blockSize=134217728, childrenNum=0, fileId=16443, group='supergroup', length=53, modificationTime=1694564428530, owner='root', pathSuffix='', permission='644', replication=1, storagePolicy=0, type='FILE')
FileStatus(accessTime=0, blockSize=0, childrenNum=2, fileId=16442, group='supergroup', length=0, modificationTime=1694566764561, owner='root', pathSuffix='', permission='755', replication=0, storagePolicy=0, type='DIRECTORY')
DIRECTORY
FileChecksum(algorithm='MD5-of-0MD5-of-512CRC32C', bytes='0000020000000000000000007e9365afb9323129fbe488ed4bc6071500000000', length=28)
['sub1test.txt', 'test.txt']

 

posted @ 2023-09-13 09:02  社会主义接班人  阅读(546)  评论(0编辑  收藏  举报