QJM
错误演示
以下代码是直接连接一个非Active的NN并尝试写入文件的时候报错信息
Exception in thread "main" org.apache.hadoop.ipc.RemoteException(org.apache.hadoop.ipc.StandbyException): Operation category WRITE is not supported in state standby. Visit https://s.apache.org/sbnn-error
at org.apache.hadoop.hdfs.server.namenode.ha.StandbyState.checkOperation(StandbyState.java:108)
at org.apache.hadoop.hdfs.server.namenode.NameNode$NameNodeHAContext.checkOperation(NameNode.java:2094)
at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.checkOperation(FSNamesystem.java:1550)
at org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.create(NameNodeRpcServer.java:789)
at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslatorPB.create(ClientNamenodeProtocolServerSideTranslatorPB.java:494)
at org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNamenodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java)
at org.apache.hadoop.ipc.ProtobufRpcEngine2$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine2.java:604)
at org.apache.hadoop.ipc.ProtobufRpcEngine2$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine2.java:572)
at org.apache.hadoop.ipc.ProtobufRpcEngine2$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine2.java:556)
at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:1093)
at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:1043)
at org.apache.hadoop.ipc.Server$RpcCall.run(Server.java:971)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1878)
at org.apache.hadoop.ipc.Server$Handler.run(Server.java:2976)
at org.apache.hadoop.ipc.Client.getRpcResponse(Client.java:1584)
at org.apache.hadoop.ipc.Client.call(Client.java:1530)
at org.apache.hadoop.ipc.Client.call(Client.java:1427)
at org.apache.hadoop.ipc.ProtobufRpcEngine2$Invoker.invoke(ProtobufRpcEngine2.java:258)
at org.apache.hadoop.ipc.ProtobufRpcEngine2$Invoker.invoke(ProtobufRpcEngine2.java:139)
at com.sun.proxy.$Proxy9.create(Unknown Source)
at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolTranslatorPB.create(ClientNamenodeProtocolTranslatorPB.java:383)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocationHandler.java:433)
at org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invokeMethod(RetryInvocationHandler.java:166)
at org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invoke(RetryInvocationHandler.java:158)
at org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invokeOnce(RetryInvocationHandler.java:96)
at org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHandler.java:362)
at com.sun.proxy.$Proxy10.create(Unknown Source)
at org.apache.hadoop.hdfs.DFSOutputStream.newStreamForCreate(DFSOutputStream.java:280)
at org.apache.hadoop.hdfs.DFSClient.create(DFSClient.java:1271)
at org.apache.hadoop.hdfs.DFSClient.create(DFSClient.java:1250)
at org.apache.hadoop.hdfs.DFSClient.create(DFSClient.java:1232)
at org.apache.hadoop.hdfs.DFSClient.create(DFSClient.java:1170)
at org.apache.hadoop.hdfs.DistributedFileSystem$8.doCall(DistributedFileSystem.java:556)
at org.apache.hadoop.hdfs.DistributedFileSystem$8.doCall(DistributedFileSystem.java:553)
at org.apache.hadoop.fs.FileSystemLinkResolver.resolve(FileSystemLinkResolver.java:81)
at org.apache.hadoop.hdfs.DistributedFileSystem.create(DistributedFileSystem.java:567)
at org.apache.hadoop.hdfs.DistributedFileSystem.create(DistributedFileSystem.java:494)
at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:1233)
at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:1210)
at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:1091)
at org.apache.hadoop.fs.FileSystem.create(FileSystem.java:1078)
at org.example.WriteToHDFS.main(WriteToHDFS.java:30)
解决
使用如下Haproxy(/etc/haproxy/haproxy.cfg),配置如下内容,选择出Active的NN。
原理是通过检测NN的9870端口的/isActive页面是否返回200来判断该NameNode是否是Active的(非Active的节点会返回405)。
listen nn-rpc
mode tcp
option tcplog
bind 192.168.6.119:8020
balance roundrobin
option httpchk GET /isActive
http-check expect rstatus 200
server up1 192.168.6.211:8020 check port 9870
server up2 192.168.6.212:8020 check port 9870
server up3 192.168.6.213:8020 check port 9870
listen nn-http
mode tcp
option tcplog
bind 192.168.6.119:9870
balance roundrobin
option httpchk GET /isActive
http-check expect rstatus 200
server up1 192.168.6.211:9870 check
server up2 192.168.6.212:9870 check
server up3 192.168.6.213:9870 check
同理,我们也可以用相同的办法找出yarn-resourcemanager的活动节点
listen yarn-rm
mode tcp
option tcplog
bind 192.168.2.119:8088
balance roundrobin
option httpchk GET /ws/v1/cluster/info
http-check expect rstring .*ACTIVE.*
server up1 192.168.2.211:8088 check
server up2 192.168.2.212:8088 check
server up3 192.168.2.213:8088 check
一并把tidb的pd组件的活动的pd找出来,省的打开/dashboard再去跳转,实现原理是check /dashboard/ 返回的code为200(非活动的pd返回307)
listen tidb-pd
mode tcp
option tcplog
bind 192.168.2.119:2379
balance roundrobin
option httpchk GET /dashboard/
http-check expect rstatus 200
server up1 192.168.2.211:2379 check
server up2 192.168.2.212:2379 check
server up3 192.168.2.213:2379 check
- 标签 hadoop