hadoop+zookeeper高可用集群搭建
1.配置docker环境
- 三个节点都要操作
#1.永久关闭selinux
vim /etc/selinux/config
SELINUX=disabled
#2.临时关闭
setenforce 0
#3.修改防火墙规则
cd /etc/firewalld/zones/
vim public.xml
firewall-cmd --reload
#4配置docker环境
2.配置环境变量 java hadoop zookeeper
- 三个节点都要操作
tar xf jdk-8u341-linux-x64.tar.gz -C /usr/lib/jvm
tar xf apache-zookeeper-3.8.1-bin.tar.gz -C /usr/local
tar xf hadoop-3.3.1.tar.gz -C /usr/local
mv apache-zookeeper-3.8.1-bin/ zookeeper
mv hadoop-3.3.1/ hadoop
cat /etc/profile
export JAVA_HOME=/usr/lib/jvm/jdk1.8.0_341
export PATH=$JAVA_HOME/bin:$PATH
export CLASSPATH=.:$JAVA_HOME/lib/dt.jar:$JAVA_HOME/lib/tools.jar
export ZOOKEEPER_HOME=/usr/local/zookeeper
export HADOOP_HOME=/usr/local/hadoop
export PATH=$PATH:$ZOOKEEPER_HOME/bin
export PATH=$PATH:$HADOOP_HOME/sbin
export PATH=/usr/local/hadoop/bin:$PATH
#修改完/etc/profile 生效
source /etc/profile
3.修改主机名 修改hosts文件
- 三个节点都要操作
#1.修改master节点,slave1,slave2
#20.164.2.13
hostnamectl set-hostname master
#20.164.2.14
hostnamectl set-hostname slave1
#20.164.2.15
hostnamectl set-hostname slave2
#2.修改hosts文件
三个节点都加
vim /etc/hosts
20.164.2.13 master
20.164.2.14 slave1
20.164.2.15 slave2
4.ssh免密
- 三个节点都要操作
ssh-keygen
ssh-copy-id master
ssh-copy-id slave1
ssh-copy-id slave2
5.zookeeper配置
#进入到zookeeper目录
[root@master /usr/local]# ll
总用量 4
drwxr-xr-x. 2 root root 6 4月 11 2018 bin
drwxr-xr-x. 2 root root 6 4月 11 2018 etc
drwxr-xr-x. 2 root root 6 4月 11 2018 games
drwxr-xr-x. 13 root root 250 4月 21 18:16 hadoop
drwxr-xr-x. 2 root root 6 4月 11 2018 include
drwxr-xr-x. 2 root root 6 4月 11 2018 lib
drwxr-xr-x. 2 root root 6 4月 11 2018 lib64
drwxr-xr-x. 2 root root 6 4月 11 2018 libexec
drwxr-xr-x. 2 root root 4096 4月 13 19:58 nsfocusagent
drwxr-xr-x. 2 root root 6 4月 11 2018 sbin
drwxr-xr-x. 5 root root 49 4月 11 18:08 share
drwxr-xr-x. 2 root root 6 4月 11 2018 src
drwxr-xr-x. 8 root root 157 4月 21 18:11 zookeeper
5.1 编辑zoo.cfg配置文件
#文件路径
/usr/local/zookeeper/conf
把zoo_sample.cfg复制为zoo.cfg文件,并进行编辑
vim zoo.cfg
[root@master /usr/local/zookeeper/conf]# cat zoo.cfg
# The number of milliseconds of each tick
tickTime=2000
# The number of ticks that the initial
# synchronization phase can take
initLimit=10
# The number of ticks that can pass between
# sending a request and getting an acknowledgement
syncLimit=5
# the directory where the snapshot is stored.
# do not use /tmp for storage, /tmp here is just
# example sakes.
#数据目录需要创建
dataDir=/usr/local/zookeeper/data
# the port at which the clients will connect
clientPort=2181
# the maximum number of client connections.
# increase this if you need to handle more clients
#maxClientCnxns=60
#
# Be sure to read the maintenance section of the
# administrator guide before turning on autopurge.
#
# https://zookeeper.apache.org/doc/current/zookeeperAdmin.html#sc_maintenance
#
# The number of snapshots to retain in dataDir
# 开启清理日志功能 保留文件3,间隔一小时清理一次
autopurge.snapRetainCount=3
# Purge task interval in hours
# Set to "0" to disable auto purge feature
autopurge.purgeInterval=1
## Metrics Providers
#
# https://prometheus.io Metrics Exporter
#metricsProvider.className=org.apache.zookeeper.metrics.prometheus.PrometheusMetricsProvider
#metricsProvider.httpHost=0.0.0.0
#metricsProvider.httpPort=7000
#metricsProvider.exportJvmInfo=true
#集群各个节点的IP加端口
server.0=20.164.2.13:28888:38888
server.1=20.164.2.14:28888:38888
server.2=20.164.2.15:28888:38888
5.2新增数据目录并创建myid文件
在数据目录 /usr/local/zookeeper/data 下创建识别文件 myid ,在myid文件中设置数字标识,标识与zoo.cfg中的server.x对应(x代表数字),各个节点配置为对应的数字
#数据目录就是zoo.cfg配置文件 dataDir
mkdir /usr/local/zookeeper/data
#在数据目录创建myid文件
touch /usr/local/zookeeper/data/myid
#写入id到myid文件
#根据zoo.cfg配置的server.0 server.1 server.2 配置
根据各个节点入
#master节点 echo 0 > myid
#slave1节点 echo 1 > myid
#slave2节点 echo 2 》 myid
6.hadoop配置
hadoop配置文件主要在这个目录下
/usr/local/hadoop/etc/hadoop
需要修改的 文件有hadoop-env.sh
core-site.xml
hdfs-site.xml
yarn-site.xml
mapred-site.xml
- hadoop-env.sh
[root@master /usr/local/hadoop/etc/hadoop]# egrep -v '^#|^$' hadoop-env.sh
export HADOOP_OS_TYPE=${HADOOP_OS_TYPE:-$(uname -s)}
export HDFS_NAMENODE_USER=root
export HDFS_DATANODE_USER=root
export HDFS_SECONDARYNAMENODE_USER=root
export YARN_RESOURCEMANAGER_USER=root
export YARN_NODEMANAGER_USER=root
export JAVA_HOME=/usr/lib/jvm/jdk1.8.0_341
export HDFS_JOURNALNODE_USER=root
export HDFS_ZKFC_USER=root
- core-site.xml
[root@master /usr/local/hadoop/etc/hadoop]# cat core-site.xml
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
<!-- Put site-specific property overrides in this file. -->
<configuration>
<!-- 指定hdfs的nameservice为master -->
<property>
<name>fs.defaultFS</name>
<value>hdfs://mycluster</value>
</property>
<!-- 指定hadoop临时产生文件的存放目录 -->
<property>
<name>hadoop.tmp.dir</name>
<value>/usr/local/hadoop/tmp</value>
</property>
<!-- io.file.buffer.size 缓冲区大小,实际工作中根据服务器性能动态调整,默认为4096(4MB) -->
<property>
<name>io.file.buffer.size</name>
<value>131702</value>
</property>
<!-- fs.trash.interval 清理回收站的间隔,单位为分钟,默认为0,表示hdfs里删除的文件不会进入回收站,而是直接删除,可以按需修改 -->
<property>
<name>fs.trash.interval</name>
<value>1440</value>
</property>
<!--指定每个zookeeper服务器的位置和客户端端口号-->
<property>
<name>ha.zookeeper.quorum</name>
<value>master:2181,slave1:2181,slave2:2181</value>
</property>
<!-- 解决HDFS web页面上删除、创建文件权限不足的问题 -->
<property>
<name>hadoop.http.staticuser.user</name>
<value>root</value>
</property>
<property>
<name>hadoop.proxyuser.root.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.root.groups</name>
<value>*</value>
</property>
</configuration>
- hdfs-site.xml
[root@master /usr/local/hadoop/etc/hadoop]# cat hdfs-site.xml
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
<!-- Put site-specific property overrides in this file. -->
<configuration>
<!-- 集群名称 -->
<property>
<name>dfs.nameservices</name>
<value>mycluster</value>
</property>
<!--集群中NameNode节点-->
<property>
<name>dfs.ha.namenodes.mycluster</name>
<value>nn1,nn2</value>
</property>
<!--NameNode RPC通讯地址-->
<property>
<name>dfs.namenode.rpc-address.mycluster.nn1</name>
<value>master:20007</value>
</property>
<property>
<name>dfs.namenode.rpc-address.mycluster.nn2</name>
<value>slave1:20007</value>
</property>
<!--NameNode http通讯地址 3版本默认是9870 -->
<property>
<name>dfs.namenode.http-address.mycluster.nn1</name>
<value>master:9870</value>
</property>
<property>
<name>dfs.namenode.http-address.mycluster.nn2</name>
<value>slave1:9870</value>
</property>
<!--NameNode元数据在JournalNode上存放的位置-->
<property>
<name>dfs.namenode.shared.edits.dir</name>
<value>qjournal://master:8485;slave1:8485;slave2:8485/mycluster</value>
</property>
<!--JournalNode数据存放目录-->
<property>
<name>dfs.journalnode.edits.dir</name>
<value>/usr/local/hadoop/hdfs/journal/data</value>
</property>
<!--启用nn故障自动转移-->
<property>
<name>dfs.ha.automatic-failover.enabled</name>
<value>true</value>
</property>
<!--访问代理类:client用于确定哪个NameNode为Active-->
<property>
<name>dfs.client.failover.proxy.provider.mycluster</name>
<value>org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider</value>
</property>
<!--配置隔离机制,即同一时刻只能有一台服务器对外响应-->
<property>
<name>dfs.ha.fencing.methods</name>
<value>sshfence</value>
</property>
<!--使用隔离机制时需要ssh秘钥登录-->
<property>
<name>dfs.ha.fencing.ssh.private-key-files</name>
<value>/root/.ssh/id_rsa</value>
</property>
<!--隔离的超时时间-->
<property>
<name>dfs.ha.fencing.ssh.connect-timeout</name>
<value>30000</value>
</property>
<property>
<name>dfs.namenode.datanode.registration.ip-hostname-check</name>
<value>false</value>
</property>
</configuration>
- yarn-site.xml
[root@master /usr/local/hadoop/etc/hadoop]# cat yarn-site.xml
<?xml version="1.0"?>
<!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
<configuration>
<!-- Site specific YARN configuration properties -->
<!-- 开启RM高可用 -->
<property>
<name>yarn.resourcemanager.ha.enabled</name>
<value>true</value>
</property>
<!-- 指定RM的cluster id -->
<property>
<name>yarn.resourcemanager.cluster-id</name>
<value>yrc</value>
</property>
<!-- 指定RM的名字 -->
<property>
<name>yarn.resourcemanager.ha.rm-ids</name>
<value>rm1,rm2</value>
</property>
<!-- 分别指定RM的地址 -->
<property>
<name>yarn.resourcemanager.hostname.rm1</name>
<value>master</value>
</property>
<property>
<name>yarn.resourcemanager.hostname.rm2</name>
<value>slave1</value>
</property>
<!-- 指定zk集群地址 -->
<property>
<name>yarn.resourcemanager.zk-address</name>
<value>master:2181,slave1:2181,slave2:2181</value>
</property>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
</configuration>
- mapred-site.xml
[root@master /usr/local/hadoop/etc/hadoop]# cat mapred-site.xml
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
<!-- Put site-specific property overrides in this file. -->
<configuration>
<!-- 采用yarn作为mapreduce的资源调度框架 -->
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<property>
<name>mapreduce.jobhistory.address</name>
<value>master:10020</value>
</property>
<property>
<name>mapreduce.jobhistory.webapp.address</name>
<value>master:19888</value>
</property>
<property>
<name>yarn.app.mapreduce.am.env</name>
<value>HADOOP_MAPRED_HOME=$HADOOP_HOME</value>
</property>
<property>
<name>mapreduce.map.env</name>
<value>HADOOP_MAPRED_HOME=$HADOOP_HOME</value>
</property>
<property>
<name>mapreduce.reduce.env</name>
<value>HADOOP_MAPRED_HOME=$HADOOP_HOME</value>
</property>
</configuration>
- workers 文件配置
[root@master /usr/local/hadoop/etc/hadoop]# cat workers
master
slave1
slave2
- 创建name目录,否则namenode节点起不来
mkdir -p /usr/local/hadoop/tmp/dfs/name
7.传输文件到slave1,slave2 (包含zookeeper+hadoop)
#zookeeper
scp -r /usr/local/zookeeper slave1:/usr/local/zookeeper
scp -r /usr/local/zookeeper slave2:/usr/local/zookeeper
#hadoop
scp -r /usr/local/hadoop slave1:/usr/local/hadoop
scp -r /usr/local/hadoop slave2:/usr/local/hadoop
注意各个节点的不同,myid文件,由zoo.cfg文件内容进行配置
8.启动测试
启动zk集群,master、slave1、slave2上分别执行
#各个节点均需执行启动命令
zkServer.sh start
#查看各个节点的状态
zkServer.sh status
- 在master、slave1、slave2上启动journal node
hdfs --daemon start journalnode
[root@master hadoop]# hdfs --daemon start journalnode
WARNING: /opt/hadoop-3.1.3/logs does not exist. Creating.
[root@master hadoop]# jps
122130 JournalNode
122178 Jps
7781 QuorumPeerMain
- 选择master,格式化HDFS
hdfs namenode -format
- 格式化后,在master上启动namenode进程
hdfs --daemon start namenode
[root@master hadoop]# hdfs --daemon start namenode
[root@master hadoop]# jps
122130 JournalNode
7781 QuorumPeerMain
122406 NameNode
122446 Jps
- 在另一台namenode机器(slave1)上同步元数据信息,然后在该节点启动NameNode
hdfs namenode -bootstrapStandby #同步
hdfs --daemon start namenode #启动
hdfs namenode -bootstrapStandby
# 同步元数据信息
[root@slave1 ~]# hdfs namenode -bootstrapStandby
# 执行信息
=====================================================
About to bootstrap Standby ID nn2 from:
Nameservice ID: mycluster
Other Namenode ID: nn1
Other NN's HTTP address: http://master:9870
Other NN's IPC address: master/192.168.64.102:9820
Namespace ID: 668316271
Block pool ID: BP-1895467582-192.168.64.102-1677179048358
Cluster ID: CID-a14a7c8c-5b81-4f8b-9cf6-ff25735f6543
Layout version: -64
isUpgradeFinalized: true
=====================================================
2023-02-24 03:10:58,141 INFO common.Storage: Storage directory /opt/hadoop-3.1.3/data/dfs/name has been successfully formatted.
hdfs --daemon start namenode
# 启动namenode
[root@slave1 ~]# hdfs --daemon start namenode
[root@slave1 ~]# jps
53650 Jps
53431 JournalNode
53581 NameNode #可看到NameNode已启动
1935 QuorumPeerMain
- 在master上执行格式化
hdfs zkfc -formatZK
# 执行信息
2023-02-24 03:09:29,842 INFO ha.ActiveStandbyElector: Successfully created /hadoop-ha/mycluster in ZK.
执行到此处,还没有启动3个DataNode和2个ZKFC进程
启动hadoop集群,在master上执行
#master
cd /usr/local/hadoop/sbin
#./start-all.sh
start-all.sh
hadoop-daemon.sh start zkfc
#slave01
yarn-daemon.sh start resourcemanager
hadoop-daemon.sh start zkfc
- 在启动zkCli.sh,观察节点信息
zkCli.sh -server 20.164.2.14
- 测试 将处于active状态对应节点上的NameNode进程关闭
[root@master hadoop]# jps
123520 DFSZKFailoverController
122130 JournalNode
123106 DataNode
7781 QuorumPeerMain
122406 NameNode
124156 Jps
[root@master hadoop]# kill 122406
[root@master hadoop]# jps
123520 DFSZKFailoverController
122130 JournalNode
123106 DataNode
7781 QuorumPeerMain
124200 Jps
关闭后slave1节点standby会变成active
注意,如果两个节点都是standby,需要重新格式化
#先关闭进程,然后格式化,启动,格式化操作在master节点,关闭与启动在三个节点都执行
#此次主备节点为master slave1
#hadoop主背都执行停止命令
hadoop-daemon.sh stop zkfc
#master节点格式化
hdfs zkfc -formatZK
#主备节点启动
hadoop-daemon.sh start zkfc