docker部署spark+hadoop+livy

2024-03-19 19:59:52

python3.6为基础镜像版本，该版本linux系统为debian:

FROM python:3.6

ARG WORK_DIR=/opt

WORKDIR $WORK_DIR

# java
ADD jdk-8u281-linux-x64.tar.gz $WORK_DIR
RUN mv jdk1.8.0_281 jdk
ENV JAVA_HOME $WORK_DIR/jdk
ENV JRE_HOME $JAVA_HOME/jre
ENV CLASSPATH .:$JAVA_HOME/lib:$JRE_HOME/lib
ENV PATH $PATH:$JAVA_HOME/bin

# hadoop
ADD hadoop-2.7.7.tar.gz $WORK_DIR
RUN mv hadoop-2.7.7 hadoop
RUN mkdir -p /home/hadoop/tmp /home/hadoop/dfs/name /home/hadoop/dfs/name
ENV HADOOP_HOME $WORK_DIR/hadoop
ENV PATH $PATH:$JAVA_HOME/bin:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
COPY core-site.xml $HADOOP_HOME/etc/hadoop/core-site.xml
COPY hdfs-site.xml $HADOOP_HOME/etc/hadoop/hdfs-site.xml
COPY mapred-site.xml $HADOOP_HOME/etc/hadoop/mapred-site.xml
COPY yarn-site.xml $HADOOP_HOME/etc/hadoop/yarn-site.xml
COPY slaves $HADOOP_HOME/etc/hadoop/slaves
RUN echo export JAVA_HOME=$JAVA_HOME >> $HADOOP_HOME/etc/hadoop/hadoop-env.sh

# spark
ADD spark-2.4.7-bin-hadoop2.7.tgz $WORK_DIR
RUN mv spark-2.4.7-bin-hadoop2.7 spark
ENV SPARK_HOME /opt/spark
ENV PATH $PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin
RUN echo export JAVA_HOME=$JAVA_HOME >> $SPARK_HOME/conf/spark-env.sh
COPY slaves $SPARK_HOME/conf/slaves

RUN sed -i s@deb.debian.org@mirrors.aliyun.com@g /etc/apt/sources.list
RUN sed -i s@security.debian.org@mirrors.aliyun.com@g /etc/apt/sources.list
RUN apt-get update && apt-get install -y openssh-server

RUN ssh-keygen -t rsa -f ~/.ssh/id_rsa && \
    cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys && \
    chmod 600 ~/.ssh/authorized_keys
RUN echo '    StrictHostKeyChecking no' >> /etc/ssh/ssh_config

COPY apache-livy-0.7.0-incubating-bin.zip $WORK_DIR
RUN cd $WORK_DIR && unzip apache-livy-0.7.0-incubating-bin.zip && \
    rm -f apache-livy-0.7.0-incubating-bin.zip && \
    mv apache-livy-0.7.0-incubating-bin livy
ENV HADOOP_CONF_DIR $HADOOP_HOME/etc/hadoop
ENV PATH $PATH:$WORK_DIR/livy/bin

RUN mkdir /var/run/sshd
EXPOSE 22
CMD ["/usr/sbin/sshd", "-D"]

centos版本，该版本需要自己手动安装python,现在提供的这个未安装python，后续有时间把python安装补充更新进去：

FROM centos

ARG WORK_DIR=/opt

# java
ADD jdk-8u281-linux-x64.tar.gz $WORK_DIR
RUN mv $WORK_DIR/jdk1.8.0_281 $WORK_DIR/jdk
ENV JAVA_HOME $WORK_DIR/jdk
ENV JRE_HOME $JAVA_HOME/jre
ENV CLASSPATH .:$JAVA_HOME/lib:$JRE_HOME/lib
ENV PATH $PATH:$JAVA_HOME/bin

# hadoop
ADD hadoop-2.7.7.tar.gz $WORK_DIR
RUN mv $WORK_DIR/hadoop-2.7.7 $WORK_DIR/hadoop
RUN mkdir -p /home/hadoop/tmp /home/hadoop/dfs/name /home/hadoop/dfs/name
ENV HADOOP_HOME $WORK_DIR/hadoop
ENV PATH $PATH:$JAVA_HOME/bin:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
COPY core-site.xml $HADOOP_HOME/etc/hadoop/core-site.xml
COPY hdfs-site.xml $HADOOP_HOME/etc/hadoop/hdfs-site.xml
COPY mapred-site.xml $HADOOP_HOME/etc/hadoop/mapred-site.xml
COPY yarn-site.xml $HADOOP_HOME/etc/hadoop/yarn-site.xml
COPY slaves $HADOOP_HOME/etc/hadoop/slaves
RUN echo export JAVA_HOME=$JAVA_HOME >> $HADOOP_HOME/etc/hadoop/hadoop-env.sh

# spark
ADD spark-2.4.7-bin-hadoop2.7.tgz $WORK_DIR
RUN mv $WORK_DIR/spark-2.4.7-bin-hadoop2.7 $WORK_DIR/spark
ENV SPARK_HOME /opt/spark
ENV PATH $PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin
RUN echo export JAVA_HOME=$JAVA_HOME >> $SPARK_HOME/conf/spark-env.sh
COPY slaves $SPARK_HOME/conf/slaves

RUN yum install -y openssh-server openssh-clients which zip

RUN ssh-keygen -t dsa -f /etc/ssh/ssh_host_dsa_key
RUN ssh-keygen -t rsa -f /etc/ssh/ssh_host_rsa_key
RUN ssh-keygen -t ecdsa -f /etc/ssh/ssh_host_ecdsa_key
RUN ssh-keygen -t dsa -f /etc/ssh/ssh_host_ed25519_key
RUN ssh-keygen -t rsa -f ~/.ssh/id_rsa && \
    cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys && \
    chmod 600 ~/.ssh/authorized_keys
RUN echo -e 'Host *\n  StrictHostKeyChecking no' > /etc/ssh/ssh_config.d/default.conf

COPY apache-livy-0.7.0-incubating-bin.zip $WORK_DIR
RUN cd $WORK_DIR && unzip apache-livy-0.7.0-incubating-bin.zip && \
    rm -f apache-livy-0.7.0-incubating-bin.zip && \
    mv apache-livy-0.7.0-incubating-bin livy
ENV HADOOP_CONF_DIR $HADOOP_HOME/etc/hadoop
ENV PATH $PATH:$WORK_DIR/livy/bin

WORKDIR $WORK_DIR

RUN mkdir /var/run/sshd
EXPOSE 22
CMD ["/usr/sbin/sshd", "-D"]

docker-compose.yml

version: '3.7'

services:
  master:
    image: spark
    restart: unless-stopped
    ports:
    - 8080:8080
    - 8088:8088
    - 9000:9000
    - 8998:8998
  slave1:
    image: spark
    restart: unless-stopped

  slave2:
    image: spark
    restart: unless-stopped

core-site.xml

<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License. See accompanying LICENSE file.
-->

<!-- Put site-specific property overrides in this file. -->

<configuration>
    <property>
        <name>fs.defaultFS</name>
        <value>hdfs://master:9000</value>
    </property>
    <property>
        <name>hadoop.tmp.dir</name>
        <value>file:/opt/hadoop/tmp</value>
    </property>
</configuration>

hdfs-site.xml

<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License. See accompanying LICENSE file.
-->

<!-- Put site-specific property overrides in this file. -->

<configuration>
    <property>
        <name>dfs.namenode.secondary.http-address</name>
        <value>master:9001</value>
    </property>
    <property>
        <name>dfs.namenode.name.dir</name>
        <value>file:/home/hadoop/dfs/name</value>
    </property>
    <property>
        <name>dfs.datanode.data.dir</name>
        <value>file:/home/hadoop/dfs/data</value>
    </property>
    <property>
        <name>dfs.replication</name>
        <value>2</value>
    </property>
</configuration>

mapred-site.xml

<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License. See accompanying LICENSE file.
-->

<!-- Put site-specific property overrides in this file. -->

<configuration>
    <property>
        <name>mapreduce.framework.name</name>
        <value>yarn</value>
    </property>
</configuration>

yarn-site.xml

<?xml version="1.0"?>
<!--
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License. See accompanying LICENSE file.
-->
<configuration>
    <!-- Site specific YARN configuration properties -->
    <property>
        <name>yarn.nodemanager.aux-services</name>
        <value>mapreduce_shuffle</value>
    </property>
    <!-- 指定resourcemanager组件在哪个机子上跑 -->
    <property>
        <name>yarn.resourcemanager.hostname</name>
        <value>master</value>
    </property>
    <property>
        <name>yarn.nodemanager.auxservices.mapreduce.shuffle.class</name>
        <value>org.apache.hadoop.mapred.ShuffleHandler</value>
    </property>
    <property>
        <name>yarn.resourcemanager.address</name>
        <value>master:8032</value>
    </property>
    <property>
        <name>yarn.resourcemanager.scheduler.address</name>
        <value>master:8030</value>
    </property>
    <property>
        <name>yarn.resourcemanager.resource-tracker.address</name>
        <value>master:8031</value>
    </property>
    <property>
        <name>yarn.resourcemanager.admin.address</name>
        <value>master:8033</value>
    </property>
    <property>
        <name>yarn.resourcemanager.webapp.address</name>
        <value>master:8088</value>
    </property>
    <!--启用日志聚集功能-->
    <property>
        <name>yarn.log-aggregation-enable</name>
        <value>true</value>
    </property>
</configuration>

关于以上这四个文件的配置内容，可自行查阅根据需要配置

slaves

# 这个对应的是三个节点hostname,因为我使用docker-compose启动，默认容器的hostname就是docker-compose.yml文件中的services
master
slave1
slave2

compose启动之后进入master节点执行以下命令

# 依次为启动hadoop, spark, livy
/opt/hadoop/sbin/start-all.sh
/opt/spark/sbin/start-all.sh
livy-server start

参考链接：

https://blog.csdn.net/qq_39494664/article/details/106001216

https://www.cnblogs.com/Fordestiny/p/9401161.html

https://www.cnblogs.com/cpaulyz/p/13740540.html

码农公寓

core-site.xml

hdfs-site.xml

mapred-site.xml

yarn-site.xml

slaves

compose启动之后进入master节点执行以下命令

相关文章