build.sh:
#!/bin/bash # # -- Build Apache Spark Standalone Cluster Docker Images # ---------------------------------------------------------------------------------------------------------------------- # -- Variables --------------------------------------------------------------------------------------------------------- # ---------------------------------------------------------------------------------------------------------------------- BUILD_DATE="$(date -u +'%Y-%m-%d')" SPARK_VERSION="3.5.4" HADOOP_VERSION="3" # DELTA_SPARK_VERSION="2.4.0" # DELTALAKE_VERSION="0.10.0" # JUPYTERLAB_VERSION="4.0.2" # PANDAS_VERSION="2.0.1" DELTA_PACKAGE_VERSION="delta-core_2.12:2.4.0" # SPARK_VERSION_MAJOR=${SPARK_VERSION:0:1} SPARK_XML_PACKAGE_VERSION="spark-xml_2.12:0.16.0" # SPARKSQL_MAGIC_VERSION="0.0.3" # KAFKA_PYTHON_VERSION="2.0.2" # ---------------------------------------------------------------------------------------------------------------------- # -- Functions---------------------------------------------------------------------------------------------------------- # ---------------------------------------------------------------------------------------------------------------------- function cleanContainers() { container="$(docker ps -a | grep 'jupyterlab' | awk '{print $1}')" docker stop "${container}" docker rm "${container}" container="$(docker ps -a | grep 'spark-worker' -m 1 | awk '{print $1}')" while [ -n "${container}" ]; do docker stop "${container}" docker rm "${container}" container="$(docker ps -a | grep 'spark-worker' -m 1 | awk '{print $1}')" done container="$(docker ps -a | grep 'spark-master' | awk '{print $1}')" docker stop "${container}" docker rm "${container}" container="$(docker ps -a | grep 'spark-base' | awk '{print $1}')" docker stop "${container}" docker rm "${container}" container="$(docker ps -a | grep 'base' | awk '{print $1}')" docker stop "${container}" docker rm "${container}" } function cleanImages() { docker rmi -f "$(docker images | grep -m 1 'jupyterlab' | awk '{print $3}')" docker rmi -f "$(docker images | grep -m 1 'spark-worker' | awk '{print $3}')" docker rmi -f "$(docker images | grep -m 1 'spark-master' | awk '{print $3}')" docker rmi -f "$(docker images | grep -m 1 'spark-base' | awk '{print $3}')" docker rmi -f "$(docker images | grep -m 1 'base' | awk '{print $3}')" } function cleanVolume() { docker volume rm "distributed-file-system" } function buildImages() { docker build \ --build-arg build_date="${BUILD_DATE}" \ -f docker/base/Dockerfile \ -t base:latest . docker build \ --build-arg build_date="${BUILD_DATE}" \ --build-arg spark_version="${SPARK_VERSION}" \ --build-arg hadoop_version="${HADOOP_VERSION}" \ --build-arg delta_package_version="${DELTA_PACKAGE_VERSION}" \ --build-arg spark_xml_package_version="${SPARK_XML_PACKAGE_VERSION}" \ -f docker/spark-base/Dockerfile \ -t spark-base:${SPARK_VERSION} . docker build \ --build-arg build_date="${BUILD_DATE}" \ --build-arg spark_version="${SPARK_VERSION}" \ -f docker/spark-master/Dockerfile \ -t spark-master:${SPARK_VERSION} . docker build \ --build-arg build_date="${BUILD_DATE}" \ --build-arg spark_version="${SPARK_VERSION}" \ -f docker/spark-worker/Dockerfile \ -t spark-worker:${SPARK_VERSION} . docker build \ --build-arg build_date="${BUILD_DATE}" \ --build-arg spark_version="${SPARK_VERSION}" \ -f docker/jupyterlab/Dockerfile \ -t jupyterlab:spark-${SPARK_VERSION} . } # ---------------------------------------------------------------------------------------------------------------------- # -- Main -------------------------------------------------------------------------------------------------------------- # ---------------------------------------------------------------------------------------------------------------------- cleanContainers; cleanImages; cleanVolume; buildImages;
base/Dockerfile:
ARG java_image_tag=17-jre FROM eclipse-temurin:${java_image_tag} # -- Layer: Image Metadata ARG build_date LABEL org.label-schema.build-date=${build_date} LABEL org.label-schema.description="Data Engineering wih Apache Spark and Delta Lake Cookbook - Cluster base image" LABEL org.label-schema.schema-version="1.0" # -- Layer: OS + Python + Scala ARG shared_workspace=/opt/workspace RUN mkdir -p ${shared_workspace}/data RUN mkdir -p /usr/share/man/man1 RUN apt-get update -y RUN apt-get install -y python3 python3-dev python3-venv RUN apt-get install -y curl r-base netcat-traditional build-essential manpages-dev RUN apt-get clean RUN rm -rf /var/lib/apt/lists/* RUN python3 -m venv /opt/myenv RUN /opt/myenv/bin/pip install --no-cache-dir --upgrade pip # We are explicitly pinning the versions of various libraries which this Docker image runs on. RUN /opt/myenv/bin/pip install --quiet --no-cache-dir delta-spark RUN /opt/myenv/bin/pip install --quiet --no-cache-dir deltalake RUN /opt/myenv/bin/pip install --quiet --no-cache-dir pandas ENV SCALA_HOME="/usr/bin/scala" ENV PATH=/opt/myenv/bin:${PATH}:${SCALA_HOME}/bin ENV SHARED_WORKSPACE=${shared_workspace} # -- Runtime VOLUME ${shared_workspace} CMD ["bash"]
Successfully built.
spark-base/Dockerfile:
FROM base # -- Layer: Image Metadata ARG build_date ARG delta_package_version ARG spark_xml_package_version LABEL org.label-schema.build-date=${build_date} LABEL org.label-schema.description="Data Engineering wih Apache Spark and Delta Lake Cookbook - Spark base image" LABEL org.label-schema.schema-version="1.0" # -- Layer: Apache Spark ARG spark_version ARG hadoop_version RUN curl https://archive.apache.org/dist/spark/spark-${spark_version}/spark-${spark_version}-bin-hadoop${hadoop_version}.tgz -o spark.tgz RUN tar -xf spark.tgz RUN mv spark-${spark_version}-bin-hadoop${hadoop_version} /usr/bin/ RUN echo "alias pyspark=/usr/bin/spark-${spark_version}-bin-hadoop${hadoop_version}/bin/pyspark" >> ~/.bashrc RUN echo "alias spark-shell=/usr/bin/spark-${spark_version}-bin-hadoop${hadoop_version}/bin/spark-shell" >> ~/.bashrc RUN mkdir /usr/bin/spark-${spark_version}-bin-hadoop${hadoop_version}/logs RUN rm spark.tgz ENV SPARK_HOME /usr/bin/spark-${spark_version}-bin-hadoop${hadoop_version} ENV SPARK_MASTER_HOST spark-master ENV SPARK_MASTER_PORT 7077 ENV PYSPARK_PYTHON python3 # -- Runtime WORKDIR ${SPARK_HOME} USER root ARG NBuser=NBuser ARG GROUP=NBuser RUN groupadd -r ${GROUP} && useradd -r -m -g ${GROUP} ${NBuser} RUN chown -R "${NBuser}":"${GROUP}" /home/"${NBuser}"/ RUN chown -R "${NBuser}":"${GROUP}" "${SPARK_HOME}" RUN chown -R "${NBuser}":"${GROUP}" "${SHARED_WORKSPACE}" USER ${NBuser} RUN ${SPARK_HOME}/bin/spark-shell --packages io.delta:${delta_package_version} \ --conf "spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension" \ --conf "spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog" RUN ${SPARK_HOME}/bin/spark-shell --packages com.databricks:${spark_xml_package_version} RUN ${SPARK_HOME}/bin/spark-shell --packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.4.1
Successfully built.
spark-master/Dockerfile:
ARG spark_version FROM spark-base:${spark_version} # -- Layer: Image Metadata ARG build_date LABEL org.label-schema.build-date=${build_date} LABEL org.label-schema.description="Spark master image" LABEL org.label-schema.schema-version="1.0" # -- Runtime EXPOSE 8080 7077 CMD bin/spark-class org.apache.spark.deploy.master.Master >> logs/spark-master.out
Successfully built.
spark-worker/Dockerfile:
ARG spark_version FROM spark-base:${spark_version} # -- Layer: Image Metadata ARG build_date LABEL org.label-schema.build-date=${build_date} LABEL org.label-schema.description="Spark worker image" LABEL org.label-schema.schema-version="1.0" # -- Runtime EXPOSE 8081 CMD bin/spark-class org.apache.spark.deploy.worker.Worker spark://${SPARK_MASTER_HOST}:${SPARK_MASTER_PORT} >> logs/spark-worker.out
Successfully built.
jupyterlab/Dockerfile:
FROM base # -- Layer: Image Metadata ARG build_date LABEL org.label-schema.build-date=${build_date} LABEL org.label-schema.name="Data Engineering wih Apache Spark and Delta Lake Cookbook - JupyterLab Image" LABEL org.label-schema.description="JupyterLab image" # -- Layer: Notebooks and data # ADD docker/jupyterlab/kafka-producer.py / # -- Layer: JupyterLab + Python kernel for PySpark ARG spark_version RUN pip install --no-cache-dir wget RUN pip install --no-cache-dir pyspark==${spark_version} RUN pip install --no-cache-dir jupyterlab RUN pip install --no-cache-dir sparksql-magic RUN pip install --no-cache-dir kafka-python EXPOSE 8888 WORKDIR ${SHARED_WORKSPACE} # COPY docker/jupyterlab/00-first.py /root/.ipython/profile_default/startup/00-first.py CMD jupyter lab --ip=0.0.0.0 --port=8888 --no-browser --allow-root --NotebookApp.token=;python
Successfully built.
docker-compose.yml:
volumes: shared-workspace: name: "distributed-file-system" driver: local driver_opts: o: bind type: none device: ./docker_volumes services: zookeeper: image: docker.io/bitnami/zookeeper:3.8.2 container_name: zookeeper ports: - "2181:2181" volumes: - shared-workspace:/opt/workspace environment: - ALLOW_ANONYMOUS_LOGIN=yes kafka: image: docker.io/bitnami/kafka:3.5.1 container_name: kafka ports: - "9092:9092" environment: - BITNAMI_DEBUG=yes - KAFKA_BROKER_ID=1 - KAFKA_ENABLE_KRAFT=false - KAFKA_CFG_LISTENERS=PLAINTEXT://kafka:9092 - KAFKA_CFG_ADVERTISED_LISTENERS=PLAINTEXT://kafka:9092 - KAFKA_CFG_ZOOKEEPER_CONNECT=zookeeper:2181 - KAFKA_CFG_INTER_BROKER_LISTENER_NAME=PLAINTEXT - KAFKA_CFG_LISTENER_SECURITY_PROTOCOL_MAP=PLAINTEXT:PLAINTEXT - ALLOW_PLAINTEXT_LISTENER=yes depends_on: - zookeeper jupyterlab: image: jupyterlab:spark-3.5.4 container_name: jupyterlab ports: - 8888:8888 - 4040:4040 volumes: - shared-workspace:/opt/workspace spark-master: image: spark-master:3.5.4 container_name: spark-master ports: - 8080:8080 - 7077:7077 volumes: - shared-workspace:/opt/workspace spark-worker-1: image: spark-worker:3.5.4 container_name: spark-worker-1 environment: - SPARK_WORKER_CORES=1 - SPARK_WORKER_MEMORY=512m ports: - 8081:8081 volumes: - shared-workspace:/opt/workspace depends_on: - spark-master spark-worker-2: image: spark-worker:3.5.4 container_name: spark-worker-2 environment: - SPARK_WORKER_CORES=1 - SPARK_WORKER_MEMORY=512m ports: - 8082:8081 volumes: - shared-workspace:/opt/workspace depends_on: - spark-master
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 震惊!C++程序真的从main开始吗?99%的程序员都答错了
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 单元测试从入门到精通
· 上周热点回顾(3.3-3.9)
· winform 绘制太阳,地球,月球 运作规律