FROMdebian:10-slim
# Suppress interactive prompts.
ENVDEBIAN_FRONTEND=noninteractive
# Required: Install utilities required by Spark scripts.
RUNaptupdate && aptinstall-yprocpstini
# Optional: Add extra jars.
ENVSPARK_EXTRA_JARS_DIR=/opt/spark/jars/
ENVSPARK_EXTRA_CLASSPATH='/opt/spark/jars/*'
RUNmkdir-p"${SPARK_EXTRA_JARS_DIR}"
COPY*.jar"${SPARK_EXTRA_JARS_DIR}"# Optional: Install and configure Miniconda3.
ENVCONDA_HOME=/opt/miniconda3
ENVPYSPARK_PYTHON=${CONDA_HOME}/bin/python
ENVPYSPARK_DRIVER_PYTHON=${CONDA_HOME}/bin/python
ENVPATH=${CONDA_HOME}/bin:${PATH}
COPYMiniconda3-py39_4.10.3-Linux-x86_64.sh.
RUNbashMiniconda3-py39_4.10.3-Linux-x86_64.sh-b-p/opt/miniconda3\ && ${CONDA_HOME}/bin/condaconfig--system--setalways_yesTrue\ && ${CONDA_HOME}/bin/condaconfig--system--setauto_update_condaFalse\ && ${CONDA_HOME}/bin/condaconfig--system--prependchannelsconda-forge\ && ${CONDA_HOME}/bin/condaconfig--system--setchannel_prioritystrict
# Optional: Install Conda packages.## The following packages are installed in the default image. It is strongly# recommended to include all of them.## Use mamba to install packages quickly.
RUN${CONDA_HOME}/bin/condainstallmamba-nbase-cconda-forge\ && ${CONDA_HOME}/bin/mambainstall\conda\cython\fastavro\fastparquet\gcsfs\google-cloud-bigquery-storage\google-cloud-bigquery[pandas]\google-cloud-bigtable\google-cloud-container\google-cloud-datacatalog\google-cloud-dataproc\google-cloud-datastore\google-cloud-language\google-cloud-logging\google-cloud-monitoring\google-cloud-pubsub\google-cloud-redis\google-cloud-spanner\google-cloud-speech\google-cloud-storage\google-cloud-texttospeech\google-cloud-translate\google-cloud-vision\koalas\matplotlib\nltk\numba\numpy\openblas\orc\pandas\pyarrow\pysal\pytables\python\regex\requests\rtree\scikit-image\scikit-learn\scipy\seaborn\sqlalchemy\sympy\virtualenv
# Optional: Add extra Python modules.
ENVPYTHONPATH=/opt/python/packages
RUNmkdir-p"${PYTHONPATH}"
COPYtest_util.py"${PYTHONPATH}"# Required: Create the 'yarn_docker_user' group/user.# The GID and UID must be 1099. Home directory is required.
RUNgroupadd-g1099yarn_docker_user
RUNuseradd-u1099-g1099-d/home/yarn_docker_user-myarn_docker_user
USERyarn_docker_user
构建并推送映像
以下是用于构建和推送示例 Docker 映像的命令,您可以根据自己的自定义情况进行更改。
# Increase the version number when there is a change to avoid referencing# a cached older image. Avoid reusing the version number, including the default# `latest` version.IMAGE=gcr.io/my-project/my-image:1.0.1
# Download the BigQuery connector.
gcloudstoragecp\gs://spark-lib/bigquery/spark-bigquery-with-dependencies_2.12-0.22.2.jar.
# Download the Miniconda3 installer.
wgethttps://repo.anaconda.com/miniconda/Miniconda3-py39_4.10.3-Linux-x86_64.sh
# Python module example:
cat>test_util.py<<EOF
defhello(name):
print("hello {}".format(name))
defread_lines(path):
withopen(path)asf:
returnf.readlines()
EOF
# Build and push the image.
dockerbuild-t"${IMAGE}".
dockerpush"${IMAGE}"
# Set the Docker image URI.IMAGE=(e.g.,gcr.io/my-project/my-image:1.0.1)# Required: Use `#` as the delimiter for properties to avoid conflicts.JOB_PROPERTIES='^#^'# Required: Set Spark properties with the Docker image.JOB_PROPERTIES="${JOB_PROPERTIES}#spark.yarn.appMasterEnv.YARN_CONTAINER_RUNTIME_DOCKER_IMAGE=${IMAGE}"JOB_PROPERTIES="${JOB_PROPERTIES}#spark.executorEnv.YARN_CONTAINER_RUNTIME_DOCKER_IMAGE=${IMAGE}"# Optional: Add custom jars to Spark classpath. Don't set these properties if# there are no customizations.JOB_PROPERTIES="${JOB_PROPERTIES}#spark.driver.extraClassPath=/opt/spark/jars/*"JOB_PROPERTIES="${JOB_PROPERTIES}#spark.executor.extraClassPath=/opt/spark/jars/*"# Optional: Set custom PySpark Python path only if there are customizations.JOB_PROPERTIES="${JOB_PROPERTIES}#spark.pyspark.python=/opt/miniconda3/bin/python"JOB_PROPERTIES="${JOB_PROPERTIES}#spark.pyspark.driver.python=/opt/miniconda3/bin/python"# Optional: Set custom Python module path only if there are customizations.# Since the `PYTHONPATH` environment variable defined in the Dockerfile is# overridden by Spark, it must be set as a job property.JOB_PROPERTIES="${JOB_PROPERTIES}#spark.yarn.appMasterEnv.PYTHONPATH=/opt/python/packages"JOB_PROPERTIES="${JOB_PROPERTIES}#spark.executorEnv.PYTHONPATH=/opt/python/packages"
[[["易于理解","easyToUnderstand","thumb-up"],["解决了我的问题","solvedMyProblem","thumb-up"],["其他","otherUp","thumb-up"]],[["很难理解","hardToUnderstand","thumb-down"],["信息或示例代码不正确","incorrectInformationOrSampleCode","thumb-down"],["没有我需要的信息/示例","missingTheInformationSamplesINeed","thumb-down"],["翻译问题","translationIssue","thumb-down"],["其他","otherDown","thumb-down"]],["最后更新时间 (UTC):2025-08-27。"],[[["\u003cp\u003eDataproc's Docker on YARN feature enables the use of custom Docker images to modify the Spark job runtime environment, including Java, Python, and R dependencies.\u003c/p\u003e\n"],["\u003cp\u003eThis feature is limited to Dataproc image versions 2.0.49 and later, only supports Spark jobs in cluster mode, and does not support Kerberos enabled clusters, MapReduce jobs, or client Spark mode.\u003c/p\u003e\n"],["\u003cp\u003eA Dockerfile is required to customize the Spark environment and can be adapted with specific changes to meet the users needs, including installing new utilities, packages and modules.\u003c/p\u003e\n"],["\u003cp\u003eTo use this feature, a Dataproc cluster must be created with the Docker component enabled, using the properties \u003ccode\u003edataproc:yarn.docker.enable=true\u003c/code\u003e and optionally \u003ccode\u003edataproc:yarn.docker.image\u003c/code\u003e, with the custom Docker image hosted on Container Registry or Artifact Registry.\u003c/p\u003e\n"],["\u003cp\u003eWhen submitting a Spark job to a cluster using Docker, specific properties related to the Docker image must be set, such as the Docker image URI, and it is important to remember that \u003ccode\u003ePYTHONPATH\u003c/code\u003e in the Dockerfile is overriden by Spark and needs to be set as a job property.\u003c/p\u003e\n"]]],[],null,["# Customize your Spark job runtime environment with Docker on YARN\n\nThe Dataproc **Docker on YARN** feature allows you\nto create and use a Docker image to customize your\nSpark job runtime environment. The image can include customizations to\nJava, Python, and R dependencies, and to your job jar.\n\n### Limitations\n\nFeature availability or support is **not available** with:\n\n- Dataproc image versions prior to 2.0.49 (not available in 1.5 images)\n- MapReduce jobs (only supported for Spark jobs )\n- Spark client mode (only supported with Spark cluster mode)\n- [Kerberos clusters](/dataproc/docs/concepts/configuring-clusters/security#create_the_cluster): cluster creation fails if you [create a cluster with Docker on YARN](#create_a_cluster) and Kerberos enabled.\n- Customizations of JDK, Hadoop and Spark: the host JDK, Hadoop, and Spark are used, not your customizations.\n\nCreate a Docker image\n---------------------\n\nThe first step to customize your Spark environment is\n[building a Docker image](https://docs.docker.com/develop/develop-images/dockerfile_best-practices/).\n\n### Dockerfile\n\nYou can use the following Dockerfile as an example, making changes and\nadditions to meet you needs. \n\n FROM debian:10-slim\n\n # Suppress interactive prompts.\n ENV DEBIAN_FRONTEND=noninteractive\n\n # Required: Install utilities required by Spark scripts.\n RUN apt update && apt install -y procps tini\n\n # Optional: Add extra jars.\n ENV SPARK_EXTRA_JARS_DIR=/opt/spark/jars/\n ENV SPARK_EXTRA_CLASSPATH='/opt/spark/jars/*'\n RUN mkdir -p \"${SPARK_EXTRA_JARS_DIR}\"\n COPY *.jar \"${SPARK_EXTRA_JARS_DIR}\"\n\n # Optional: Install and configure Miniconda3.\n ENV CONDA_HOME=/opt/miniconda3\n ENV PYSPARK_PYTHON=${CONDA_HOME}/bin/python\n ENV PYSPARK_DRIVER_PYTHON=${CONDA_HOME}/bin/python\n\n ENV PATH=${CONDA_HOME}/bin:${PATH}\n COPY Miniconda3-py39_4.10.3-Linux-x86_64.sh .\n RUN bash Miniconda3-py39_4.10.3-Linux-x86_64.sh -b -p /opt/miniconda3 \\\n && ${CONDA_HOME}/bin/conda config --system --set always_yes True \\\n && ${CONDA_HOME}/bin/conda config --system --set auto_update_conda False \\\n && ${CONDA_HOME}/bin/conda config --system --prepend channels conda-forge \\\n && ${CONDA_HOME}/bin/conda config --system --set channel_priority strict\n\n # Optional: Install Conda packages.\n #\n # The following packages are installed in the default image. It is strongly\n # recommended to include all of them.\n #\n # Use mamba to install packages quickly.\n RUN ${CONDA_HOME}/bin/conda install mamba -n base -c conda-forge \\\n && ${CONDA_HOME}/bin/mamba install \\\n conda \\\n cython \\\n fastavro \\\n fastparquet \\\n gcsfs \\\n google-cloud-bigquery-storage \\\n google-cloud-bigquery[pandas] \\\n google-cloud-bigtable \\\n google-cloud-container \\\n google-cloud-datacatalog \\\n google-cloud-dataproc \\\n google-cloud-datastore \\\n google-cloud-language \\\n google-cloud-logging \\\n google-cloud-monitoring \\\n google-cloud-pubsub \\\n google-cloud-redis \\\n google-cloud-spanner \\\n google-cloud-speech \\\n google-cloud-storage \\\n google-cloud-texttospeech \\\n google-cloud-translate \\\n google-cloud-vision \\\n koalas \\\n matplotlib \\\n nltk \\\n numba \\\n numpy \\\n openblas \\\n orc \\\n pandas \\\n pyarrow \\\n pysal \\\n pytables \\\n python \\\n regex \\\n requests \\\n rtree \\\n scikit-image \\\n scikit-learn \\\n scipy \\\n seaborn \\\n sqlalchemy \\\n sympy \\\n virtualenv\n\n # Optional: Add extra Python modules.\n ENV PYTHONPATH=/opt/python/packages\n RUN mkdir -p \"${PYTHONPATH}\"\n COPY test_util.py \"${PYTHONPATH}\"\n\n # Required: Create the 'yarn_docker_user' group/user.\n # The GID and UID must be 1099. Home directory is required.\n RUN groupadd -g 1099 yarn_docker_user\n RUN useradd -u 1099 -g 1099 -d /home/yarn_docker_user -m yarn_docker_user\n USER yarn_docker_user\n\n### Build and push the image\n\nThe following is commands for building and pushing the example Docker image, you\ncan make changes according to your customizations. \n\n # Increase the version number when there is a change to avoid referencing\n # a cached older image. Avoid reusing the version number, including the default\n # `latest` version.\n IMAGE=gcr.io/my-project/my-image:1.0.1\n\n # Download the BigQuery connector.\n gcloud storage cp \\\n gs://spark-lib/bigquery/spark-bigquery-with-dependencies_2.12-0.22.2.jar .\n\n # Download the Miniconda3 installer.\n wget https://repo.anaconda.com/miniconda/Miniconda3-py39_4.10.3-Linux-x86_64.sh\n\n # Python module example:\n cat \u003etest_util.py \u003c\u003cEOF\n def hello(name):\n print(\"hello {}\".format(name))\n\n def read_lines(path):\n with open(path) as f:\n return f.readlines()\n EOF\n\n # Build and push the image.\n docker build -t \"${IMAGE}\" .\n docker push \"${IMAGE}\"\n\nCreate a Dataproc cluster\n-------------------------\n\nAfter [creating a Docker image](#create_a_docker_image)\nthat customizes your Spark environment, create a Dataproc cluster\nthat will use your Docker image when running Spark jobs. \n\n### gcloud\n\n```\ngcloud dataproc clusters create CLUSTER_NAME \\\n --region=REGION \\\n --image-version=DP_IMAGE \\\n --optional-components=DOCKER \\\n --properties=dataproc:yarn.docker.enable=true,dataproc:yarn.docker.image=DOCKER_IMAGE \\\n other flags\n```\n\nReplace the following;\n\n- \u003cvar translate=\"no\"\u003eCLUSTER_NAME\u003c/var\u003e: The cluster name.\n- \u003cvar translate=\"no\"\u003eREGION\u003c/var\u003e: The cluster region.\n- \u003cvar translate=\"no\"\u003eDP_IMAGE\u003c/var\u003e: Dataproc image version must be `2.0.49` or later (`--image-version=2.0` will use a qualified minor version later than `2.0.49`).\n- `--optional-components=DOCKER`: Enables the [Docker component](/dataproc/docs/concepts/components/docker) on the cluster.\n- `--properties` flag:\n - `dataproc:yarn.docker.enable=true`: Required property to enable the Dataproc Docker on YARN feature.\n - `dataproc:yarn.docker.image`: Optional property that you can add to specify your [\u003cvar translate=\"no\"\u003eDOCKER_IMAGE\u003c/var\u003e](#create_a_docker_image) using the following Container Registry image naming format: `{hostname}/{project-id}/{image}:{tag}`.\n\n Example: \n\n ```\n dataproc:yarn.docker.image=gcr.io/project-id/image:1.0.1\n ```\n\n **Requirement:** You must host your Docker image on\n [Container Registry](/container-registry) or\n [Artifact Registry](/artifact-registry). (Dataproc cannot fetch\n containers from other registries).\n\n **Recommendation:** Add this property when you create your cluster\n to cache your Docker image and avoid YARN timeouts later when\n you submit a job that uses the image.\n\n\u003cbr /\u003e\n\nWhen `dataproc:yarn.docker.enable` is set to `true`, Dataproc\nupdates Hadoop and Spark configurations to enable the Docker on YARN feature in\nthe cluster. For example, `spark.submit.deployMode` is set to `cluster`, and\n`spark.yarn.appMasterEnv.YARN_CONTAINER_RUNTIME_DOCKER_MOUNTS` and\n`spark.executorEnv.YARN_CONTAINER_RUNTIME_DOCKER_MOUNTS` are set to mount\ndirectories from the host into the container.\n\nSubmit a Spark job to the cluster\n---------------------------------\n\nAfter [creating a Dataproc cluster](#create_a_cluster), submit\na Spark job to the cluster that uses your Docker image. The example\nin this section submits a PySpark job to the cluster.\n\nSet job properties: \n\n # Set the Docker image URI.\n IMAGE=(e.g., gcr.io/my-project/my-image:1.0.1)\n\n # Required: Use `#` as the delimiter for properties to avoid conflicts.\n JOB_PROPERTIES='^#^'\n\n # Required: Set Spark properties with the Docker image.\n JOB_PROPERTIES=\"${JOB_PROPERTIES}#spark.yarn.appMasterEnv.YARN_CONTAINER_RUNTIME_DOCKER_IMAGE=${IMAGE}\"\n JOB_PROPERTIES=\"${JOB_PROPERTIES}#spark.executorEnv.YARN_CONTAINER_RUNTIME_DOCKER_IMAGE=${IMAGE}\"\n\n # Optional: Add custom jars to Spark classpath. Don't set these properties if\n # there are no customizations.\n JOB_PROPERTIES=\"${JOB_PROPERTIES}#spark.driver.extraClassPath=/opt/spark/jars/*\"\n JOB_PROPERTIES=\"${JOB_PROPERTIES}#spark.executor.extraClassPath=/opt/spark/jars/*\"\n\n # Optional: Set custom PySpark Python path only if there are customizations.\n JOB_PROPERTIES=\"${JOB_PROPERTIES}#spark.pyspark.python=/opt/miniconda3/bin/python\"\n JOB_PROPERTIES=\"${JOB_PROPERTIES}#spark.pyspark.driver.python=/opt/miniconda3/bin/python\"\n\n # Optional: Set custom Python module path only if there are customizations.\n # Since the `PYTHONPATH` environment variable defined in the Dockerfile is\n # overridden by Spark, it must be set as a job property.\n JOB_PROPERTIES=\"${JOB_PROPERTIES}#spark.yarn.appMasterEnv.PYTHONPATH=/opt/python/packages\"\n JOB_PROPERTIES=\"${JOB_PROPERTIES}#spark.executorEnv.PYTHONPATH=/opt/python/packages\"\n\nNotes:\n\n- See [Launching Applications Using Docker Containers](https://hadoop.apache.org/docs/r3.2.3/hadoop-yarn/hadoop-yarn-site/DockerContainers.html) information on related properties.\n\n### gcloud\n\nSubmit the job to the cluster. \n\n```\ngcloud dataproc jobs submit pyspark PYFILE \\\n --cluster=CLUSTER_NAME \\\n --region=REGION \\\n --properties=${JOB_PROPERTIES}\n```\n\nReplace the following;\n\n- \u003cvar translate=\"no\"\u003ePYFILE\u003c/var\u003e: The file path to your PySpark job file. It can be a local file path or the URI of the file in Cloud Storage (`gs://`\u003cvar translate=\"no\"\u003eBUCKET_NAME\u003c/var\u003e`/`\u003cvar translate=\"no\"\u003ePySpark filename\u003c/var\u003e).\n- \u003cvar translate=\"no\"\u003eCLUSTER_NAME\u003c/var\u003e: The cluster name.\n- \u003cvar translate=\"no\"\u003eREGION\u003c/var\u003e: The cluster region.\n\n\u003cbr /\u003e\n\n\u003cbr /\u003e\n\n\u003cbr /\u003e"]]