Docker for Data Engineering
Containerization for Data Workloads
Overview
Docker enables packaging data applications and their dependencies into containers. For data engineering, Docker ensures reproducible environments across development, testing, and production.
Docker Basics
Dockerfile for Python Data Application
# Dockerfile
# Base imageFROM python:3.11-slim
# MetadataLABEL maintainer="data-engineering@my-company.com"LABEL description="Python data engineering environment"
# Set working directoryWORKDIR /app
# Install system dependenciesRUN apt-get update && apt-get install -y \ build-essential \ curl \ gcc \ g++ \ libpq-dev \ && rm -rf /var/lib/apt/lists/*
# Copy requirements first (for caching)COPY requirements.txt .
# Install Python dependenciesRUN pip install --no-cache-dir -r requirements.txt
# Copy application codeCOPY . .
# Create non-root userRUN useradd -m -u 1000 datauser && \ chown -R datauser:datauser /appUSER datauser
# Set environment variablesENV PYTHONPATH=/appENV PYTHONUNBUFFERED=1
# Default commandCMD ["python", "main.py"]Requirements File
# Data processingpandas==2.1.0numpy==1.24.0pyarrow==14.0.0polars==0.19.0
# Storageboto3==1.28.0google-cloud-storage==2.10.0azure-storage-blob==12.19.0
# Databasespsycopg2-binary==2.9.7pymongo==4.5.0snowflake-connector-python==3.4.0redshift_connector==2.0.910
# Orchestrationapache-airflow==2.7.0dagster==1.5.0prefect==2.14.0
# Streamingkafka-python==2.0.2confluent-kafka==2.3.0
# MLscikit-learn==1.3.0xgboost==2.0.0mlflow==2.8.0
# Utilitiespython-dotenv==1.0.0pydantic==2.4.0requests==2.31.0Docker Compose
Local Development Stack
version: '3.8'
services: # Jupyter notebook jupyter: build: context: . dockerfile: Dockerfile.jupyter container_name: jupyter ports: - "8888:8888" volumes: - ./notebooks:/home/jovyan/work - ./data:/home/jovyan/data environment: - JUPYTER_ENABLE_LAB=yes networks: - data-platform
# PostgreSQL database postgres: image: postgres:15-alpine container_name: postgres environment: POSTGRES_USER: datauser POSTGRES_PASSWORD: datapass POSTGRES_DB: datalake ports: - "5432:5432" volumes: - postgres-data:/var/lib/postgresql/data networks: - data-platform
# MinIO (S3-compatible storage) minio: image: minio/minio:latest container_name: minio ports: - "9000:9000" - "9001:9001" environment: MINIO_ROOT_USER: minioadmin MINIO_ROOT_PASSWORD: minioadmin command: server /data --console-address ":9001" volumes: - minio-data:/data networks: - data-platform
# Kafka zookeeper: image: confluentinc/cp-zookeeper:7.5.0 container_name: zookeeper environment: ZOOKEEPER_CLIENT_PORT: 2181 ZOOKEEPER_TICK_TIME: 2000 networks: - data-platform
kafka: image: confluentinc/cp-kafka:7.5.0 container_name: kafka depends_on: - zookeeper ports: - "9092:9092" environment: KAFKA_BROKER_ID: 1 KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181 KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:29092,PLAINTEXT_HOST://localhost:9092 KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT KAFKA_INTER_BROKER_LISTENER_NAME: PLAINTEXT KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 networks: - data-platform
# Spark master spark-master: image: bitnami/spark:3.5.0 container_name: spark-master ports: - "8080:8080" - "7077:7077" environment: - SPARK_MODE=master - SPARK_RPC_AUTHENTICATION_ENABLED=no - SPARK_RPC_ENCRYPTION_ENABLED=no networks: - data-platform
# Spark worker spark-worker: image: bitnami/spark:3.5.0 container_name: spark-worker depends_on: - spark-master environment: - SPARK_MODE=worker - SPARK_MASTER_URL=spark://spark-master:7077 - SPARK_WORKER_MEMORY=2G - SPARK_WORKER_CORES=2 - SPARK_RPC_AUTHENTICATION_ENABLED=no - SPARK_RPC_ENCRYPTION_ENABLED=no networks: - data-platform
networks: data-platform: driver: bridge
volumes: postgres-data: minio-data:Dockerfile Patterns
PySpark Application
FROM python:3.11-slim
# Install Java (required for Spark)RUN apt-get update && apt-get install -y \ openjdk-17-jre-headless \ && rm -rf /var/lib/apt/lists/*
# Set JAVA_HOMEENV JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64
# Install SparkENV SPARK_VERSION=3.5.0ENV SPARK_HOME=/opt/sparkENV PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.9.7-src.zip:$PYTHONPATH
RUN wget -q https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop3.tgz && \ tar -xzf spark-${SPARK_VERSION}-bin-hadoop3.tgz -C /opt && \ rm spark-${SPARK_VERSION}-bin-hadoop3.tgz && \ ln -s /opt/spark-${SPARK_VERSION}-bin-hadoop3 $SPARK_HOME
# Install Python dependenciesCOPY requirements-pyspark.txt .RUN pip install --no-cache-dir -r requirements-pyspark.txt
WORKDIR /appCOPY . .
# Default Spark submitCMD ["spark-submit", "--master", "local[*]", "pyspark_job.py"]Airflow
FROM apache/airflow:2.7.0-python3.10
# Install Airflow providersRUN pip install --no-cache-dir \ apache-airflow-providers-amazon==8.2.0 \ apache-airflow-providers-google==10.8.0 \ apache-airflow-providers-postgres==5.5.0 \ apache-airflow-providers-slack==8.0.0
# Copy DAGsCOPY dags /opt/airflow/dags
# Copy requirementsCOPY requirements.txt .RUN pip install --no-cache-dir -r requirements.txt
# Set environmentENV AIRFLOW__CORE__DAGS_FOLDER=/opt/airflow/dagsENV AIRFLOW__CORE__LOAD_EXAMPLES=FalseENV AIRFLOW__CORE__SQL_ALCHEMY_CONN=postgresql://airflow:airflow@postgres:5432/airflow
EXPOSE 8080
CMD ["airflow", "webserver"]Data Science Environment
FROM jupyter/scipy-notebook:latest
# Install data science librariesRUN pip install --no-cache-dir \ pandas==2.1.0 \ numpy==1.24.0 \ scikit-learn==1.3.0 \ xgboost==2.0.0 \ matplotlib==3.7.0 \ seaborn==0.12.0 \ plotly==5.17.0 \ mlflow==2.8.0
# Install additional librariesRUN pip install --no-cache-dir \ pyarrow==14.0.0 \ polars==0.19.0 \ sqlalchemy==2.0.0 \ psycopg2-binary==2.9.7 \ redis==5.0.0
# Set working directoryWORKDIR /home/jovyan/work
# Expose notebook portEXPOSE 8888
# Start notebookCMD ["start-notebook.sh"]Docker Best Practices
DO
# 1. Use specific versionsFROM python:3.11-slim # GoodFROM python:latest # Bad
# 2. Use multi-stage buildsFROM python:3.11 as builderRUN pip install -r requirements.txt
FROM python:3.11-slimCOPY --from=builder /usr/local/lib/python3.11/site-packages /usr/local/lib/python3.11/site-packages
# 3. Use .dockerignore# .dockerignore.gitvenv__pycache__*.pyc.env
# 4. Minimize layersRUN apt-get update && apt-get install -y curl && rm -rf /var/lib/apt/lists/* # GoodRUN apt-get update # BadRUN apt-get install -y curl
# 5. Use non-root userUSER 1000:1000DON’T
# 1. Don't use latest tags# Use specific versions
# 2. Don't ignore security# Don't run as root
# 3. Don't ignore .dockerignore# Use .dockerignore to exclude unnecessary files
# 4. Don't install unnecessary packages# Minimal images (slim, alpine)
# 5. Don't hardcode credentials# Use environment variables or secretsDocker Optimization
Multi-Stage Builds
# Multi-stage build for smaller images
# Builder stageFROM python:3.11 as builderWORKDIR /appCOPY requirements.txt .RUN pip install --user --no-cache-dir -r requirements.txt
# Final stageFROM python:3.11-slimWORKDIR /appCOPY --from=builder /root/.local /root/.localCOPY . .ENV PATH=/root/.local/bin:$PATHCMD ["python", "main.py"]BuildKit Features
# Dockerfile with BuildKit
# syntax=docker/dockerfile:1
FROM python:3.11-slim
# Mount cache for pip installRUN --mount=type=cache,target=/root/.cache/pip \ pip install -r requirements.txt
# Use build secretsRUN --mount=type=secret,id=github_token \ git clone https://$(cat /run/secrets/github_token)@github.com/my-repo.git
# Run as non-rootRUN adduser -u 1000 --disabled-password --gecos '' datauserUSER datauserKey Takeaways
- Reproducibility: Same environment everywhere
- Isolation: No dependency conflicts
- Portability: Run anywhere
- Scalability: Easy to scale with Kubernetes
- Best Practices: Specific versions, multi-stage builds, non-root user
- Compose: Local development stack
- Optimization: Multi-stage builds, BuildKit features
- Use Cases: Development, testing, production
Back to Module 3