Skip to content

Docker for Data Engineering

Containerization for Data Workloads


Overview

Docker enables packaging data applications and their dependencies into containers. For data engineering, Docker ensures reproducible environments across development, testing, and production.


Docker Basics

Dockerfile for Python Data Application

# Dockerfile
# Base image
FROM python:3.11-slim
# Metadata
LABEL maintainer="data-engineering@my-company.com"
LABEL description="Python data engineering environment"
# Set working directory
WORKDIR /app
# Install system dependencies
RUN apt-get update && apt-get install -y \
build-essential \
curl \
gcc \
g++ \
libpq-dev \
&& rm -rf /var/lib/apt/lists/*
# Copy requirements first (for caching)
COPY requirements.txt .
# Install Python dependencies
RUN pip install --no-cache-dir -r requirements.txt
# Copy application code
COPY . .
# Create non-root user
RUN useradd -m -u 1000 datauser && \
chown -R datauser:datauser /app
USER datauser
# Set environment variables
ENV PYTHONPATH=/app
ENV PYTHONUNBUFFERED=1
# Default command
CMD ["python", "main.py"]

Requirements File

requirements.txt
# Data processing
pandas==2.1.0
numpy==1.24.0
pyarrow==14.0.0
polars==0.19.0
# Storage
boto3==1.28.0
google-cloud-storage==2.10.0
azure-storage-blob==12.19.0
# Databases
psycopg2-binary==2.9.7
pymongo==4.5.0
snowflake-connector-python==3.4.0
redshift_connector==2.0.910
# Orchestration
apache-airflow==2.7.0
dagster==1.5.0
prefect==2.14.0
# Streaming
kafka-python==2.0.2
confluent-kafka==2.3.0
# ML
scikit-learn==1.3.0
xgboost==2.0.0
mlflow==2.8.0
# Utilities
python-dotenv==1.0.0
pydantic==2.4.0
requests==2.31.0

Docker Compose

Local Development Stack

docker-compose.yml
version: '3.8'
services:
# Jupyter notebook
jupyter:
build:
context: .
dockerfile: Dockerfile.jupyter
container_name: jupyter
ports:
- "8888:8888"
volumes:
- ./notebooks:/home/jovyan/work
- ./data:/home/jovyan/data
environment:
- JUPYTER_ENABLE_LAB=yes
networks:
- data-platform
# PostgreSQL database
postgres:
image: postgres:15-alpine
container_name: postgres
environment:
POSTGRES_USER: datauser
POSTGRES_PASSWORD: datapass
POSTGRES_DB: datalake
ports:
- "5432:5432"
volumes:
- postgres-data:/var/lib/postgresql/data
networks:
- data-platform
# MinIO (S3-compatible storage)
minio:
image: minio/minio:latest
container_name: minio
ports:
- "9000:9000"
- "9001:9001"
environment:
MINIO_ROOT_USER: minioadmin
MINIO_ROOT_PASSWORD: minioadmin
command: server /data --console-address ":9001"
volumes:
- minio-data:/data
networks:
- data-platform
# Kafka
zookeeper:
image: confluentinc/cp-zookeeper:7.5.0
container_name: zookeeper
environment:
ZOOKEEPER_CLIENT_PORT: 2181
ZOOKEEPER_TICK_TIME: 2000
networks:
- data-platform
kafka:
image: confluentinc/cp-kafka:7.5.0
container_name: kafka
depends_on:
- zookeeper
ports:
- "9092:9092"
environment:
KAFKA_BROKER_ID: 1
KAFKA_ZOOKEEPER_CONNECT: zookeeper:2181
KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka:29092,PLAINTEXT_HOST://localhost:9092
KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT
KAFKA_INTER_BROKER_LISTENER_NAME: PLAINTEXT
KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
networks:
- data-platform
# Spark master
spark-master:
image: bitnami/spark:3.5.0
container_name: spark-master
ports:
- "8080:8080"
- "7077:7077"
environment:
- SPARK_MODE=master
- SPARK_RPC_AUTHENTICATION_ENABLED=no
- SPARK_RPC_ENCRYPTION_ENABLED=no
networks:
- data-platform
# Spark worker
spark-worker:
image: bitnami/spark:3.5.0
container_name: spark-worker
depends_on:
- spark-master
environment:
- SPARK_MODE=worker
- SPARK_MASTER_URL=spark://spark-master:7077
- SPARK_WORKER_MEMORY=2G
- SPARK_WORKER_CORES=2
- SPARK_RPC_AUTHENTICATION_ENABLED=no
- SPARK_RPC_ENCRYPTION_ENABLED=no
networks:
- data-platform
networks:
data-platform:
driver: bridge
volumes:
postgres-data:
minio-data:

Dockerfile Patterns

PySpark Application

Dockerfile.pyspark
FROM python:3.11-slim
# Install Java (required for Spark)
RUN apt-get update && apt-get install -y \
openjdk-17-jre-headless \
&& rm -rf /var/lib/apt/lists/*
# Set JAVA_HOME
ENV JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64
# Install Spark
ENV SPARK_VERSION=3.5.0
ENV SPARK_HOME=/opt/spark
ENV PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.9.7-src.zip:$PYTHONPATH
RUN wget -q https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop3.tgz && \
tar -xzf spark-${SPARK_VERSION}-bin-hadoop3.tgz -C /opt && \
rm spark-${SPARK_VERSION}-bin-hadoop3.tgz && \
ln -s /opt/spark-${SPARK_VERSION}-bin-hadoop3 $SPARK_HOME
# Install Python dependencies
COPY requirements-pyspark.txt .
RUN pip install --no-cache-dir -r requirements-pyspark.txt
WORKDIR /app
COPY . .
# Default Spark submit
CMD ["spark-submit", "--master", "local[*]", "pyspark_job.py"]

Airflow

Dockerfile.airflow
FROM apache/airflow:2.7.0-python3.10
# Install Airflow providers
RUN pip install --no-cache-dir \
apache-airflow-providers-amazon==8.2.0 \
apache-airflow-providers-google==10.8.0 \
apache-airflow-providers-postgres==5.5.0 \
apache-airflow-providers-slack==8.0.0
# Copy DAGs
COPY dags /opt/airflow/dags
# Copy requirements
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Set environment
ENV AIRFLOW__CORE__DAGS_FOLDER=/opt/airflow/dags
ENV AIRFLOW__CORE__LOAD_EXAMPLES=False
ENV AIRFLOW__CORE__SQL_ALCHEMY_CONN=postgresql://airflow:airflow@postgres:5432/airflow
EXPOSE 8080
CMD ["airflow", "webserver"]

Data Science Environment

Dockerfile.datascience
FROM jupyter/scipy-notebook:latest
# Install data science libraries
RUN pip install --no-cache-dir \
pandas==2.1.0 \
numpy==1.24.0 \
scikit-learn==1.3.0 \
xgboost==2.0.0 \
matplotlib==3.7.0 \
seaborn==0.12.0 \
plotly==5.17.0 \
mlflow==2.8.0
# Install additional libraries
RUN pip install --no-cache-dir \
pyarrow==14.0.0 \
polars==0.19.0 \
sqlalchemy==2.0.0 \
psycopg2-binary==2.9.7 \
redis==5.0.0
# Set working directory
WORKDIR /home/jovyan/work
# Expose notebook port
EXPOSE 8888
# Start notebook
CMD ["start-notebook.sh"]

Docker Best Practices

DO

# 1. Use specific versions
FROM python:3.11-slim # Good
FROM python:latest # Bad
# 2. Use multi-stage builds
FROM python:3.11 as builder
RUN pip install -r requirements.txt
FROM python:3.11-slim
COPY --from=builder /usr/local/lib/python3.11/site-packages /usr/local/lib/python3.11/site-packages
# 3. Use .dockerignore
# .dockerignore
.git
venv
__pycache__
*.pyc
.env
# 4. Minimize layers
RUN apt-get update && apt-get install -y curl && rm -rf /var/lib/apt/lists/* # Good
RUN apt-get update # Bad
RUN apt-get install -y curl
# 5. Use non-root user
USER 1000:1000

DON’T

# 1. Don't use latest tags
# Use specific versions
# 2. Don't ignore security
# Don't run as root
# 3. Don't ignore .dockerignore
# Use .dockerignore to exclude unnecessary files
# 4. Don't install unnecessary packages
# Minimal images (slim, alpine)
# 5. Don't hardcode credentials
# Use environment variables or secrets

Docker Optimization

Multi-Stage Builds

# Multi-stage build for smaller images
# Builder stage
FROM python:3.11 as builder
WORKDIR /app
COPY requirements.txt .
RUN pip install --user --no-cache-dir -r requirements.txt
# Final stage
FROM python:3.11-slim
WORKDIR /app
COPY --from=builder /root/.local /root/.local
COPY . .
ENV PATH=/root/.local/bin:$PATH
CMD ["python", "main.py"]

BuildKit Features

# Dockerfile with BuildKit
# syntax=docker/dockerfile:1
FROM python:3.11-slim
# Mount cache for pip install
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -r requirements.txt
# Use build secrets
RUN --mount=type=secret,id=github_token \
git clone https://$(cat /run/secrets/github_token)@github.com/my-repo.git
# Run as non-root
RUN adduser -u 1000 --disabled-password --gecos '' datauser
USER datauser

Key Takeaways

  1. Reproducibility: Same environment everywhere
  2. Isolation: No dependency conflicts
  3. Portability: Run anywhere
  4. Scalability: Easy to scale with Kubernetes
  5. Best Practices: Specific versions, multi-stage builds, non-root user
  6. Compose: Local development stack
  7. Optimization: Multi-stage builds, BuildKit features
  8. Use Cases: Development, testing, production

Back to Module 3