Skip to content

Ansible for Data Platform Configuration

Configuration Management for Data Infrastructure


Overview

Ansible is a configuration management tool that automates the provisioning and configuration of data platform infrastructure. Unlike Terraform (which manages cloud resources), Ansible configures servers, installs software, and manages configuration files.


Ansible Fundamentals

Playbook Structure

site.yml
---
- name: Configure Data Platform
hosts: data_servers
become: true
vars_files:
- vars/{{ env }}.yml
roles:
- role: common
tags: ['common']
- role: java
tags: ['java']
- role: hadoop
tags: ['hadoop']
- role: spark
tags: ['spark']
- role: airflow
tags: ['airflow']
- role: jupyter
tags: ['jupyter']

Inventory

inventory/production.ini
[data_servers]
prod-spark-01 ansible_host=10.0.1.10
prod-spark-02 ansible_host=10.0.1.11
prod-spark-03 ansible_host=10.0.1.12
[airflow_servers]
prod-airflow-01 ansible_host=10.0.2.10
[jupyter_servers]
prod-jupyter-01 ansible_host=10.0.3.10
[data_servers:vars]
ansible_user=ubuntu
ansible_ssh_private_key_file=~/.ssh/prod_key.pem
env=production
spark_version=3.5.0
hadoop_version=3.3.4

Ansible Roles

Role Structure

roles/
├── common/
│ ├── tasks/
│ │ └── main.yml
│ ├── handlers/
│ │ └── main.yml
│ ├── templates/
│ │ └── config.j2
│ ├── files/
│ │ └── script.sh
│ ├── defaults/
│ │ └── main.yml
│ └── vars/
│ └── main.yml
├── spark/
│ ├── tasks/
│ │ └── main.yml
│ ├── templates/
│ │ └── spark-env.sh.j2
│ └── defaults/
│ └── main.yml
└── airflow/
├── tasks/
│ └── main.yml
├── templates/
│ ├── airflow.cfg.j2
│ └── DAGs/
└── defaults/
└── main.yml

Spark Role

roles/spark/tasks/main.yml
---
- name: Create Spark user
user:
name: spark
system: yes
shell: /bin/bash
home: /opt/spark
- name: Download Spark
get_url:
url: "https://downloads.apache.org/spark/spark-{{ spark_version }}/spark-{{ spark_version }}-bin-hadoop3.tgz"
dest: /tmp/spark.tgz
mode: '0644'
- name: Extract Spark
unarchive:
src: /tmp/spark.tgz
dest: /opt
remote_src: yes
owner: spark
group: spark
- name: Create Spark symlink
file:
src: /opt/spark-{{ spark_version }}-bin-hadoop3
dest: /opt/spark
state: link
owner: spark
group: spark
- name: Configure Spark environment
template:
src: spark-env.sh.j2
dest: /opt/spark/conf/spark-env.sh
owner: spark
group: spark
mode: '0644'
notify: restart spark
- name: Configure Spark defaults
template:
src: spark-defaults.conf.j2
dest: /opt/spark/conf/spark-defaults.conf
owner: spark
group: spark
mode: '0644'
notify: restart spark
- name: Create Spark service file
template:
src: spark.service.j2
dest: /etc/systemd/system/spark.service
mode: '0644'
notify:
- reload systemd
- restart spark
- name: Start and enable Spark
systemd:
name: spark
state: started
enabled: yes
roles/spark/templates/spark-env.sh.j2
#!/usr/bin/env bash
# Spark environment variables
export SPARK_HOME=/opt/spark
export SPARK_CONF_DIR=$SPARK_HOME/conf
export SPARK_LOG_DIR=/var/log/spark
# Java options
export SPARK_EXECUTOR_MEMORY={{ spark_executor_memory }}
export SPARK_DRIVER_MEMORY={{ spark_driver_memory }}
# Hadoop configuration
export HADOOP_CONF_DIR={{ hadoop_conf_dir }}
# Python
export PYSPARK_PYTHON={{ python_executable }}
# Spark master
export SPARK_MASTER_HOST={{ ansible_host }}
export SPARK_MASTER_PORT={{ spark_master_port }}
roles/spark/templates/spark-defaults.conf.j2
# Spark default configuration
# Master
spark.master {{ spark_master_url }}
# Application
spark.app.name {{ spark_app_name | default('Spark Application') }}
spark.submit.deployMode {{ spark_deploy_mode | default('client') }}
# Execution
spark.executor.memory {{ spark_executor_memory }}
spark.executor.cores {{ spark_executor_cores }}
spark.executor.instances {{ spark_executor_instances }}
spark.driver.memory {{ spark_driver_memory }}
spark.driver.cores {{ spark_driver_cores }}
# Dynamic allocation
spark.dynamicAllocation.enabled {{ spark_dynamic_allocation_enabled }}
spark.dynamicAllocation.minExecutors {{ spark_min_executors }}
spark.dynamicAllocation.maxExecutors {{ spark_max_executors }}
spark.dynamicAllocation.initialExecutors {{ spark_initial_executors }}
# Shuffle
spark.shuffle.service.enabled {{ spark_shuffle_service_enabled }}
# Serialization
spark.serializer org.apache.spark.serializer.KryoSerializer
# Compression
spark.rdd.compress {{ spark_rdd_compress }}
spark.shuffle.compress {{ spark_shuffle_compress }}
# Event log
spark.eventLog.enabled {{ spark_event_log_enabled }}
spark.eventLog.dir {{ spark_event_log_dir }}
# Metrics
spark.metrics.conf {{ spark_metrics_conf }}
roles/spark/defaults/main.yml
---
spark_version: "3.5.0"
spark_memory: "4g"
spark_cores: "2"
spark_executor_memory: "4g"
spark_executor_cores: "2"
spark_executor_instances: "4"
spark_driver_memory: "2g"
spark_driver_cores: "1"
spark_master_url: "spark://{{ ansible_host }}:7077"
spark_deploy_mode: "cluster"
spark_master_port: "7077"
spark_dynamic_allocation_enabled: "true"
spark_min_executors: "2"
spark_max_executors: "10"
spark_initial_executors: "4"
spark_shuffle_service_enabled: "true"
spark_rdd_compress: "true"
spark_shuffle_compress: "true"
spark_event_log_enabled: "true"
spark_event_log_dir: "/var/log/spark/events"
hadoop_conf_dir: "/etc/hadoop/conf"
python_executable: "/usr/bin/python3"

Airflow Role

roles/airflow/tasks/main.yml
---
- name: Create Airflow user
user:
name: airflow
system: yes
shell: /bin/bash
home: /opt/airflow
- name: Install Airflow dependencies
apt:
name:
- python3-pip
- python3-venv
- build-essential
- libssl-dev
- libffi-dev
- python3-dev
state: present
update_cache: yes
- name: Create Airflow virtual environment
command: python3 -m venv /opt/airflow/venv
args:
creates: /opt/airflow/venv/bin/activate
- name: Upgrade pip in venv
pip:
name: pip
state: latest
virtualenv: /opt/airflow/venv
- name: Install Apache Airflow
pip:
name:
- apache-airflow=={{ airflow_version }}
- apache-airflow-providers-cncf-kubernetes=={{ airflow_kubernetes_provider_version }}
- apache-airflow-providers-amazon=={{ airflow_amazon_provider_version }}
- apache-airflow-providers-postgres=={{ airflow_postgres_provider_version }}
virtualenv: /opt/airflow/venv
- name: Create Airflow directories
file:
path: "{{ item }}"
state: directory
owner: airflow
group: airflow
mode: '0755'
loop:
- /opt/airflow/dags
- /opt/airflow/logs
- /opt/airflow/plugins
- /var/log/airflow
- name: Configure Airflow
template:
src: airflow.cfg.j2
dest: /opt/airflow/airflow.cfg
owner: airflow
group: airflow
mode: '0644'
notify: restart airflow
- name: Initialize Airflow database
become_user: airflow
shell: |
. /opt/airflow/venv/bin/activate
airflow db init
args:
creates: /opt/airflow/airflow.db
- name: Create Airflow admin user
become_user: airflow
shell: |
. /opt/airflow/venv/bin/activate
airflow users create \
--username {{ airflow_admin_user }} \
--firstname Admin \
--lastname User \
--role Admin \
--email {{ airflow_admin_email }} \
--password {{ airflow_admin_password }}
run_once: yes
when: inventory_hostname == groups['airflow_servers'][0]
- name: Copy DAG files
synchronize:
src: dags/
dest: /opt/airflow/dags/
delete: yes
notify: restart airflow
- name: Create Airflow service file
template:
src: airflow.service.j2
dest: /etc/systemd/system/airflow.service
mode: '0644'
notify:
- reload systemd
- restart airflow
- name: Start and enable Airflow
systemd:
name: airflow
state: started
enabled: yes
roles/airflow/templates/airflow.cfg.j2
[core]
# Airflow home directory
airflow_home = /opt/airflow
# The default airflow timezone
default_timezone = {{ airflow_timezone | default('UTC') }}
# The executor class
executor = {{ airflow_executor | default('LocalExecutor') }}
# SQL Alchemy connection
sql_engine = {{ sql_engine }}
sql_alchemy_conn = {{ airflow_db_connection }}
# The amount of parallelism as a setting to the executor
parallelism = {{ airflow_parallelism | default(32) }}
# Dag concurrency
dag_concurrency = {{ airflow_dag_concurrency | default(16) }}
# Max active runs per DAG
max_active_runs_per_dag = {{ airflow_max_active_runs_per_dag | default(4) }}
# Load examples
load_examples = False
[logging]
# Base log folder
base_log_folder = /var/log/airflow
# Log level
log_level = {{ airflow_log_level | default('INFO') }}
[webserver]
# Webserver listen port
web_server_port = {{ airflow_webserver_port | default(8080) }}
# Worker class
worker_class = sync
# Expose config
expose_config = False
[database]
# The amount of parallelism
sql_alchemy_pool_size = {{ airflow_db_pool_size | default(5) }}
# The max overflow value
sql_alchemy_max_overflow = {{ airflow_db_max_overflow | default(10) }}
[cli]
# Name of the dag to run
dag_id_pos = 2
[smtp]
# SMTP email backend
email_backend = airflow.utils.email.send_email_smtp
# SMTP host
smtp_host = {{ airflow_smtp_host }}
smtp_port = {{ airflow_smtp_port | default(587) }}
smtp_starttls = true
smtp_ssl = false
# SMTP user and password
smtp_user = {{ airflow_smtp_user }}
smtp_password = {{ airflow_smtp_password }}
# Email from
smtp_mail_from = {{ airflow_smtp_from }}
roles/airflow/defaults/main.yml
---
airflow_version: "2.7.0"
airflow_kubernetes_provider_version: "7.4.0"
airflow_amazon_provider_version: "8.2.0"
airflow_postgres_provider_version: "5.5.0"
airflow_executor: "CeleryExecutor"
airflow_db_connection: "postgresql://airflow:password@postgres:5432/airflow"
airflow_parallelism: 32
airflow_dag_concurrency: 16
airflow_max_active_runs_per_dag: 4
airflow_timezone: "UTC"
airflow_log_level: "INFO"
airflow_webserver_port: 8080
airflow_db_pool_size: 5
airflow_db_max_overflow: 10
airflow_smtp_host: "smtp.gmail.com"
airflow_smtp_port: 587
airflow_smtp_user: "airflow@my-company.com"
airflow_smtp_password: "secure_password"
airflow_smtp_from: "Airflow <airflow@my-company.com>"
airflow_admin_user: "admin"
airflow_admin_email: "admin@my-company.com"
airflow_admin_password: "admin"

Ansible Handlers

roles/spark/handlers/main.yml
---
- name: reload systemd
systemd:
daemon_reload: yes
- name: restart spark
systemd:
name: spark
state: restarted
- name: restart airflow
systemd:
name: airflow
state: restarted

Ansible Variables

Environment-Specific Variables

vars/production.yml
---
spark_executor_memory: "8g"
spark_executor_cores: "4"
spark_executor_instances: "20"
airflow_executor: "KubernetesExecutor"
airflow_db_connection: "postgresql://airflow:{{ vault_airflow_password }}@prod-postgres:5432/airflow"
airflow_smtp_user: "prod-airflow@my-company.com"
airflow_smtp_password: "{{ vault_smtp_password }}"
vars/development.yml
---
spark_executor_memory: "2g"
spark_executor_cores: "1"
spark_executor_instances: "2"
airflow_executor: "LocalExecutor"
airflow_db_connection: "sqlite:////opt/airflow/airflow.db"
airflow_smtp_user: "dev-airflow@my-company.com"
airflow_smtp_password: "dev_password"

Ansible Vault

Encrypting Secrets

Terminal window
# Create encrypted file
ansible-vault create vars/secrets.yml
# Edit encrypted file
ansible-vault edit vars/secrets.yml
# View encrypted file
ansible-vault view vars/secrets.yml
# Encrypt existing file
ansible-vault encrypt vars/secrets.yml
# Decrypt file
ansible-vault decrypt vars/secrets.yml
# Change password
ansible-vault rekey vars/secrets.yml

Using Encrypted Variables

# vars/secrets.yml (encrypted with ansible-vault)
---
vault_airflow_db_password: "secure_password_123"
vault_smtp_password: "secure_smtp_password"
vault_aws_access_key: "AKIAIOSFODNN7EXAMPLE"
vault_aws_secret_key: "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY"
# Playbook using encrypted variables
---
- name: Configure Airflow
hosts: airflow_servers
become: true
vars_files:
- vars/secrets.yml
- vars/production.yml
roles:
- role: airflow

Ansible Best Practices

DO

# 1. Use roles for modularity
roles:
- spark
- airflow
# 2. Use tags for selective execution
ansible-playbook site.yml --tags "spark"
# 3. Use handlers for service restarts
handlers:
- name: restart spark
systemd:
name: spark
state: restarted
# 4. Use templates for configuration
template:
src: config.j2
dest: /etc/service/config
# 5. Use vault for secrets
ansible-vault encrypt vars/secrets.yml

DON’T

# 1. Don't hardcode values
# Use variables instead
# 2. Don't ignore idempotency
# Always check state before making changes
# 3. Don't use shell when a module exists
# Use the appropriate module
# 4. Don't ignore error handling
# Use failed_when and changed_when
# 5. Don't store secrets in plain text
# Use ansible-vault

Ansible CI/CD

GitHub Actions

.github/workflows/ansible.yml
name: Ansible
on:
pull_request:
paths:
- 'ansible/**'
push:
branches:
- main
paths:
- 'ansible/**'
jobs:
lint:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.11'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install ansible-lint yamllint
- name: Run yamllint
run: yamllint ansible/
- name: Run ansible-lint
run: ansible-lint ansible/
syntax-check:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.11'
- name: Install dependencies
run: pip install ansible
- name: Ansible syntax check
run: |
cd ansible
ansible-playbook site.yml --syntax-check
dry-run:
needs: [lint, syntax-check]
if: github.event_name == 'pull_request'
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.11'
- name: Install dependencies
run: pip install ansible
- name: Ansible check mode
run: |
cd ansible
ansible-playbook site.yml --check

Key Takeaways

  1. Configuration Management: Configure servers, install software
  2. Roles: Reusable, modular configuration
  3. Templates: Jinja2 for configuration files
  4. Vault: Encrypt secrets with ansible-vault
  5. Idempotent: Safe to run multiple times
  6. CI/CD: Lint and test before deployment
  7. Best Practices: Use roles, tags, handlers
  8. Integration: Use with Terraform (Terraform creates, Ansible configures)

Back to Module 3