Skip to content

Terraform for Data Infrastructure

Infrastructure as Code for Data Platforms


Overview

Terraform is a declarative Infrastructure as Code (IaC) tool that enables reproducible, version-controlled infrastructure. For data platforms, Terraform manages warehouses, storage, networking, IAM, and orchestration infrastructure across AWS, GCP, and Azure.


Terraform Fundamentals

Declarative Syntax

main.tf
# Provider configuration
provider "aws" {
region = "us-east-1"
}
# S3 bucket for data lake
resource "aws_s3_bucket" "data_lake" {
bucket = "my-company-data-lake"
tags = {
Environment = "production"
ManagedBy = "terraform"
}
}
# DynamoDB table for Terraform state locking
resource "aws_dynamodb_table" "terraform_state_lock" {
name = "terraform-state-lock"
billing_mode = "PAY_PER_REQUEST"
hash_key = "LockID"
attribute {
name = "LockID"
type = "S"
}
}
# Terraform backend configuration
terraform {
backend "s3" {
bucket = "my-company-terraform-state"
key = "data-platform/terraform.tfstate"
region = "us-east-1"
encrypt = true
dynamodb_table = "terraform-state-lock"
}
}

Terraform State Management

Remote State

backend.tf
terraform {
backend "s3" {
bucket = "my-company-terraform-state"
key = "data-platform/terraform.tfstate"
region = "us-east-1"
encrypt = true
dynamodb_table = "terraform-state-lock"
# State versioning
key_prefix = "data-platform:"
}
}
# Benefits:
# - Team collaboration (shared state)
# - State locking (prevent concurrent updates)
# - State versioning (rollback capability)
# - Encryption (security)

State Outputs

outputs.tf
# Output values for other Terraform configurations
output "data_lake_bucket_name" {
value = aws_s3_bucket.data_lake.id
description = "Name of the data lake S3 bucket"
}
output "redshift_cluster_endpoint" {
value = aws_redshift_cluster.analytics.endpoint
description = "Redshift cluster endpoint"
sensitive = true # Don't show in plain text
}
output "vpc_id" {
value = aws_vpc.main.id
description = "VPC ID for networking"
}
# Use outputs in other configurations:
# data "terraform_remote_state" "networking" {
# backend = "s3"
# config = {
# bucket = "my-company-terraform-state"
# key = "networking/terraform.tfstate"
# region = "us-east-1"
# }
# }

Terraform Modules

Module Structure

terraform/
├── modules/
│ ├── data_lake/
│ │ ├── main.tf
│ │ ├── variables.tf
│ │ ├── outputs.tf
│ │ └── index.md
│ ├── redshift/
│ │ ├── main.tf
│ │ ├── variables.tf
│ │ ├── outputs.tf
│ │ └── index.md
│ └── airflow/
│ ├── main.tf
│ ├── variables.tf
│ ├── outputs.tf
│ └── index.md
├── environments/
│ ├── dev/
│ │ ├── main.tf
│ │ └── terraform.tfvars
│ ├── staging/
│ │ ├── main.tf
│ │ └── terraform.tfvars
│ └── production/
│ ├── main.tf
│ └── terraform.tfvars
└── main.tf

Module Example

modules/data_lake/main.tf
resource "aws_s3_bucket" "data_lake" {
bucket = var.bucket_name
tags = var.tags
}
resource "aws_s3_bucket_versioning" "data_lake_versioning" {
bucket = aws_s3_bucket.data_lake.id
versioning_configuration {
status = "Enabled"
}
}
resource "aws_s3_bucket_server_side_encryption_configuration" "data_lake_encryption" {
bucket = aws_s3_bucket.data_lake.id
rule {
apply_server_side_encryption_by_default {
sse_algorithm = "AES256"
}
}
}
resource "aws_s3_bucket_lifecycle_configuration" "data_lake_lifecycle" {
bucket = aws_s3_bucket.data_lake.id
rule {
id = "transition-to-ia"
status = "Enabled"
transition {
days = 30
storage_class = "STANDARD_IA"
}
transition {
days = 90
storage_class = "GLACIER"
}
expiration {
days = 365
}
}
}
modules/data_lake/variables.tf
variable "bucket_name" {
description = "Name of the S3 bucket"
type = string
}
variable "tags" {
description = "Tags to apply to resources"
type = map(string)
default = {}
}
variable "versioning_enabled" {
description = "Enable S3 versioning"
type = bool
default = true
}
variable "lifecycle_rules" {
description = "Lifecycle rules for the bucket"
type = map(object({
transition_days_ia = number
transition_days_glacier = number
expiration_days = number
}))
default = {}
}
modules/data_lake/outputs.tf
output "bucket_name" {
description = "Name of the S3 bucket"
value = aws_s3_bucket.data_lake.id
}
output "bucket_arn" {
description = "ARN of the S3 bucket"
value = aws_s3_bucket.data_lake.arn
}
output "bucket_id" {
description = "ID of the S3 bucket"
value = aws_s3_bucket.data_lake.id
}

Using Modules

environments/production/main.tf
module "data_lake_raw" {
source = "../../modules/data_lake"
bucket_name = "my-company-raw-data"
tags = {
Environment = "production"
Layer = "raw"
ManagedBy = "terraform"
}
lifecycle_rules = {
default = {
transition_days_ia = 30
transition_days_glacier = 90
expiration_days = 365
}
}
}
module "data_lake_curated" {
source = "../../modules/data_lake"
bucket_name = "my-company-curated-data"
tags = {
Environment = "production"
Layer = "curated"
ManagedBy = "terraform"
}
lifecycle_rules = {
default = {
transition_days_ia = 90
transition_days_glacier = 365
expiration_days = 2555 # 7 years
}
}
}

Data Platform Resources

Redshift Cluster

modules/redshift/main.tf
resource "aws_redshift_cluster" "analytics" {
cluster_identifier = var.cluster_identifier
database_name = var.database_name
master_username = var.master_username
master_password = var.master_password
node_type = var.node_type
number_of_nodes = var.number_of_nodes
# VPC configuration
cluster_subnet_group_name = var.cluster_subnet_group_name
vpc_security_group_ids = var.security_group_ids
cluster_public_key = var.cluster_public_key
# IAM roles
iam_roles = [
aws_iam_role.redshift_s3_access.arn
]
# Parameters
automated_snapshot_retention_period = var.snapshot_retention_period
preferred_maintenance_window = var.maintenance_window
cluster_type = var.cluster_type # single-node or multi-node
# Enhanced VPC routing
enhanced_vpc_routing = var.enhanced_vpc_routing
# Logging
enable_logging = var.enable_logging
log_exports = var.log_exports
tags = var.tags
}
# IAM role for S3 access
resource "aws_iam_role" "redshift_s3_access" {
name = "${var.cluster_identifier}-s3-access"
assume_role_policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Action = "sts:AssumeRole"
Effect = "Allow"
Principal = {
Service = "redshift.amazonaws.com"
}
}
]
})
}
resource "aws_iam_role_policy_attachment" "redshift_s3_read_only" {
role = aws_iam_role.redshift_s3_access.name
policy_arn = "arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"
}
# Snapshot schedule
resource "aws_redshift_snapshot_schedule" "daily" {
identifier = "${var.cluster_identifier}-daily-snapshot"
schedules {
snapshot_identifier_prefix = var.cluster_identifier
snapshot_create_tags = {
SnapshotType = "daily"
}
}
definition {
schedule = "cron(0 0 * * ? *)" # Daily at midnight UTC
}
}

BigQuery Dataset

modules/bigquery/main.tf
resource "google_bigquery_dataset" "analytics" {
dataset_id = var.dataset_id
project = var.project_id
location = var.location
labels = var.labels
default_table_expiration_ms = var.default_table_expiration_ms
access {
role = "OWNER"
user_by_email = var.owner_email
}
access {
role = "READER"
domain = "my-company.com"
}
depends_on = [
google_project_iam_member.bigquery_admin
]
}
resource "google_bigquery_table" "sales" {
dataset_id = google_bigquery_dataset.analytics.dataset_id
table_id = "sales"
project = var.project_id
schema = <<EOF
[
{
"name": "sale_id",
"type": "INTEGER",
"mode": "REQUIRED"
},
{
"name": "customer_id",
"type": "INTEGER",
"mode": "REQUIRED"
},
{
"name": "sale_date",
"type": "DATE",
"mode": "REQUIRED"
},
{
"name": "amount",
"type": "NUMERIC",
"mode": "NULLABLE"
}
]
EOF
time_partitioning {
type = "DAY"
field = "sale_date"
expiration_ms = 7776000000 # 90 days
require_partition_filter = true
}
clustering = ["customer_id"]
labels = var.labels
}

Snowflake Resources

modules/snowflake/main.tf
# Configure Snowflake provider
provider "snowflake" {
account = var.snowflake_account
region = var.snowflake_region
role = var.snowflake_role
}
# Create warehouse
resource "snowflake_warehouse" "compute_wh" {
name = var.warehouse_name
warehouse_size = var.warehouse_size # X-Small to 4X-Large
auto_suspend = var.auto_suspend_minutes
auto_resume = true
scaling_policy = var.scaling_policy # STANDARD or ECONOMY
max_cluster_count = var.max_cluster_count
min_cluster_count = var.min_cluster_count
tags = {
Environment = var.environment
ManagedBy = "terraform"
}
}
# Create database
resource "snowflake_database" "analytics" {
name = var.database_name
comment = "Analytics database"
tags = {
Environment = var.environment
}
}
# Create schema
resource "snowflake_schema" "public" {
database = snowflake_database.analytics.name
name = "public"
comment = "Public schema"
is_managed = false
}
# Create role
resource "snowflake_role" "data_engineer" {
name = "data_engineer"
comment = "Data engineer role"
}
# Grant privileges
resource "snowflake_role_grants" "data_engineer" {
role_name = snowflake_role.data_engineer.name
grants {
privilege = "USAGE"
on_schema {
schema_name = "${snowflake_database.analytics.name}.${snowflake_schema.public.name}"
}
}
grants {
privilege = "SELECT"
on_schema_object {
object_type = "TABLES"
object_name = "${snowflake_database.analytics.name}.${snowflake_schema.public.name}"
}
}
}

Terraform Best Practices

Environment Management

Workspaces

# Use Terraform workspaces for environment management
# Initialize workspace
terraform workspace new dev
# Use workspace-specific variables
variable "cluster_size" {
default = {
"dev" = "dc2.large"
"staging" = "dc2.8xlarge"
"prod" = "ra3.4xlarge"
}
}
locals {
cluster_size = var.cluster_size[terraform.workspace]
}
# Use workspace in resource
resource "aws_redshift_cluster" "analytics" {
node_type = local.cluster_size
}

Terraform CI/CD

GitHub Actions

.github/workflows/terraform.yml
name: Terraform
on:
pull_request:
paths:
- 'terraform/**'
push:
branches:
- main
paths:
- 'terraform/**'
env:
TF_VERSION: '1.5.0'
AWS_REGION: 'us-east-1'
jobs:
validate:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: hashicorp/setup-terraform@v2
with:
terraform_version: ${{ env.TF_VERSION }}
- name: Terraform Format Check
run: terraform fmt -check -recursive
- name: Terraform Init
run: terraform init
- name: Terraform Validate
run: terraform validate
plan:
needs: validate
if: github.event_name == 'pull_request'
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: hashicorp/setup-terraform@v2
with:
terraform_version: ${{ env.TF_VERSION }}
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v2
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ env.AWS_REGION }}
- name: Terraform Init
run: terraform init
- name: Terraform Plan
run: terraform plan -out=tfplan
- name: Terraform Plan Summary
run: terraform show -no-color tfplan
apply:
needs: plan
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: hashicorp/setup-terraform@v2
with:
terraform_version: ${{ env.TF_VERSION }}
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v2
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ env.AWS_REGION }}
- name: Terraform Init
run: terraform init
- name: Terraform Apply
run: terraform apply -auto-approve

Terraform State Drift

Detecting Drift

Terminal window
# Refresh state to detect drift
terraform refresh
# Plan to show drift
terraform plan
# Or use terraform show to see drifted resources
terraform show

Managing Drift

# Use lifecycle to prevent manual changes
resource "aws_s3_bucket" "data_lake" {
bucket = "my-company-data-lake"
lifecycle {
ignore_changes = [] # Detect all changes
# ignore_changes = [tags["LastModified"]] # Ignore specific changes
prevent_destroy = true # Prevent accidental deletion
}
}

Key Takeaways

  1. Declarative: Define desired state, Terraform handles the rest
  2. State Management: Remote state with locking for team collaboration
  3. Modules: Reusable infrastructure components
  4. Environments: Separate workspaces or directories for dev/staging/prod
  5. CI/CD: Integrate Terraform with GitHub Actions
  6. Drift Detection: Regularly check for manual changes
  7. Best Practices: Use modules, version control, automated testing
  8. Cost: Use Terraform for cost optimization (spot instances, auto-scaling)

Back to Module 3