Terraform for Data Infrastructure

Infrastructure as Code for Data Platforms

Overview

Terraform is a declarative Infrastructure as Code (IaC) tool that enables reproducible, version-controlled infrastructure. For data platforms, Terraform manages warehouses, storage, networking, IAM, and orchestration infrastructure across AWS, GCP, and Azure.

Terraform Fundamentals

Declarative Syntax

# Provider configuration
provider "aws" {
  region = "us-east-1"
}

# S3 bucket for data lake
resource "aws_s3_bucket" "data_lake" {
  bucket = "my-company-data-lake"

  tags = {
    Environment = "production"
    ManagedBy   = "terraform"
  }
}

# DynamoDB table for Terraform state locking
resource "aws_dynamodb_table" "terraform_state_lock" {
  name           = "terraform-state-lock"
  billing_mode   = "PAY_PER_REQUEST"
  hash_key       = "LockID"

  attribute {
    name = "LockID"
    type = "S"
  }
}

# Terraform backend configuration
terraform {
  backend "s3" {
    bucket         = "my-company-terraform-state"
    key            = "data-platform/terraform.tfstate"
    region         = "us-east-1"
    encrypt        = true
    dynamodb_table = "terraform-state-lock"
  }
}

Terraform State Management

Remote State

terraform {
  backend "s3" {
    bucket         = "my-company-terraform-state"
    key            = "data-platform/terraform.tfstate"
    region         = "us-east-1"
    encrypt        = true
    dynamodb_table = "terraform-state-lock"

    # State versioning
    key_prefix = "data-platform:"
  }
}

# Benefits:
# - Team collaboration (shared state)
# - State locking (prevent concurrent updates)
# - State versioning (rollback capability)
# - Encryption (security)

State Outputs

# Output values for other Terraform configurations
output "data_lake_bucket_name" {
  value = aws_s3_bucket.data_lake.id
  description = "Name of the data lake S3 bucket"
}

output "redshift_cluster_endpoint" {
  value = aws_redshift_cluster.analytics.endpoint
  description = "Redshift cluster endpoint"
  sensitive = true  # Don't show in plain text
}

output "vpc_id" {
  value = aws_vpc.main.id
  description = "VPC ID for networking"
}

# Use outputs in other configurations:
# data "terraform_remote_state" "networking" {
#   backend = "s3"
#   config = {
#     bucket = "my-company-terraform-state"
#     key    = "networking/terraform.tfstate"
#     region = "us-east-1"
#   }
# }

Terraform Modules

Module Structure

terraform/
├── modules/
│   ├── data_lake/
│   │   ├── main.tf
│   │   ├── variables.tf
│   │   ├── outputs.tf
│   │   └── index.md
│   ├── redshift/
│   │   ├── main.tf
│   │   ├── variables.tf
│   │   ├── outputs.tf
│   │   └── index.md
│   └── airflow/
│       ├── main.tf
│       ├── variables.tf
│       ├── outputs.tf
│       └── index.md
├── environments/
│   ├── dev/
│   │   ├── main.tf
│   │   └── terraform.tfvars
│   ├── staging/
│   │   ├── main.tf
│   │   └── terraform.tfvars
│   └── production/
│       ├── main.tf
│       └── terraform.tfvars
└── main.tf

Module Example

resource "aws_s3_bucket" "data_lake" {
  bucket = var.bucket_name

  tags = var.tags
}

resource "aws_s3_bucket_versioning" "data_lake_versioning" {
  bucket = aws_s3_bucket.data_lake.id

  versioning_configuration {
    status = "Enabled"
  }
}

resource "aws_s3_bucket_server_side_encryption_configuration" "data_lake_encryption" {
  bucket = aws_s3_bucket.data_lake.id

  rule {
    apply_server_side_encryption_by_default {
      sse_algorithm = "AES256"
    }
  }
}

resource "aws_s3_bucket_lifecycle_configuration" "data_lake_lifecycle" {
  bucket = aws_s3_bucket.data_lake.id

  rule {
    id     = "transition-to-ia"
    status = "Enabled"

    transition {
      days          = 30
      storage_class = "STANDARD_IA"
    }

    transition {
      days          = 90
      storage_class = "GLACIER"
    }

    expiration {
      days = 365
    }
  }
}

variable "bucket_name" {
  description = "Name of the S3 bucket"
  type        = string
}

variable "tags" {
  description = "Tags to apply to resources"
  type        = map(string)
  default     = {}
}

variable "versioning_enabled" {
  description = "Enable S3 versioning"
  type        = bool
  default     = true
}

variable "lifecycle_rules" {
  description = "Lifecycle rules for the bucket"
  type = map(object({
    transition_days_ia = number
    transition_days_glacier = number
    expiration_days = number
  }))
  default = {}
}

output "bucket_name" {
  description = "Name of the S3 bucket"
  value       = aws_s3_bucket.data_lake.id
}

output "bucket_arn" {
  description = "ARN of the S3 bucket"
  value       = aws_s3_bucket.data_lake.arn
}

output "bucket_id" {
  description = "ID of the S3 bucket"
  value       = aws_s3_bucket.data_lake.id
}

Using Modules

module "data_lake_raw" {
  source = "../../modules/data_lake"

  bucket_name = "my-company-raw-data"
  tags = {
    Environment = "production"
    Layer       = "raw"
    ManagedBy   = "terraform"
  }

  lifecycle_rules = {
    default = {
      transition_days_ia       = 30
      transition_days_glacier  = 90
      expiration_days         = 365
    }
  }
}

module "data_lake_curated" {
  source = "../../modules/data_lake"

  bucket_name = "my-company-curated-data"
  tags = {
    Environment = "production"
    Layer       = "curated"
    ManagedBy   = "terraform"
  }

  lifecycle_rules = {
    default = {
      transition_days_ia       = 90
      transition_days_glacier  = 365
      expiration_days         = 2555  # 7 years
    }
  }
}

Data Platform Resources

Redshift Cluster

resource "aws_redshift_cluster" "analytics" {
  cluster_identifier      = var.cluster_identifier
  database_name           = var.database_name
  master_username         = var.master_username
  master_password         = var.master_password
  node_type               = var.node_type
  number_of_nodes         = var.number_of_nodes

  # VPC configuration
  cluster_subnet_group_name = var.cluster_subnet_group_name
  vpc_security_group_ids    = var.security_group_ids
  cluster_public_key         = var.cluster_public_key

  # IAM roles
  iam_roles = [
    aws_iam_role.redshift_s3_access.arn
  ]

  # Parameters
  automated_snapshot_retention_period = var.snapshot_retention_period
  preferred_maintenance_window       = var.maintenance_window
  cluster_type                        = var.cluster_type  # single-node or multi-node

  # Enhanced VPC routing
  enhanced_vpc_routing = var.enhanced_vpc_routing

  # Logging
  enable_logging = var.enable_logging
  log_exports   = var.log_exports

  tags = var.tags
}

# IAM role for S3 access
resource "aws_iam_role" "redshift_s3_access" {
  name = "${var.cluster_identifier}-s3-access"

  assume_role_policy = jsonencode({
    Version = "2012-10-17"
    Statement = [
      {
        Action = "sts:AssumeRole"
        Effect = "Allow"
        Principal = {
          Service = "redshift.amazonaws.com"
        }
      }
    ]
  })
}

resource "aws_iam_role_policy_attachment" "redshift_s3_read_only" {
  role       = aws_iam_role.redshift_s3_access.name
  policy_arn = "arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"
}

# Snapshot schedule
resource "aws_redshift_snapshot_schedule" "daily" {
  identifier = "${var.cluster_identifier}-daily-snapshot"

  schedules {
    snapshot_identifier_prefix = var.cluster_identifier
    snapshot_create_tags = {
      SnapshotType = "daily"
    }
  }

  definition {
    schedule = "cron(0 0 * * ? *)"  # Daily at midnight UTC
  }
}

BigQuery Dataset

resource "google_bigquery_dataset" "analytics" {
  dataset_id  = var.dataset_id
  project     = var.project_id
  location    = var.location

  labels = var.labels

  default_table_expiration_ms = var.default_table_expiration_ms

  access {
    role          = "OWNER"
    user_by_email = var.owner_email
  }

  access {
    role   = "READER"
    domain = "my-company.com"
  }

  depends_on = [
    google_project_iam_member.bigquery_admin
  ]
}

resource "google_bigquery_table" "sales" {
  dataset_id = google_bigquery_dataset.analytics.dataset_id
  table_id   = "sales"
  project    = var.project_id

  schema = <<EOF
[
  {
    "name": "sale_id",
    "type": "INTEGER",
    "mode": "REQUIRED"
  },
  {
    "name": "customer_id",
    "type": "INTEGER",
    "mode": "REQUIRED"
  },
  {
    "name": "sale_date",
    "type": "DATE",
    "mode": "REQUIRED"
  },
  {
    "name": "amount",
    "type": "NUMERIC",
    "mode": "NULLABLE"
  }
]
EOF

  time_partitioning {
    type                  = "DAY"
    field                 = "sale_date"
    expiration_ms         = 7776000000  # 90 days
    require_partition_filter = true
  }

  clustering = ["customer_id"]

  labels = var.labels
}

Snowflake Resources

# Configure Snowflake provider
provider "snowflake" {
  account = var.snowflake_account
  region  = var.snowflake_region
  role    = var.snowflake_role
}

# Create warehouse
resource "snowflake_warehouse" "compute_wh" {
  name           = var.warehouse_name
  warehouse_size = var.warehouse_size  # X-Small to 4X-Large
  auto_suspend   = var.auto_suspend_minutes
  auto_resume    = true

  scaling_policy = var.scaling_policy  # STANDARD or ECONOMY
  max_cluster_count = var.max_cluster_count
  min_cluster_count = var.min_cluster_count

  tags = {
    Environment = var.environment
    ManagedBy   = "terraform"
  }
}

# Create database
resource "snowflake_database" "analytics" {
  name    = var.database_name
  comment = "Analytics database"

  tags = {
    Environment = var.environment
  }
}

# Create schema
resource "snowflake_schema" "public" {
  database   = snowflake_database.analytics.name
  name       = "public"
  comment    = "Public schema"
  is_managed = false
}

# Create role
resource "snowflake_role" "data_engineer" {
  name = "data_engineer"
  comment = "Data engineer role"
}

# Grant privileges
resource "snowflake_role_grants" "data_engineer" {
  role_name = snowflake_role.data_engineer.name

  grants {
    privilege = "USAGE"
    on_schema {
      schema_name = "${snowflake_database.analytics.name}.${snowflake_schema.public.name}"
    }
  }

  grants {
    privilege = "SELECT"
    on_schema_object {
      object_type = "TABLES"
      object_name = "${snowflake_database.analytics.name}.${snowflake_schema.public.name}"
    }
  }
}

Terraform Best Practices

Environment Management

Workspaces

# Use Terraform workspaces for environment management

# Initialize workspace
terraform workspace new dev

# Use workspace-specific variables
variable "cluster_size" {
  default = {
    "dev"     = "dc2.large"
    "staging" = "dc2.8xlarge"
    "prod"    = "ra3.4xlarge"
  }
}

locals {
  cluster_size = var.cluster_size[terraform.workspace]
}

# Use workspace in resource
resource "aws_redshift_cluster" "analytics" {
  node_type = local.cluster_size
}

Terraform CI/CD

GitHub Actions

name: Terraform

on:
  pull_request:
    paths:
      - 'terraform/**'
  push:
    branches:
      - main
    paths:
      - 'terraform/**'

env:
  TF_VERSION: '1.5.0'
  AWS_REGION: 'us-east-1'

jobs:
  validate:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v3

      - uses: hashicorp/setup-terraform@v2
        with:
          terraform_version: ${{ env.TF_VERSION }}

      - name: Terraform Format Check
        run: terraform fmt -check -recursive

      - name: Terraform Init
        run: terraform init

      - name: Terraform Validate
        run: terraform validate

  plan:
    needs: validate
    if: github.event_name == 'pull_request'
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v3

      - uses: hashicorp/setup-terraform@v2
        with:
          terraform_version: ${{ env.TF_VERSION }}

      - name: Configure AWS Credentials
        uses: aws-actions/configure-aws-credentials@v2
        with:
          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
          aws-region: ${{ env.AWS_REGION }}

      - name: Terraform Init
        run: terraform init

      - name: Terraform Plan
        run: terraform plan -out=tfplan

      - name: Terraform Plan Summary
        run: terraform show -no-color tfplan

  apply:
    needs: plan
    if: github.event_name == 'push' && github.ref == 'refs/heads/main'
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v3

      - uses: hashicorp/setup-terraform@v2
        with:
          terraform_version: ${{ env.TF_VERSION }}

      - name: Configure AWS Credentials
        uses: aws-actions/configure-aws-credentials@v2
        with:
          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
          aws-region: ${{ env.AWS_REGION }}

      - name: Terraform Init
        run: terraform init

      - name: Terraform Apply
        run: terraform apply -auto-approve

Terraform State Drift

Detecting Drift

# Refresh state to detect drift
terraform refresh

# Plan to show drift
terraform plan

# Or use terraform show to see drifted resources
terraform show

Managing Drift

# Use lifecycle to prevent manual changes
resource "aws_s3_bucket" "data_lake" {
  bucket = "my-company-data-lake"

  lifecycle {
    ignore_changes = []  # Detect all changes
    # ignore_changes = [tags["LastModified"]]  # Ignore specific changes
    prevent_destroy = true  # Prevent accidental deletion
  }
}

Key Takeaways

Declarative: Define desired state, Terraform handles the rest
State Management: Remote state with locking for team collaboration
Modules: Reusable infrastructure components
Environments: Separate workspaces or directories for dev/staging/prod
CI/CD: Integrate Terraform with GitHub Actions
Drift Detection: Regularly check for manual changes
Best Practices: Use modules, version control, automated testing
Cost: Use Terraform for cost optimization (spot instances, auto-scaling)

Back to Module 3