Lifecycle Management
Data Tiering and Retention Policies
Overview
Lifecycle management automatically moves data between storage tiers based on age, access patterns, and business requirements, reducing storage costs while maintaining data availability.
Lifecycle Architecture
Tiering Strategy
Tier Definitions:
- Hot: 0-30 days old, frequent access, high-performance storage
- Warm: 30-90 days old, occasional access, standard storage
- Cold: 90+ days old, rare access, archive storage
S3 Lifecycle Policies
S3 Lifecycle Rules
# S3 lifecycle policy configuration
import boto3
def create_lifecycle_policy( bucket_name: str, transition_days_standard: int = 30, transition_days_ia: int = 90, expiration_days: int = 365): """Create S3 lifecycle policy"""
s3_client = boto3.client('s3')
lifecycle_configuration = { 'Rules': [ { 'ID': 'TransitionAndExpireRule', 'Status': 'Enabled', 'Filter': { 'Prefix': 'data/' }, 'Transitions': [ { 'Days': transition_days_standard, 'StorageClass': 'STANDARD_IA' # Infrequent Access }, { 'Days': transition_days_ia, 'StorageClass': 'GLACIER' # Archive } ], 'Expiration': { 'Days': expiration_days } } ] }
s3_client.put_bucket_lifecycle_configuration( Bucket=bucket_name, LifecycleConfiguration=lifecycle_configuration )
print(f"Lifecycle policy created for {bucket_name}")
# Example usagecreate_lifecycle_policy( bucket_name='my-company-data-lake', transition_days_standard=30, # Move to IA after 30 days transition_days_ia=90, # Move to Glacier after 90 days expiration_days=365 # Delete after 1 year)Intelligent Tiering
# S3 Intelligent Tiering
def enable_intelligent_tiering(bucket_name: str): """Enable S3 Intelligent Tiering"""
s3_client = boto3.client('s3')
# Enable intelligent tiering s3_client.put_bucket_tagging( Bucket=bucket_name, Tagging={ 'TagSet': [ { 'Key': 'IntelligentTiering', 'Value': 'Enabled' } ] } )
# Configure intelligent tiering s3_client.put_object( Bucket=bucket_name, Key='data/', LifecycleConfiguration={ 'Id': 'IntelligentTiering', 'Filter': {'Prefix': 'data/'}, 'Status': 'Enabled', 'Transitions': [ { 'Days': 90, 'StorageClass': 'INTELLIGENT_TIERING' } ] } )
print(f"Intelligent tiering enabled for {bucket_name}")Database Lifecycle
Table Partition Lifecycle
-- Partition lifecycle management
-- Create partitioned tableCREATE TABLE sales ( order_id BIGINT, customer_id BIGINT, order_date DATE, amount DECIMAL(10,2)) PARTITION BY RANGE (order_date);
-- Create partitions-- Current month (hot)CREATE TABLE sales_2025_01 PARTITION OF salesFOR VALUES FROM ('2025-01-01') TO ('2025-01-31');
-- Previous month (warm)CREATE TABLE sales_2024_12 PARTITION OF salesFOR VALUES FROM ('2024-12-01') TO ('2024-12-31');
-- Older months (cold)CREATE TABLE sales_2024_11 PARTITION OF salesFOR VALUES FROM ('2024-11-01') TO ('2024-11-30');
-- Archive old partitions-- Move to cheaper storage or delete
-- Drop old partitions (after 1 year)DROP TABLE sales_2024_01;Data Retention Policies
-- Data retention policy implementation
-- Create retention functionCREATE OR REPLACE FUNCTION apply_retention_policy()RETURNS void AS $$BEGIN -- Delete data older than 1 year DELETE FROM sales WHERE order_date < CURRENT_DATE - INTERVAL '1 year';
-- Archive data older than 6 months -- Move to archive table INSERT INTO sales_archive SELECT * FROM sales WHERE order_date < CURRENT_DATE - INTERVAL '6 months';
-- Delete from main table DELETE FROM sales WHERE order_date < CURRENT_DATE - INTERVAL '6 months';END;$$ LANGUAGE plpgsql;
-- Schedule retention functionCREATE EXTENSION IF NOT EXISTS pg_cron;
-- Schedule daily at 2 AMSELECT cron.schedule( 'apply-retention', '0 2 * * *', 'SELECT apply_retention_policy()');Delta Lake Lifecycle
Vacuum and Optimize
# Delta Lake lifecycle management
from delta import *from pyspark.sql import SparkSession
spark = SparkSession.builder \ .appName("DeltaLifecycle") \ .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \ .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \ .getOrCreate()
def delta_lifecycle( table_path: str, vacuum_hours: int = 240, # 7 days zorder_columns: list = None): """Apply Delta Lake lifecycle policies"""
from delta.tables import DeltaTable
delta_table = DeltaTable.forPath(spark, table_path)
# Vacuum (remove old files) delta_table.vacuum(retention_hours=vacuum_hours)
# Optimize (compact and Z-Order) delta_table.optimize()
# Z-Order if columns specified if zorder_columns: delta_table.optimize().executeZOrderBy(zorder_columns)
print(f"Lifecycle management complete for {table_path}")
# Example usagedelta_lifecycle( table_path="s3://bucket/delta/sales/", vacuum_hours=240, # Keep 7 days zorder_columns=['order_date', 'customer_id'])Tiering Strategy
Access-Based Tiering
Tiering Implementation
# Automated tiering based on access
class TieringManager: """Automated data tiering"""
def __init__(self): self.s3_client = boto3.client('s3')
def classify_data(self, access_logs: list) -> dict: """Classify data into tiers based on access"""
# Calculate access statistics data_classification = {}
for log in access_logs: key = log['key']
if key not in data_classification: data_classification[key] = { 'access_count': 0, 'last_accessed': None }
data_classification[key]['access_count'] += 1 data_classification[key]['last_accessed'] = max( data_classification[key]['last_accessed'], log['timestamp'] )
# Classify into tiers tier_classification = {}
for key, stats in data_classification.items(): days_since_access = ( datetime.now() - stats['last_accessed'] ).days
access_frequency = stats['access_count'] / 30 # Per day
if access_frequency > 1: tier = 'hot' elif access_frequency > 0.1: # 3+ times per month tier = 'warm' else: tier = 'cold'
tier_classification[key] = { 'tier': tier, 'access_frequency': access_frequency, 'days_since_access': days_since_access }
return tier_classification
def move_to_tier(self, key: str, tier: str): """Move data to appropriate tier"""
if tier == 'hot': storage_class = 'STANDARD' elif tier == 'warm': storage_class = 'STANDARD_IA' elif tier == 'cold': storage_class = 'GLACIER' else: raise ValueError(f"Unknown tier: {tier}")
# Copy object to new storage class self.s3_client.copy_object( CopySource={ 'Bucket': 'my-bucket', 'Key': key }, Bucket='my-bucket', Key=key, StorageClass=storage_class )
print(f"Moved {key} to {tier} tier ({storage_class})")Lifecycle Best Practices
DO
# 1. Define clear tier boundaries# Age + access frequency
# 2. Automate lifecycle policies# No manual intervention
# 3. Monitor tier effectiveness# Track access patterns
# 4. Test data recovery# Ensure archives can be restored
# 5. Document retention policies# Compliance requirementsDON’T
#1. Don't delete data without backup# Ensure recoverability
# 2. Don't ignore compliance requirements# Legal retention requirements
# 3. Don't set too short retention# Data loss risk
# 4. Don't forget egress fees# Retrieval costs from archive
# 5. Don't ignore access patterns# Review tier effectivenessKey Takeaways
- Hot tier: 0-30 days, frequent access, high-performance
- Warm tier: 30-90 days, occasional access, standard storage
- Cold tier: 90+ days, rare access, archive storage
- S3 lifecycle: Automated tiering with policies
- Intelligent tiering: Automatic optimization
- Vacuum: Remove old files (Delta Lake)
- Retention policies: Delete/archive based on age
- Use When: All data platforms, cost optimization
Back to Module 7