Skip to content

Lifecycle Management

Data Tiering and Retention Policies


Overview

Lifecycle management automatically moves data between storage tiers based on age, access patterns, and business requirements, reducing storage costs while maintaining data availability.


Lifecycle Architecture

Tiering Strategy

Tier Definitions:

  • Hot: 0-30 days old, frequent access, high-performance storage
  • Warm: 30-90 days old, occasional access, standard storage
  • Cold: 90+ days old, rare access, archive storage

S3 Lifecycle Policies

S3 Lifecycle Rules

# S3 lifecycle policy configuration
import boto3
def create_lifecycle_policy(
bucket_name: str,
transition_days_standard: int = 30,
transition_days_ia: int = 90,
expiration_days: int = 365
):
"""Create S3 lifecycle policy"""
s3_client = boto3.client('s3')
lifecycle_configuration = {
'Rules': [
{
'ID': 'TransitionAndExpireRule',
'Status': 'Enabled',
'Filter': {
'Prefix': 'data/'
},
'Transitions': [
{
'Days': transition_days_standard,
'StorageClass': 'STANDARD_IA' # Infrequent Access
},
{
'Days': transition_days_ia,
'StorageClass': 'GLACIER' # Archive
}
],
'Expiration': {
'Days': expiration_days
}
}
]
}
s3_client.put_bucket_lifecycle_configuration(
Bucket=bucket_name,
LifecycleConfiguration=lifecycle_configuration
)
print(f"Lifecycle policy created for {bucket_name}")
# Example usage
create_lifecycle_policy(
bucket_name='my-company-data-lake',
transition_days_standard=30, # Move to IA after 30 days
transition_days_ia=90, # Move to Glacier after 90 days
expiration_days=365 # Delete after 1 year
)

Intelligent Tiering

# S3 Intelligent Tiering
def enable_intelligent_tiering(bucket_name: str):
"""Enable S3 Intelligent Tiering"""
s3_client = boto3.client('s3')
# Enable intelligent tiering
s3_client.put_bucket_tagging(
Bucket=bucket_name,
Tagging={
'TagSet': [
{
'Key': 'IntelligentTiering',
'Value': 'Enabled'
}
]
}
)
# Configure intelligent tiering
s3_client.put_object(
Bucket=bucket_name,
Key='data/',
LifecycleConfiguration={
'Id': 'IntelligentTiering',
'Filter': {'Prefix': 'data/'},
'Status': 'Enabled',
'Transitions': [
{
'Days': 90,
'StorageClass': 'INTELLIGENT_TIERING'
}
]
}
)
print(f"Intelligent tiering enabled for {bucket_name}")

Database Lifecycle

Table Partition Lifecycle

-- Partition lifecycle management
-- Create partitioned table
CREATE TABLE sales (
order_id BIGINT,
customer_id BIGINT,
order_date DATE,
amount DECIMAL(10,2)
) PARTITION BY RANGE (order_date);
-- Create partitions
-- Current month (hot)
CREATE TABLE sales_2025_01 PARTITION OF sales
FOR VALUES FROM ('2025-01-01') TO ('2025-01-31');
-- Previous month (warm)
CREATE TABLE sales_2024_12 PARTITION OF sales
FOR VALUES FROM ('2024-12-01') TO ('2024-12-31');
-- Older months (cold)
CREATE TABLE sales_2024_11 PARTITION OF sales
FOR VALUES FROM ('2024-11-01') TO ('2024-11-30');
-- Archive old partitions
-- Move to cheaper storage or delete
-- Drop old partitions (after 1 year)
DROP TABLE sales_2024_01;

Data Retention Policies

-- Data retention policy implementation
-- Create retention function
CREATE OR REPLACE FUNCTION apply_retention_policy()
RETURNS void AS $$
BEGIN
-- Delete data older than 1 year
DELETE FROM sales
WHERE order_date < CURRENT_DATE - INTERVAL '1 year';
-- Archive data older than 6 months
-- Move to archive table
INSERT INTO sales_archive
SELECT * FROM sales
WHERE order_date < CURRENT_DATE - INTERVAL '6 months';
-- Delete from main table
DELETE FROM sales
WHERE order_date < CURRENT_DATE - INTERVAL '6 months';
END;
$$ LANGUAGE plpgsql;
-- Schedule retention function
CREATE EXTENSION IF NOT EXISTS pg_cron;
-- Schedule daily at 2 AM
SELECT cron.schedule(
'apply-retention',
'0 2 * * *',
'SELECT apply_retention_policy()'
);

Delta Lake Lifecycle

Vacuum and Optimize

# Delta Lake lifecycle management
from delta import *
from pyspark.sql import SparkSession
spark = SparkSession.builder \
.appName("DeltaLifecycle") \
.config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
.config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
.getOrCreate()
def delta_lifecycle(
table_path: str,
vacuum_hours: int = 240, # 7 days
zorder_columns: list = None
):
"""Apply Delta Lake lifecycle policies"""
from delta.tables import DeltaTable
delta_table = DeltaTable.forPath(spark, table_path)
# Vacuum (remove old files)
delta_table.vacuum(retention_hours=vacuum_hours)
# Optimize (compact and Z-Order)
delta_table.optimize()
# Z-Order if columns specified
if zorder_columns:
delta_table.optimize().executeZOrderBy(zorder_columns)
print(f"Lifecycle management complete for {table_path}")
# Example usage
delta_lifecycle(
table_path="s3://bucket/delta/sales/",
vacuum_hours=240, # Keep 7 days
zorder_columns=['order_date', 'customer_id']
)

Tiering Strategy

Access-Based Tiering

Tiering Implementation

# Automated tiering based on access
class TieringManager:
"""Automated data tiering"""
def __init__(self):
self.s3_client = boto3.client('s3')
def classify_data(self, access_logs: list) -> dict:
"""Classify data into tiers based on access"""
# Calculate access statistics
data_classification = {}
for log in access_logs:
key = log['key']
if key not in data_classification:
data_classification[key] = {
'access_count': 0,
'last_accessed': None
}
data_classification[key]['access_count'] += 1
data_classification[key]['last_accessed'] = max(
data_classification[key]['last_accessed'],
log['timestamp']
)
# Classify into tiers
tier_classification = {}
for key, stats in data_classification.items():
days_since_access = (
datetime.now() - stats['last_accessed']
).days
access_frequency = stats['access_count'] / 30 # Per day
if access_frequency > 1:
tier = 'hot'
elif access_frequency > 0.1: # 3+ times per month
tier = 'warm'
else:
tier = 'cold'
tier_classification[key] = {
'tier': tier,
'access_frequency': access_frequency,
'days_since_access': days_since_access
}
return tier_classification
def move_to_tier(self, key: str, tier: str):
"""Move data to appropriate tier"""
if tier == 'hot':
storage_class = 'STANDARD'
elif tier == 'warm':
storage_class = 'STANDARD_IA'
elif tier == 'cold':
storage_class = 'GLACIER'
else:
raise ValueError(f"Unknown tier: {tier}")
# Copy object to new storage class
self.s3_client.copy_object(
CopySource={
'Bucket': 'my-bucket',
'Key': key
},
Bucket='my-bucket',
Key=key,
StorageClass=storage_class
)
print(f"Moved {key} to {tier} tier ({storage_class})")

Lifecycle Best Practices

DO

# 1. Define clear tier boundaries
# Age + access frequency
# 2. Automate lifecycle policies
# No manual intervention
# 3. Monitor tier effectiveness
# Track access patterns
# 4. Test data recovery
# Ensure archives can be restored
# 5. Document retention policies
# Compliance requirements

DON’T

#1. Don't delete data without backup
# Ensure recoverability
# 2. Don't ignore compliance requirements
# Legal retention requirements
# 3. Don't set too short retention
# Data loss risk
# 4. Don't forget egress fees
# Retrieval costs from archive
# 5. Don't ignore access patterns
# Review tier effectiveness

Key Takeaways

  1. Hot tier: 0-30 days, frequent access, high-performance
  2. Warm tier: 30-90 days, occasional access, standard storage
  3. Cold tier: 90+ days, rare access, archive storage
  4. S3 lifecycle: Automated tiering with policies
  5. Intelligent tiering: Automatic optimization
  6. Vacuum: Remove old files (Delta Lake)
  7. Retention policies: Delete/archive based on age
  8. Use When: All data platforms, cost optimization

Back to Module 7