Skip to content

Data Vault 2.0

Agile Data Warehouse Modeling


Overview

Data Vault 2.0 is a data modeling methodology designed for enterprise data warehousing that provides agility, auditability, and scalability. It combines the best of dimensional modeling (Kimball) and enterprise modeling (Inmon) with patterns for Hub, Link, and Satellite entities.


Data Vault Architecture

Core Concepts

Entity Types:

  • Hub: Core business concepts (Customer, Product, Order)
  • Link: Relationships between hubs (Customer-Order, Order-Item)
  • Satellite: Descriptive attributes attached to hubs/links

Hubs

Hub Design

-- Hub: Core business entity with business keys
-- Purpose: Store unique business keys with integration logic
CREATE TABLE hub_customer (
hub_customer_key BIGINT IDENTITY(1,1) PRIMARY KEY,
customer_key VARCHAR(255) NOT NULL, -- Business key
customer_source_key VARCHAR(255), -- Source system key
load_date TIMESTAMP NOT NULL, -- When loaded
record_source VARCHAR(100) NOT NULL, -- Source system
CONSTRAINT uk_hub_customer_key UNIQUE (customer_key, customer_source_key)
);
CREATE INDEX idx_hub_customer_load_date ON hub_customer(load_date);
CREATE INDEX idx_hub_customer_key ON hub_customer(customer_key);
-- Benefits:
-- - Single source of truth for customer
-- - All customer business keys (from multiple sources)
-- - Integration logic for matching customers
-- - Audit trail (load_date, record_source)
-- Best Practices:
-- 1. One hub per core business concept
-- 2. Hash business keys for consistency
-- 3. Include source system tracking
-- 4. Always include load_date and record_source

Hub with Multiple Sources

-- Customer data from multiple source systems
-- Sources: E-commerce (web), CRM (sales), ERP (finance)
-- Hash business keys (e.g., email, customer_id)
-- MD5 or SHA256 hash of business key
-- Example: SHA256(customer_id + source_system) AS customer_key
INSERT INTO hub_customer (customer_key, customer_source_key, load_date, record_source)
SELECT
SHA2(CONCAT('ECOMMERCE', customer_id), 256) AS customer_key,
customer_id AS customer_source_key,
CURRENT_TIMESTAMP AS load_date,
'ECOMMERCE' AS record_source
FROM ecommerce.customers;
INSERT INTO hub_customer (customer_key, customer_source_key, load_date, record_source)
SELECT
SHA2(CONCAT('CRM', account_id), 256) AS customer_key,
account_id AS customer_source_key,
CURRENT_TIMESTAMP AS load_date,
'CRM' AS record_source
FROM crm.accounts;
-- Integration logic: Match customers across sources
-- - Same email = same customer
-- - Same phone = same customer
-- - Same SSN (if applicable) = same customer

-- Link: Relationships between hubs
-- Purpose: Store relationships between business entities
CREATE TABLE link_customer_order (
link_customer_order_key BIGINT IDENTITY(1,1) PRIMARY KEY,
hub_customer_key BIGINT NOT NULL,
hub_order_key BIGINT NOT NULL,
load_date TIMESTAMP NOT NULL,
record_source VARCHAR(100) NOT NULL,
CONSTRAINT fk_link_customer_customer FOREIGN KEY (hub_customer_key) REFERENCES hub_customer(hub_customer_key),
CONSTRAINT fk_link_customer_order FOREIGN KEY (hub_order_key) REFERENCES hub_order(hub_order_key),
CONSTRAINT uk_link_customer_order UNIQUE (hub_customer_key, hub_order_key, load_date)
);
CREATE INDEX idx_link_customer_order_load_date ON link_customer_order(load_date);
CREATE INDEX idx_link_customer_order_customer ON link_customer_order(hub_customer_key);
CREATE INDEX idx_link_customer_order_order ON link_customer_order(hub_order_key);
-- Multi-active link: Multiple relationships between same entities
-- Example: Customer can have multiple addresses
CREATE TABLE link_customer_address (
link_customer_address_key BIGINT IDENTITY(1,1) PRIMARY KEY,
hub_customer_key BIGINT NOT NULL,
hub_address_key BIGINT NOT NULL,
is_current BOOLEAN DEFAULT TRUE, -- Current address
is_deleted BOOLEAN DEFAULT FALSE, -- Soft delete
load_date TIMESTAMP NOT NULL,
record_source VARCHAR(100) NOT NULL,
CONSTRAINT fk_link_customer_customer FOREIGN KEY (hub_customer_key) REFERENCES hub_customer(hub_customer_key),
CONSTRAINT fk_link_customer_address FOREIGN KEY (hub_address_key) REFERENCES hub_address(hub_address_key)
);
-- Benefits:
-- - Capture all relationships
-- - Multi-active (multiple relationships between same entities)
-- - Historized (track relationship changes over time)
-- - No foreign key constraints in staging (better performance)
-- Link with relationship attributes
-- Example: Order-Item with quantity, unit price
CREATE TABLE link_order_item (
link_order_item_key BIGINT IDENTITY(1,1) PRIMARY KEY,
hub_order_key BIGINT NOT NULL,
hub_product_key BIGINT NOT NULL,
load_date TIMESTAMP NOT NULL,
record_source VARCHAR(100) NOT NULL,
CONSTRAINT fk_link_order_order FOREIGN KEY (hub_order_key) REFERENCES hub_order(hub_order_key),
CONSTRAINT fk_link_order_product FOREIGN KEY (hub_product_key) REFERENCES hub_product(hub_product_key)
);
-- Satellite for link attributes
CREATE TABLE sat_order_item_details (
sat_order_item_details_key BIGINT IDENTITY(1,1) PRIMARY KEY,
link_order_item_key BIGINT NOT NULL,
quantity INT NOT NULL,
unit_price DECIMAL(18,2) NOT NULL,
discount_percent DECIMAL(5,2) DEFAULT 0,
load_date TIMESTAMP NOT NULL,
record_source VARCHAR(100) NOT NULL,
CONSTRAINT fk_sat_order_item_link FOREIGN KEY (link_order_item_key) REFERENCES link_order_item(link_order_item_key)
);
CREATE INDEX idx_sat_order_item_link ON sat_order_item_details(link_order_item_key);
CREATE INDEX idx_sat_order_item_load_date ON sat_order_item_details(load_date);

Satellites

Satellite Design

-- Satellite: Descriptive attributes for hubs/links
-- Purpose: Store all descriptive attributes, historized
CREATE TABLE sat_customer_details (
sat_customer_details_key BIGINT IDENTITY(1,1) PRIMARY KEY,
hub_customer_key BIGINT NOT NULL,
customer_name VARCHAR(255),
customer_email VARCHAR(255),
customer_phone VARCHAR(50),
customer_segment VARCHAR(50),
customer_tier VARCHAR(20),
load_date TIMESTAMP NOT NULL,
record_source VARCHAR(100) NOT NULL,
CONSTRAINT fk_sat_customer_details_hub FOREIGN KEY (hub_customer_key) REFERENCES hub_customer(hub_customer_key)
);
CREATE INDEX idx_sat_customer_details_hub ON sat_customer_details(hub_customer_key);
CREATE INDEX idx_sat_customer_details_load_date ON sat_customer_details(load_date);
-- Benefits:
-- - All attributes in one place (no hunting)
-- - Historized (see attribute changes over time)
-- - Source system tracking (record_source)
-- - No updates, only inserts (append-only)
-- Loading pattern:
INSERT INTO sat_customer_details (hub_customer_key, customer_name, customer_email, load_date, record_source)
SELECT
h.hub_customer_key,
c.customer_name,
c.customer_email,
CURRENT_TIMESTAMP AS load_date,
'ECOMMERCE' AS record_source
FROM hub_customer h
JOIN ecommerce.customers c ON h.customer_source_key = c.customer_id
WHERE h.record_source = 'ECOMMERCE'
AND NOT EXISTS (
-- Only insert if changed (CDC pattern)
SELECT 1
FROM sat_customer_details s
WHERE s.hub_customer_key = h.hub_customer_key
AND s.load_date >= CURRENT_TIMESTAMP - INTERVAL '1 hour'
AND s.customer_name = c.customer_name
AND s.customer_email = c.customer_email
);

Satellite Types

-- 1. Descriptive satellite: Business attributes
CREATE TABLE sat_customer_demographics (
sat_customer_demographics_key BIGINT IDENTITY(1,1) PRIMARY KEY,
hub_customer_key BIGINT NOT NULL,
age INT,
gender VARCHAR(10),
income_bracket VARCHAR(20),
occupation VARCHAR(100),
load_date TIMESTAMP NOT NULL,
record_source VARCHAR(100) NOT NULL
);
-- 2. Status satellite: Status attributes (frequently changing)
CREATE TABLE sat_customer_status (
sat_customer_status_key BIGINT IDENTITY(1,1) PRIMARY KEY,
hub_customer_key BIGINT NOT NULL,
is_active BOOLEAN DEFAULT TRUE,
customer_status VARCHAR(50) DEFAULT 'ACTIVE',
churn_probability DECIMAL(3,2),
loyalty_points INT DEFAULT 0,
load_date TIMESTAMP NOT NULL,
record_source VARCHAR(100) NOT NULL
);
-- 3. Multi-active satellite: Multiple records per hub key
CREATE TABLE sat_customer_addresses (
sat_customer_addresses_key BIGINT IDENTITY(1,1) PRIMARY KEY,
hub_customer_key BIGINT NOT NULL,
address_type VARCHAR(20) NOT NULL, -- BILLING, SHIPPING, MAILING
address_line1 VARCHAR(255),
address_line2 VARCHAR(255),
city VARCHAR(100),
state VARCHAR(50),
postal_code VARCHAR(20),
country VARCHAR(100),
is_primary BOOLEAN DEFAULT FALSE,
load_date TIMESTAMP NOT NULL,
record_source VARCHAR(100) NOT NULL
);

Data Vault Loading Patterns

Hub Loading

-- Stage data from source systems
CREATE TABLE stg_customers AS
SELECT
customer_id,
customer_name,
customer_email,
customer_phone,
'ECOMMERCE' AS record_source,
CURRENT_TIMESTAMP AS load_date
FROM ecommerce.customers;
-- Load hub (deduplicated)
INSERT INTO hub_customer (customer_key, customer_source_key, load_date, record_source)
SELECT
SHA2(CONCAT('ECOMMERCE', customer_id), 256) AS customer_key,
customer_id AS customer_source_key,
load_date,
record_source
FROM (
SELECT
customer_id,
load_date,
record_source,
ROW_NUMBER() OVER (PARTITION BY customer_id ORDER BY load_date DESC) as rn
FROM stg_customers
) deduplicated
WHERE rn = 1;
-- Load link from source transactions
INSERT INTO link_customer_order (hub_customer_key, hub_order_key, load_date, record_source)
SELECT DISTINCT
SHA2(CONCAT('ECOMMERCE', c.customer_id), 256) AS hub_customer_key,
SHA2(CONCAT('ECOMMERCE', o.order_id), 256) AS hub_order_key,
CURRENT_TIMESTAMP AS load_date,
'ECOMMERCE' AS record_source
FROM ecommerce.orders o
JOIN ecommerce.customers c ON o.customer_id = c.customer_id;

Satellite Loading

-- Load satellite with CDC (Change Data Capture)
INSERT INTO sat_customer_details (hub_customer_key, customer_name, customer_email, load_date, record_source)
SELECT
h.hub_customer_key,
s.customer_name,
s.customer_email,
s.load_date,
s.record_source
FROM hub_customer h
JOIN stg_customers s ON SHA2(CONCAT('ECOMMERCE', s.customer_id), 256) = h.customer_key
WHERE h.record_source = 'ECOMMERCE'
AND NOT EXISTS (
-- Only insert if changed (delta detection)
SELECT 1
FROM sat_customer_details prev
WHERE prev.hub_customer_key = h.hub_customer_key
AND prev.load_date >= CURRENT_TIMESTAMP - INTERVAL '1 day'
AND prev.customer_name = s.customer_name
AND prev.customer_email = s.customer_email
);

Data Vault Patterns

Driving Key

-- Driving key: Hash of natural keys + source system
-- Used for consistent identification across systems
-- Example: Customer driving key
CREATE TABLE hub_customer (
hub_customer_key BIGINT IDENTITY(1,1) PRIMARY KEY,
customer_key VARCHAR(255) NOT NULL, -- Driving key
customer_source_key VARCHAR(255),
load_date TIMESTAMP NOT NULL,
record_source VARCHAR(100) NOT NULL
);
-- Driving key format:
-- SHA2(source_system + business_key_1 + business_key_2, 256)
-- Example: SHA2('ECOMMERCE' + 'customer_123' + 'email@example.com', 256)

Sequence Numbers

-- Sequence: Auto-increment for ordering records within satellite
CREATE TABLE sat_customer_details (
sat_customer_details_key BIGINT IDENTITY(1,1) PRIMARY KEY,
hub_customer_key BIGINT NOT NULL,
customer_name VARCHAR(255),
customer_email VARCHAR(255),
load_date TIMESTAMP NOT NULL,
load_end_date TIMESTAMP, -- NULL = current record
record_source VARCHAR(100) NOT NULL,
sequence_number INT NOT NULL, -- For ordering
CONSTRAINT uk_sat_customer_details UNIQUE (hub_customer_key, sequence_number)
);
-- Insert with sequence number
INSERT INTO sat_customer_details (hub_customer_key, customer_name, customer_email, load_date, record_source, sequence_number)
SELECT
h.hub_customer_key,
c.customer_name,
c.customer_email,
CURRENT_TIMESTAMP AS load_date,
'ECOMMERCE' AS record_source,
ROW_NUMBER() OVER (PARTITION BY h.hub_customer_key ORDER BY CURRENT_TIMESTAMP) AS sequence_number
FROM hub_customer h
JOIN ecommerce.customers c ON h.customer_source_key = c.customer_id;

Effectivity Satellites

-- Effectivity satellite: Track when records are effective
CREATE TABLE sat_customer_effectivity (
sat_customer_effectivity_key BIGINT IDENTITY(1,1) PRIMARY KEY,
hub_customer_key BIGINT NOT NULL,
effective_from DATE NOT NULL, -- When record became effective
effective_to DATE, -- NULL = currently effective
is_current BOOLEAN DEFAULT TRUE,
load_date TIMESTAMP NOT NULL,
record_source VARCHAR(100) NOT NULL
);
-- Query current records
SELECT
h.hub_customer_key,
s.customer_name,
s.effective_from,
s.effective_to
FROM hub_customer h
JOIN sat_customer_effectivity e ON h.hub_customer_key = e.hub_customer_key
JOIN sat_customer_details s ON h.hub_customer_key = s.hub_customer_key
WHERE e.is_current = TRUE;

Data Vault vs. Kimball

DimensionData Vault 2.0Kimball
ApproachEntity-relationshipDimensional modeling
EntitiesHub, Link, SatelliteFact, Dimension
AgilityVery highMedium
Audit trailExcellentLimited
PerformanceGood (with vault)Excellent (star schema)
ComplexityHigherLower
Best forEnterprise, agile, auditBusiness users, performance

Bridge Tables

Info Bridge

-- Info Bridge: Derived from Data Vault for dimensional modeling
-- Purpose: Performance-optimized layer for BI tools
-- Star schema from Data Vault
CREATE TABLE dim_customer AS
SELECT DISTINCT
h.hub_customer_key AS customer_key,
s.customer_name,
s.customer_email,
s.customer_phone,
s.customer_segment,
s.customer_tier
FROM hub_customer h
JOIN sat_customer_details s ON h.hub_customer_key = s.hub_customer_key
WHERE s.load_end_date IS NULL; -- Current record
CREATE TABLE fact_sales AS
SELECT
h.hub_order_key AS order_key,
h.hub_product_key AS product_key,
h.hub_customer_key AS customer_key,
s.quantity,
s.unit_price,
s.discount_percent,
o.order_date
FROM hub_order h
JOIN link_order_item l ON h.hub_order_key = l.hub_order_key
JOIN link_customer_order co ON h.hub_customer_key = co.hub_customer_key
JOIN sat_order_item_details s ON l.link_order_item_key = s.link_order_item_key
JOIN sat_order_details o ON h.hub_order_key = o.hub_order_key
WHERE s.load_end_date IS NULL
AND o.load_end_date IS NULL;

Data Vault Best Practices

DO

-- 1. Use consistent naming conventions
-- HUB_<entity>, LINK_<entity>_<entity>, SAT_<entity>_<attribute>
-- 2. Always include load_date and record_source
load_date TIMESTAMP NOT NULL,
record_source VARCHAR(100) NOT NULL
-- 3. Use hash keys for business keys
SHA2(CONCAT(source_system, business_key), 256) AS customer_key
-- 4. Create indexes on foreign keys and load_date
CREATE INDEX idx_sat_hub_key ON sat_customer_details(hub_customer_key);
CREATE INDEX idx_sat_load_date ON sat_customer_details(load_date);
-- 5. Use append-only pattern (no updates)
-- Always insert new records, never update

DON’T

-- 1. Don't use natural keys as primary keys
-- Use surrogate keys (IDENTITY)
-- 2. Don't update existing records
-- Insert new records for changes
-- 3. Don't ignore audit trail
-- Always track load_date and record_source
-- 4. Don't mix data from different sources in same satellite
-- Separate satellites per source system
-- 5. Don't create indexes on every column
-- Index strategically (foreign keys, load_date)

Key Takeaways

  1. Hubs: Core business entities with business keys
  2. Links: Relationships between hubs (multi-active)
  3. Satellites: Descriptive attributes (historized)
  4. Agility: Easy to adapt to changing requirements
  5. Audit trail: Complete history of all changes
  6. Integration: Multiple source systems, single source of truth
  7. Performance: Use Info Bridge for BI queries
  8. Use When: Enterprise data warehouse, agile requirements, audit trail needed

Back to Module 4