Skip to content

Prompt Engineering Pipelines

Production Prompt Management for LLM Systems


Overview

Prompt engineering pipelines provide systematic, version-controlled, and tested prompt management for production LLM systems. They enable prompt versioning, A/B testing, automated evaluation, and deployment strategies.


Prompt Pipeline Architecture

Pipeline Components

Key Components:

  • Prompt Registry: Version-controlled prompt storage
  • Prompt Templates: Parameterized, reusable prompts
  • Testing Framework: Automated prompt evaluation
  • CI/CD Pipeline: Automated testing and deployment
  • LLM Gateway: Unified API for multiple LLM providers

Prompt Registry

Prompt Storage

# Prompt registry with versioning
from typing import Dict, List, Optional
from dataclasses import dataclass
from datetime import datetime
import json
@dataclass
class Prompt:
"""Prompt version"""
id: str
name: str
version: str
template: str
parameters: List[str]
metadata: Dict
created_at: datetime
created_by: str
class PromptRegistry:
"""Centralized prompt registry"""
def __init__(self, storage_path: str = "prompts/"):
self.storage_path = storage_path
self.prompts: Dict[str, List[Prompt]] = {}
def register_prompt(
self,
name: str,
template: str,
parameters: List[str],
metadata: Dict,
created_by: str
) -> Prompt:
"""Register new prompt version"""
# Generate version
versions = self.prompts.get(name, [])
version = f"v{len(versions) + 1}"
# Create prompt
prompt = Prompt(
id=f"{name}:{version}",
name=name,
version=version,
template=template,
parameters=parameters,
metadata=metadata,
created_at=datetime.now(),
created_by=created_by
)
# Store prompt
if name not in self.prompts:
self.prompts[name] = []
self.prompts[name].append(prompt)
# Persist to file
self._save_prompt(prompt)
return prompt
def get_prompt(self, name: str, version: Optional[str] = None) -> Prompt:
"""Get prompt by name and version"""
versions = self.prompts.get(name, [])
if not versions:
raise ValueError(f"Prompt {name} not found")
if version:
for prompt in versions:
if prompt.version == version:
return prompt
raise ValueError(f"Version {version} not found")
else:
# Return latest version
return versions[-1]
def render_prompt(self, name: str, parameters: Dict) -> str:
"""Render prompt with parameters"""
prompt = self.get_prompt(name)
# Validate parameters
missing_params = set(prompt.parameters) - set(parameters.keys())
if missing_params:
raise ValueError(f"Missing parameters: {missing_params}")
# Render template
rendered = prompt.template.format(**parameters)
return rendered
def _save_prompt(self, prompt: Prompt):
"""Save prompt to file"""
import os
os.makedirs(self.storage_path, exist_ok=True)
file_path = f"{self.storage_path}/{prompt.name}/{prompt.version}.json"
with open(file_path, 'w') as f:
json.dump({
'id': prompt.id,
'name': prompt.name,
'version': prompt.version,
'template': prompt.template,
'parameters': prompt.parameters,
'metadata': prompt.metadata,
'created_at': prompt.created_at.isoformat(),
'created_by': prompt.created_by
}, f, indent=2)
# Example usage
registry = PromptRegistry()
# Register RAG prompt
rag_prompt = registry.register_prompt(
name="rag_query",
template="""
You are a helpful assistant. Answer the following question based on the context provided.
Context:
{context}
Question: {query}
Answer:
""".strip(),
parameters=["context", "query"],
metadata={
"description": "RAG query prompt",
"model": "gpt-4",
"max_tokens": 500,
"temperature": 0.7
},
created_by="data-engineering@my-company.com"
)
# Render prompt
rendered = registry.render_prompt(
"rag_query",
parameters={
"context": "Machine learning is a subset of AI.",
"query": "What is machine learning?"
}
)
print(rendered)

Prompt Templates

Template Types

# Prompt template patterns
from string import Template
from jinja2 import Template as JinjaTemplate
# 1. String format templates (simple)
simple_template = """
Context: {context}
Question: {query}
Answer:
"""
# 2. Jinja2 templates (advanced)
jinja_template = JinjaTemplate("""
You are a {{ role }} assistant.
{% if context %}
Context:
{{ context }}
{% endif %}
Question: {{ query }}
{% if examples %}
Examples:
{% for example in examples %}
Q: {{ example.question }}
A: {{ example.answer }}
{% endfor %}
{% endif %}
Answer:
""")
# 3. Few-shot template
few_shot_template = """
You are a sentiment classifier. Classify the sentiment of the following text as Positive, Negative, or Neutral.
Examples:
Text: "I love this product!"
Sentiment: Positive
Text: "This is terrible."
Sentiment: Negative
Text: "It's okay."
Sentiment: Neutral
Text: "{{ text }}"
Sentiment:
"""
# 4. Chain-of-thought template
cot_template = """
You are a reasoning assistant. Think step-by-step to answer the question.
Question: {{ question }}
Let's think step-by-step:
1.
2.
3.
Answer:
"""
# 5. Structured output template
structured_template = """
You are a data extractor. Extract the following fields from the text:
Text: {{ text }}
Extract:
- Name: {{ name }}
- Email: {{ email }}
- Phone: {{ phone }}
Output format: JSON
"""

Parameter Validation

# Parameter validation for prompts
from typing import Dict, Any, List
from pydantic import BaseModel, validator
class PromptParameters(BaseModel):
"""Prompt parameters with validation"""
context: str
query: str
max_tokens: int = 500
temperature: float = 0.7
@validator('context')
def context_not_empty(cls, v):
if not v or len(v.strip()) == 0:
raise ValueError('Context cannot be empty')
return v
@validator('query')
def query_not_empty(cls, v):
if not v or len(v.strip()) == 0:
raise ValueError('Query cannot be empty')
return v
@validator('max_tokens')
def max_tokens_valid(cls, v):
if v < 1 or v > 4096:
raise ValueError('max_tokens must be between 1 and 4096')
return v
@validator('temperature')
def temperature_valid(cls, v):
if v < 0 or v > 2:
raise ValueError('temperature must be between 0 and 2')
return v
# Usage
try:
params = PromptParameters(
context="Machine learning is a subset of AI.",
query="What is machine learning?",
max_tokens=1000,
temperature=0.7
)
print("Parameters valid:", params.dict())
except ValueError as e:
print("Validation error:", e)

Prompt Testing

Unit Tests

# Prompt testing framework
import pytest
from typing import List, Dict
from openai import OpenAI
class PromptTester:
"""Test prompt templates"""
def __init__(self, llm_client: OpenAI):
self.llm = llm_client
def test_prompt_rendering(
self,
template: str,
parameters: Dict,
expected_contains: List[str]
) -> bool:
"""Test prompt renders correctly"""
try:
rendered = template.format(**parameters)
for expected in expected_contains:
assert expected in rendered, f"Expected '{expected}' in rendered prompt"
return True
except Exception as e:
print(f"Render test failed: {e}")
return False
def test_prompt_length(
self,
rendered: str,
max_length: int = 4000
) -> bool:
"""Test prompt length is within limits"""
import tiktoken
enc = tiktoken.encoding_for_model("gpt-4")
tokens = enc.encode(rendered)
if len(tokens) > max_length:
print(f"Prompt too long: {len(tokens)} > {max_length}")
return False
return True
def test_prompt_output(
self,
rendered: str,
expected_output_type: str = "str"
) -> bool:
"""Test prompt produces expected output"""
response = self.llm.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": rendered}],
max_tokens=100
)
output = response.choices[0].message.content
if expected_output_type == "json":
import json
try:
json.loads(output)
return True
except:
return False
return isinstance(output, str)
def test_prompt_consistency(
self,
rendered: str,
num_runs: int = 3,
temperature: float = 0.0
) -> bool:
"""Test prompt produces consistent outputs"""
outputs = []
for _ in range(num_runs):
response = self.llm.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": rendered}],
temperature=temperature
)
outputs.append(response.choices[0].message.content)
# All outputs should be identical
return len(set(outputs)) == 1
# Example usage
tester = PromptTester(llm_client=OpenAI())
# Test rendering
assert tester.test_prompt_rendering(
template="Context: {context}\nQuery: {query}",
parameters={"context": "Test", "query": "Test query"},
expected_contains=["Context:", "Query:"]
)
# Test length
assert tester.test_prompt_length(
rendered="Context: Test\nQuery: Test query",
max_length=100
)
# Test output
assert tester.test_prompt_output(
rendered="Say 'Hello, World!'",
expected_output_type="str"
)

Evaluation Metrics

# Prompt evaluation metrics
from typing import List, Dict
import numpy as np
class PromptEvaluator:
"""Evaluate prompt performance"""
def __init__(self, llm_client: OpenAI):
self.llm = llm_client
def evaluate_relevance(
self,
prompt: str,
response: str,
reference: str
) -> float:
"""Evaluate response relevance"""
# Use LLM to evaluate relevance
eval_prompt = f"""
Rate the relevance of the following response to the prompt.
Reference answer: {reference}
Response: {response}
Rate from 1 (not relevant) to 5 (highly relevant).
Output only the number.
"""
response = self.llm.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": eval_prompt}],
temperature=0
)
score = float(response.choices[0].message.content.strip())
return score / 5 # Normalize to 0-1
def evaluate_conciseness(self, response: str) -> float:
"""Evaluate response conciseness"""
words = response.split()
# Ideal length: 50-200 words
if 50 <= len(words) <= 200:
return 1.0
elif len(words) < 50:
return len(words) / 50
else:
return max(0, 1 - (len(words) - 200) / 500)
def evaluate_safety(self, response: str) -> float:
"""Evaluate response safety (no harmful content)"""
# Use moderation API
moderation = self.llm.moderations.create(input=response)
# Flagged categories
flagged = moderation.results[0].categories
# Safe if no flags
return 0.0 if any(flagged.values()) else 1.0
def evaluate_token_efficiency(
self,
prompt: str,
response: str
) -> float:
"""Evaluate token efficiency (output/input ratio)"""
import tiktoken
enc = tiktoken.encoding_for_model("gpt-4")
input_tokens = len(enc.encode(prompt))
output_tokens = len(enc.encode(response))
# Ideal ratio: 0.5-2.0
ratio = output_tokens / input_tokens
if 0.5 <= ratio <= 2.0:
return 1.0
else:
return max(0, 1 - abs(ratio - 1.0))

Prompt Deployment

CI/CD Pipeline

.github/workflows/prompt-ci-cd.yml
name: Prompt CI/CD
on:
push:
paths:
- 'prompts/**'
- 'tests/prompts/**'
jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.11'
- name: Install dependencies
run: |
pip install openai pytest
- name: Run prompt tests
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
run: |
pytest tests/prompts/
- name: Evaluate prompts
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
run: |
python scripts/evaluate_prompts.py
deploy:
needs: test
runs-on: ubuntu-latest
if: github.ref == 'refs/heads/main'
steps:
- name: Deploy to production
env:
PROMPT_REGISTRY_API_KEY: ${{ secrets.PROMPT_REGISTRY_API_KEY }}
run: |
python scripts/deploy_prompts.py --env production

Blue-Green Deployment

# Blue-green deployment for prompts
class PromptDeployment:
"""Prompt deployment manager"""
def __init__(self, registry: PromptRegistry):
self.registry = registry
def deploy_blue_green(
self,
prompt_name: str,
new_version: str,
traffic_percentage: float = 0.5
):
"""Deploy new version with blue-green strategy"""
# Get both versions
blue_version = self.registry.get_prompt(prompt_name) # Current
green_version = self.registry.get_prompt(prompt_name, new_version) # New
# Route traffic
import random
def get_prompt_version():
if random.random() < traffic_percentage:
return green_version
else:
return blue_version
# Monitor metrics
# - Response quality
# - Token usage
# - Latency
# - Error rate
# Gradually shift traffic to green
# If metrics are good, shift to 100% green
# If metrics are bad, rollback to blue
def canary_deployment(
self,
prompt_name: str,
new_version: str,
canary_percentage: float = 0.1
):
"""Deploy new version with canary strategy"""
# Start with 10% traffic to new version
# Monitor metrics
# Gradually increase to 100% if metrics are good

Prompt Monitoring

Metrics Tracking

# Prompt monitoring
from typing import Dict, List
from datetime import datetime
import json
class PromptMonitor:
"""Monitor prompt performance"""
def __init__(self, storage_path: str = "metrics/"):
self.storage_path = storage_path
def log_invocation(
self,
prompt_name: str,
prompt_version: str,
parameters: Dict,
response: str,
tokens_used: int,
latency_ms: float,
error: str = None
):
"""Log prompt invocation"""
log_entry = {
'timestamp': datetime.now().isoformat(),
'prompt_name': prompt_name,
'prompt_version': prompt_version,
'parameters': parameters,
'response_length': len(response),
'tokens_used': tokens_used,
'latency_ms': latency_ms,
'error': error
}
# Write to file
import os
os.makedirs(self.storage_path, exist_ok=True)
file_path = f"{self.storage_path}/{prompt_name}.jsonl"
with open(file_path, 'a') as f:
f.write(json.dumps(log_entry) + '\n')
def get_metrics(
self,
prompt_name: str,
version: str = None
) -> Dict:
"""Get metrics for prompt"""
import statistics
# Read logs
logs = []
file_path = f"{self.storage_path}/{prompt_name}.jsonl"
try:
with open(file_path, 'r') as f:
for line in f:
log = json.loads(line)
if version is None or log['prompt_version'] == version:
logs.append(log)
except FileNotFoundError:
return {}
# Calculate metrics
if not logs:
return {}
metrics = {
'invocations': len(logs),
'avg_tokens': statistics.mean([log['tokens_used'] for log in logs]),
'p50_latency': statistics.median([log['latency_ms'] for log in logs]),
'p95_latency': sorted([log['latency_ms'] for log in logs])[int(len(logs) * 0.95)],
'p99_latency': sorted([log['latency_ms'] for log in logs])[int(len(logs) * 0.99)],
'error_rate': sum(1 for log in logs if log['error']) / len(logs)
}
return metrics

Prompt Best Practices

DO

# 1. Use version control
# Git for prompt templates
# 2. Test prompts
# Unit tests, evaluation metrics
# 3. Use templates
# Jinja2 for complex prompts
# 4. Monitor performance
# Track token usage, latency, quality
# 5. A/B test
# Compare prompt versions

DON’T

# 1. Don't hardcode prompts
# Use templates and registries
# 2. Don't skip testing
# Essential for production
# 3. Don't ignore token limits
# Context window constraints
# 4. Don't forget versioning
# Track prompt changes
# 5. Don't ignore monitoring
# Essential for operations

Key Takeaways

  1. Prompt registry: Version-controlled prompt storage
  2. Prompt templates: Parameterized, reusable prompts
  3. Testing framework: Automated prompt evaluation
  4. CI/CD pipeline: Automated testing and deployment
  5. Blue-green deployment: Gradual rollout with rollback
  6. Monitoring: Token usage, latency, quality metrics
  7. A/B testing: Compare prompt versions
  8. Use When: Production LLM systems, prompt management

Back to Module 5