Prompt Engineering Pipelines
Production Prompt Management for LLM Systems
Overview
Prompt engineering pipelines provide systematic, version-controlled, and tested prompt management for production LLM systems. They enable prompt versioning, A/B testing, automated evaluation, and deployment strategies.
Prompt Pipeline Architecture
Pipeline Components
Key Components:
- Prompt Registry: Version-controlled prompt storage
- Prompt Templates: Parameterized, reusable prompts
- Testing Framework: Automated prompt evaluation
- CI/CD Pipeline: Automated testing and deployment
- LLM Gateway: Unified API for multiple LLM providers
Prompt Registry
Prompt Storage
# Prompt registry with versioning
from typing import Dict, List, Optionalfrom dataclasses import dataclassfrom datetime import datetimeimport json
@dataclassclass Prompt: """Prompt version""" id: str name: str version: str template: str parameters: List[str] metadata: Dict created_at: datetime created_by: str
class PromptRegistry: """Centralized prompt registry"""
def __init__(self, storage_path: str = "prompts/"): self.storage_path = storage_path self.prompts: Dict[str, List[Prompt]] = {}
def register_prompt( self, name: str, template: str, parameters: List[str], metadata: Dict, created_by: str ) -> Prompt: """Register new prompt version"""
# Generate version versions = self.prompts.get(name, []) version = f"v{len(versions) + 1}"
# Create prompt prompt = Prompt( id=f"{name}:{version}", name=name, version=version, template=template, parameters=parameters, metadata=metadata, created_at=datetime.now(), created_by=created_by )
# Store prompt if name not in self.prompts: self.prompts[name] = []
self.prompts[name].append(prompt)
# Persist to file self._save_prompt(prompt)
return prompt
def get_prompt(self, name: str, version: Optional[str] = None) -> Prompt: """Get prompt by name and version"""
versions = self.prompts.get(name, [])
if not versions: raise ValueError(f"Prompt {name} not found")
if version: for prompt in versions: if prompt.version == version: return prompt raise ValueError(f"Version {version} not found") else: # Return latest version return versions[-1]
def render_prompt(self, name: str, parameters: Dict) -> str: """Render prompt with parameters"""
prompt = self.get_prompt(name)
# Validate parameters missing_params = set(prompt.parameters) - set(parameters.keys()) if missing_params: raise ValueError(f"Missing parameters: {missing_params}")
# Render template rendered = prompt.template.format(**parameters)
return rendered
def _save_prompt(self, prompt: Prompt): """Save prompt to file"""
import os os.makedirs(self.storage_path, exist_ok=True)
file_path = f"{self.storage_path}/{prompt.name}/{prompt.version}.json"
with open(file_path, 'w') as f: json.dump({ 'id': prompt.id, 'name': prompt.name, 'version': prompt.version, 'template': prompt.template, 'parameters': prompt.parameters, 'metadata': prompt.metadata, 'created_at': prompt.created_at.isoformat(), 'created_by': prompt.created_by }, f, indent=2)
# Example usageregistry = PromptRegistry()
# Register RAG promptrag_prompt = registry.register_prompt( name="rag_query", template="""You are a helpful assistant. Answer the following question based on the context provided.
Context:{context}
Question: {query}
Answer: """.strip(), parameters=["context", "query"], metadata={ "description": "RAG query prompt", "model": "gpt-4", "max_tokens": 500, "temperature": 0.7 }, created_by="data-engineering@my-company.com")
# Render promptrendered = registry.render_prompt( "rag_query", parameters={ "context": "Machine learning is a subset of AI.", "query": "What is machine learning?" })
print(rendered)Prompt Templates
Template Types
# Prompt template patterns
from string import Templatefrom jinja2 import Template as JinjaTemplate
# 1. String format templates (simple)simple_template = """Context: {context}Question: {query}Answer:"""
# 2. Jinja2 templates (advanced)jinja_template = JinjaTemplate("""You are a {{ role }} assistant.
{% if context %}Context:{{ context }}
{% endif %}Question: {{ query }}
{% if examples %}Examples:{% for example in examples %}Q: {{ example.question }}A: {{ example.answer }}{% endfor %}{% endif %}
Answer:""")
# 3. Few-shot templatefew_shot_template = """You are a sentiment classifier. Classify the sentiment of the following text as Positive, Negative, or Neutral.
Examples:Text: "I love this product!"Sentiment: Positive
Text: "This is terrible."Sentiment: Negative
Text: "It's okay."Sentiment: Neutral
Text: "{{ text }}"Sentiment:"""
# 4. Chain-of-thought templatecot_template = """You are a reasoning assistant. Think step-by-step to answer the question.
Question: {{ question }}
Let's think step-by-step:1.2.3.
Answer:"""
# 5. Structured output templatestructured_template = """You are a data extractor. Extract the following fields from the text:
Text: {{ text }}
Extract:- Name: {{ name }}- Email: {{ email }}- Phone: {{ phone }}
Output format: JSON"""Parameter Validation
# Parameter validation for prompts
from typing import Dict, Any, Listfrom pydantic import BaseModel, validator
class PromptParameters(BaseModel): """Prompt parameters with validation"""
context: str query: str max_tokens: int = 500 temperature: float = 0.7
@validator('context') def context_not_empty(cls, v): if not v or len(v.strip()) == 0: raise ValueError('Context cannot be empty') return v
@validator('query') def query_not_empty(cls, v): if not v or len(v.strip()) == 0: raise ValueError('Query cannot be empty') return v
@validator('max_tokens') def max_tokens_valid(cls, v): if v < 1 or v > 4096: raise ValueError('max_tokens must be between 1 and 4096') return v
@validator('temperature') def temperature_valid(cls, v): if v < 0 or v > 2: raise ValueError('temperature must be between 0 and 2') return v
# Usagetry: params = PromptParameters( context="Machine learning is a subset of AI.", query="What is machine learning?", max_tokens=1000, temperature=0.7 ) print("Parameters valid:", params.dict())except ValueError as e: print("Validation error:", e)Prompt Testing
Unit Tests
# Prompt testing framework
import pytestfrom typing import List, Dictfrom openai import OpenAI
class PromptTester: """Test prompt templates"""
def __init__(self, llm_client: OpenAI): self.llm = llm_client
def test_prompt_rendering( self, template: str, parameters: Dict, expected_contains: List[str] ) -> bool: """Test prompt renders correctly"""
try: rendered = template.format(**parameters)
for expected in expected_contains: assert expected in rendered, f"Expected '{expected}' in rendered prompt"
return True except Exception as e: print(f"Render test failed: {e}") return False
def test_prompt_length( self, rendered: str, max_length: int = 4000 ) -> bool: """Test prompt length is within limits"""
import tiktoken
enc = tiktoken.encoding_for_model("gpt-4") tokens = enc.encode(rendered)
if len(tokens) > max_length: print(f"Prompt too long: {len(tokens)} > {max_length}") return False
return True
def test_prompt_output( self, rendered: str, expected_output_type: str = "str" ) -> bool: """Test prompt produces expected output"""
response = self.llm.chat.completions.create( model="gpt-4", messages=[{"role": "user", "content": rendered}], max_tokens=100 )
output = response.choices[0].message.content
if expected_output_type == "json": import json try: json.loads(output) return True except: return False
return isinstance(output, str)
def test_prompt_consistency( self, rendered: str, num_runs: int = 3, temperature: float = 0.0 ) -> bool: """Test prompt produces consistent outputs"""
outputs = []
for _ in range(num_runs): response = self.llm.chat.completions.create( model="gpt-4", messages=[{"role": "user", "content": rendered}], temperature=temperature )
outputs.append(response.choices[0].message.content)
# All outputs should be identical return len(set(outputs)) == 1
# Example usagetester = PromptTester(llm_client=OpenAI())
# Test renderingassert tester.test_prompt_rendering( template="Context: {context}\nQuery: {query}", parameters={"context": "Test", "query": "Test query"}, expected_contains=["Context:", "Query:"])
# Test lengthassert tester.test_prompt_length( rendered="Context: Test\nQuery: Test query", max_length=100)
# Test outputassert tester.test_prompt_output( rendered="Say 'Hello, World!'", expected_output_type="str")Evaluation Metrics
# Prompt evaluation metrics
from typing import List, Dictimport numpy as np
class PromptEvaluator: """Evaluate prompt performance"""
def __init__(self, llm_client: OpenAI): self.llm = llm_client
def evaluate_relevance( self, prompt: str, response: str, reference: str ) -> float: """Evaluate response relevance"""
# Use LLM to evaluate relevance eval_prompt = f""" Rate the relevance of the following response to the prompt. Reference answer: {reference} Response: {response}
Rate from 1 (not relevant) to 5 (highly relevant). Output only the number. """
response = self.llm.chat.completions.create( model="gpt-4", messages=[{"role": "user", "content": eval_prompt}], temperature=0 )
score = float(response.choices[0].message.content.strip()) return score / 5 # Normalize to 0-1
def evaluate_conciseness(self, response: str) -> float: """Evaluate response conciseness"""
words = response.split()
# Ideal length: 50-200 words if 50 <= len(words) <= 200: return 1.0 elif len(words) < 50: return len(words) / 50 else: return max(0, 1 - (len(words) - 200) / 500)
def evaluate_safety(self, response: str) -> float: """Evaluate response safety (no harmful content)"""
# Use moderation API moderation = self.llm.moderations.create(input=response)
# Flagged categories flagged = moderation.results[0].categories
# Safe if no flags return 0.0 if any(flagged.values()) else 1.0
def evaluate_token_efficiency( self, prompt: str, response: str ) -> float: """Evaluate token efficiency (output/input ratio)"""
import tiktoken
enc = tiktoken.encoding_for_model("gpt-4")
input_tokens = len(enc.encode(prompt)) output_tokens = len(enc.encode(response))
# Ideal ratio: 0.5-2.0 ratio = output_tokens / input_tokens
if 0.5 <= ratio <= 2.0: return 1.0 else: return max(0, 1 - abs(ratio - 1.0))Prompt Deployment
CI/CD Pipeline
name: Prompt CI/CD
on: push: paths: - 'prompts/**' - 'tests/prompts/**'
jobs: test: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3
- name: Set up Python uses: actions/setup-python@v4 with: python-version: '3.11'
- name: Install dependencies run: | pip install openai pytest
- name: Run prompt tests env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} run: | pytest tests/prompts/
- name: Evaluate prompts env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} run: | python scripts/evaluate_prompts.py
deploy: needs: test runs-on: ubuntu-latest if: github.ref == 'refs/heads/main' steps: - name: Deploy to production env: PROMPT_REGISTRY_API_KEY: ${{ secrets.PROMPT_REGISTRY_API_KEY }} run: | python scripts/deploy_prompts.py --env productionBlue-Green Deployment
# Blue-green deployment for prompts
class PromptDeployment: """Prompt deployment manager"""
def __init__(self, registry: PromptRegistry): self.registry = registry
def deploy_blue_green( self, prompt_name: str, new_version: str, traffic_percentage: float = 0.5 ): """Deploy new version with blue-green strategy"""
# Get both versions blue_version = self.registry.get_prompt(prompt_name) # Current green_version = self.registry.get_prompt(prompt_name, new_version) # New
# Route traffic import random
def get_prompt_version(): if random.random() < traffic_percentage: return green_version else: return blue_version
# Monitor metrics # - Response quality # - Token usage # - Latency # - Error rate
# Gradually shift traffic to green # If metrics are good, shift to 100% green # If metrics are bad, rollback to blue
def canary_deployment( self, prompt_name: str, new_version: str, canary_percentage: float = 0.1 ): """Deploy new version with canary strategy"""
# Start with 10% traffic to new version # Monitor metrics # Gradually increase to 100% if metrics are goodPrompt Monitoring
Metrics Tracking
# Prompt monitoring
from typing import Dict, Listfrom datetime import datetimeimport json
class PromptMonitor: """Monitor prompt performance"""
def __init__(self, storage_path: str = "metrics/"): self.storage_path = storage_path
def log_invocation( self, prompt_name: str, prompt_version: str, parameters: Dict, response: str, tokens_used: int, latency_ms: float, error: str = None ): """Log prompt invocation"""
log_entry = { 'timestamp': datetime.now().isoformat(), 'prompt_name': prompt_name, 'prompt_version': prompt_version, 'parameters': parameters, 'response_length': len(response), 'tokens_used': tokens_used, 'latency_ms': latency_ms, 'error': error }
# Write to file import os os.makedirs(self.storage_path, exist_ok=True)
file_path = f"{self.storage_path}/{prompt_name}.jsonl"
with open(file_path, 'a') as f: f.write(json.dumps(log_entry) + '\n')
def get_metrics( self, prompt_name: str, version: str = None ) -> Dict: """Get metrics for prompt"""
import statistics
# Read logs logs = [] file_path = f"{self.storage_path}/{prompt_name}.jsonl"
try: with open(file_path, 'r') as f: for line in f: log = json.loads(line) if version is None or log['prompt_version'] == version: logs.append(log) except FileNotFoundError: return {}
# Calculate metrics if not logs: return {}
metrics = { 'invocations': len(logs), 'avg_tokens': statistics.mean([log['tokens_used'] for log in logs]), 'p50_latency': statistics.median([log['latency_ms'] for log in logs]), 'p95_latency': sorted([log['latency_ms'] for log in logs])[int(len(logs) * 0.95)], 'p99_latency': sorted([log['latency_ms'] for log in logs])[int(len(logs) * 0.99)], 'error_rate': sum(1 for log in logs if log['error']) / len(logs) }
return metricsPrompt Best Practices
DO
# 1. Use version control# Git for prompt templates
# 2. Test prompts# Unit tests, evaluation metrics
# 3. Use templates# Jinja2 for complex prompts
# 4. Monitor performance# Track token usage, latency, quality
# 5. A/B test# Compare prompt versionsDON’T
# 1. Don't hardcode prompts# Use templates and registries
# 2. Don't skip testing# Essential for production
# 3. Don't ignore token limits# Context window constraints
# 4. Don't forget versioning# Track prompt changes
# 5. Don't ignore monitoring# Essential for operationsKey Takeaways
- Prompt registry: Version-controlled prompt storage
- Prompt templates: Parameterized, reusable prompts
- Testing framework: Automated prompt evaluation
- CI/CD pipeline: Automated testing and deployment
- Blue-green deployment: Gradual rollout with rollback
- Monitoring: Token usage, latency, quality metrics
- A/B testing: Compare prompt versions
- Use When: Production LLM systems, prompt management
Back to Module 5