LLM Ops
Production Operations for Large Language Model Systems
Overview
LLM Ops encompasses the practices, tools, and processes for deploying, monitoring, and maintaining production LLM systems. It includes RAG architecture, prompt engineering, embedding generation, and system monitoring.
LLM Ops Architecture
Production Pipeline
Key Components:
- Prompt Management: Versioned, tested prompts
- Embedding Service: Scalable embedding generation with caching
- Vector Database: Efficient similarity search
- LLM Gateway: Unified API for multiple LLM providers
- Monitoring: Token usage, cost, latency metrics
- Evaluation: Quality, accuracy, faithfulness metrics
LLM Ops Components
1. RAG Architecture
- Naive RAG: Simple retrieve and generate
- Advanced RAG: Query rewriting, reranking, context compression
- Hybrid Search: Vector + BM25 for best results
- Evaluation: Precision, recall, MRR, NDCG
2. Prompt Engineering
- Prompt Registry: Version-controlled prompt storage
- Prompt Templates: Parameterized, reusable prompts
- Testing: Unit tests, evaluation metrics
- Deployment: Blue-green, canary deployments
Prompt Engineering Pipelines Guide
3. Vector Embeddings
- Model Selection: OpenAI, Cohere, open-source
- Batch Processing: Parallel, distributed generation
- Caching: Redis for repeated embeddings
- Cost Optimization: Deduplication, chunking
Vector Embeddings at Scale Guide
LLM Gateway
Unified API
# LLM Gateway for multiple providers
from typing import Dict, List, Optionalfrom abc import ABC, abstractmethod
class LLMProvider(ABC): """Base LLM provider"""
@abstractmethod def chat(self, messages: List[Dict], **kwargs) -> str: pass
@abstractmethod def embed(self, texts: List[str]) -> List[List[float]]: pass
class OpenAIProvider(LLMProvider): """OpenAI provider"""
def __init__(self, api_key: str): from openai import OpenAI self.client = OpenAI(api_key=api_key)
def chat(self, messages: List[Dict], **kwargs) -> str: response = self.client.chat.completions.create( model=kwargs.get("model", "gpt-4"), messages=messages, temperature=kwargs.get("temperature", 0.7), max_tokens=kwargs.get("max_tokens", 500) ) return response.choices[0].message.content
def embed(self, texts: List[str]) -> List[List[float]]: response = self.client.embeddings.create( model="text-embedding-3-small", input=texts ) return [item.embedding for item in response.data]
class AnthropicProvider(LLMProvider): """Anthropic provider"""
def __init__(self, api_key: str): import anthropic self.client = anthropic.Anthropic(api_key=api_key)
def chat(self, messages: List[Dict], **kwargs) -> str: response = self.client.messages.create( model=kwargs.get("model", "claude-3-opus-20240229"), max_tokens=kwargs.get("max_tokens", 500), messages=messages ) return response.content[0].text
def embed(self, texts: List[str]) -> List[List[float]]: raise NotImplementedError("Anthropic doesn't support embeddings")
class LLMGateway: """Unified LLM gateway"""
def __init__(self): self.providers = { "openai": OpenAIProvider(api_key="your-openai-key"), "anthropic": AnthropicProvider(api_key="your-anthropic-key") } self.default_provider = "openai"
def chat( self, messages: List[Dict], provider: Optional[str] = None, **kwargs ) -> str: """Chat with LLM"""
provider = provider or self.default_provider llm = self.providers[provider] return llm.chat(messages, **kwargs)
def embed( self, texts: List[str], provider: Optional[str] = None ) -> List[List[float]]: """Generate embeddings"""
provider = provider or self.default_provider llm = self.providers[provider] return llm.embed(texts)
# Example usagegateway = LLMGateway()
response = gateway.chat( messages=[ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Hello!"} ], provider="openai", temperature=0.7)
embeddings = gateway.embed(["Hello world"], provider="openai")LLM Monitoring
Metrics
# LLM monitoring framework
from typing import Dict, Listfrom datetime import datetimeimport jsonimport time
class LLMMonitor: """Monitor LLM operations"""
def __init__(self, storage_path: str = "metrics/llm/"): self.storage_path = storage_path
def log_invocation( self, provider: str, model: str, operation: str, # "chat", "embed" prompt_tokens: int, completion_tokens: int, latency_ms: float, error: str = None ): """Log LLM invocation"""
# Calculate cost cost_per_1k = { "gpt-4": 0.03, "gpt-3.5-turbo": 0.002, "text-embedding-3-small": 0.00002 }
cost = ( (prompt_tokens / 1000) * cost_per_1k.get(model, 0) + (completion_tokens / 1000) * cost_per_1k.get(model, 0) )
log_entry = { 'timestamp': datetime.now().isoformat(), 'provider': provider, 'model': model, 'operation': operation, 'prompt_tokens': prompt_tokens, 'completion_tokens': completion_tokens, 'total_tokens': prompt_tokens + completion_tokens, 'latency_ms': latency_ms, 'cost_usd': cost, 'error': error }
# Write to file import os os.makedirs(self.storage_path, exist_ok=True)
file_path = f"{self.storage_path}/{provider}.jsonl"
with open(file_path, 'a') as f: f.write(json.dumps(log_entry) + '\n')
def get_metrics(self, provider: str) -> Dict: """Get metrics for provider"""
import statistics
# Read logs logs = [] file_path = f"{self.storage_path}/{provider}.jsonl"
try: with open(file_path, 'r') as f: for line in f: logs.append(json.loads(line)) except FileNotFoundError: return {}
if not logs: return {}
# Calculate metrics metrics = { 'total_invocations': len(logs), 'total_tokens': sum(log['total_tokens'] for log in logs), 'total_cost_usd': sum(log['cost_usd'] for log in logs), 'avg_latency_ms': statistics.mean([log['latency_ms'] for log in logs]), 'p50_latency': statistics.median([log['latency_ms'] for log in logs]), 'p95_latency': sorted([log['latency_ms'] for log in logs])[int(len(logs) * 0.95)], 'p99_latency': sorted([log['latency_ms'] for log in logs])[int(len(logs) * 0.99)], 'error_rate': sum(1 for log in logs if log['error']) / len(logs) }
return metrics
# Decorator for automatic monitoringdef monitor_llm(monitor: LLMMonitor): """Decorator to monitor LLM calls"""
def decorator(func): def wrapper(*args, **kwargs): start = time.time()
try: result = func(*args, **kwargs)
# Log success # Extract tokens, cost from result monitor.log_invocation( provider=kwargs.get("provider", "openai"), model=kwargs.get("model", "gpt-4"), operation="chat", prompt_tokens=result.get("prompt_tokens", 0), completion_tokens=result.get("completion_tokens", 0), latency_ms=(time.time() - start) * 1000 )
return result
except Exception as e: # Log error monitor.log_invocation( provider=kwargs.get("provider", "openai"), model=kwargs.get("model", "gpt-4"), operation="chat", prompt_tokens=0, completion_tokens=0, latency_ms=(time.time() - start) * 1000, error=str(e) ) raise
return wrapper return decoratorLLM Evaluation
Quality Metrics
# LLM evaluation framework
from typing import List, Dict
class LLMEvaluator: """Evaluate LLM outputs"""
def __init__(self, llm_gateway: LLMGateway): self.gateway = llm_gateway
def evaluate_relevance( self, prompt: str, response: str, reference: str ) -> float: """Evaluate response relevance"""
# Use LLM to evaluate eval_prompt = f""" Rate the relevance of the following response to the prompt. Reference answer: {reference} Response: {response}
Rate from 1 (not relevant) to 5 (highly relevant). Output only the number. """
response = self.gateway.chat( messages=[{"role": "user", "content": eval_prompt}], temperature=0 )
score = float(response.strip()) return score / 5
def evaluate_coherence(self, response: str) -> float: """Evaluate response coherence"""
eval_prompt = f""" Rate the coherence of the following response. Response: {response}
Rate from 1 (not coherent) to 5 (highly coherent). Output only the number. """
response = self.gateway.chat( messages=[{"role": "user", "content": eval_prompt}], temperature=0 )
score = float(response.strip()) return score / 5
def evaluate_safety(self, response: str) -> float: """Evaluate response safety"""
# Use moderation API from openai import OpenAI client = OpenAI()
moderation = client.moderations.create(input=response)
# Safe if no flags flagged = moderation.results[0].categories return 0.0 if any(flagged.values()) else 1.0LLM Cost Optimization
Cost Strategies
# LLM cost optimization
class LLMCostOptimizer: """Optimize LLM costs"""
def __init__(self, gateway: LLMGateway): self.gateway = gateway
def choose_model( self, task: str, complexity: str # "simple", "medium", "complex" ) -> str: """Choose appropriate model for task"""
# Model selection strategy if task == "chat": if complexity == "simple": return "gpt-3.5-turbo" # Cheaper elif complexity == "medium": return "gpt-4" # Balanced else: # complex return "gpt-4-turbo" # Best
elif task == "embed": return "text-embedding-3-small" # Cost-optimized
return "gpt-3.5-turbo"
def optimize_prompt(self, prompt: str) -> str: """Optimize prompt to reduce tokens"""
# Remove redundant text # Use concise language # Remove examples if not needed
# This is a simplified example optimized = prompt.strip()
# Remove multiple spaces import re optimized = re.sub(r'\s+', ' ', optimized)
return optimized
def use_cache(self, prompt: str) -> Optional[str]: """Check cache for previous responses"""
# Implement caching logic # Redis, Memcached, etc.
return NoneLLM Best Practices
DO
# 1. Use gateway pattern# Unified API for multiple providers
# 2. Monitor everything# Tokens, cost, latency, errors
# 3. Use caching# Cache embeddings and responses
# 4. Choose appropriate model# Balance cost and quality
# 5. Evaluate quality# Relevance, coherence, safetyDON’T
# 1. Don't skip monitoring# Essential for operations
# 2. Don't ignore costs# LLM costs add up quickly
# 3. Don't use wrong model# Choose based on task complexity
# 4. Don't skip evaluation# Quality degrades over time
# 5. Don't hardcode prompts# Use versioned templatesLLM Ops Guides
- RAG Architecture - Retrieval-augmented generation patterns
- Prompt Engineering Pipelines - Prompt management and deployment
- Vector Embeddings at Scale - Scalable embedding generation
Key Takeaways
- RAG: Combine LLMs with external knowledge retrieval
- Prompt management: Version control, testing, deployment
- Embeddings: Scalable generation with caching
- Gateway: Unified API for multiple LLM providers
- Monitoring: Token usage, cost, latency, errors
- Evaluation: Relevance, coherence, safety metrics
- Cost optimization: Model selection, prompt optimization, caching
- Use When: Production LLM systems, chatbots, search
Back to Module 5