Skip to content

LLM Ops

Production Operations for Large Language Model Systems


Overview

LLM Ops encompasses the practices, tools, and processes for deploying, monitoring, and maintaining production LLM systems. It includes RAG architecture, prompt engineering, embedding generation, and system monitoring.


LLM Ops Architecture

Production Pipeline

Key Components:

  • Prompt Management: Versioned, tested prompts
  • Embedding Service: Scalable embedding generation with caching
  • Vector Database: Efficient similarity search
  • LLM Gateway: Unified API for multiple LLM providers
  • Monitoring: Token usage, cost, latency metrics
  • Evaluation: Quality, accuracy, faithfulness metrics

LLM Ops Components

1. RAG Architecture

  • Naive RAG: Simple retrieve and generate
  • Advanced RAG: Query rewriting, reranking, context compression
  • Hybrid Search: Vector + BM25 for best results
  • Evaluation: Precision, recall, MRR, NDCG

RAG Architecture Guide

2. Prompt Engineering

  • Prompt Registry: Version-controlled prompt storage
  • Prompt Templates: Parameterized, reusable prompts
  • Testing: Unit tests, evaluation metrics
  • Deployment: Blue-green, canary deployments

Prompt Engineering Pipelines Guide

3. Vector Embeddings

  • Model Selection: OpenAI, Cohere, open-source
  • Batch Processing: Parallel, distributed generation
  • Caching: Redis for repeated embeddings
  • Cost Optimization: Deduplication, chunking

Vector Embeddings at Scale Guide


LLM Gateway

Unified API

# LLM Gateway for multiple providers
from typing import Dict, List, Optional
from abc import ABC, abstractmethod
class LLMProvider(ABC):
"""Base LLM provider"""
@abstractmethod
def chat(self, messages: List[Dict], **kwargs) -> str:
pass
@abstractmethod
def embed(self, texts: List[str]) -> List[List[float]]:
pass
class OpenAIProvider(LLMProvider):
"""OpenAI provider"""
def __init__(self, api_key: str):
from openai import OpenAI
self.client = OpenAI(api_key=api_key)
def chat(self, messages: List[Dict], **kwargs) -> str:
response = self.client.chat.completions.create(
model=kwargs.get("model", "gpt-4"),
messages=messages,
temperature=kwargs.get("temperature", 0.7),
max_tokens=kwargs.get("max_tokens", 500)
)
return response.choices[0].message.content
def embed(self, texts: List[str]) -> List[List[float]]:
response = self.client.embeddings.create(
model="text-embedding-3-small",
input=texts
)
return [item.embedding for item in response.data]
class AnthropicProvider(LLMProvider):
"""Anthropic provider"""
def __init__(self, api_key: str):
import anthropic
self.client = anthropic.Anthropic(api_key=api_key)
def chat(self, messages: List[Dict], **kwargs) -> str:
response = self.client.messages.create(
model=kwargs.get("model", "claude-3-opus-20240229"),
max_tokens=kwargs.get("max_tokens", 500),
messages=messages
)
return response.content[0].text
def embed(self, texts: List[str]) -> List[List[float]]:
raise NotImplementedError("Anthropic doesn't support embeddings")
class LLMGateway:
"""Unified LLM gateway"""
def __init__(self):
self.providers = {
"openai": OpenAIProvider(api_key="your-openai-key"),
"anthropic": AnthropicProvider(api_key="your-anthropic-key")
}
self.default_provider = "openai"
def chat(
self,
messages: List[Dict],
provider: Optional[str] = None,
**kwargs
) -> str:
"""Chat with LLM"""
provider = provider or self.default_provider
llm = self.providers[provider]
return llm.chat(messages, **kwargs)
def embed(
self,
texts: List[str],
provider: Optional[str] = None
) -> List[List[float]]:
"""Generate embeddings"""
provider = provider or self.default_provider
llm = self.providers[provider]
return llm.embed(texts)
# Example usage
gateway = LLMGateway()
response = gateway.chat(
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Hello!"}
],
provider="openai",
temperature=0.7
)
embeddings = gateway.embed(["Hello world"], provider="openai")

LLM Monitoring

Metrics

# LLM monitoring framework
from typing import Dict, List
from datetime import datetime
import json
import time
class LLMMonitor:
"""Monitor LLM operations"""
def __init__(self, storage_path: str = "metrics/llm/"):
self.storage_path = storage_path
def log_invocation(
self,
provider: str,
model: str,
operation: str, # "chat", "embed"
prompt_tokens: int,
completion_tokens: int,
latency_ms: float,
error: str = None
):
"""Log LLM invocation"""
# Calculate cost
cost_per_1k = {
"gpt-4": 0.03,
"gpt-3.5-turbo": 0.002,
"text-embedding-3-small": 0.00002
}
cost = (
(prompt_tokens / 1000) * cost_per_1k.get(model, 0) +
(completion_tokens / 1000) * cost_per_1k.get(model, 0)
)
log_entry = {
'timestamp': datetime.now().isoformat(),
'provider': provider,
'model': model,
'operation': operation,
'prompt_tokens': prompt_tokens,
'completion_tokens': completion_tokens,
'total_tokens': prompt_tokens + completion_tokens,
'latency_ms': latency_ms,
'cost_usd': cost,
'error': error
}
# Write to file
import os
os.makedirs(self.storage_path, exist_ok=True)
file_path = f"{self.storage_path}/{provider}.jsonl"
with open(file_path, 'a') as f:
f.write(json.dumps(log_entry) + '\n')
def get_metrics(self, provider: str) -> Dict:
"""Get metrics for provider"""
import statistics
# Read logs
logs = []
file_path = f"{self.storage_path}/{provider}.jsonl"
try:
with open(file_path, 'r') as f:
for line in f:
logs.append(json.loads(line))
except FileNotFoundError:
return {}
if not logs:
return {}
# Calculate metrics
metrics = {
'total_invocations': len(logs),
'total_tokens': sum(log['total_tokens'] for log in logs),
'total_cost_usd': sum(log['cost_usd'] for log in logs),
'avg_latency_ms': statistics.mean([log['latency_ms'] for log in logs]),
'p50_latency': statistics.median([log['latency_ms'] for log in logs]),
'p95_latency': sorted([log['latency_ms'] for log in logs])[int(len(logs) * 0.95)],
'p99_latency': sorted([log['latency_ms'] for log in logs])[int(len(logs) * 0.99)],
'error_rate': sum(1 for log in logs if log['error']) / len(logs)
}
return metrics
# Decorator for automatic monitoring
def monitor_llm(monitor: LLMMonitor):
"""Decorator to monitor LLM calls"""
def decorator(func):
def wrapper(*args, **kwargs):
start = time.time()
try:
result = func(*args, **kwargs)
# Log success
# Extract tokens, cost from result
monitor.log_invocation(
provider=kwargs.get("provider", "openai"),
model=kwargs.get("model", "gpt-4"),
operation="chat",
prompt_tokens=result.get("prompt_tokens", 0),
completion_tokens=result.get("completion_tokens", 0),
latency_ms=(time.time() - start) * 1000
)
return result
except Exception as e:
# Log error
monitor.log_invocation(
provider=kwargs.get("provider", "openai"),
model=kwargs.get("model", "gpt-4"),
operation="chat",
prompt_tokens=0,
completion_tokens=0,
latency_ms=(time.time() - start) * 1000,
error=str(e)
)
raise
return wrapper
return decorator

LLM Evaluation

Quality Metrics

# LLM evaluation framework
from typing import List, Dict
class LLMEvaluator:
"""Evaluate LLM outputs"""
def __init__(self, llm_gateway: LLMGateway):
self.gateway = llm_gateway
def evaluate_relevance(
self,
prompt: str,
response: str,
reference: str
) -> float:
"""Evaluate response relevance"""
# Use LLM to evaluate
eval_prompt = f"""
Rate the relevance of the following response to the prompt.
Reference answer: {reference}
Response: {response}
Rate from 1 (not relevant) to 5 (highly relevant).
Output only the number.
"""
response = self.gateway.chat(
messages=[{"role": "user", "content": eval_prompt}],
temperature=0
)
score = float(response.strip())
return score / 5
def evaluate_coherence(self, response: str) -> float:
"""Evaluate response coherence"""
eval_prompt = f"""
Rate the coherence of the following response.
Response: {response}
Rate from 1 (not coherent) to 5 (highly coherent).
Output only the number.
"""
response = self.gateway.chat(
messages=[{"role": "user", "content": eval_prompt}],
temperature=0
)
score = float(response.strip())
return score / 5
def evaluate_safety(self, response: str) -> float:
"""Evaluate response safety"""
# Use moderation API
from openai import OpenAI
client = OpenAI()
moderation = client.moderations.create(input=response)
# Safe if no flags
flagged = moderation.results[0].categories
return 0.0 if any(flagged.values()) else 1.0

LLM Cost Optimization

Cost Strategies

# LLM cost optimization
class LLMCostOptimizer:
"""Optimize LLM costs"""
def __init__(self, gateway: LLMGateway):
self.gateway = gateway
def choose_model(
self,
task: str,
complexity: str # "simple", "medium", "complex"
) -> str:
"""Choose appropriate model for task"""
# Model selection strategy
if task == "chat":
if complexity == "simple":
return "gpt-3.5-turbo" # Cheaper
elif complexity == "medium":
return "gpt-4" # Balanced
else: # complex
return "gpt-4-turbo" # Best
elif task == "embed":
return "text-embedding-3-small" # Cost-optimized
return "gpt-3.5-turbo"
def optimize_prompt(self, prompt: str) -> str:
"""Optimize prompt to reduce tokens"""
# Remove redundant text
# Use concise language
# Remove examples if not needed
# This is a simplified example
optimized = prompt.strip()
# Remove multiple spaces
import re
optimized = re.sub(r'\s+', ' ', optimized)
return optimized
def use_cache(self, prompt: str) -> Optional[str]:
"""Check cache for previous responses"""
# Implement caching logic
# Redis, Memcached, etc.
return None

LLM Best Practices

DO

# 1. Use gateway pattern
# Unified API for multiple providers
# 2. Monitor everything
# Tokens, cost, latency, errors
# 3. Use caching
# Cache embeddings and responses
# 4. Choose appropriate model
# Balance cost and quality
# 5. Evaluate quality
# Relevance, coherence, safety

DON’T

# 1. Don't skip monitoring
# Essential for operations
# 2. Don't ignore costs
# LLM costs add up quickly
# 3. Don't use wrong model
# Choose based on task complexity
# 4. Don't skip evaluation
# Quality degrades over time
# 5. Don't hardcode prompts
# Use versioned templates

LLM Ops Guides


Key Takeaways

  1. RAG: Combine LLMs with external knowledge retrieval
  2. Prompt management: Version control, testing, deployment
  3. Embeddings: Scalable generation with caching
  4. Gateway: Unified API for multiple LLM providers
  5. Monitoring: Token usage, cost, latency, errors
  6. Evaluation: Relevance, coherence, safety metrics
  7. Cost optimization: Model selection, prompt optimization, caching
  8. Use When: Production LLM systems, chatbots, search

Back to Module 5