Skip to content

RAG Architecture

Retrieval-Augmented Generation for Production LLM Systems


Overview

Retrieval-Augmented Generation (RAG) combines the power of large language models (LLMs) with external knowledge retrieval. It enables LLMs to access up-to-date, domain-specific information without fine-tuning, making it ideal for enterprise applications.


RAG Architecture

End-to-End Pipeline

Key Components:

  • Vector Database: Stores document embeddings for semantic search
  • Retrieval Strategy: Semantic search, hybrid search, metadata filtering
  • Context Building: Prompt construction with retrieved documents
  • LLM Generation: Text generation with context
  • Citations: Source attribution for transparency

RAG Patterns

Naive RAG

Naive RAG Implementation:

import openai
from pinecone import Pinecone
# Initialize clients
llm_client = openai.OpenAI(api_key="your-api-key")
vector_db = Pinecone(api_key="your-pinecone-api-key")
index = vector_db.Index("documents")
def naive_rag(query: str, top_k: int = 5) -> str:
"""Naive RAG: Simple retrieve and generate"""
# 1. Embed query
query_vector = llm_client.embeddings.create(
model="text-embedding-ada-002",
input=query
).data[0].embedding
# 2. Retrieve documents
results = index.query(
vector=query_vector,
top_k=top_k,
include_metadata=True
)
# 3. Build context
context = "\n".join([
f"Document {i+1}: {match['metadata']['text']}\n"
for i, match in enumerate(results['matches'])
])
# 4. Generate response
prompt = f"""
Context:
{context}
Question: {query}
Answer:
"""
response = llm_client.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": prompt}]
)
return response.choices[0].message.content

Advanced RAG

Advanced RAG Implementation:

from typing import List, Dict
import numpy as np
class AdvancedRAG:
"""Advanced RAG with query rewriting, reranking, and context compression"""
def __init__(self, vector_index, llm_client):
self.index = vector_index
self.llm = llm_client
self.conversation_history = []
def query_rewriting(self, query: str) -> List[str]:
"""Rewrite query for better retrieval"""
# Generate multiple query variations
prompt = f"""
Generate 3 different ways to ask this question:
{query}
Format: One query per line
"""
response = self.llm.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": prompt}],
temperature=0.7
)
queries = [query] # Include original
queries.extend(response.choices[0].message.content.split("\n"))
return queries
def hybrid_search(self, queries: List[str], top_k: int = 10) -> List[Dict]:
"""Hybrid search: Vector + BM25"""
all_results = []
for query in queries:
# Vector search
query_vector = self.llm.embeddings.create(
model="text-embedding-ada-002",
input=query
).data[0].embedding
vector_results = self.index.query(
vector=query_vector,
top_k=top_k,
include_metadata=True
)
all_results.extend(vector_results['matches'])
# Remove duplicates
unique_results = {r['id']: r for r in all_results}.values()
return list(unique_results)
def reciprocal_rerank(self, results: List[Dict], k: int = 60) -> List[Dict]:
"""Reciprocal rank fusion (RRF) for result fusion"""
scores = {}
for i, result in enumerate(results):
doc_id = result['id']
if doc_id not in scores:
scores[doc_id] = {
'doc': result,
'score': 0
}
# RRF score: 1/(k + rank)
scores[doc_id]['score'] += 1 / (k + i + 1)
# Sort by score
reranked = sorted(
scores.values(),
key=lambda x: x['score'],
reverse=True
)
return [r['doc'] for r in reranked[:10]]
def context_compression(self, documents: List[Dict], query: str) -> str:
"""Compress context using LLM"""
# Concatenate documents
docs_text = "\n".join([
f"Doc {i+1}: {doc['metadata']['text']}"
for i, doc in enumerate(documents)
])
# Compress with LLM
prompt = f"""
Compress the following documents into a concise summary
relevant to the query: {query}
Documents:
{docs_text}
Summary:
"""
response = self.llm.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": prompt}],
max_tokens=1000
)
return response.choices[0].message.content
def generate_with_citations(self, query: str, context: str, sources: List[Dict]) -> str:
"""Generate response with citations"""
# Format citations
citations = "\n".join([
f"[{i+1}] {doc['metadata']['source']}"
for i, doc in enumerate(sources)
])
# Generate response
prompt = f"""
Answer the following question based on the context provided.
Include citations in your answer using [1], [2], etc.
Context:
{context}
Question: {query}
Sources:
{citations}
Answer:
"""
response = self.llm.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": prompt}],
temperature=0.7
)
return response.choices[0].message.content
def run(self, query: str) -> str:
"""Run advanced RAG pipeline"""
# 1. Query rewriting
queries = self.query_rewriting(query)
# 2. Hybrid search
results = self.hybrid_search(queries)
# 3. Reranking
reranked = self.reciprocal_rerank(results)
# 4. Context compression
compressed_context = self.context_compression(reranked, query)
# 5. Generate with citations
response = self.generate_with_citations(
query,
compressed_context,
reranked
)
return response

RAG Retrieval Strategies

Metadata Filtering

# Metadata filtering for retrieval
def filtered_rag(query: str, category: str, date_from: str) -> str:
"""RAG with metadata filtering"""
# Embed query
query_vector = llm_client.embeddings.create(
model="text-embedding-ada-002",
input=query
).data[0].embedding
# Search with filter
results = index.query(
vector=query_vector,
top_k=10,
filter={
"category": {"$eq": category},
"date": {"$gte": date_from}
},
include_metadata=True
)
# Generate response
# ... (same as naive RAG)
# Hybrid search: Vector + BM25
def hybrid_rag(query: str, alpha: float = 0.7) -> str:
"""Hybrid RAG: Vector + BM25"""
# Embed query for vector search
query_vector = llm_client.embeddings.create(
model="text-embedding-ada-002",
input=query
).data[0].embedding
# Vector search
vector_results = index.query(
vector=query_vector,
top_k=50,
include_metadata=True
)
# BM25 search (if supported)
# keyword_results = index.search(query, search_method="bm25")
# Combine scores
# combined_score = alpha * vector_score + (1 - alpha) * keyword_score
# Rerank and select top K
# ...
# Generate response
# ...

Query Expansion

# Query expansion for better retrieval
def expanded_rag(query: str) -> str:
"""RAG with query expansion"""
# Generate query expansions
expansion_prompt = f"""
Generate 3 different ways to ask this question:
{query}
Format: One query per line
"""
response = llm_client.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": expansion_prompt}],
temperature=0.7
)
expanded_queries = [query]
expanded_queries.extend(response.choices[0].message.content.split("\n"))
# Search for all queries
all_results = []
for q in expanded_queries:
q_vector = llm_client.embeddings.create(
model="text-embedding-ada-002",
input=q
).data[0].embedding
results = index.query(
vector=q_vector,
top_k=10,
include_metadata=True
)
all_results.extend(results['matches'])
# Remove duplicates and rerank
unique_results = {r['id']: r for r in all_results}.values()
# Generate response with top results
# ...

RAG Evaluation

Evaluation Metrics

# RAG evaluation framework
from typing import List, Dict
import numpy as np
class RAGEvaluator:
"""Evaluate RAG system performance"""
def __init__(self, ground_truth: List[Dict]):
self.ground_truth = ground_truth
def retrieval_precision(self, retrieved: List[Dict], top_k: int = 10) -> float:
"""Precision@K: How many retrieved docs are relevant?"""
relevant_retrieved = 0
for doc in retrieved[:top_k]:
if doc['id'] in self.ground_truth:
relevant_retrieved += 1
return relevant_retrieved / top_k
def retrieval_recall(self, retrieved: List[Dict]) -> float:
"""Recall: How many relevant docs were retrieved?"""
relevant_retrieved = 0
for doc in retrieved:
if doc['id'] in self.ground_truth:
relevant_retrieved += 1
return relevant_retrieved / len(self.ground_truth)
def retrieval_mrr(self, retrieved: List[Dict]) -> float:
"""Mean Reciprocal Rank: Where is the first relevant doc?"""
for i, doc in enumerate(retrieved):
if doc['id'] in self.ground_truth:
return 1 / (i + 1)
return 0
def retrieval_ndcg(self, retrieved: List[Dict], top_k: int = 10) -> float:
"""Normalized Discounted Cumulative Gain"""
dcg = 0
for i, doc in enumerate(retrieved[:top_k]):
if doc['id'] in self.ground_truth:
dcg += 1 / np.log2(i + 2) # i+2 because log2(1) = 0
# Ideal DCG: All top K docs are relevant
idcg = sum(1 / np.log2(i + 2) for i in range(top_k))
return dcg / idcg
def answer_relevance(self, generated_answer: str, reference_answer: str) -> float:
"""Answer relevance: Semantic similarity with reference"""
# Embed both answers
gen_emb = llm_client.embeddings.create(
model="text-embedding-ada-002",
input=generated_answer
).data[0].embedding
ref_emb = llm_client.embeddings.create(
model="text-embedding-ada-002",
input=reference_answer
).data[0].embedding
# Cosine similarity
similarity = np.dot(gen_emb, ref_emb) / (
np.linalg.norm(gen_emb) * np.linalg.norm(ref_emb)
)
return similarity
def faithfulness(self, generated_answer: str, context: str) -> float:
"""Faithfulness: Is answer faithful to context?"""
# Check if answer is supported by context
prompt = f"""
Determine if the following answer is faithful to the context.
Return "FAITHFUL" or "NOT_FAITHFUL".
Context:
{context}
Answer:
{generated_answer}
Verdict:
"""
response = llm_client.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": prompt}],
temperature=0
)
return 1.0 if "FAITHFUL" in response.choices[0].message.content else 0.0

RAG Best Practices

DO

# 1. Use hybrid search
# Vector + BM25 for best results
# 2. Implement reranking
# Reciprocal rank fusion, cross-encoder reranking
# 3. Use metadata filtering
# Pre-filter for better performance
# 4. Compress context
# LLM-based summarization to reduce tokens
# 5. Add citations
# Source attribution for transparency

DON’T

# 1. Don't use only vector search
# Hybrid search is better
# 2. Don't ignore query rewriting
# Query expansion improves retrieval
# 3. Don't skip evaluation
# Essential for production
# 4. Don't forget context window limits
# Compress long contexts
# 5. Don't ignore conversation history
# Multi-turn conversations need memory

Key Takeaways

  1. Naive RAG: Simple retrieve and generate pipeline
  2. Advanced RAG: Query rewriting, reranking, context compression
  3. Hybrid search: Vector + BM25 for best results
  4. Metadata filtering: Pre-filter for better performance
  5. Reranking: Reciprocal rank fusion, cross-encoder reranking
  6. Context compression: LLM-based summarization
  7. Evaluation: Precision, recall, MRR, NDCG, faithfulness
  8. Use When: Question answering, knowledge search, document chat

Back to Module 5