RAG Architecture
Retrieval-Augmented Generation for Production LLM Systems
Overview
Retrieval-Augmented Generation (RAG) combines the power of large language models (LLMs) with external knowledge retrieval. It enables LLMs to access up-to-date, domain-specific information without fine-tuning, making it ideal for enterprise applications.
RAG Architecture
End-to-End Pipeline
Key Components:
- Vector Database: Stores document embeddings for semantic search
- Retrieval Strategy: Semantic search, hybrid search, metadata filtering
- Context Building: Prompt construction with retrieved documents
- LLM Generation: Text generation with context
- Citations: Source attribution for transparency
RAG Patterns
Naive RAG
Naive RAG Implementation:
import openaifrom pinecone import Pinecone
# Initialize clientsllm_client = openai.OpenAI(api_key="your-api-key")vector_db = Pinecone(api_key="your-pinecone-api-key")index = vector_db.Index("documents")
def naive_rag(query: str, top_k: int = 5) -> str: """Naive RAG: Simple retrieve and generate"""
# 1. Embed query query_vector = llm_client.embeddings.create( model="text-embedding-ada-002", input=query ).data[0].embedding
# 2. Retrieve documents results = index.query( vector=query_vector, top_k=top_k, include_metadata=True )
# 3. Build context context = "\n".join([ f"Document {i+1}: {match['metadata']['text']}\n" for i, match in enumerate(results['matches']) ])
# 4. Generate response prompt = f""" Context: {context}
Question: {query}
Answer: """
response = llm_client.chat.completions.create( model="gpt-4", messages=[{"role": "user", "content": prompt}] )
return response.choices[0].message.contentAdvanced RAG
Advanced RAG Implementation:
from typing import List, Dictimport numpy as np
class AdvancedRAG: """Advanced RAG with query rewriting, reranking, and context compression"""
def __init__(self, vector_index, llm_client): self.index = vector_index self.llm = llm_client self.conversation_history = []
def query_rewriting(self, query: str) -> List[str]: """Rewrite query for better retrieval"""
# Generate multiple query variations prompt = f""" Generate 3 different ways to ask this question: {query}
Format: One query per line """
response = self.llm.chat.completions.create( model="gpt-4", messages=[{"role": "user", "content": prompt}], temperature=0.7 )
queries = [query] # Include original queries.extend(response.choices[0].message.content.split("\n")) return queries
def hybrid_search(self, queries: List[str], top_k: int = 10) -> List[Dict]: """Hybrid search: Vector + BM25"""
all_results = []
for query in queries: # Vector search query_vector = self.llm.embeddings.create( model="text-embedding-ada-002", input=query ).data[0].embedding
vector_results = self.index.query( vector=query_vector, top_k=top_k, include_metadata=True )
all_results.extend(vector_results['matches'])
# Remove duplicates unique_results = {r['id']: r for r in all_results}.values()
return list(unique_results)
def reciprocal_rerank(self, results: List[Dict], k: int = 60) -> List[Dict]: """Reciprocal rank fusion (RRF) for result fusion"""
scores = {}
for i, result in enumerate(results): doc_id = result['id']
if doc_id not in scores: scores[doc_id] = { 'doc': result, 'score': 0 }
# RRF score: 1/(k + rank) scores[doc_id]['score'] += 1 / (k + i + 1)
# Sort by score reranked = sorted( scores.values(), key=lambda x: x['score'], reverse=True )
return [r['doc'] for r in reranked[:10]]
def context_compression(self, documents: List[Dict], query: str) -> str: """Compress context using LLM"""
# Concatenate documents docs_text = "\n".join([ f"Doc {i+1}: {doc['metadata']['text']}" for i, doc in enumerate(documents) ])
# Compress with LLM prompt = f""" Compress the following documents into a concise summary relevant to the query: {query}
Documents: {docs_text}
Summary: """
response = self.llm.chat.completions.create( model="gpt-4", messages=[{"role": "user", "content": prompt}], max_tokens=1000 )
return response.choices[0].message.content
def generate_with_citations(self, query: str, context: str, sources: List[Dict]) -> str: """Generate response with citations"""
# Format citations citations = "\n".join([ f"[{i+1}] {doc['metadata']['source']}" for i, doc in enumerate(sources) ])
# Generate response prompt = f""" Answer the following question based on the context provided. Include citations in your answer using [1], [2], etc.
Context: {context}
Question: {query}
Sources: {citations}
Answer: """
response = self.llm.chat.completions.create( model="gpt-4", messages=[{"role": "user", "content": prompt}], temperature=0.7 )
return response.choices[0].message.content
def run(self, query: str) -> str: """Run advanced RAG pipeline"""
# 1. Query rewriting queries = self.query_rewriting(query)
# 2. Hybrid search results = self.hybrid_search(queries)
# 3. Reranking reranked = self.reciprocal_rerank(results)
# 4. Context compression compressed_context = self.context_compression(reranked, query)
# 5. Generate with citations response = self.generate_with_citations( query, compressed_context, reranked )
return responseRAG Retrieval Strategies
Metadata Filtering
# Metadata filtering for retrieval
def filtered_rag(query: str, category: str, date_from: str) -> str: """RAG with metadata filtering"""
# Embed query query_vector = llm_client.embeddings.create( model="text-embedding-ada-002", input=query ).data[0].embedding
# Search with filter results = index.query( vector=query_vector, top_k=10, filter={ "category": {"$eq": category}, "date": {"$gte": date_from} }, include_metadata=True )
# Generate response # ... (same as naive RAG)Hybrid Search
# Hybrid search: Vector + BM25
def hybrid_rag(query: str, alpha: float = 0.7) -> str: """Hybrid RAG: Vector + BM25"""
# Embed query for vector search query_vector = llm_client.embeddings.create( model="text-embedding-ada-002", input=query ).data[0].embedding
# Vector search vector_results = index.query( vector=query_vector, top_k=50, include_metadata=True )
# BM25 search (if supported) # keyword_results = index.search(query, search_method="bm25")
# Combine scores # combined_score = alpha * vector_score + (1 - alpha) * keyword_score
# Rerank and select top K # ...
# Generate response # ...Query Expansion
# Query expansion for better retrieval
def expanded_rag(query: str) -> str: """RAG with query expansion"""
# Generate query expansions expansion_prompt = f""" Generate 3 different ways to ask this question: {query}
Format: One query per line """
response = llm_client.chat.completions.create( model="gpt-4", messages=[{"role": "user", "content": expansion_prompt}], temperature=0.7 )
expanded_queries = [query] expanded_queries.extend(response.choices[0].message.content.split("\n"))
# Search for all queries all_results = [] for q in expanded_queries: q_vector = llm_client.embeddings.create( model="text-embedding-ada-002", input=q ).data[0].embedding
results = index.query( vector=q_vector, top_k=10, include_metadata=True )
all_results.extend(results['matches'])
# Remove duplicates and rerank unique_results = {r['id']: r for r in all_results}.values()
# Generate response with top results # ...RAG Evaluation
Evaluation Metrics
# RAG evaluation framework
from typing import List, Dictimport numpy as np
class RAGEvaluator: """Evaluate RAG system performance"""
def __init__(self, ground_truth: List[Dict]): self.ground_truth = ground_truth
def retrieval_precision(self, retrieved: List[Dict], top_k: int = 10) -> float: """Precision@K: How many retrieved docs are relevant?"""
relevant_retrieved = 0
for doc in retrieved[:top_k]: if doc['id'] in self.ground_truth: relevant_retrieved += 1
return relevant_retrieved / top_k
def retrieval_recall(self, retrieved: List[Dict]) -> float: """Recall: How many relevant docs were retrieved?"""
relevant_retrieved = 0
for doc in retrieved: if doc['id'] in self.ground_truth: relevant_retrieved += 1
return relevant_retrieved / len(self.ground_truth)
def retrieval_mrr(self, retrieved: List[Dict]) -> float: """Mean Reciprocal Rank: Where is the first relevant doc?"""
for i, doc in enumerate(retrieved): if doc['id'] in self.ground_truth: return 1 / (i + 1)
return 0
def retrieval_ndcg(self, retrieved: List[Dict], top_k: int = 10) -> float: """Normalized Discounted Cumulative Gain"""
dcg = 0 for i, doc in enumerate(retrieved[:top_k]): if doc['id'] in self.ground_truth: dcg += 1 / np.log2(i + 2) # i+2 because log2(1) = 0
# Ideal DCG: All top K docs are relevant idcg = sum(1 / np.log2(i + 2) for i in range(top_k))
return dcg / idcg
def answer_relevance(self, generated_answer: str, reference_answer: str) -> float: """Answer relevance: Semantic similarity with reference"""
# Embed both answers gen_emb = llm_client.embeddings.create( model="text-embedding-ada-002", input=generated_answer ).data[0].embedding
ref_emb = llm_client.embeddings.create( model="text-embedding-ada-002", input=reference_answer ).data[0].embedding
# Cosine similarity similarity = np.dot(gen_emb, ref_emb) / ( np.linalg.norm(gen_emb) * np.linalg.norm(ref_emb) )
return similarity
def faithfulness(self, generated_answer: str, context: str) -> float: """Faithfulness: Is answer faithful to context?"""
# Check if answer is supported by context prompt = f""" Determine if the following answer is faithful to the context. Return "FAITHFUL" or "NOT_FAITHFUL".
Context: {context}
Answer: {generated_answer}
Verdict: """
response = llm_client.chat.completions.create( model="gpt-4", messages=[{"role": "user", "content": prompt}], temperature=0 )
return 1.0 if "FAITHFUL" in response.choices[0].message.content else 0.0RAG Best Practices
DO
# 1. Use hybrid search# Vector + BM25 for best results
# 2. Implement reranking# Reciprocal rank fusion, cross-encoder reranking
# 3. Use metadata filtering# Pre-filter for better performance
# 4. Compress context# LLM-based summarization to reduce tokens
# 5. Add citations# Source attribution for transparencyDON’T
# 1. Don't use only vector search# Hybrid search is better
# 2. Don't ignore query rewriting# Query expansion improves retrieval
# 3. Don't skip evaluation# Essential for production
# 4. Don't forget context window limits# Compress long contexts
# 5. Don't ignore conversation history# Multi-turn conversations need memoryKey Takeaways
- Naive RAG: Simple retrieve and generate pipeline
- Advanced RAG: Query rewriting, reranking, context compression
- Hybrid search: Vector + BM25 for best results
- Metadata filtering: Pre-filter for better performance
- Reranking: Reciprocal rank fusion, cross-encoder reranking
- Context compression: LLM-based summarization
- Evaluation: Precision, recall, MRR, NDCG, faithfulness
- Use When: Question answering, knowledge search, document chat
Back to Module 5