Skip to content

Weaviate Guide

Knowledge Graph Vector Database


Overview

Weaviate is an open-source vector database that combines vector search with knowledge graph capabilities. It provides schema-based data organization, semantic search, and GraphQL API, making it ideal for knowledge management and semantic search applications.


Weaviate Architecture

Graph-Based Architecture

Key Components:

  • GraphQL API: Query and mutate data with GraphQL
  • Vector Index: HNSW (Hierarchical Navigable Small World)
  • Schema: Classes, properties, data types
  • Knowledge Graph: References, relations between objects
  • Hybrid Search: Vector + BM25 (keyword) search

Weaviate Installation

Docker Deployment

docker-compose.yml
version: '3.8'
services:
weaviate:
image: semitechnologies/weaviate:latest
ports:
- "8080:8080"
environment:
QUERY_DEFAULTS_LIMIT: 25
AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true'
PERSISTENCE_DATA_PATH: '/var/lib/weaviate'
DEFAULT_VECTORIZER_MODULE: 'none'
ENABLE_MODULES: 'text2vec-openai,text2vec-cohere,text2vec-huggingface'
CLUSTER_HOSTNAME: 'node1'
volumes:
- weaviate_data:/var/lib/weaviate
volumes:
weaviate_data:

Weaviate Cloud

Terminal window
# Weaviate Cloud Services (WCS)
# Sign up: https://console.weaviate.cloud/
# Create cluster
# - Region: us-central1, eu-central1, asia-southeast1
# - Tier: Free (1M vectors), Standard (10M), Enterprise (unlimited)
# Connect to cluster
import weaviate
client = weaviate.Client(
url="https://your-cluster.weaviate.cloud",
auth_client_secret=weaviate.AuthApiKey(api_key="your-api-key")
)

Weaviate Operations

Schema Definition

import weaviate
client = weaviate.Client("http://localhost:8080")
# Define schema
schema = {
"classes": [
{
"class": "Document",
"description": "A document with text and metadata",
"vectorIndexType": "hnsw",
"vectorizer": "text2vec-openai", # Or "none" for custom vectors
"properties": [
{
"name": "content",
"dataType": ["text"],
"description": "Document content"
},
{
"name": "title",
"dataType": ["string"],
"description": "Document title"
},
{
"name": "category",
"dataType": ["string"],
"description": "Document category"
},
{
"name": "author",
"dataType": ["string"],
"description": "Document author"
},
{
"name": "date",
"dataType": ["date"],
"description": "Document date"
},
{
"name": "hasReference",
"dataType": ["Document"],
"description": "Reference to another document"
}
],
"moduleConfig": {
"text2vec-openai": {
"vectorizeClassName": False,
"model": "ada",
"type": "text",
"vectorizePropertyName": False
}
}
}
]
}
# Create schema
client.schema.create(schema)
# Verify schema
schema = client.schema.get()
print(schema)

Insert Objects

# Insert objects with Weaviate
import openai
import numpy as np
# Generate embedding (if using custom vectors)
def embed_text(text):
response = openai.Embedding.create(
model="text-embedding-ada-002",
input=text
)
return response['data'][0]['embedding']
# Insert single object
client.data_object.create(
class_name="Document",
data_object={
"content": "Machine learning is a subset of artificial intelligence.",
"title": "Introduction to ML",
"category": "AI",
"author": "John Doe",
"date": "2025-01-27T00:00:00Z"
},
vector=embed_text("Machine learning is a subset of artificial intelligence.") # Optional if using vectorizer
)
# Batch insert
from weaviate.batch import Batch
with Batch(client, batch_size=100, dynamic=True) as batch:
for i in range(100):
batch.add_data_object(
data_object={
"content": f"Document {i}",
"title": f"Title {i}",
"category": "AI" if i % 2 == 0 else "ML",
"author": f"Author {i}",
"date": "2025-01-27T00:00:00Z"
},
class_name="Document",
vector=embed_text(f"Document {i}") # Optional
)
# Verify insertion
count = client.query.aggregate("Document").with_meta_count().do()
print(f"Total objects: {count['data']['Aggregate']['Document'][0]['meta']['count']}")
# Vector similarity search with Weaviate
query_text = "What is machine learning?"
query_vector = embed_text(query_text)
# Near vector search (semantic search)
result = client.query.get(
"Document",
["content", "title", "category", "author", "date"]
).with_near_vector({
"vector": query_vector,
"certainty": 0.7 # Minimum certainty threshold
}).with_limit(10).do()
# Process results
for obj in result["data"]["Get"]["Document"]:
print(f"Content: {obj['content']}")
print(f"Title: {obj['title']}")
print(f"Category: {obj['category']}")
print(f"Certainty: {obj.get('_additional', {}).get('certainty', 0):.4f}")
print("---")
# BM25 search (keyword search)
result = client.query.get(
"Document",
["content", "title", "category"]
).with_bm25(
query="machine learning",
properties=["content", "title"] # Search in these properties
).with_limit(10).do()
# Hybrid search (vector + BM25)
result = client.query.get(
"Document",
["content", "title", "category"]
).with_hybrid(
query="machine learning",
vector=query_vector,
alpha=0.7, # 0 = pure BM25, 1 = pure vector
properties=["content", "title"]
).with_limit(10).do()

GraphQL Queries

# GraphQL query for semantic search
Get {
Documents(
nearVector: {
vector: [0.1, 0.2, 0.3, ...], # 1536 dimensions
certainty: 0.7
}
limit: 10
) {
content
title
category
author
date
_additional {
certainty
id
vector
}
}
}
# GraphQL query for BM25 search
Get {
Documents(
bm25: {
query: "machine learning",
properties: ["content", "title"]
}
limit: 10
) {
content
title
category
_additional {
score
id
}
}
}
# GraphQL query for hybrid search
Get {
Documents(
hybrid: {
query: "machine learning"
vector: [0.1, 0.2, 0.3, ...]
alpha: 0.7
properties: ["content", "title"]
}
limit: 10
) {
content
title
category
_additional {
score
certainty
id
}
}
}

Weaviate Modules

Vectorizer Modules

# Vectorizer modules for automatic embedding
# 1. text2vec-openai (OpenAI embeddings)
client.schema.create({
"classes": [{
"class": "Document",
"vectorizer": "text2vec-openai",
"moduleConfig": {
"text2vec-openai": {
"model": "ada", # or "text-embedding-3-small/large"
"type": "text",
"vectorizeClassName": False,
"vectorizePropertyName": False
}
},
"properties": [
{
"name": "content",
"dataType": ["text"],
"moduleConfig": {
"text2vec-openai": {
"skip": False,
"vectorizePropertyName": False
}
}
}
]
}]
})
# 2. text2vec-cohere (Cohere embeddings)
"vectorizer": "text2vec-cohere",
"moduleConfig": {
"text2vec-cohere": {
"model": "embed-english-v3.0",
"type": "text",
"truncate": "RIGHT"
}
}
# 3. text2vec-huggingface (Open source embeddings)
"vectorizer": "text2vec-huggingface",
"moduleConfig": {
"text2vec-huggingface": {
"model": "sentence-transformers/all-MiniLM-L6-v2",
"type": "text"
}
}
# 4. multi2vec-clip (Image + text embeddings)
"vectorizer": "multi2vec-clip",
"moduleConfig": {
"multi2vec-clip": {
"imageFields": ["image"],
"textFields": ["text"]
}
}

Reranker Modules

# Reranker modules for result re-ranking
# 1. reranker-cohere (Cohere reranker)
result = (
client.query
.get("Document", ["content", "title"])
.with_bm25(query="machine learning")
.with_limit(100) # Get more results for reranking
.with_near_vector(vector=query_vector)
.with_additional([
"rerank(property: content query: \"machine learning\" model: rerank-cohere)")
])
.do()
)
# 2. qna-openai (OpenAI question answering)
result = (
client.query
.get("Document", ["content"])
.with_near_vector(vector=query_vector)
.with_limit(5)
.with_generate(
single_prompt="Generate a concise summary of this: {content}"
)
.do()
)

Weaviate Performance

Index Configuration

# HNSW index configuration
schema = {
"classes": [{
"class": "Document",
"vectorIndexType": "hnsw",
"vectorIndexConfig": {
"maxConnections": 16, # M: Number of bidirectional links
"efConstruction": 64, # Size of dynamic candidate list
"ef": -1, # Search ef (-1 = auto, or specify 10-100)
"dynamicEfMin": 100, # Minimum ef for auto-ef
"dynamicEfMax": 500, # Maximum ef for auto-ef
"dynamicEfAccuracy": 0.90, # Target accuracy for auto-ef
"vectorCacheMaxObjects": 100000000, # Cache size
"flatSearchCutoff": 40000, # Switch to flat search below this
"distance": "cosine" # Distance metric: cosine, dot, l2-squared, hamming, manhattan
},
"properties": [
{"name": "content", "dataType": ["text"]},
{"name": "title", "dataType": ["string"]}
]
}]
}
# HNSW parameters:
# maxConnections (M): Number of bidirectional links (16-32)
# Higher: Better recall, more memory, slower indexing
# efConstruction: Size of dynamic candidate list (32-512)
# Higher: Better recall, slower indexing
# ef: Search ef (10-100, or -1 for auto)
# Higher: Better recall, slower search

Query Optimization

# Query optimization strategies
# 1. Use appropriate search type
# - nearVector: Pure vector search
# - bm25: Pure keyword search
# - hybrid: Vector + keyword (recommended)
# 2. Use hybrid search for best results
result = (
client.query
.get("Document", ["content", "title"])
.with_hybrid(
query="machine learning",
vector=query_vector,
alpha=0.7, # Tune alpha (0-1)
properties=["content", "title"]
)
.with_limit(10)
.do()
)
# 3. Use filters to reduce search space
result = (
client.query
.get("Document", ["content", "title"])
.with_where({
"path": ["category"],
"operator": "Equal",
"valueString": "AI"
})
.with_near_vector({"vector": query_vector})
.with_limit(10)
.do()
)
# 4. Use pagination for large result sets
result = (
client.query
.get("Document", ["content", "title"])
.with_near_vector({"vector": query_vector})
.with_limit(10)
.with_offset(20) # Skip first 20 results
.do()
)
# 5. Use projection to reduce data transfer
result = (
client.query
.get("Document", ["content", "title"]) # Only get these properties
.with_near_vector({"vector": query_vector})
.with_limit(10)
.do()
)

Weaviate Cost Optimization

Self-Hosted vs. Cloud

DeploymentCostComplexityUse Case
Self-hosted (Docker)Free (hardware only)MediumLearning, testing
Self-hosted (K8s)Cloud costs onlyHighProduction, control
Weaviate Cloud (Free)$0LowDevelopment (1M vectors)
Weaviate Cloud (Standard)$25/monthLowProduction (10M vectors)

Cost Optimization

# Cost optimization strategies
# 1. Use self-hosted for cost savings
# Free, only pay for hardware
# 2. Use appropriate vectorizer
# - OpenAI: Best quality, $0.10 per 1M tokens
# - Cohere: Good quality, $0.10 per 1M tokens
# - HuggingFace: Free, lower quality
# 3. Delete old data
client.data_object.delete(
class_name="Document",
where={
"path": ["date"],
"operator": "LessThan",
"valueDate": "2024-01-01T00:00:00Z"
}
)
# 4. Use compression
# Weaviate automatically compresses vectors
# 5. Use pagination
# Avoid loading all results at once

Weaviate Security

Authentication

# Weaviate authentication
# 1. API key authentication
client = weaviate.Client(
url="https://your-cluster.weaviate.cloud",
auth_client_secret=weaviate.AuthApiKey(api_key="your-api-key")
)
# 2. OIDC authentication (OpenID Connect)
client = weaviate.Client(
url="https://your-cluster.weaviate.cloud",
auth_client_secret=weaviate.AuthClientPassword(
username="your-username",
password="your-password",
scope="offline_access"
)
)
# 3. Environment variables
import os
weaviate_url = os.environ.get("WEAVIATE_URL")
weaviate_api_key = os.environ.get("WEAVIATE_API_KEY")
client = weaviate.Client(
url=weaviate_url,
auth_client_secret=weaviate.AuthApiKey(api_key=weaviate_api_key)
)

Data Encryption

# Data encryption
# 1. Encryption at rest (automatic with Weaviate Cloud)
# Data is encrypted in storage
# 2. Encryption in transit (HTTPS)
client = weaviate.Client(
url="https://your-cluster.weaviate.cloud", # HTTPS
auth_client_secret=weaviate.AuthApiKey(api_key="your-api-key")
)
# 3. Data privacy
# - Remove PII before embedding
# - Anonymize sensitive information
# - Use data masking

Weaviate Monitoring

Metrics

# Monitor Weaviate performance
# 1. Object count
count = client.query.aggregate("Document").with_meta_count().do()
print(f"Total objects: {count['data']['Aggregate']['Document'][0]['meta']['count']}")
# 2. Shards status
shards = client.schema.get_shards("Document")
for shard in shards:
print(f"Shard: {shard['name']}")
print(f"Status: {shard['status']}")
# 3. Query performance
import time
start = time.time()
result = client.query.get("Document", ["content"]).with_near_vector({
"vector": query_vector
}).with_limit(10).do()
end = time.time()
print(f"Query time: {end - start:.3f}s")
# 4. Memory usage
# Use Weaviate's Prometheus endpoint for metrics
# http://localhost:8080/metrics

Weaviate Best Practices

DO

# 1. Use schema validation
# Define schema before inserting data
# 2. Use hybrid search
# Best of both worlds (vector + BM25)
# 3. Use filters to reduce search space
# Pre-filter results for better performance
# 4. Use batch insertion
# Faster than single inserts
# 5. Monitor performance
# Track query latency and throughput

DON’T

# 1. Don't skip schema definition
# Schema is required
# 2. Don't ignore index parameters
# M and efConstruction matter
# 3. Don't use low-dimensional vectors
# 768+ dimensions recommended
# 4. Don't forget to backup
# Back up Weaviate data regularly
# 5. Don't use inefficient queries
# Use filters, pagination, projection

Weaviate vs. Alternatives

FeatureWeaviatePineconeMilvuspgvector
Knowledge GraphYes (references)NoNoNo
SchemaRequiredOptionalOptionalOptional
VectorizerBuilt-in modulesExternalExternalExternal
Search TypesVector, BM25, HybridVector onlyVector onlyVector only
GraphQLYesNoNoNo
Open SourceYesNoYesYes
Best ForKnowledge graphs, semantic searchRAG, managed serviceOpen-source, on-premSelf-hosted, PostgreSQL

Key Takeaways

  1. Knowledge graph: Vector search + semantic relationships
  2. Schema-based: Classes, properties, data types
  3. Vectorizers: Built-in modules (OpenAI, Cohere, HuggingFace)
  4. Hybrid search: Vector + BM25 for best results
  5. GraphQL API: Query and mutate with GraphQL
  6. Modules: Vectorizer, reranker, QnA modules
  7. Open source: Self-hosted, free option available
  8. Use When: Knowledge management, semantic search, relationships

Back to Module 5