A Guide to Document Chunking and Vector Search
Building smarter search systems that understand your content
Introduction: Why Traditional Search Falls Short
Imagine you're searching through a massive company knowledge base for information about "machine learning best practices." Traditional keyword search might return hundreds of documents, but you end up scrolling through irrelevant results because:
The term "machine learning" appears in random sentences throughout documents
You get the entire 50-page document when you need a specific section
Important documents are buried because they use synonyms like "AI" or "artificial intelligence"
This is where intelligent document chunking and vector search come to the rescue. Instead of treating documents like black boxes, we break them down intelligently and search through them using AI that understands meaning, not just keywords.
But here's the thing: there are multiple ways to approach this problem, each with its strengths and use cases. Let's examine the primary strategies that production systems employ today.
The Three Main Approaches Explained
1. Fixed-Size Chunking: The Simple Approach
What it is: Cut documents into equal-sized pieces, like slicing bread.
How it works:
from langchain.text_splitter import CharacterTextSplitter
# Simple character-based chunking
def simple_chunking(document, chunk_size=500, chunk_overlap=50):
text_splitter = CharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
separator="\n"
)
chunks = text_splitter.split_text(document)
return chunks
# Example
document = "Machine learning is revolutionizing healthcare..."
chunks = simple_chunking(document, 200)
# Result: ["Machine learning is revolutionizing healthcare by enabling...",
# "...doctors to diagnose diseases faster. Recent studies show..."]
Real-world example: Netflix might use this approach for subtitles or movie descriptions where the content is relatively uniform.
Pros:
✅ Simple to implement
✅ Predictable memory usage
✅ Works well for uniform content (novels, articles)
Cons:
❌ Cuts through sentences mid-thought
❌ Loses document structure
❌ Poor for complex documents
2. Semantic Chunking: The Smart Approach
What it is: Split documents based on meaning and structure, like organising a library by topics.
How it works:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import MarkdownHeaderTextSplitter
import re
def semantic_chunking_with_langchain(document, doc_type="markdown"):
chunks = []
if doc_type == "markdown":
# For markdown documents, split by headers
headers_to_split_on = [
("#", "Header 1"),
("##", "Header 2"),
("###", "Header 3"),
]
markdown_splitter = MarkdownHeaderTextSplitter(
headers_to_split_on=headers_to_split_on
)
md_header_splits = markdown_splitter.split_text(document)
# Further split large sections
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=50
)
for header_chunk in md_header_splits:
chunk_type = detect_section_type(header_chunk.page_content, header_chunk.metadata)
# Split large chunks further while preserving structure
if len(header_chunk.page_content) > 800:
sub_chunks = text_splitter.split_text(header_chunk.page_content)
for i, sub_chunk in enumerate(sub_chunks):
chunks.append({
'content': sub_chunk,
'type': chunk_type,
'metadata': {
**header_chunk.metadata,
'sub_chunk_index': i,
'is_split_chunk': True
}
})
else:
chunks.append({
'content': header_chunk.page_content,
'type': chunk_type,
'metadata': header_chunk.metadata
})
else:
# For plain text, use recursive splitting with custom separators
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=50,
separators=["\n\n\n", "\n\n", "\n", ".", "!", "?", " ", ""]
)
raw_chunks = text_splitter.split_text(document)
for i, chunk in enumerate(raw_chunks):
chunk_type = detect_section_type(chunk)
chunks.append({
'content': chunk,
'type': chunk_type,
'metadata': {'chunk_index': i}
})
return chunks
def detect_section_type(text, existing_metadata=None):
text_lower = text.lower()
# Use existing header metadata if available
if existing_metadata:
for key, value in existing_metadata.items():
if 'header' in key.lower():
header_text = value.lower()
if any(keyword in header_text for keyword in ['summary', 'abstract', 'overview']):
return 'summary'
elif any(keyword in header_text for keyword in ['conclusion', 'summary', 'results']):
return 'conclusion'
elif any(keyword in header_text for keyword in ['introduction', 'background']):
return 'introduction'
# Fallback to content-based detection
if len(text) < 100 and ':' in text:
return 'title'
elif any(keyword in text_lower for keyword in ['summary', 'abstract', 'overview']):
return 'summary'
elif any(keyword in text_lower for keyword in ['conclusion', 'in conclusion', 'to conclude']):
return 'conclusion'
elif any(keyword in text_lower for keyword in ['introduction', 'background']):
return 'introduction'
else:
return 'content'
Real-world example: A legal firm's document system where lawyers need to quickly find case summaries vs. detailed legal reasoning, vs. final judgments.
Pros:
✅ Preserves document structure
✅ Enables targeted search (search only in summaries)
✅ Better context preservation
✅ Widely used in production
Cons:
❌ More complex to implement
❌ Requires understanding of document structure
❌ May create uneven chunk sizes
3. Multi-Vector Collections: The Advanced Approach
What it is: Create multiple different representations of the same content, like having multiple indexes for the same library book.
How it works:
def multi_vector_approach(document):
# Same document, multiple representations
representations = {}
# Semantic representation (for meaning)
representations['semantic'] = create_embedding(document, model='semantic')
# Keyword representation (for exact matches)
keyword_enhanced = extract_keywords(document) + document
representations['keyword'] = create_embedding(keyword_enhanced, model='keyword')
# Summary representation (for high-level concepts)
summary = generate_summary(document)
representations['summary'] = create_embedding(summary, model='large')
return {
'document': document,
'vectors': representations
}
# When searching, you can choose which representation to use
def search_with_strategy(query, search_type='semantic'):
if search_type == 'semantic':
return search_vector(query, vector_type='semantic')
elif search_type == 'keyword':
return search_vector(query, vector_type='keyword')
elif search_type == 'conceptual':
return search_vector(query, vector_type='summary')
Real-world example: A research platform where the same paper needs to be findable by exact technical terms, general concepts, and semantic similarity.
Pros:
✅ Multiple search strategies for the same content
✅ Can combine different AI models
✅ Handles diverse query types well
Cons:
❌ Much more complex and expensive
❌ Requires multiple embedding API calls
❌ Higher storage costs
❌ Less commonly used in production
Real-World Use Cases: Which Approach When?
E-commerce Platform: Product Search
Scenario: Customers search for products using various terms
Best approach: Semantic chunking with product attribute separation
Product chunks:
- Title: "iPhone 15 Pro Max"
- Features: "6.7-inch display, A17 Pro chip, titanium design"
- Reviews: "Great camera quality, excellent battery life"
- Specifications: "256GB storage, 5G connectivity"
Why: Customers might want to search specifically in reviews ("battery life") or specifications ("storage"), making targeted search valuable.
Legal Document Management
Scenario: Lawyers need to find specific information in thousands of legal documents
Best approach: Semantic chunking with legal document structure
Legal document chunks:
- Case summary: "Plaintiff vs. Defendant regarding contract dispute"
- Facts: "On January 15, 2023, the parties entered into agreement..."
- Legal reasoning: "Under contract law precedent established in..."
- Judgment: "The court finds in favor of plaintiff and awards..."
Why: Legal professionals have specific information needs - they might want only case summaries for quick review or only judgments for precedent research.
Customer Support Knowledge Base
Scenario: Support agents need quick answers to customer questions
Best approach: Multi-vector collections for diverse query handling
Same article about "Password Reset" gets multiple representations:
- Semantic vector: Understands "I can't log in" → password reset
- Keyword vector: Finds exact matches for "forgot password"
- Summary vector: Matches high-level concepts like "account access issues"
Why: Customer questions come in many forms - some use exact terminology, others describe problems in natural language.
Academic Research Platform
Scenario: Researchers search through millions of scientific papers
Best approach: Semantic chunking with academic paper structure
Research paper chunks:
- Abstract: High-level research summary
- Introduction: Problem background and motivation
- Methodology: How the research was conducted
- Results: What was discovered
- Conclusion: Implications and future work
Why: Researchers have different needs - some want quick overviews (abstracts), others need implementation details (methodology).
Best Practices and Common Pitfalls
Do's ✅
Start simple: Begin with semantic chunking before considering multi-vector
Test with real queries: Use actual user queries to evaluate effectiveness
Monitor chunk sizes: Aim for 200-800 tokens per chunk for most embedding models
Preserve context: Include some overlap between chunks to maintain context
Use metadata effectively: Store document source, creation date, author, etc.
Don'ts ❌
Don't over-engineer: Multi-vector isn't always better than good semantic chunking
Don't ignore document structure: Fixed chunking often loses important context
Don't forget evaluation: Measure search quality with real user scenarios
Don't chunk too small: Very small chunks lose context
Don't chunk too large: Very large chunks dilute specific information
Common Pitfalls and How to Avoid Them
Pitfall 1: "More Vectors = Better Results"
Problem: Assuming multi-vector always outperforms simpler approaches
Solution: Start with semantic chunking and only add complexity if you have specific use cases that require it
Pitfall 2: Ignoring Document Structure
Problem: Using fixed chunking on structured documents like research papers
Solution: Analyze your document types and chunk according to their natural structure
Pitfall 3: Not Testing with Real Queries
Problem: Optimizing for theoretical scenarios instead of actual user needs
Solution: Collect real user queries and evaluate your chunking strategy against them
Sample code
import os
import uuid
import logging
from typing import List, Dict, Any, Optional
from dataclasses import dataclass
import openai
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct, Filter, FieldCondition, MatchValue
from dotenv import load_dotenv
load_dotenv()
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
@dataclass
class DocumentChunk:
"""Represents a chunk of a document with its metadata"""
chunk_id: str
content: str
chunk_type: str # 'summary', 'paragraph', 'title', 'conclusion', etc.
parent_doc_id: str
chunk_index: int
metadata: Dict[str, Any]
class OpenAIEmbeddingService:
def __init__(self, model: str = "text-embedding-3-small"):
self.client = openai.OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
self.model = model
self.embedding_dimension = 1536 if "3-small" in model else 3072
logger.info(f"Initialized OpenAI embedding service with model: {model}")
def create_embeddings(self, texts: List[str]) -> List[List[float]]:
try:
logger.info(f"Creating embeddings for {len(texts)} texts")
response = self.client.embeddings.create(
model=self.model,
input=texts,
encoding_format="float"
)
embeddings = [embedding.embedding for embedding in response.data]
logger.info(f"Successfully created {len(embeddings)} embeddings")
return embeddings
except Exception as e:
logger.error(f"Failed to create embeddings: {str(e)}")
raise
def create_single_embedding(self, text: str) -> List[float]:
"""Create embedding for a single text"""
return self.create_embeddings([text])[0]
class QdrantCollection:
"""
Multi-vector collection implementation using Qdrant.
Each document is split into multiple chunks, with each chunk getting its own vector.
"""
def __init__(self,
collection_name: str,
qdrant_client: QdrantClient,
embedding_service: OpenAIEmbeddingService):
self.collection_name = collection_name
self.qdrant_client = qdrant_client
self.embedding_service = embedding_service
# Create the collection if it doesn't exist
self._ensure_collection_exists()
def _ensure_collection_exists(self):
"""Create the Qdrant collection for multi-vector storage"""
try:
# Check if collection exists
collections = self.qdrant_client.get_collections()
existing_names = [col.name for col in collections.collections]
if self.collection_name not in existing_names:
logger.info(f"Creating new collection: {self.collection_name}")
# Create collection with vector configuration
self.qdrant_client.create_collection(
collection_name=self.collection_name,
vectors_config=VectorParams(
size=self.embedding_service.embedding_dimension,
distance=Distance.COSINE
)
)
logger.info(f"✅ Collection '{self.collection_name}' created successfully")
else:
logger.info(f"✅ Collection '{self.collection_name}' already exists")
except Exception as e:
logger.error(f"Failed to create/verify collection: {str(e)}")
raise
def _create_document_chunks(self, doc_id: str, content: str, metadata: Dict) -> List[DocumentChunk]:
"""
Split document into multiple chunks of different types.
This is where the 'multi-vector' concept comes into play.
"""
chunks = []
lines = content.strip().split('\n')
paragraphs = [p.strip() for p in content.split('\n\n') if p.strip()]
# 1. Title chunk (first non-empty line if it looks like a title)
if lines and len(lines[0].strip()) < 100:
title_chunk = DocumentChunk(
chunk_id=f"{doc_id}_title",
content=lines[0].strip(),
chunk_type="title",
parent_doc_id=doc_id,
chunk_index=0,
metadata={**metadata, "is_title": True}
)
chunks.append(title_chunk)
# 2. Summary chunk (first paragraph as summary)
if paragraphs:
summary_content = paragraphs[0]
if len(summary_content) > 50: # Only if substantial
summary_chunk = DocumentChunk(
chunk_id=f"{doc_id}_summary",
content=f"Summary: {summary_content}",
chunk_type="summary",
parent_doc_id=doc_id,
chunk_index=1,
metadata={**metadata, "is_summary": True}
)
chunks.append(summary_chunk)
# 3. Individual paragraph chunks
for i, paragraph in enumerate(paragraphs):
if len(paragraph) > 100: # Only substantial paragraphs
para_chunk = DocumentChunk(
chunk_id=f"{doc_id}_para_{i}",
content=paragraph,
chunk_type="paragraph",
parent_doc_id=doc_id,
chunk_index=i + 2, # After title and summary
metadata={**metadata, "paragraph_number": i}
)
chunks.append(para_chunk)
# 4. Conclusion chunk (last paragraph if it contains conclusion keywords)
if len(paragraphs) > 1:
last_para = paragraphs[-1].lower()
conclusion_keywords = ['conclusion', 'summary', 'in conclusion', 'to summarize', 'finally']
if any(keyword in last_para for keyword in conclusion_keywords):
conclusion_chunk = DocumentChunk(
chunk_id=f"{doc_id}_conclusion",
content=paragraphs[-1],
chunk_type="conclusion",
parent_doc_id=doc_id,
chunk_index=len(chunks) + 1,
metadata={**metadata, "is_conclusion": True}
)
chunks.append(conclusion_chunk)
logger.info(f"Created {len(chunks)} chunks for document '{doc_id}'")
return chunks
def add_document(self, doc_id: str, content: str, metadata: Dict = None) -> Dict[str, Any]:
"""
Add a document to the multi-vector collection.
This demonstrates the core workflow of multi-vector storage.
"""
metadata = metadata or {}
try:
logger.info(f"Adding document '{doc_id}' to collection")
# Step 1: Create multiple chunks from the document
chunks = self._create_document_chunks(doc_id, content, metadata)
if not chunks:
raise ValueError("No valid chunks created from document")
# Step 2: Generate embeddings for all chunks
chunk_contents = [chunk.content for chunk in chunks]
embeddings = self.embedding_service.create_embeddings(chunk_contents)
# Step 3: Create Qdrant points for each chunk
points = []
for chunk, embedding in zip(chunks, embeddings):
# Prepare payload with chunk metadata
payload = {
"chunk_id": chunk.chunk_id,
"content": chunk.content,
"chunk_type": chunk.chunk_type,
"parent_doc_id": chunk.parent_doc_id,
"chunk_index": chunk.chunk_index,
**chunk.metadata # Include all custom metadata
}
# Create point
point = PointStruct(
id=str(uuid.uuid4()), # Unique point ID
vector=embedding,
payload=payload
)
points.append(point)
# Step 4: Upload points to Qdrant
self.qdrant_client.upsert(
collection_name=self.collection_name,
points=points
)
result = {
"success": True,
"doc_id": doc_id,
"chunks_created": len(chunks),
"chunk_types": [chunk.chunk_type for chunk in chunks],
"points_uploaded": len(points)
}
logger.info(f"✅ Successfully added document '{doc_id}' with {len(chunks)} chunks")
return result
except Exception as e:
logger.error(f"Failed to add document '{doc_id}': {str(e)}")
raise
def search(self,
query: str,
limit: int = 5,
chunk_types: Optional[List[str]] = None,
doc_id_filter: Optional[str] = None) -> List[Dict[str, Any]]:
"""
Search the multi-vector collection with optional filtering.
This demonstrates the key advantage of multi-vector collections.
"""
try:
logger.info(f"Searching for: '{query}' with limit={limit}")
# Step 1: Create query embedding
query_embedding = self.embedding_service.create_single_embedding(query)
# Step 2: Build filter conditions
filter_conditions = []
if chunk_types:
# Filter by chunk types
filter_conditions.append(
FieldCondition(
key="chunk_type",
match=MatchValue(value=chunk_types[0] if len(chunk_types) == 1 else {"$in": chunk_types})
)
)
logger.info(f"Filtering by chunk types: {chunk_types}")
if doc_id_filter:
# Filter by specific document
filter_conditions.append(
FieldCondition(
key="parent_doc_id",
match=MatchValue(value=doc_id_filter)
)
)
logger.info(f"Filtering by document ID: {doc_id_filter}")
# Combine filters
search_filter = Filter(must=filter_conditions) if filter_conditions else None
# Step 3: Perform vector search
search_results = self.qdrant_client.search(
collection_name=self.collection_name,
query_vector=query_embedding,
query_filter=search_filter,
limit=limit,
with_payload=True,
with_vectors=False # Don't return vectors to save bandwidth
)
# Step 4: Format results
formatted_results = []
for result in search_results:
formatted_result = {
"score": result.score,
"chunk_id": result.payload.get("chunk_id"),
"content": result.payload.get("content"),
"chunk_type": result.payload.get("chunk_type"),
"parent_doc_id": result.payload.get("parent_doc_id"),
"chunk_index": result.payload.get("chunk_index"),
"metadata": {k: v for k, v in result.payload.items()
if k not in ["chunk_id", "content", "chunk_type", "parent_doc_id", "chunk_index"]}
}
formatted_results.append(formatted_result)
logger.info(f"Found {len(formatted_results)} results")
return formatted_results
except Exception as e:
logger.error(f"Search failed: {str(e)}")
raise
def get_document_chunks(self, doc_id: str) -> List[Dict[str, Any]]:
"""Retrieve all chunks for a specific document"""
try:
logger.info(f"Retrieving chunks for document: {doc_id}")
# Search with document filter
filter_condition = Filter(
must=[
FieldCondition(
key="parent_doc_id",
match=MatchValue(value=doc_id)
)
]
)
results = self.qdrant_client.scroll(
collection_name=self.collection_name,
scroll_filter=filter_condition,
limit=100, # Adjust based on expected chunks per document
with_payload=True,
with_vectors=False
)
chunks = []
for point in results[0]: # results is a tuple (points, next_page_offset)
chunk_info = {
"chunk_id": point.payload.get("chunk_id"),
"content": point.payload.get("content"),
"chunk_type": point.payload.get("chunk_type"),
"chunk_index": point.payload.get("chunk_index"),
"metadata": {k: v for k, v in point.payload.items()
if k not in ["chunk_id", "content", "chunk_type", "parent_doc_id", "chunk_index"]}
}
chunks.append(chunk_info)
# Sort by chunk index
chunks.sort(key=lambda x: x.get("chunk_index", 0))
logger.info(f"Retrieved {len(chunks)} chunks for document '{doc_id}'")
return chunks
except Exception as e:
logger.error(f"Failed to retrieve chunks for document '{doc_id}': {str(e)}")
raise
def get_collection_stats(self) -> Dict[str, Any]:
"""Get statistics about the collection"""
try:
collection_info = self.qdrant_client.get_collection(self.collection_name)
# Get chunk type distribution
chunk_types_result = self.qdrant_client.scroll(
collection_name=self.collection_name,
limit=1000, # Adjust based on your collection size
with_payload=True,
with_vectors=False
)
chunk_type_counts = {}
document_counts = {}
for point in chunk_types_result[0]:
chunk_type = point.payload.get("chunk_type", "unknown")
doc_id = point.payload.get("parent_doc_id", "unknown")
chunk_type_counts[chunk_type] = chunk_type_counts.get(chunk_type, 0) + 1
document_counts[doc_id] = document_counts.get(doc_id, 0) + 1
stats = {
"collection_name": self.collection_name,
"total_points": collection_info.points_count,
"vector_size": collection_info.config.params.vectors.size,
"distance_metric": collection_info.config.params.vectors.distance.value,
"total_documents": len(document_counts),
"chunk_type_distribution": chunk_type_counts,
"avg_chunks_per_document": round(collection_info.points_count / len(document_counts),
2) if document_counts else 0
}
return stats
except Exception as e:
logger.error(f"Failed to get collection stats: {str(e)}")
raise
def main():
print("=== Semantic search ===\n")
try:
print("1. Initializing services...")
embedding_service = OpenAIEmbeddingService()
from qdrant_client import QdrantClient
qdrant_client = QdrantClient(url="http://localhost:6333")
collection = QdrantCollection(
collection_name="semantic_search_demo",
qdrant_client=qdrant_client,
embedding_service=embedding_service
)
print("✅ Services initialized\n")
print("2. Adding documents to collection...")
sample_docs = {
"ai_overview": """Artificial Intelligence: An Overview
Artificial Intelligence (AI) represents one of the most transformative technologies of our time, fundamentally changing how we interact with machines and process information.
AI encompasses machine learning, natural language processing, computer vision, and robotics. These technologies enable computers to perform tasks that typically require human intelligence.
The applications are vast: from autonomous vehicles and medical diagnosis to financial trading and content recommendation systems.
As AI continues to evolve, it presents both tremendous opportunities and significant challenges that society must carefully navigate.""",
"machine_learning": """Machine Learning Fundamentals
Machine learning is a subset of artificial intelligence that enables systems to automatically learn and improve from experience without being explicitly programmed.
There are three primary types of machine learning: supervised learning uses labeled training data, unsupervised learning finds patterns in unlabeled data, and reinforcement learning learns through interaction with an environment.
Common algorithms include linear regression, decision trees, neural networks, and support vector machines. Each has strengths for different types of problems.
In conclusion, machine learning forms the backbone of modern AI applications and continues to drive innovation across industries."""
}
for doc_id, content in sample_docs.items():
result = collection.add_document(
doc_id=doc_id,
content=content,
metadata={"topic": "artificial_intelligence", "language": "english"}
)
print(f" Added '{doc_id}': {result['chunks_created']} chunks ({', '.join(result['chunk_types'])})")
print()
print("3. Demonstrating search capabilities...\n")
print("🔍 General search for 'machine learning applications':")
results = collection.search("machine learning applications", limit=3)
for i, result in enumerate(results, 1):
print(f" {i}. [{result['chunk_type']}] Score: {result['score']:.3f}")
print(f" From: {result['parent_doc_id']}")
print(f" Content: {result['content'][:80]}...")
print()
print("🔍 Search only in summaries for 'artificial intelligence':")
results = collection.search("artificial intelligence", limit=2, chunk_types=['summary'])
for i, result in enumerate(results, 1):
print(f" {i}. [{result['chunk_type']}] Score: {result['score']:.3f}")
print(f" Content: {result['content'][:100]}...")
print()
print("🔍 Search only in titles for 'machine learning':")
results = collection.search("machine learning", limit=2, chunk_types=['title'])
for i, result in enumerate(results, 1):
print(f" {i}. [{result['chunk_type']}] Score: {result['score']:.3f}")
print(f" Content: {result['content']}")
print()
print("🔍 Search within specific document for 'algorithms':")
results = collection.search("algorithms", limit=3, doc_id_filter="machine_learning")
for i, result in enumerate(results, 1):
print(f" {i}. [{result['chunk_type']}] Score: {result['score']:.3f}")
print(f" Content: {result['content'][:80]}...")
print()
print("4. Document structure analysis...\n")
for doc_id in sample_docs.keys():
print(f"📄 Document: {doc_id}")
chunks = collection.get_document_chunks(doc_id)
for chunk in chunks:
print(f" {chunk['chunk_index']}. [{chunk['chunk_type']}] {chunk['content'][:60]}...")
print()
print("5. Collection statistics...")
stats = collection.get_collection_stats()
print(f" Collection: {stats['collection_name']}")
print(f" Total points: {stats['total_points']}")
print(f" Total documents: {stats['total_documents']}")
print(f" Avg chunks per document: {stats['avg_chunks_per_document']}")
print(f" Chunk type distribution: {stats['chunk_type_distribution']}")
print()
print("✅ Semantic search demonstration completed successfully!")
except Exception as e:
logger.error(f"Demo failed: {str(e)}")
print(f"❌ Demo failed: {str(e)}")
if __name__ == "__main__":
main()
