Advanced RAG Techniques

Now that you've built a basic RAG application, let's explore advanced techniques to enhance its performance, relevance, and efficiency.

Hybrid Search: Combining Vector and Keyword Search

MongoDB Atlas supports hybrid search, which combines the strengths of vector search with traditional keyword search.

Implementation

Update your search function to use hybrid search:

async function hybridSearch(query, options = {}) {
  try {
    await rag.connect();
    
    // Get the embedding for the query
    const embedding = await rag.provider.getEmbedding(query);
    
    // Build the aggregation pipeline
    const collection = await rag._getCollection();
    const pipeline = [
      {
        $search: {
          compound: {
            should: [
              {
                vectorSearch: {
                  queryVector: embedding,
                  path: "embedding",
                  numCandidates: 100,
                  limit: 10
                }
              },
              {
                text: {
                  query: query,
                  path: "content"
                }
              }
            ]
          }
        }
      },
      {
        $project: {
          documentId: 1,
          content: 1,
          metadata: 1,
          score: { $meta: "searchScore" }
        }
      },
      {
        $limit: options.maxResults || 5
      }
    ];
    
    // Execute the search
    const results = await collection.aggregate(pipeline).toArray();
    
    return results.map(r => ({
      documentId: r.documentId,
      content: r.content,
      metadata: r.metadata,
      score: r.score
    }));
    
  } catch (error) {
    console.error('Hybrid search error:', error);
    throw error;
  } finally {
    await rag.close();
  }
}

Benefits of Hybrid Search

Improved recall: Finds relevant documents that might be missed by vector search alone
Better handling of specialized terms: Exact keyword matching for technical terms, product names, etc.
Reduced sensitivity to embedding quality: Less dependent on embedding model quality
Multi-language support: Works well across languages and specialized domains

Advanced Document Chunking

Let's implement a more sophisticated chunking strategy that respects document structure:

const { MongoRAG } = require('mongodb-rag');
const natural = require('natural');

class AdvancedChunker {
  constructor(options = {}) {
    this.options = {
      maxChunkSize: options.maxChunkSize || 500,
      minChunkSize: options.minChunkSize || 100,
      overlap: options.overlap || 50,
      ...options
    };
    
    this.sentenceTokenizer = new natural.SentenceTokenizer();
  }
  
  async chunkDocument(document) {
    const chunks = [];
    const content = document.content;
    
    // Step 1: Split content by headings
    const sections = this._splitByHeadings(content);
    
    // Step 2: Process each section
    for (const section of sections) {
      // If section is already small enough, keep it as is
      if (section.length <= this.options.maxChunkSize) {
        chunks.push(this._createChunk(document, section));
        continue;
      }
      
      // Step 3: Split large sections into paragraphs
      const paragraphs = section.split(/\n\s*\n/);
      let currentChunk = '';
      
      for (const paragraph of paragraphs) {
        // If paragraph fits in current chunk, add it
        if (currentChunk.length + paragraph.length <= this.options.maxChunkSize) {
          currentChunk += (currentChunk ? '\n\n' : '') + paragraph;
        } else {
          // If current chunk is not empty, save it
          if (currentChunk) {
            chunks.push(this._createChunk(document, currentChunk));
          }
          
          // If paragraph is too large, split into sentences
          if (paragraph.length > this.options.maxChunkSize) {
            const sentenceChunks = this._chunkBySentences(paragraph);
            chunks.push(...sentenceChunks.map(c => this._createChunk(document, c)));
          } else {
            currentChunk = paragraph;
          }
        }
      }
      
      // Save the last chunk if not empty
      if (currentChunk) {
        chunks.push(this._createChunk(document, currentChunk));
      }
    }
    
    return chunks;
  }
  
  _splitByHeadings(content) {
    // Split by markdown headings (##, ###, etc.)
    return content.split(/\n#{1,6}\s+[^\n]+/);
  }
  
  _chunkBySentences(text) {
    const sentences = this.sentenceTokenizer.tokenize(text);
    const chunks = [];
    let currentChunk = '';
    
    for (const sentence of sentences) {
      if (currentChunk.length + sentence.length <= this.options.maxChunkSize) {
        currentChunk += (currentChunk ? ' ' : '') + sentence;
      } else {
        if (currentChunk) {
          chunks.push(currentChunk);
        }
        currentChunk = sentence;
      }
    }
    
    if (currentChunk) {
      chunks.push(currentChunk);
    }
    
    return chunks;
  }
  
  _createChunk(document, content) {
    return {
      documentId: document.id,
      content: content,
      metadata: {
        ...document.metadata,
        chunkIndex: Date.now(),
        chunkLength: content.length
      }
    };
  }
}

module.exports = AdvancedChunker;

Metadata Filtering

Metadata filtering allows you to narrow down search results based on document metadata:

async function searchWithMetadata(query, metadata = {}, options = {}) {
  try {
    await rag.connect();
    
    // Build metadata filter
    const filter = {};
    Object.entries(metadata).forEach(([key, value]) => {
      filter[`metadata.${key}`] = value;
    });
    
    // Perform search with filter
    const results = await rag.search(query, {
      filter: filter,
      maxResults: options.maxResults || 5
    });
    
    return results;
    
  } catch (error) {
    console.error('Metadata search error:', error);
    throw error;
  } finally {
    await rag.close();
  }
}

// Example usage
const results = await searchWithMetadata(
  "What is MongoDB Atlas?",
  { type: "markdown", source: "mongodb-atlas.md" }
);

Re-ranking Search Results

Improve search relevance by re-ranking results:

const { MongoRAG } = require('mongodb-rag');
const { OpenAI } = require('openai');
const config = require('./config');

// Initialize OpenAI client
const openai = new OpenAI({
  apiKey: config.llm.apiKey
});

async function searchWithReranking(query, options = {}) {
  try {
    // Step 1: Get initial results (more than needed)
    const initialResults = await rag.search(query, {
      maxResults: (options.maxResults || 5) * 3
    });
    
    if (initialResults.length === 0) {
      return [];
    }
    
    // Step 2: Re-rank results using the LLM
    const rerankerPrompt = `
Query: "${query}"

I'll show you ${initialResults.length} text passages. Rank them based on relevance to the query.
For each passage, assign a score from 0-10 where:
- 10: Perfect match, directly answers the query
- 7-9: Highly relevant, contains most of the answer
- 4-6: Somewhat relevant, contains partial information
- 1-3: Slightly relevant, mentions related concepts
- 0: Not relevant at all

For each passage, output only the score followed by brief explanation.

${initialResults.map((result, i) => 
  `PASSAGE ${i+1}: "${result.content.substring(0, 300)}..."`
).join('\n\n')}
`;
    
    const completion = await openai.chat.completions.create({
      model: "gpt-3.5-turbo",
      messages: [
        { role: "system", content: "You are a helpful assistant that ranks search results based on relevance." },
        { role: "user", content: rerankerPrompt }
      ],
      temperature: 0.3
    });
    
    // Step 3: Parse LLM response to get scores
    const scores = parseRerankerResponse(completion.choices[0].message.content);
    
    // Step 4: Combine original results with new scores
    const scoredResults = initialResults.map((result, i) => ({
      ...result,
      reranked_score: scores[i] || 0
    }));
    
    // Step 5: Sort by new scores and return top results
    const rerankedResults = scoredResults
      .sort((a, b) => b.reranked_score - a.reranked_score)
      .slice(0, options.maxResults || 5);
    
    return rerankedResults;
    
  } catch (error) {
    console.error('Reranking error:', error);
    throw error;
  }
}

// Helper function to parse reranker response
function parseRerankerResponse(response) {
  const scores = [];
  const regex = /PASSAGE\s*(\d+)[^0-9]*(\d+)/gi;
  let match;
  
  while ((match = regex.exec(response)) !== null) {
    const passageNum = parseInt(match[1]) - 1;
    const score = parseInt(match[2]);
    scores[passageNum] = score;
  }
  
  return scores;
}

## Query Expansion

Another technique to improve search results is query expansion. This involves generating multiple related queries to capture different ways of expressing the same information need:

```javascript
const { OpenAI } = require('openai');
const config = require('./config');

// Initialize OpenAI client
const openai = new OpenAI({
  apiKey: config.llm.apiKey
});

async function expandQuery(query) {
  // Use LLM to generate query variations
  const completion = await openai.chat.completions.create({
    model: 'gpt-3.5-turbo',
    messages: [
      {
        role: 'system',
        content: 'You are a search expert. Generate 3-5 variations of the query that express the same information need in different ways. Return ONLY the variations, one per line.'
      },
      {
        role: 'user',
        content: `Original query: "${query}"`
      }
    ],
    temperature: 0.7
  });
  
  // Parse the response to get query variations
  const variations = completion.choices[0].message.content
    .split('\n')
    .map(v => v.trim())
    .filter(v => v && !v.startsWith('Original') && !v.startsWith('-'));
  
  return [query, ...variations];
}

async function searchWithQueryExpansion(query, options = {}) {
  try {
    // Step 1: Generate query variations
    const expandedQueries = await expandQuery(query);
    console.log('Expanded queries:', expandedQueries);
    
    // Step 2: Search with each variation
    const allResults = [];
    for (const q of expandedQueries) {
      const results = await rag.search(q, {
        maxResults: options.maxResults || 3
      });
      allResults.push(...results);
    }
    
    // Step 3: Deduplicate results based on documentId
    const uniqueResults = [];
    const seenIds = new Set();
    
    for (const result of allResults) {
      if (!seenIds.has(result.documentId)) {
        seenIds.add(result.documentId);
        uniqueResults.push(result);
      }
    }
    
    // Step 4: Sort by score and return top results
    return uniqueResults
      .sort((a, b) => b.score - a.score)
      .slice(0, options.maxResults || 5);
    
  } catch (error) {
    console.error('Query expansion error:', error);
    throw error;
  }
}

Vector Quantization

MongoDB Atlas supports vector quantization to improve search performance with minimal impact on quality:

const { MongoClient } = require('mongodb');
const config = require('./config');

async function createQuantizedIndex() {
  const client = new MongoClient(config.mongodb.uri);
  
  try {
    await client.connect();
    const database = client.db(config.mongodb.database);
    const collection = database.collection(config.mongodb.collection);
    
    // Define scalar quantization index
    const scalarQuantizationIndex = {
      name: "vector_quantized_index",
      type: "vectorSearch",
      definition: {
        fields: [
          {
            type: "vector",
            path: "embedding",
            numDimensions: config.embedding.dimensions,
            similarity: "cosine",
            quantization: {
              type: "scalar",
              config: {
                components: { type: "int8" }
              }
            }
          }
        ]
      }
    };
    
    // Create the quantized index
    console.log('Creating quantized vector search index...');
    const result = await collection.createSearchIndex(scalarQuantizationIndex);
    console.log('Quantized index creation initiated:', result);
    
  } catch (error) {
    console.error('Error creating quantized index:', error);
    throw error;
  } finally {
    await client.close();
  }
}

Vector quantization offers these benefits:

Reduced storage requirements
Improved search performance
Minimal impact on recall/precision
Better scaling for large collections

Multi-stage RAG Pipeline

Implement a multi-stage RAG pipeline that combines multiple retrieval and filtering techniques:

async function advancedRAGPipeline(query, options = {}) {
  try {
    // Step 1: Extract entities and intent from query
    const queryAnalysis = await analyzeQuery(query);
    
    // Step 2: Expand query with variations
    const expandedQueries = await expandQuery(query);
    
    // Step 3: Initial retrieval from multiple sources
    const initialResults = await retrieveFromMultipleSources(expandedQueries, queryAnalysis);
    
    // Step 4: Filter by metadata based on query intent
    const filteredResults = filterByMetadata(initialResults, queryAnalysis.metadata);
    
    // Step 5: Rerank results
    const rerankedResults = await rerankResults(filteredResults, query, queryAnalysis);
    
    // Step 6: Dynamic prompt construction based on retrieved context
    const prompt = constructPrompt(query, rerankedResults, queryAnalysis);
    
    // Step 7: Generate response with the LLM
    const response = await generateLLMResponse(prompt);
    
    return {
      answer: response,
      sources: rerankedResults.map(r => ({
        documentId: r.documentId,
        source: r.metadata?.source,
        score: r.reranked_score || r.score
      }))
    };
    
  } catch (error) {
    console.error('Advanced RAG pipeline error:', error);
    throw error;
  }
}

Enhance response quality through self-critique and refinement:

async function generateWithSelfCritique(query, context) {
  // Step 1: Generate initial response
  const initialResponse = await openai.chat.completions.create({
    model: config.llm.model,
    messages: [
      {
        role: 'system',
        content: 'You are a helpful assistant. Answer based only on the provided context.'
      },
      {
        role: 'user',
        content: `Context:\n${context}\n\nQuestion: ${query}`
      }
    ]
  });
  
  // Step 2: Self-critique the response
  const critique = await openai.chat.completions.create({
    model: config.llm.model,
    messages: [
      {
        role: 'system',
        content: `You are a critical evaluator. Assess the following answer for:
1. Factual accuracy based on the context
2. Completeness of information
3. Logical coherence
4. Possible hallucinations or unsupported claims
Provide specific suggestions for improvement.`
      },
      {
        role: 'user',
        content: `Context:\n${context}\n\nQuestion: ${query}\n\nAnswer: ${initialResponse.choices[0].message.content}`
      }
    ]
  });
  
  // Step 3: Refine the response based on critique
  const refinedResponse = await openai.chat.completions.create({
    model: config.llm.model,
    messages: [
      {
        role: 'system',
        content: 'You are a helpful assistant. Answer based only on the provided context. Incorporate the critique to provide the most accurate response possible.'
      },
      {
        role: 'user',
        content: `Context:\n${context}\n\nQuestion: ${query}\n\nInitial answer: ${initialResponse.choices[0].message.content}\n\nCritique: ${critique.choices[0].message.content}\n\nProvide an improved answer that addresses the critique.`
      }
    ]
  });
  
  return refinedResponse.choices[0].message.content;
}

Next Steps

You've now learned several advanced techniques to enhance your RAG application. In the next section, we'll explore how to deploy your application to production and optimize it for scale.

Hybrid Search: Combining Vector and Keyword Search​

Implementation​

Benefits of Hybrid Search​

Advanced Document Chunking​

Metadata Filtering​

Re-ranking Search Results​

Vector Quantization​

Multi-stage RAG Pipeline​

Self-Critique and Refinement​

Next Steps​