Building Scalable RAG Applications with Vector Databases and Node.js
Introduction
Retrieval-Augmented Generation (RAG) has revolutionized how we build AI applications that can reason over custom data. By combining vector databases with large language models, we can create systems that provide accurate, contextual responses based on our specific knowledge base. In this guide, we'll build a production-ready RAG application using Node.js, Pinecone vector database, and OpenAI's API.
Understanding RAG Architecture
RAG works by first converting documents into vector embeddings, storing them in a vector database, then retrieving relevant context when answering queries. This approach solves the problem of LLMs having outdated or missing knowledge about your specific domain.
The typical RAG workflow involves:
- Document ingestion and chunking
- Converting text chunks to vector embeddings
- Storing vectors in a database with metadata
- Retrieving similar vectors for user queries
- Generating responses using retrieved context
Setting Up the Project
Let's start by creating a Node.js application with the necessary dependencies:
npm init -y
npm install express dotenv openai @pinecone-database/pinecone pdf-parse langchainCreate a .env file with your API keys:
OPENAI_API_KEY=your_openai_api_key
PINECONE_API_KEY=your_pinecone_api_key
PINECONE_ENVIRONMENT=your_pinecone_environment
PINECONE_INDEX_NAME=your_index_nameBuilding the Vector Database Service
First, let's create a service to handle vector operations:
// services/vectorService.js
require('dotenv').config();
const { Pinecone } = require('@pinecone-database/pinecone');
const OpenAI = require('openai');
class VectorService {
constructor() {
this.pinecone = new Pinecone({
apiKey: process.env.PINECONE_API_KEY,
environment: process.env.PINECONE_ENVIRONMENT
});
this.openai = new OpenAI({
apiKey: process.env.OPENAI_API_KEY
});
this.indexName = process.env.PINECONE_INDEX_NAME;
}
async getEmbedding(text) {
const response = await this.openai.embeddings.create({
model: 'text-embedding-ada-002',
input: text
});
return response.data[0].embedding;
}
async upsertDocuments(documents) {
const index = this.pinecone.Index(this.indexName);
const vectors = [];
for (let i = 0; i < documents.length; i++) {
const doc = documents[i];
const embedding = await this.getEmbedding(doc.content);
vectors.push({
id: doc.id || `doc-${i}`,
values: embedding,
metadata: {
content: doc.content,
source: doc.source,
title: doc.title
}
});
}
await index.upsert(vectors);
return vectors.length;
}
async searchSimilar(query, topK = 5) {
const index = this.pinecone.Index(this.indexName);
const queryEmbedding = await this.getEmbedding(query);
const results = await index.query({
vector: queryEmbedding,
topK,
includeMetadata: true,
includeValues: false
});
return results.matches;
}
}
module.exports = VectorService;Document Processing and Chunking
For effective RAG, we need to break documents into manageable chunks:
// services/documentProcessor.js
const pdf = require('pdf-parse');
const fs = require('fs').promises;
class DocumentProcessor {
chunkText(text, chunkSize = 1000, overlap = 200) {
const chunks = [];
let start = 0;
while (start < text.length) {
const end = Math.min(start + chunkSize, text.length);
const chunk = text.slice(start, end);
chunks.push({
content: chunk.trim(),
start,
end
});
start = end - overlap;
}
return chunks;
}
async processPDF(filePath) {
const dataBuffer = await fs.readFile(filePath);
const data = await pdf(dataBuffer);
return this.chunkText(data.text);
}
async processText(text) {
return this.chunkText(text);
}
}
module.exports = DocumentProcessor;Creating the RAG Query Engine
Now let's build the core RAG functionality:
// services/ragService.js
const VectorService = require('./vectorService');
const OpenAI = require('openai');
class RAGService {
constructor() {
this.vectorService = new VectorService();
this.openai = new OpenAI({
apiKey: process.env.OPENAI_API_KEY
});
}
async query(question, options = {}) {
const {
maxContextLength = 3000,
temperature = 0.1,
model = 'gpt-3.5-turbo'
} = options;
// Retrieve relevant documents
const similarDocs = await this.vectorService.searchSimilar(question, 5);
// Build context from retrieved documents
let context = '';
for (const doc of similarDocs) {
if (context.length + doc.metadata.content.length > maxContextLength) {
break;
}
context += `${doc.metadata.content}\n\n`;
}
// Generate response using context
const prompt = `Context: ${context}
Question: ${question}
Please answer the question based on the provided context. If the context doesn't contain enough information to answer the question, please say so.`;
const response = await this.openai.chat.completions.create({
model,
messages: [{ role: 'user', content: prompt }],
temperature,
max_tokens: 500
});
return {
answer: response.choices[0].message.content,
sources: similarDocs.map(doc => ({
content: doc.metadata.content.substring(0, 200) + '...',
score: doc.score,
source: doc.metadata.source
}))
};
}
}
module.exports = RAGService;Building the API Endpoints
Finally, let's create Express endpoints to interact with our RAG system:
// app.js
const express = require('express');
const multer = require('multer');
const DocumentProcessor = require('./services/documentProcessor');
const VectorService = require('./services/vectorService');
const RAGService = require('./services/ragService');
const app = express();
const upload = multer({ dest: 'uploads/' });
app.use(express.json());
const documentProcessor = new DocumentProcessor();
const vectorService = new VectorService();
const ragService = new RAGService();
// Upload and index documents
app.post('/api/upload', upload.single('document'), async (req, res) => {
try {
const chunks = await documentProcessor.processPDF(req.file.path);
const documents = chunks.map((chunk, index) => ({
id: `${req.file.filename}-${index}`,
content: chunk.content,
source: req.file.originalname,
title: req.body.title || req.file.originalname
}));
const count = await vectorService.upsertDocuments(documents);
res.json({
success: true,
message: `Indexed ${count} document chunks`
});
} catch (error) {
res.status(500).json({ error: error.message });
}
});
// Query the RAG system
app.post('/api/query', async (req, res) => {
try {
const { question, options } = req.body;
const result = await ragService.query(question, options);
res.json({
success: true,
data: result
});
} catch (error) {
res.status(500).json({ error: error.message });
}
});
const PORT = process.env.PORT || 3000;
app.listen(PORT, () => {
console.log(`RAG API server running on port ${PORT}`);
});Optimization Strategies
To improve your RAG application's performance:
- Chunk Size Optimization: Experiment with different chunk sizes based on your content type
- Hybrid Search: Combine vector search with keyword search for better retrieval
- Caching: Implement Redis caching for frequently asked questions
- Metadata Filtering: Use metadata to filter results by document type, date, or category
- Reranking: Implement a reranking step to improve result relevance
Conclusion
Building a production-ready RAG application requires careful consideration of document processing, vector storage, and retrieval strategies. This Node.js implementation provides a solid foundation that you can extend with additional features like conversation memory, document versioning, and advanced filtering. The key to success with RAG is iterative improvement based on user feedback and continuous evaluation of retrieval quality.
Related Posts
Building AI-Powered Content Generators with ChatGPT API and Next.js
Learn to create intelligent content generation tools using OpenAI's ChatGPT API with Next.js, including streaming responses and error handling.
Building AI-Powered Search with RAG and Vector Databases: A Practical Guide
Learn to implement Retrieval-Augmented Generation (RAG) with vector databases for intelligent search in your web applications.
Building Intelligent Web Applications with the ChatGPT API: A Practical Guide
Learn how to integrate ChatGPT API into your web applications with practical examples and best practices for creating intelligent user experiences.