TechLead
Lesson 17 of 18
5 min read
LangChain

Production Deployment & LangSmith

Deploy LangChain apps to production with best practices for caching, error handling, cost control, and monitoring with LangSmith

Production Challenges

Moving LangChain apps from development to production involves handling latency, costs, reliability, and observability. This guide covers the essential patterns for production-ready LLM applications.

🚨 Production Considerations

πŸ’° Cost: GPT-4 calls add up fast at scale
⏱️ Latency: LLM calls are slow (2-30 seconds)
πŸ”„ Reliability: API rate limits and outages
πŸ“Š Monitoring: Debugging non-deterministic outputs

Caching Responses

Cache identical queries to reduce API costs and latency dramatically.

import { ChatOpenAI } from "@langchain/openai";

// In-memory cache (development)
const cache = new Map<string, string>();

async function cachedInvoke(model: ChatOpenAI, input: string) {
  const cacheKey = input.trim().toLowerCase();
  
  if (cache.has(cacheKey)) {
    console.log("Cache hit!");
    return cache.get(cacheKey)!;
  }
  
  const response = await model.invoke(input);
  const content = response.content as string;
  cache.set(cacheKey, content);
  return content;
}

// Redis cache (production)
import { Redis } from "ioredis";
const redis = new Redis(process.env.REDIS_URL!);

async function cachedInvokeRedis(model: ChatOpenAI, input: string) {
  const cacheKey = `llm:${Buffer.from(input).toString("base64")}`;
  const cached = await redis.get(cacheKey);
  
  if (cached) return cached;
  
  const response = await model.invoke(input);
  const content = response.content as string;
  
  // Cache for 1 hour
  await redis.set(cacheKey, content, "EX", 3600);
  return content;
}

Rate Limiting and Retry Logic

import { ChatOpenAI } from "@langchain/openai";

// Built-in retry with exponential backoff
const model = new ChatOpenAI({
  modelName: "gpt-4",
  maxRetries: 3,         // Retry up to 3 times
  maxConcurrency: 5,     // Max parallel requests
});

// Custom retry wrapper with timeout
async function invokeWithRetry(
  model: ChatOpenAI,
  input: string,
  maxRetries = 3,
  timeoutMs = 30000
) {
  for (let attempt = 1; attempt <= maxRetries; attempt++) {
    try {
      const controller = new AbortController();
      const timeout = setTimeout(() => controller.abort(), timeoutMs);

      const response = await model.invoke(input, {
        signal: controller.signal,
      });
      
      clearTimeout(timeout);
      return response;
    } catch (error: any) {
      console.error(`Attempt ${attempt} failed:`, error.message);
      
      if (error.status === 429) {
        // Rate limited β€” wait longer
        await new Promise((r) => setTimeout(r, attempt * 5000));
      } else if (attempt === maxRetries) {
        throw error;
      } else {
        await new Promise((r) => setTimeout(r, attempt * 1000));
      }
    }
  }
}

Cost Control

import { ChatOpenAI } from "@langchain/openai";
import { CallbackManager } from "@langchain/core/callbacks/manager";

// Track token usage per request
let totalTokens = 0;
let totalCost = 0;

const model = new ChatOpenAI({
  modelName: "gpt-4",
  callbacks: CallbackManager.fromHandlers({
    async handleLLMEnd(output) {
      const usage = output.llmOutput?.tokenUsage;
      if (usage) {
        totalTokens += usage.totalTokens;
        // GPT-4 pricing (approximate)
        const cost =
          (usage.promptTokens / 1000) * 0.03 +
          (usage.completionTokens / 1000) * 0.06;
        totalCost += cost;
        console.log(`Tokens: ${usage.totalTokens}, Cost: $${cost.toFixed(4)}`);
      }
    },
  }),
});

// Cost-saving strategies:
// 1. Use cheaper models for simple tasks
const cheapModel = new ChatOpenAI({ modelName: "gpt-4o-mini" }); // 20x cheaper
const powerModel = new ChatOpenAI({ modelName: "gpt-4" });

// 2. Route based on complexity
async function smartRoute(input: string) {
  const isComplex = input.length > 200 || input.includes("analyze");
  const model = isComplex ? powerModel : cheapModel;
  return await model.invoke(input);
}

Fallback Chains

Use fallback models when the primary model fails or is unavailable.

import { ChatOpenAI } from "@langchain/openai";
import { ChatAnthropic } from "@langchain/anthropic";

const primaryModel = new ChatOpenAI({ modelName: "gpt-4" });
const fallbackModel = new ChatAnthropic({ modelName: "claude-3-sonnet-20240229" });
const lastResort = new ChatOpenAI({ modelName: "gpt-4o-mini" });

// LangChain's built-in fallback
const modelWithFallback = primaryModel
  .withFallbacks({
    fallbacks: [fallbackModel, lastResort],
  });

// Automatically tries each model in order
const result = await modelWithFallback.invoke("Explain Docker networking");
// If GPT-4 fails β†’ tries Claude β†’ tries GPT-4o-mini

LangSmith Setup

LangSmith is LangChain's platform for debugging, testing, and monitoring LLM applications.

# .env.local
LANGCHAIN_TRACING_V2=true
LANGCHAIN_API_KEY=ls_your_api_key
LANGCHAIN_PROJECT=my-production-app
LANGCHAIN_ENDPOINT=https://api.smith.langchain.com
// Traces are captured automatically β€” no code changes needed!
// Every chain.invoke() call is logged to LangSmith

// Add custom metadata for filtering
const result = await chain.invoke(
  { input: "How do Docker volumes work?" },
  {
    metadata: {
      userId: "user-123",
      feature: "chat",
      environment: "production",
    },
    tags: ["production", "v2.1"],
    runName: "docker-question", // Custom name for this run
  }
);

// In LangSmith dashboard you can:
// - View full trace of every step
// - See token usage and latency per step
// - Filter by tags, metadata, and time
// - Compare different model versions
// - Create evaluation datasets

Production Architecture

// lib/ai-service.ts β€” Production-ready AI service
import { ChatOpenAI } from "@langchain/openai";
import { ChatPromptTemplate } from "@langchain/core/prompts";
import { StringOutputParser } from "@langchain/core/output_parsers";

class AIService {
  private model: ChatOpenAI;
  private cache: Map<string, { response: string; timestamp: number }>;
  private readonly CACHE_TTL = 3600000; // 1 hour

  constructor() {
    this.model = new ChatOpenAI({
      modelName: "gpt-4",
      maxRetries: 3,
      maxConcurrency: 10,
      temperature: 0.7,
    }).withFallbacks({
      fallbacks: [new ChatOpenAI({ modelName: "gpt-4o-mini" })],
    });
    this.cache = new Map();
  }

  async query(input: string, userId?: string): Promise<string> {
    // Check cache
    const cacheKey = input.toLowerCase().trim();
    const cached = this.cache.get(cacheKey);
    if (cached && Date.now() - cached.timestamp < this.CACHE_TTL) {
      return cached.response;
    }

    // Build chain
    const prompt = ChatPromptTemplate.fromMessages([
      ["system", "You are a helpful coding assistant."],
      ["human", "{input}"],
    ]);
    const chain = prompt.pipe(this.model).pipe(new StringOutputParser());

    // Invoke with metadata
    const response = await chain.invoke(
      { input },
      {
        metadata: { userId, feature: "chat" },
        tags: ["production"],
      }
    );

    // Cache response
    this.cache.set(cacheKey, { response, timestamp: Date.now() });

    return response;
  }
}

export const aiService = new AIService();

πŸš€ Production Checklist

  • βœ“ Response caching (Redis or in-memory)
  • βœ“ Rate limiting per user
  • βœ“ Fallback models configured
  • βœ“ Error handling with user-friendly messages
  • βœ“ Cost monitoring and alerts
  • βœ“ LangSmith tracing enabled
  • βœ“ Input validation and sanitization
  • βœ“ Token budget limits per request

πŸ’‘ Key Takeaways

  • β€’ Cache identical queries to cut costs by 50-80%
  • β€’ Use fallback chains for reliability (GPT-4 β†’ Claude β†’ GPT-4o-mini)
  • β€’ Route simple tasks to cheaper models to reduce costs
  • β€’ LangSmith provides essential observability for debugging production issues
  • β€’ Always implement rate limiting, timeouts, and cost tracking

Continue Learning