AI API Performance Comparison

Simple load testing to compare response times and quality across OpenAI, Claude, and Gemini APIs

LoadForge can record your browser, graphically build tests, scan your site with a wizard and more. Sign up now to run your first test.

Sign up now


This guide shows how to compare AI API performance across different providers. Perfect for testing response times, reliability, and choosing the best AI provider for your needs.

Use Cases

  • Compare AI API response times
  • Test different AI providers under load
  • Validate API reliability and uptime
  • Measure cost-effectiveness of different providers

Simple Implementation

from locust import task, HttpUser
import json
import random
import time

class AIComparisonUser(HttpUser):
    def on_start(self):
        # API configurations
        self.openai_key = "your-openai-api-key"
        self.claude_key = "your-claude-api-key" 
        self.gemini_key = "your-gemini-api-key"
        
        # Test prompts for comparison
        self.test_prompts = [
            "Write a short product description for a wireless headphone.",
            "Explain quantum computing in simple terms.",
            "Create a brief email response thanking a customer.",
            "Summarize the benefits of renewable energy.",
            "Write a creative story opening in 2 sentences."
        ]

    @task(2)
    def test_openai_gpt35(self):
        """Test OpenAI GPT-3.5 Turbo"""
        prompt = random.choice(self.test_prompts)
        
        headers = {
            "Authorization": f"Bearer {self.openai_key}",
            "Content-Type": "application/json"
        }
        
        payload = {
            "model": "gpt-3.5-turbo",
            "messages": [{"role": "user", "content": prompt}],
            "max_tokens": 150,
            "temperature": 0.7
        }
        
        start_time = time.time()
        
        with self.client.post(
            "https://api.openai.com/v1/chat/completions",
            json=payload,
            headers=headers,
            name="OpenAI GPT-3.5"
        ) as response:
            response_time = (time.time() - start_time) * 1000
            
            if response.status_code == 200:
                data = response.json()
                if "choices" in data and data["choices"]:
                    content = data["choices"][0]["message"]["content"]
                    tokens = data.get("usage", {}).get("total_tokens", 0)
                    print(f"OpenAI: {len(content)} chars, {tokens} tokens, {response_time:.0f}ms")
                else:
                    response.failure("No choices in OpenAI response")
            elif response.status_code == 429:
                response.failure("OpenAI rate limit exceeded")
            else:
                response.failure(f"OpenAI error: {response.status_code}")

    @task(2)
    def test_claude(self):
        """Test Anthropic Claude"""
        prompt = random.choice(self.test_prompts)
        
        headers = {
            "x-api-key": self.claude_key,
            "Content-Type": "application/json",
            "anthropic-version": "2023-06-01"
        }
        
        payload = {
            "model": "claude-3-haiku-20240307",
            "max_tokens": 150,
            "messages": [{"role": "user", "content": prompt}]
        }
        
        start_time = time.time()
        
        with self.client.post(
            "https://api.anthropic.com/v1/messages",
            json=payload,
            headers=headers,
            name="Claude Haiku"
        ) as response:
            response_time = (time.time() - start_time) * 1000
            
            if response.status_code == 200:
                data = response.json()
                if "content" in data and data["content"]:
                    content = data["content"][0]["text"]
                    tokens = data.get("usage", {}).get("input_tokens", 0) + data.get("usage", {}).get("output_tokens", 0)
                    print(f"Claude: {len(content)} chars, {tokens} tokens, {response_time:.0f}ms")
                else:
                    response.failure("No content in Claude response")
            elif response.status_code == 429:
                response.failure("Claude rate limit exceeded")
            else:
                response.failure(f"Claude error: {response.status_code}")

    @task(2)
    def test_gemini(self):
        """Test Google Gemini"""
        prompt = random.choice(self.test_prompts)
        
        params = {"key": self.gemini_key}
        
        payload = {
            "contents": [{"parts": [{"text": prompt}]}],
            "generationConfig": {
                "maxOutputTokens": 150,
                "temperature": 0.7
            }
        }
        
        start_time = time.time()
        
        with self.client.post(
            "https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent",
            json=payload,
            params=params,
            name="Gemini Pro"
        ) as response:
            response_time = (time.time() - start_time) * 1000
            
            if response.status_code == 200:
                data = response.json()
                if "candidates" in data and data["candidates"]:
                    content = data["candidates"][0]["content"]["parts"][0]["text"]
                    # Gemini doesn't return token count in basic API
                    print(f"Gemini: {len(content)} chars, {response_time:.0f}ms")
                else:
                    response.failure("No candidates in Gemini response")
            elif response.status_code == 429:
                response.failure("Gemini rate limit exceeded")
            else:
                response.failure(f"Gemini error: {response.status_code}")

    @task(1)
    def test_openai_gpt4(self):
        """Test OpenAI GPT-4 (if available)"""
        prompt = random.choice(self.test_prompts)
        
        headers = {
            "Authorization": f"Bearer {self.openai_key}",
            "Content-Type": "application/json"
        }
        
        payload = {
            "model": "gpt-4",
            "messages": [{"role": "user", "content": prompt}],
            "max_tokens": 150,
            "temperature": 0.7
        }
        
        start_time = time.time()
        
        with self.client.post(
            "https://api.openai.com/v1/chat/completions",
            json=payload,
            headers=headers,
            name="OpenAI GPT-4"
        ) as response:
            response_time = (time.time() - start_time) * 1000
            
            if response.status_code == 200:
                data = response.json()
                if "choices" in data and data["choices"]:
                    content = data["choices"][0]["message"]["content"]
                    tokens = data.get("usage", {}).get("total_tokens", 0)
                    print(f"GPT-4: {len(content)} chars, {tokens} tokens, {response_time:.0f}ms")
            elif response.status_code == 429:
                response.failure("GPT-4 rate limit exceeded")
            elif response.status_code == 404:
                response.failure("GPT-4 not available (need API access)")
            else:
                response.failure(f"GPT-4 error: {response.status_code}")

Setup Instructions

  1. Get API keys from each provider:

  2. Replace the API key placeholders with your actual keys

  3. Start with low user counts to avoid hitting rate limits

What This Tests

  • Response Times: Compare how fast each AI responds
  • Rate Limits: Test how each provider handles concurrent requests
  • Reliability: Check which APIs are most stable under load
  • Output Quality: Compare response length and coherence

Performance Comparison

Typical results you might see:

  • GPT-3.5: Fast responses (~1-3 seconds), good for high volume
  • Claude: Moderate speed (~2-4 seconds), excellent quality
  • Gemini: Variable speed (~1-5 seconds), good for creative tasks
  • GPT-4: Slower (~3-8 seconds), highest quality but expensive

Rate Limits & Costs

  • OpenAI: Tier-based limits, pay per token
  • Claude: Message-based limits, pay per token
  • Gemini: Requests per minute limits, generous free tier

Common Issues

  • API Keys: Ensure all keys are valid and have sufficient credits
  • Rate Limits: Each provider has different limits
  • Model Availability: GPT-4 requires special access

Ready to run your test?
LoadForge is cloud-based locust.io testing.