Ollama Local AI Testing

This guide shows how to load test local Ollama AI models. Perfect for testing your self-hosted AI setup and comparing local vs cloud AI performance.

Use Cases

Test local Ollama model performance
Compare different local AI models
Validate self-hosted AI reliability
Measure local AI response times

Simple Implementation

from locust import task, HttpUser
import json
import random

class OllamaUser(HttpUser):
    def on_start(self):
        # Ollama typically runs on localhost:11434
        self.base_url = "http://localhost:11434"
        
        # Available models (install with: ollama pull model-name)
        self.models = [
            "llama2:7b",
            "mistral:7b", 
            "codellama:7b",
            "phi:2.7b"
        ]
        
        # Test prompts
        self.prompts = [
            "Write a short product description for a smartphone.",
            "Explain machine learning in simple terms.",
            "Create a brief email to schedule a meeting.",
            "Write a Python function to calculate fibonacci numbers.",
            "Describe the benefits of renewable energy."
        ]

    @task(3)
    def generate_text(self):
        """Generate text using Ollama"""
        model = random.choice(self.models)
        prompt = random.choice(self.prompts)
        
        payload = {
            "model": model,
            "prompt": prompt,
            "stream": False,
            "options": {
                "temperature": 0.7,
                "num_predict": 100
            }
        }
        
        with self.client.post(
            f"{self.base_url}/api/generate",
            json=payload,
            name=f"Generate - {model}"
        ) as response:
            if response.status_code == 200:
                try:
                    data = response.json()
                    response_text = data.get("response", "")
                    done = data.get("done", False)
                    
                    if done and response_text:
                        print(f"{model}: Generated {len(response_text)} characters")
                    else:
                        response.failure("Incomplete response from Ollama")
                        
                except json.JSONDecodeError:
                    response.failure("Invalid JSON response from Ollama")
            else:
                response.failure(f"Ollama error: {response.status_code}")

    @task(2)
    def chat_completion(self):
        """Test chat completion with Ollama"""
        model = random.choice(self.models)
        prompt = random.choice(self.prompts)
        
        payload = {
            "model": model,
            "messages": [
                {"role": "user", "content": prompt}
            ],
            "stream": False,
            "options": {
                "temperature": 0.7,
                "num_predict": 100
            }
        }
        
        with self.client.post(
            f"{self.base_url}/api/chat",
            json=payload,
            name=f"Chat - {model}"
        ) as response:
            if response.status_code == 200:
                try:
                    data = response.json()
                    message = data.get("message", {})
                    content = message.get("content", "")
                    done = data.get("done", False)
                    
                    if done and content:
                        print(f"{model} Chat: {len(content)} characters")
                    else:
                        response.failure("Incomplete chat response")
                        
                except json.JSONDecodeError:
                    response.failure("Invalid JSON response from Ollama")
            else:
                response.failure(f"Ollama chat error: {response.status_code}")

    @task(1)
    def list_models(self):
        """List available models"""
        with self.client.get(
            f"{self.base_url}/api/tags",
            name="List Models"
        ) as response:
            if response.status_code == 200:
                try:
                    data = response.json()
                    models = data.get("models", [])
                    print(f"Available models: {len(models)}")
                    
                    for model in models[:3]:  # Show first 3
                        name = model.get("name", "Unknown")
                        size = model.get("size", 0) / (1024**3)  # Convert to GB
                        print(f"  - {name}: {size:.1f}GB")
                        
                except json.JSONDecodeError:
                    response.failure("Invalid JSON response from Ollama")
            else:
                response.failure(f"Failed to list models: {response.status_code}")

    @task(1)
    def model_info(self):
        """Get information about a specific model"""
        model = random.choice(self.models)
        
        payload = {"name": model}
        
        with self.client.post(
            f"{self.base_url}/api/show",
            json=payload,
            name=f"Model Info - {model}"
        ) as response:
            if response.status_code == 200:
                try:
                    data = response.json()
                    modelfile = data.get("modelfile", "")
                    parameters = data.get("parameters", "")
                    print(f"{model} info retrieved")
                    
                except json.JSONDecodeError:
                    response.failure("Invalid JSON response from Ollama")
            else:
                response.failure(f"Failed to get model info: {response.status_code}")

Setup Instructions

Install Ollama: Download from ollama.ai

Pull Models: Install models you want to test:

ollama pull llama2:7b
ollama pull mistral:7b
ollama pull phi:2.7b

Start Ollama: Run ollama serve (usually starts automatically)
Verify Setup: Test with curl http://localhost:11434/api/tags

What This Tests

Local AI Performance: Measures response times for local models
Model Comparison: Compare different models on same hardware
Resource Usage: Monitor CPU/GPU usage during testing
Reliability: Test local AI stability under load

Performance Tips

GPU Acceleration: Use NVIDIA GPU for faster inference
Model Size: Smaller models (7B) are faster than larger ones (13B, 70B)
Memory: Ensure sufficient RAM for model loading
Concurrent Users: Start with low numbers to avoid overwhelming local hardware

Common Issues

Model Not Found: Ensure models are pulled with ollama pull
Connection Refused: Check if Ollama service is running
Slow Responses: Local models are slower than cloud APIs
Memory Issues: Large models require significant RAM/VRAM

Product

Help

QA Series: Automated Security Headers Testing

QA Series: Basic Accessibility Monitoring

Use Cases

Simple Implementation

Setup Instructions

What This Tests

Performance Tips

Common Issues

Infura Node Testing

OpenSea NFT API Testing

Ready to run your test?
LoadForge is cloud-based locust.io testing.

Product

Help

Recent posts

QA Series: Automated Security Headers Testing

QA Series: Basic Accessibility Monitoring

Ollama Local AI Testing

Use Cases

Simple Implementation

Setup Instructions

What This Tests

Performance Tips

Common Issues

Infura Node Testing

OpenSea NFT API Testing

Ready to run your test? LoadForge is cloud-based locust.io testing.

Ready to run your test?
LoadForge is cloud-based locust.io testing.