Simple Chaos Testing

Basic chaos engineering with random failures, timeouts, and error injection to test system resilience

LoadForge can record your browser, graphically build tests, scan your site with a wizard and more. Sign up now to run your first test.

Sign up now


This guide shows how to introduce controlled chaos into your load tests to validate system resilience. Perfect for testing how your application handles failures and unexpected conditions.

Use Cases

  • Test system resilience under failures
  • Validate error handling and recovery
  • Check timeout behavior
  • Test circuit breaker patterns

Simple Implementation

from locust import task, HttpUser
import random
import time

class ChaosTestUser(HttpUser):
    def on_start(self):
        # Chaos configuration
        self.chaos_enabled = True
        self.failure_rate = 0.1  # 10% of requests will have chaos
        
        # Test endpoints
        self.endpoints = [
            "/api/users",
            "/api/products", 
            "/api/orders",
            "/api/health"
        ]
        
        # Chaos scenarios
        self.chaos_scenarios = [
            "timeout",
            "slow_response", 
            "random_failure",
            "bad_request"
        ]

    @task(4)
    def normal_request(self):
        """Normal request that might have chaos injected"""
        endpoint = random.choice(self.endpoints)
        
        # Decide if we should inject chaos
        if self.chaos_enabled and random.random() < self.failure_rate:
            self.inject_chaos(endpoint)
        else:
            self.make_normal_request(endpoint)

    @task(1)
    def intentional_chaos(self):
        """Intentionally trigger chaos scenarios"""
        endpoint = random.choice(self.endpoints)
        scenario = random.choice(self.chaos_scenarios)
        
        print(f"Intentional chaos: {scenario} on {endpoint}")
        
        if scenario == "timeout":
            self.test_timeout_resilience(endpoint)
        elif scenario == "slow_response":
            self.test_slow_response(endpoint)
        elif scenario == "random_failure":
            self.test_random_failure(endpoint)
        elif scenario == "bad_request":
            self.test_bad_request(endpoint)

    def inject_chaos(self, endpoint):
        """Randomly inject chaos into a request"""
        scenario = random.choice(self.chaos_scenarios)
        
        print(f"Chaos injection: {scenario} on {endpoint}")
        
        if scenario == "timeout":
            self.test_timeout_resilience(endpoint)
        elif scenario == "slow_response":
            self.test_slow_response(endpoint)
        elif scenario == "random_failure":
            self.test_random_failure(endpoint)
        else:
            self.make_normal_request(endpoint)

    def make_normal_request(self, endpoint):
        """Make a normal request without chaos"""
        with self.client.get(endpoint, name=f"Normal - {endpoint}") as response:
            if response.status_code == 200:
                print(f"Normal request to {endpoint}: SUCCESS")
            else:
                print(f"Normal request to {endpoint}: {response.status_code}")

    def test_timeout_resilience(self, endpoint):
        """Test how system handles timeouts"""
        try:
            with self.client.get(
                endpoint,
                timeout=0.5,  # Very short timeout
                name=f"Chaos Timeout - {endpoint}"
            ) as response:
                if response.status_code == 200:
                    print(f"Timeout test {endpoint}: Surprisingly fast response")
                else:
                    print(f"Timeout test {endpoint}: {response.status_code}")
        except Exception as e:
            print(f"Timeout test {endpoint}: Timeout occurred (expected)")
            # This is expected behavior - system should handle timeouts gracefully

    def test_slow_response(self, endpoint):
        """Test system behavior with artificially slow responses"""
        # Add artificial delay before request
        delay = random.uniform(2.0, 5.0)
        time.sleep(delay)
        
        start_time = time.time()
        
        with self.client.get(endpoint, name=f"Chaos Slow - {endpoint}") as response:
            response_time = time.time() - start_time
            
            if response.status_code == 200:
                print(f"Slow response test {endpoint}: {response_time:.2f}s")
            else:
                print(f"Slow response test {endpoint}: {response.status_code} in {response_time:.2f}s")

    def test_random_failure(self, endpoint):
        """Test random failure scenarios"""
        # Simulate different types of failures
        failure_types = [
            ("connection_error", self.simulate_connection_error),
            ("server_error", self.simulate_server_error),
            ("client_error", self.simulate_client_error)
        ]
        
        failure_type, failure_func = random.choice(failure_types)
        print(f"Random failure test {endpoint}: {failure_type}")
        
        failure_func(endpoint)

    def simulate_connection_error(self, endpoint):
        """Simulate connection errors"""
        try:
            # Use invalid host to trigger connection error
            invalid_url = endpoint.replace(self.host, "http://invalid-host-12345.com")
            
            with self.client.get(
                invalid_url,
                timeout=2.0,
                name=f"Chaos Connection Error - {endpoint}"
            ) as response:
                print(f"Connection error test: Unexpected success")
        except Exception as e:
            print(f"Connection error test: Failed as expected ({type(e).__name__})")

    def simulate_server_error(self, endpoint):
        """Test how system handles server errors"""
        # Try to trigger server errors with bad data
        bad_data = {"invalid": "data" * 1000}  # Large payload
        
        with self.client.post(
            endpoint,
            json=bad_data,
            name=f"Chaos Server Error - {endpoint}"
        ) as response:
            if response.status_code >= 500:
                print(f"Server error test {endpoint}: Got {response.status_code} (expected)")
            elif response.status_code >= 400:
                print(f"Server error test {endpoint}: Got {response.status_code} (client error)")
            else:
                print(f"Server error test {endpoint}: Unexpected success {response.status_code}")

    def simulate_client_error(self, endpoint):
        """Test client error handling"""
        # Send malformed requests
        malformed_scenarios = [
            ("invalid_json", '{"invalid": json}'),
            ("missing_content_type", {"data": "test"}),
            ("wrong_method", "DELETE")
        ]
        
        scenario, data = random.choice(malformed_scenarios)
        
        if scenario == "invalid_json":
            with self.client.post(
                endpoint,
                data=data,  # Invalid JSON string
                headers={"Content-Type": "application/json"},
                name=f"Chaos Invalid JSON - {endpoint}"
            ) as response:
                print(f"Invalid JSON test {endpoint}: {response.status_code}")
                
        elif scenario == "missing_content_type":
            with self.client.post(
                endpoint,
                json=data,
                headers={"Content-Type": "text/plain"},  # Wrong content type
                name=f"Chaos Wrong Content-Type - {endpoint}"
            ) as response:
                print(f"Wrong content-type test {endpoint}: {response.status_code}")
                
        elif scenario == "wrong_method":
            with self.client.delete(
                endpoint,
                name=f"Chaos Wrong Method - {endpoint}"
            ) as response:
                print(f"Wrong method test {endpoint}: {response.status_code}")

    def test_bad_request(self, endpoint):
        """Test bad request handling"""
        # Send requests with bad parameters
        bad_params = {
            "invalid_param": "value",
            "limit": -1,  # Invalid limit
            "page": "not_a_number"  # Invalid page
        }
        
        with self.client.get(
            endpoint,
            params=bad_params,
            name=f"Chaos Bad Request - {endpoint}"
        ) as response:
            if response.status_code == 400:
                print(f"Bad request test {endpoint}: Properly rejected (400)")
            elif response.status_code == 200:
                print(f"Bad request test {endpoint}: Accepted bad params (potential issue)")
            else:
                print(f"Bad request test {endpoint}: {response.status_code}")

    @task(1)
    def test_system_recovery(self):
        """Test if system recovers after chaos"""
        # First, cause some chaos
        endpoint = random.choice(self.endpoints)
        self.inject_chaos(endpoint)
        
        # Wait a bit
        time.sleep(1)
        
        # Then test if system is still responsive
        with self.client.get(endpoint, name=f"Recovery Test - {endpoint}") as response:
            if response.status_code == 200:
                print(f"Recovery test {endpoint}: System recovered successfully")
            else:
                print(f"Recovery test {endpoint}: System still having issues ({response.status_code})")

Setup Instructions

  1. Configure chaos parameters (failure rate, scenarios)
  2. Define endpoints you want to test with chaos
  3. Adjust timeout values based on your system
  4. Monitor system behavior during chaos testing

What This Tests

  • Timeout Handling: How system responds to request timeouts
  • Error Recovery: System's ability to recover from failures
  • Bad Input Handling: Response to malformed requests
  • Connection Failures: Behavior when connections fail

Chaos Scenarios

  • Timeouts: Very short request timeouts
  • Slow Responses: Artificially delayed requests
  • Connection Errors: Simulated network failures
  • Server Errors: Requests designed to trigger 5xx errors
  • Client Errors: Malformed requests causing 4xx errors

Monitoring During Chaos

Watch for:

  • Error Rates: Acceptable failure rates during chaos
  • Recovery Time: How quickly system recovers
  • Cascading Failures: Failures spreading to other services
  • Resource Usage: CPU/memory during failure scenarios

Safety Tips

  • Start Small: Begin with low failure rates
  • Monitor Closely: Watch system metrics during tests
  • Have Rollback: Be ready to stop chaos if needed
  • Test in Staging: Don't run chaos in production initially

Ready to run your test?
Run your test today with LoadForge.