Simple Chaos Testing

This guide shows how to introduce controlled chaos into your load tests to validate system resilience. Perfect for testing how your application handles failures and unexpected conditions.

Use Cases

Test system resilience under failures
Validate error handling and recovery
Check timeout behavior
Test circuit breaker patterns

Simple Implementation

from locust import task, HttpUser
import random
import time

class ChaosTestUser(HttpUser):
    def on_start(self):
        # Chaos configuration
        self.chaos_enabled = True
        self.failure_rate = 0.1  # 10% of requests will have chaos
        
        # Test endpoints
        self.endpoints = [
            "/api/users",
            "/api/products", 
            "/api/orders",
            "/api/health"
        ]
        
        # Chaos scenarios
        self.chaos_scenarios = [
            "timeout",
            "slow_response", 
            "random_failure",
            "bad_request"
        ]

    @task(4)
    def normal_request(self):
        """Normal request that might have chaos injected"""
        endpoint = random.choice(self.endpoints)
        
        # Decide if we should inject chaos
        if self.chaos_enabled and random.random() < self.failure_rate:
            self.inject_chaos(endpoint)
        else:
            self.make_normal_request(endpoint)

    @task(1)
    def intentional_chaos(self):
        """Intentionally trigger chaos scenarios"""
        endpoint = random.choice(self.endpoints)
        scenario = random.choice(self.chaos_scenarios)
        
        print(f"Intentional chaos: {scenario} on {endpoint}")
        
        if scenario == "timeout":
            self.test_timeout_resilience(endpoint)
        elif scenario == "slow_response":
            self.test_slow_response(endpoint)
        elif scenario == "random_failure":
            self.test_random_failure(endpoint)
        elif scenario == "bad_request":
            self.test_bad_request(endpoint)

    def inject_chaos(self, endpoint):
        """Randomly inject chaos into a request"""
        scenario = random.choice(self.chaos_scenarios)
        
        print(f"Chaos injection: {scenario} on {endpoint}")
        
        if scenario == "timeout":
            self.test_timeout_resilience(endpoint)
        elif scenario == "slow_response":
            self.test_slow_response(endpoint)
        elif scenario == "random_failure":
            self.test_random_failure(endpoint)
        else:
            self.make_normal_request(endpoint)

    def make_normal_request(self, endpoint):
        """Make a normal request without chaos"""
        with self.client.get(endpoint, name=f"Normal - {endpoint}") as response:
            if response.status_code == 200:
                print(f"Normal request to {endpoint}: SUCCESS")
            else:
                print(f"Normal request to {endpoint}: {response.status_code}")

    def test_timeout_resilience(self, endpoint):
        """Test how system handles timeouts"""
        try:
            with self.client.get(
                endpoint,
                timeout=0.5,  # Very short timeout
                name=f"Chaos Timeout - {endpoint}"
            ) as response:
                if response.status_code == 200:
                    print(f"Timeout test {endpoint}: Surprisingly fast response")
                else:
                    print(f"Timeout test {endpoint}: {response.status_code}")
        except Exception as e:
            print(f"Timeout test {endpoint}: Timeout occurred (expected)")
            # This is expected behavior - system should handle timeouts gracefully

    def test_slow_response(self, endpoint):
        """Test system behavior with artificially slow responses"""
        # Add artificial delay before request
        delay = random.uniform(2.0, 5.0)
        time.sleep(delay)
        
        start_time = time.time()
        
        with self.client.get(endpoint, name=f"Chaos Slow - {endpoint}") as response:
            response_time = time.time() - start_time
            
            if response.status_code == 200:
                print(f"Slow response test {endpoint}: {response_time:.2f}s")
            else:
                print(f"Slow response test {endpoint}: {response.status_code} in {response_time:.2f}s")

    def test_random_failure(self, endpoint):
        """Test random failure scenarios"""
        # Simulate different types of failures
        failure_types = [
            ("connection_error", self.simulate_connection_error),
            ("server_error", self.simulate_server_error),
            ("client_error", self.simulate_client_error)
        ]
        
        failure_type, failure_func = random.choice(failure_types)
        print(f"Random failure test {endpoint}: {failure_type}")
        
        failure_func(endpoint)

    def simulate_connection_error(self, endpoint):
        """Simulate connection errors"""
        try:
            # Use invalid host to trigger connection error
            invalid_url = endpoint.replace(self.host, "http://invalid-host-12345.com")
            
            with self.client.get(
                invalid_url,
                timeout=2.0,
                name=f"Chaos Connection Error - {endpoint}"
            ) as response:
                print(f"Connection error test: Unexpected success")
        except Exception as e:
            print(f"Connection error test: Failed as expected ({type(e).__name__})")

    def simulate_server_error(self, endpoint):
        """Test how system handles server errors"""
        # Try to trigger server errors with bad data
        bad_data = {"invalid": "data" * 1000}  # Large payload
        
        with self.client.post(
            endpoint,
            json=bad_data,
            name=f"Chaos Server Error - {endpoint}"
        ) as response:
            if response.status_code >= 500:
                print(f"Server error test {endpoint}: Got {response.status_code} (expected)")
            elif response.status_code >= 400:
                print(f"Server error test {endpoint}: Got {response.status_code} (client error)")
            else:
                print(f"Server error test {endpoint}: Unexpected success {response.status_code}")

    def simulate_client_error(self, endpoint):
        """Test client error handling"""
        # Send malformed requests
        malformed_scenarios = [
            ("invalid_json", '{"invalid": json}'),
            ("missing_content_type", {"data": "test"}),
            ("wrong_method", "DELETE")
        ]
        
        scenario, data = random.choice(malformed_scenarios)
        
        if scenario == "invalid_json":
            with self.client.post(
                endpoint,
                data=data,  # Invalid JSON string
                headers={"Content-Type": "application/json"},
                name=f"Chaos Invalid JSON - {endpoint}"
            ) as response:
                print(f"Invalid JSON test {endpoint}: {response.status_code}")
                
        elif scenario == "missing_content_type":
            with self.client.post(
                endpoint,
                json=data,
                headers={"Content-Type": "text/plain"},  # Wrong content type
                name=f"Chaos Wrong Content-Type - {endpoint}"
            ) as response:
                print(f"Wrong content-type test {endpoint}: {response.status_code}")
                
        elif scenario == "wrong_method":
            with self.client.delete(
                endpoint,
                name=f"Chaos Wrong Method - {endpoint}"
            ) as response:
                print(f"Wrong method test {endpoint}: {response.status_code}")

    def test_bad_request(self, endpoint):
        """Test bad request handling"""
        # Send requests with bad parameters
        bad_params = {
            "invalid_param": "value",
            "limit": -1,  # Invalid limit
            "page": "not_a_number"  # Invalid page
        }
        
        with self.client.get(
            endpoint,
            params=bad_params,
            name=f"Chaos Bad Request - {endpoint}"
        ) as response:
            if response.status_code == 400:
                print(f"Bad request test {endpoint}: Properly rejected (400)")
            elif response.status_code == 200:
                print(f"Bad request test {endpoint}: Accepted bad params (potential issue)")
            else:
                print(f"Bad request test {endpoint}: {response.status_code}")

    @task(1)
    def test_system_recovery(self):
        """Test if system recovers after chaos"""
        # First, cause some chaos
        endpoint = random.choice(self.endpoints)
        self.inject_chaos(endpoint)
        
        # Wait a bit
        time.sleep(1)
        
        # Then test if system is still responsive
        with self.client.get(endpoint, name=f"Recovery Test - {endpoint}") as response:
            if response.status_code == 200:
                print(f"Recovery test {endpoint}: System recovered successfully")
            else:
                print(f"Recovery test {endpoint}: System still having issues ({response.status_code})")

Setup Instructions

Configure chaos parameters (failure rate, scenarios)
Define endpoints you want to test with chaos
Adjust timeout values based on your system
Monitor system behavior during chaos testing

What This Tests

Timeout Handling: How system responds to request timeouts
Error Recovery: System's ability to recover from failures
Bad Input Handling: Response to malformed requests
Connection Failures: Behavior when connections fail

Chaos Scenarios

Timeouts: Very short request timeouts
Slow Responses: Artificially delayed requests
Connection Errors: Simulated network failures
Server Errors: Requests designed to trigger 5xx errors
Client Errors: Malformed requests causing 4xx errors

Monitoring During Chaos

Watch for:

Error Rates: Acceptable failure rates during chaos
Recovery Time: How quickly system recovers
Cascading Failures: Failures spreading to other services
Resource Usage: CPU/memory during failure scenarios

Safety Tips

Start Small: Begin with low failure rates
Monitor Closely: Watch system metrics during tests
Have Rollback: Be ready to stop chaos if needed
Test in Staging: Don't run chaos in production initially

Product

Help

QA Series: Automated Security Headers Testing

QA Series: Basic Accessibility Monitoring

Use Cases

Simple Implementation

Setup Instructions

What This Tests

Chaos Scenarios

Monitoring During Chaos

Safety Tips

Simple A/B Testing

Simple Contract Testing

Ready to run your test?
Run your test today with LoadForge.

Product

Help

Recent posts

QA Series: Automated Security Headers Testing

QA Series: Basic Accessibility Monitoring

Simple Chaos Testing

Use Cases

Simple Implementation

Setup Instructions

What This Tests

Chaos Scenarios

Monitoring During Chaos

Safety Tips

Simple A/B Testing

Simple Contract Testing

Ready to run your test? Run your test today with LoadForge.

Ready to run your test?
Run your test today with LoadForge.