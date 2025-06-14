This guide shows how to introduce controlled chaos into your load tests to validate system resilience. Perfect for testing how your application handles failures and unexpected conditions.

Use Cases

Test system resilience under failures

Validate error handling and recovery

Check timeout behavior

Test circuit breaker patterns

Simple Implementation

from locust import task, HttpUser import random import time class ChaosTestUser(HttpUser): def on_start(self): # Chaos configuration self.chaos_enabled = True self.failure_rate = 0.1 # 10% of requests will have chaos # Test endpoints self.endpoints = [ "/api/users", "/api/products", "/api/orders", "/api/health" ] # Chaos scenarios self.chaos_scenarios = [ "timeout", "slow_response", "random_failure", "bad_request" ] @task(4) def normal_request(self): """Normal request that might have chaos injected""" endpoint = random.choice(self.endpoints) # Decide if we should inject chaos if self.chaos_enabled and random.random() < self.failure_rate: self.inject_chaos(endpoint) else: self.make_normal_request(endpoint) @task(1) def intentional_chaos(self): """Intentionally trigger chaos scenarios""" endpoint = random.choice(self.endpoints) scenario = random.choice(self.chaos_scenarios) print(f"Intentional chaos: {scenario} on {endpoint}") if scenario == "timeout": self.test_timeout_resilience(endpoint) elif scenario == "slow_response": self.test_slow_response(endpoint) elif scenario == "random_failure": self.test_random_failure(endpoint) elif scenario == "bad_request": self.test_bad_request(endpoint) def inject_chaos(self, endpoint): """Randomly inject chaos into a request""" scenario = random.choice(self.chaos_scenarios) print(f"Chaos injection: {scenario} on {endpoint}") if scenario == "timeout": self.test_timeout_resilience(endpoint) elif scenario == "slow_response": self.test_slow_response(endpoint) elif scenario == "random_failure": self.test_random_failure(endpoint) else: self.make_normal_request(endpoint) def make_normal_request(self, endpoint): """Make a normal request without chaos""" with self.client.get(endpoint, name=f"Normal - {endpoint}") as response: if response.status_code == 200: print(f"Normal request to {endpoint}: SUCCESS") else: print(f"Normal request to {endpoint}: {response.status_code}") def test_timeout_resilience(self, endpoint): """Test how system handles timeouts""" try: with self.client.get( endpoint, timeout=0.5, # Very short timeout name=f"Chaos Timeout - {endpoint}" ) as response: if response.status_code == 200: print(f"Timeout test {endpoint}: Surprisingly fast response") else: print(f"Timeout test {endpoint}: {response.status_code}") except Exception as e: print(f"Timeout test {endpoint}: Timeout occurred (expected)") # This is expected behavior - system should handle timeouts gracefully def test_slow_response(self, endpoint): """Test system behavior with artificially slow responses""" # Add artificial delay before request delay = random.uniform(2.0, 5.0) time.sleep(delay) start_time = time.time() with self.client.get(endpoint, name=f"Chaos Slow - {endpoint}") as response: response_time = time.time() - start_time if response.status_code == 200: print(f"Slow response test {endpoint}: {response_time:.2f}s") else: print(f"Slow response test {endpoint}: {response.status_code} in {response_time:.2f}s") def test_random_failure(self, endpoint): """Test random failure scenarios""" # Simulate different types of failures failure_types = [ ("connection_error", self.simulate_connection_error), ("server_error", self.simulate_server_error), ("client_error", self.simulate_client_error) ] failure_type, failure_func = random.choice(failure_types) print(f"Random failure test {endpoint}: {failure_type}") failure_func(endpoint) def simulate_connection_error(self, endpoint): """Simulate connection errors""" try: # Use invalid host to trigger connection error invalid_url = endpoint.replace(self.host, "http://invalid-host-12345.com") with self.client.get( invalid_url, timeout=2.0, name=f"Chaos Connection Error - {endpoint}" ) as response: print(f"Connection error test: Unexpected success") except Exception as e: print(f"Connection error test: Failed as expected ({type(e).__name__})") def simulate_server_error(self, endpoint): """Test how system handles server errors""" # Try to trigger server errors with bad data bad_data = {"invalid": "data" * 1000} # Large payload with self.client.post( endpoint, json=bad_data, name=f"Chaos Server Error - {endpoint}" ) as response: if response.status_code >= 500: print(f"Server error test {endpoint}: Got {response.status_code} (expected)") elif response.status_code >= 400: print(f"Server error test {endpoint}: Got {response.status_code} (client error)") else: print(f"Server error test {endpoint}: Unexpected success {response.status_code}") def simulate_client_error(self, endpoint): """Test client error handling""" # Send malformed requests malformed_scenarios = [ ("invalid_json", '{"invalid": json}'), ("missing_content_type", {"data": "test"}), ("wrong_method", "DELETE") ] scenario, data = random.choice(malformed_scenarios) if scenario == "invalid_json": with self.client.post( endpoint, data=data, # Invalid JSON string headers={"Content-Type": "application/json"}, name=f"Chaos Invalid JSON - {endpoint}" ) as response: print(f"Invalid JSON test {endpoint}: {response.status_code}") elif scenario == "missing_content_type": with self.client.post( endpoint, json=data, headers={"Content-Type": "text/plain"}, # Wrong content type name=f"Chaos Wrong Content-Type - {endpoint}" ) as response: print(f"Wrong content-type test {endpoint}: {response.status_code}") elif scenario == "wrong_method": with self.client.delete( endpoint, name=f"Chaos Wrong Method - {endpoint}" ) as response: print(f"Wrong method test {endpoint}: {response.status_code}") def test_bad_request(self, endpoint): """Test bad request handling""" # Send requests with bad parameters bad_params = { "invalid_param": "value", "limit": -1, # Invalid limit "page": "not_a_number" # Invalid page } with self.client.get( endpoint, params=bad_params, name=f"Chaos Bad Request - {endpoint}" ) as response: if response.status_code == 400: print(f"Bad request test {endpoint}: Properly rejected (400)") elif response.status_code == 200: print(f"Bad request test {endpoint}: Accepted bad params (potential issue)") else: print(f"Bad request test {endpoint}: {response.status_code}") @task(1) def test_system_recovery(self): """Test if system recovers after chaos""" # First, cause some chaos endpoint = random.choice(self.endpoints) self.inject_chaos(endpoint) # Wait a bit time.sleep(1) # Then test if system is still responsive with self.client.get(endpoint, name=f"Recovery Test - {endpoint}") as response: if response.status_code == 200: print(f"Recovery test {endpoint}: System recovered successfully") else: print(f"Recovery test {endpoint}: System still having issues ({response.status_code})")

Setup Instructions

Configure chaos parameters (failure rate, scenarios) Define endpoints you want to test with chaos Adjust timeout values based on your system Monitor system behavior during chaos testing

What This Tests

Timeout Handling : How system responds to request timeouts

: How system responds to request timeouts Error Recovery : System's ability to recover from failures

: System's ability to recover from failures Bad Input Handling : Response to malformed requests

: Response to malformed requests Connection Failures: Behavior when connections fail

Chaos Scenarios

Timeouts : Very short request timeouts

: Very short request timeouts Slow Responses : Artificially delayed requests

: Artificially delayed requests Connection Errors : Simulated network failures

: Simulated network failures Server Errors : Requests designed to trigger 5xx errors

: Requests designed to trigger 5xx errors Client Errors: Malformed requests causing 4xx errors

Monitoring During Chaos

Watch for:

Error Rates : Acceptable failure rates during chaos

: Acceptable failure rates during chaos Recovery Time : How quickly system recovers

: How quickly system recovers Cascading Failures : Failures spreading to other services

: Failures spreading to other services Resource Usage: CPU/memory during failure scenarios

Safety Tips