This guide shows how to introduce controlled chaos into your load tests to validate system resilience. Perfect for testing how your application handles failures and unexpected conditions.
Use Cases
- Test system resilience under failures
- Validate error handling and recovery
- Check timeout behavior
- Test circuit breaker patterns
Simple Implementation
from locust import task, HttpUser
import random
import time
class ChaosTestUser(HttpUser):
def on_start(self):
# Chaos configuration
self.chaos_enabled = True
self.failure_rate = 0.1 # 10% of requests will have chaos
# Test endpoints
self.endpoints = [
"/api/users",
"/api/products",
"/api/orders",
"/api/health"
]
# Chaos scenarios
self.chaos_scenarios = [
"timeout",
"slow_response",
"random_failure",
"bad_request"
]
@task(4)
def normal_request(self):
"""Normal request that might have chaos injected"""
endpoint = random.choice(self.endpoints)
# Decide if we should inject chaos
if self.chaos_enabled and random.random() < self.failure_rate:
self.inject_chaos(endpoint)
else:
self.make_normal_request(endpoint)
@task(1)
def intentional_chaos(self):
"""Intentionally trigger chaos scenarios"""
endpoint = random.choice(self.endpoints)
scenario = random.choice(self.chaos_scenarios)
print(f"Intentional chaos: {scenario} on {endpoint}")
if scenario == "timeout":
self.test_timeout_resilience(endpoint)
elif scenario == "slow_response":
self.test_slow_response(endpoint)
elif scenario == "random_failure":
self.test_random_failure(endpoint)
elif scenario == "bad_request":
self.test_bad_request(endpoint)
def inject_chaos(self, endpoint):
"""Randomly inject chaos into a request"""
scenario = random.choice(self.chaos_scenarios)
print(f"Chaos injection: {scenario} on {endpoint}")
if scenario == "timeout":
self.test_timeout_resilience(endpoint)
elif scenario == "slow_response":
self.test_slow_response(endpoint)
elif scenario == "random_failure":
self.test_random_failure(endpoint)
else:
self.make_normal_request(endpoint)
def make_normal_request(self, endpoint):
"""Make a normal request without chaos"""
with self.client.get(endpoint, name=f"Normal - {endpoint}") as response:
if response.status_code == 200:
print(f"Normal request to {endpoint}: SUCCESS")
else:
print(f"Normal request to {endpoint}: {response.status_code}")
def test_timeout_resilience(self, endpoint):
"""Test how system handles timeouts"""
try:
with self.client.get(
endpoint,
timeout=0.5, # Very short timeout
name=f"Chaos Timeout - {endpoint}"
) as response:
if response.status_code == 200:
print(f"Timeout test {endpoint}: Surprisingly fast response")
else:
print(f"Timeout test {endpoint}: {response.status_code}")
except Exception as e:
print(f"Timeout test {endpoint}: Timeout occurred (expected)")
# This is expected behavior - system should handle timeouts gracefully
def test_slow_response(self, endpoint):
"""Test system behavior with artificially slow responses"""
# Add artificial delay before request
delay = random.uniform(2.0, 5.0)
time.sleep(delay)
start_time = time.time()
with self.client.get(endpoint, name=f"Chaos Slow - {endpoint}") as response:
response_time = time.time() - start_time
if response.status_code == 200:
print(f"Slow response test {endpoint}: {response_time:.2f}s")
else:
print(f"Slow response test {endpoint}: {response.status_code} in {response_time:.2f}s")
def test_random_failure(self, endpoint):
"""Test random failure scenarios"""
# Simulate different types of failures
failure_types = [
("connection_error", self.simulate_connection_error),
("server_error", self.simulate_server_error),
("client_error", self.simulate_client_error)
]
failure_type, failure_func = random.choice(failure_types)
print(f"Random failure test {endpoint}: {failure_type}")
failure_func(endpoint)
def simulate_connection_error(self, endpoint):
"""Simulate connection errors"""
try:
# Use invalid host to trigger connection error
invalid_url = endpoint.replace(self.host, "http://invalid-host-12345.com")
with self.client.get(
invalid_url,
timeout=2.0,
name=f"Chaos Connection Error - {endpoint}"
) as response:
print(f"Connection error test: Unexpected success")
except Exception as e:
print(f"Connection error test: Failed as expected ({type(e).__name__})")
def simulate_server_error(self, endpoint):
"""Test how system handles server errors"""
# Try to trigger server errors with bad data
bad_data = {"invalid": "data" * 1000} # Large payload
with self.client.post(
endpoint,
json=bad_data,
name=f"Chaos Server Error - {endpoint}"
) as response:
if response.status_code >= 500:
print(f"Server error test {endpoint}: Got {response.status_code} (expected)")
elif response.status_code >= 400:
print(f"Server error test {endpoint}: Got {response.status_code} (client error)")
else:
print(f"Server error test {endpoint}: Unexpected success {response.status_code}")
def simulate_client_error(self, endpoint):
"""Test client error handling"""
# Send malformed requests
malformed_scenarios = [
("invalid_json", '{"invalid": json}'),
("missing_content_type", {"data": "test"}),
("wrong_method", "DELETE")
]
scenario, data = random.choice(malformed_scenarios)
if scenario == "invalid_json":
with self.client.post(
endpoint,
data=data, # Invalid JSON string
headers={"Content-Type": "application/json"},
name=f"Chaos Invalid JSON - {endpoint}"
) as response:
print(f"Invalid JSON test {endpoint}: {response.status_code}")
elif scenario == "missing_content_type":
with self.client.post(
endpoint,
json=data,
headers={"Content-Type": "text/plain"}, # Wrong content type
name=f"Chaos Wrong Content-Type - {endpoint}"
) as response:
print(f"Wrong content-type test {endpoint}: {response.status_code}")
elif scenario == "wrong_method":
with self.client.delete(
endpoint,
name=f"Chaos Wrong Method - {endpoint}"
) as response:
print(f"Wrong method test {endpoint}: {response.status_code}")
def test_bad_request(self, endpoint):
"""Test bad request handling"""
# Send requests with bad parameters
bad_params = {
"invalid_param": "value",
"limit": -1, # Invalid limit
"page": "not_a_number" # Invalid page
}
with self.client.get(
endpoint,
params=bad_params,
name=f"Chaos Bad Request - {endpoint}"
) as response:
if response.status_code == 400:
print(f"Bad request test {endpoint}: Properly rejected (400)")
elif response.status_code == 200:
print(f"Bad request test {endpoint}: Accepted bad params (potential issue)")
else:
print(f"Bad request test {endpoint}: {response.status_code}")
@task(1)
def test_system_recovery(self):
"""Test if system recovers after chaos"""
# First, cause some chaos
endpoint = random.choice(self.endpoints)
self.inject_chaos(endpoint)
# Wait a bit
time.sleep(1)
# Then test if system is still responsive
with self.client.get(endpoint, name=f"Recovery Test - {endpoint}") as response:
if response.status_code == 200:
print(f"Recovery test {endpoint}: System recovered successfully")
else:
print(f"Recovery test {endpoint}: System still having issues ({response.status_code})")
Setup Instructions
- Configure chaos parameters (failure rate, scenarios)
- Define endpoints you want to test with chaos
- Adjust timeout values based on your system
- Monitor system behavior during chaos testing
What This Tests
- Timeout Handling: How system responds to request timeouts
- Error Recovery: System's ability to recover from failures
- Bad Input Handling: Response to malformed requests
- Connection Failures: Behavior when connections fail
Chaos Scenarios
- Timeouts: Very short request timeouts
- Slow Responses: Artificially delayed requests
- Connection Errors: Simulated network failures
- Server Errors: Requests designed to trigger 5xx errors
- Client Errors: Malformed requests causing 4xx errors
Monitoring During Chaos
Watch for:
- Error Rates: Acceptable failure rates during chaos
- Recovery Time: How quickly system recovers
- Cascading Failures: Failures spreading to other services
- Resource Usage: CPU/memory during failure scenarios
Safety Tips
- Start Small: Begin with low failure rates
- Monitor Closely: Watch system metrics during tests
- Have Rollback: Be ready to stop chaos if needed
- Test in Staging: Don't run chaos in production initially