Sanity Check Guide: Why Tests Pass But Apps Fail

The Problem: Green Tests, Broken Reality

Scenario: LLM writes you 50 tests. All pass ✅. You deploy. Users complain the app is broken 🔥.

Root Cause: Tests validate that code works as written, not as intended.

The "Sanity Check" Framework

1. The Reality Gap Audit 🕵️

Before trusting any test suite, ask:

# ❌ LLM Test: Tests what the code does
def test_calculate_shipping():
    result = calculate_shipping(weight=10, distance=100)
    assert result == 50  # Code returns 50, test passes

# ✅ Reality Check: Tests what SHOULD happen
def test_calculate_shipping_makes_business_sense():
    # $50 shipping for 10lb package across 100 miles?
    result = calculate_shipping(weight=10, distance=100)

    # Sanity bounds based on REAL business logic
    assert 5 <= result <= 25  # Reasonable shipping cost

    # Test against known competitors
    competitor_cost = 12.50
    assert abs(result - competitor_cost) <= 5  # Within reasonable range

Common "Green but Wrong" Test Patterns

🚨 Pattern 1: "Circular Logic" Tests

# ❌ BAD: Tests the implementation, not the requirement
class UserService:
    def get_user_display_name(self, user_id):
        user = self.db.get_user(user_id)
        return f"{user.first_name}_{user.last_name}"  # Bug: underscore!

# LLM writes this test (and it passes):
def test_get_user_display_name():
    service = UserService()
    result = service.get_user_display_name(123)
    assert result == "John_Doe"  # ✅ Passes, but users see "John_Doe"!

# ✅ GOOD: Tests the actual requirement
def test_get_user_display_name_for_humans():
    service = UserService()
    result = service.get_user_display_name(123)

    # Test what users actually expect to see
    assert " " in result  # Should have space, not underscore
    assert "_" not in result  # Should not have underscores
    assert result == "John Doe"  # What users actually want

🚨 Pattern 2: "Happy Path Only" Tests

# ❌ BAD: Only tests perfect scenarios
def test_process_payment():
    payment = create_valid_payment()
    result = process_payment(payment)
    assert result.success == True

# ✅ GOOD: Tests what actually happens in production
def test_payment_processing_reality():
    # Test network failures (happens 2-5% of the time)
    with mock_network_timeout():
        result = process_payment(valid_payment)
        assert result.has_retry_mechanism()

    # Test invalid cards (happens 10-15% of the time)
    invalid_card = Payment(card_number="1234567890123456")
    result = process_payment(invalid_card)
    assert result.error_message_is_user_friendly()

    # Test duplicate payments (common user behavior)
    process_payment(valid_payment)
    duplicate_result = process_payment(valid_payment)
    assert duplicate_result.prevents_double_charge()

🚨 Pattern 3: "Mock Everything" Tests

# ❌ BAD: Mocks hide integration problems
@mock.patch('database.get_user')
@mock.patch('email_service.send_welcome')
@mock.patch('payment_processor.charge_card')
def test_user_registration(mock_payment, mock_email, mock_db):
    mock_db.return_value = fake_user
    mock_email.return_value = True
    mock_payment.return_value = success_response

    result = register_user(user_data)
    assert result.success == True  # ✅ Always passes!

# ✅ GOOD: Test with real dependencies (at least sometimes)
def test_user_registration_integration():
    # Use test database, real email service (with test mode)
    with test_database(), test_email_service():
        result = register_user(valid_user_data)

        # Verify ACTUAL side effects happened
        assert user_exists_in_database(result.user_id)
        assert welcome_email_was_sent(result.user_id)
        assert user_can_actually_login(result.user_id)

The "Sanity Check" Checklist

⚡ Quick Reality Checks (Run These First)

# 1. Boundary Value Reality Check
def test_realistic_boundaries():
    # If your app calculates prices, test realistic scenarios
    expensive_item = calculate_price(item="laptop")
    assert 100 <= expensive_item <= 10000  # Reasonable laptop price

    cheap_item = calculate_price(item="pen")
    assert 0.10 <= cheap_item <= 50  # Reasonable pen price

# 2. Time-Based Reality Check  
def test_reasonable_performance():
    start_time = time.time()
    result = search_users("john")
    duration = time.time() - start_time

    assert duration < 2.0  # Users won't wait longer than 2 seconds
    assert len(result) <= 100  # Don't return 10,000 results

# 3. User Experience Reality Check
def test_error_messages_make_sense():
    with pytest.raises(ValidationError) as exc_info:
        create_user(email="not-an-email")

    error_msg = str(exc_info.value).lower()
    # Users should understand the error
    assert "email" in error_msg
    assert "invalid" in error_msg or "format" in error_msg
    # Should NOT contain technical jargon
    assert "regex" not in error_msg
    assert "validation.py line 47" not in error_msg

🔍 Deep Sanity Audit Questions

For each passing test, ask:

"Would a user actually do this?" ```python # ❌ Tests unrealistic user behavior def test_user_enters_perfect_data(): user = User(name="John", email="perfect@email.com", age=30)

# ✅ Tests realistic user behavior
def test_user_enters_messy_data(): user = User(name=" john smith ", email="JOHN@GMAIL.COM", age="30") cleaned = clean_user_data(user) assert cleaned.name == "John Smith" assert cleaned.email == "john@gmail.com" ```

"What happens when external services fail?" python # Test your app when the payment processor is down def test_payment_service_unavailable(): with mock_service_unavailable('payment_processor'): result = checkout(cart) # Should gracefully handle, not crash assert result.user_friendly_error_message assert cart.items_are_still_saved()
"Does this work with real data volumes?" ```python # ❌ Tests with 3 records def test_with_tiny_dataset(): users = [user1, user2, user3] result = process_users(users)

# ✅ Tests with realistic data size def test_with_realistic_dataset(): users = generate_test_users(count=10000) # More realistic start_time = time.time() result = process_users(users)

   assert time.time() - start_time < 30  # Should complete in reasonable time
   assert not memory_usage_exceeded_limit()

```

The "Manual Spot Check" Protocol

🛠️ After Tests Pass, Do This:

# 1. Actually Run Your App Manually
"""
- Open your browser
- Go through the signup flow
- Try to break things on purpose
- Use weird inputs: "Robert'); DROP TABLE users;--"
- Try on mobile
"""

# 2. Check Logs for Warnings
def test_no_scary_logs():
    with capture_logs() as log_capture:
        result = run_typical_workflow()

    logs = log_capture.records
    # Tests should not generate warnings/errors
    assert not any("ERROR" in log.message for log in logs)
    assert not any("WARNING" in log.message for log in logs)
    assert not any("CRITICAL" in log.message for log in logs)

# 3. Database State Sanity Check
def test_database_state_makes_sense():
    # Run a typical workflow
    user = create_user("test@example.com")
    order = create_order(user, items=[laptop, mouse])

    # Check database state manually
    db_user = User.objects.get(email="test@example.com")
    assert db_user.orders.count() == 1
    assert db_user.orders.first().total > 0

    # Check for data corruption
    assert all_foreign_keys_valid()
    assert no_orphaned_records()

Red Flags: When to Distrust Passing Tests

🚩 Suspicious Test Patterns:

# 🚩 Red Flag 1: Tests that never fail
def test_that_always_passes():
    result = some_function()
    assert True  # This test is useless!

# 🚩 Red Flag 2: Tests with hardcoded expectations
def test_hardcoded_expectations():
    users = get_all_users()
    assert len(users) == 5  # What if you add/remove users?

# 🚩 Red Flag 3: Tests that don't test edge cases
def test_only_happy_path():
    result = divide(10, 2)
    assert result == 5
    # Missing: divide(10, 0), divide(0, 10), divide(-5, 2), etc.

🚩 Environmental Red Flags:

# If your tests need perfect conditions to pass:
export PYTHONPATH=/perfect/path
export DATABASE_URL=sqlite://perfect_test.db
export API_KEY=fake_key_that_never_expires

# Real apps need to handle imperfect environments

Practical "Sanity Suite" for Your Apps

🎯 For Django Apps:

# tests/sanity_checks.py
class SanityTestSuite:

    def test_admin_actually_works(self):
        """Can admin user actually use Django admin?"""
        self.client.login(username='admin', password='admin')
        response = self.client.get('/admin/')
        assert response.status_code == 200
        assert "Django administration" in response.content.decode()

    def test_static_files_load(self):
        """Do CSS/JS files actually exist and load?"""
        response = self.client.get('/static/css/main.css')
        assert response.status_code == 200
        assert len(response.content) > 0

    def test_database_migrations_work(self):
        """Do all migrations actually apply cleanly?"""
        from django.core.management import call_command
        call_command('migrate', '--check')  # Should not raise

🎯 For FastAPI Apps:

def test_api_docs_accessible():
    """Users should be able to see API documentation"""
    response = client.get("/docs")
    assert response.status_code == 200
    assert "swagger" in response.text.lower()

def test_health_check_meaningful():
    """Health check should actually verify app health"""
    response = client.get("/health")
    health_data = response.json()

    # Should check real dependencies
    assert health_data["database"] == "connected"
    assert health_data["external_api"] == "reachable"
    assert "uptime" in health_data

🎯 For Data Pipelines:

def test_pipeline_with_real_sample_data():
    """Use actual production data sample (anonymized)"""
    sample_data = load_production_sample("last_week_sample.csv")
    result = run_pipeline(sample_data)

    # Verify realistic expectations
    assert len(result) > 0
    assert all(record.is_valid() for record in result)
    assert no_data_was_corrupted(sample_data, result)

Key Takeaway

Green tests don't guarantee working software.

The sanity check mindset is: 1. Test what users actually experience 2. Use realistic data and scenarios
3. Manually spot-check critical paths 4. Question tests that seem "too easy"

Think of sanity checks as "Would I trust this with my own money?" tests.

If you wouldn't use your own app based on what the tests cover, your tests need work.

sanity_check_guide