From 2d1c584ecac962e288e4a3b9e7a5bcbdc005c9b4 Mon Sep 17 00:00:00 2001
From: Min RK <benjaminrk@gmail.com>
Date: Tue, 28 Mar 2023 15:06:04 +0200
Subject: [PATCH] Fix wait/fail conditions in hub culling tests

- actually check for running server, matching comments, not just user existence
- catch asyncio.TimeoutError, not TimeoutError
- fail if condition is not met, rather than passing in both cases
- update some comments for accuracy (max age and timeout aren't the same)
---
 integration-tests/test_hub.py | 86 ++++++++++++++++++++++-------------
 1 file changed, 54 insertions(+), 32 deletions(-)

diff --git a/integration-tests/test_hub.py b/integration-tests/test_hub.py
index a056f62..fa5415d 100644
--- a/integration-tests/test_hub.py
+++ b/integration-tests/test_hub.py
@@ -346,12 +346,12 @@ async def test_idle_server_culled():
             )
         ).wait()
     )
-    # Check every 10s for idle servers to cull
+    # Check every 5s for idle servers to cull
     assert (
         0
         == await (
             await asyncio.create_subprocess_exec(
-                *TLJH_CONFIG_PATH, "set", "services.cull.every", "10"
+                *TLJH_CONFIG_PATH, "set", "services.cull.every", "5"
             )
         ).wait()
     )
@@ -364,12 +364,12 @@ async def test_idle_server_culled():
             )
         ).wait()
     )
-    # Cull servers and users after 60s of activity
+    # Cull servers and users after 30s, regardless of activity
     assert (
         0
         == await (
             await asyncio.create_subprocess_exec(
-                *TLJH_CONFIG_PATH, "set", "services.cull.max_age", "60"
+                *TLJH_CONFIG_PATH, "set", "services.cull.max_age", "30"
             )
         ).wait()
     )
@@ -388,25 +388,50 @@ async def test_idle_server_culled():
         assert pwd.getpwnam(f"jupyter-{username}") is not None
 
         # Check that we can get to the user's server
-        r = await u.session.get(
-            u.hub_url / "hub/api/users" / username,
-            headers={"Referer": str(u.hub_url / "hub/")},
-        )
+        user_url = u.notebook_url / "api/status"
+        r = await u.session.get(user_url, allow_redirects=False)
         assert r.status == 200
 
-        async def _check_culling_done():
-            # Check that after 60s, the user and server have been culled and are not reacheable anymore
+        # Check that we can talk to JupyterHub itself
+        # use this as a proxy for whether the user still exists
+        async def hub_api_request():
             r = await u.session.get(
-                u.hub_url / "hub/api/users" / username,
+                u.hub_url / "hub/api/user",
                 headers={"Referer": str(u.hub_url / "hub/")},
+                allow_redirects=False,
             )
-            print(r.status)
+            return r
+
+        r = await hub_api_request()
+        assert r.status == 200
+
+        # Wait for culling
+        # step 1: check if the server is still running
+        timeout = 100
+
+        async def server_stopped():
+            """Has the server been stopped?"""
+            r = await u.session.get(user_url, allow_redirects=False)
+            print(f"{r.status} {r.url}")
+            return r.status != 200
+
+        await exponential_backoff(
+            server_stopped,
+            "Server still running!",
+            timeout=timeout,
+        )
+
+        # step 2. wait for user to be deleted
+        async def user_removed():
+            # Check that after 60s, the user has been culled
+            r = await hub_api_request()
+            print(f"{r.status} {r.url}")
             return r.status == 403
 
         await exponential_backoff(
-            _check_culling_done,
-            "Server culling failed!",
-            timeout=100,
+            user_removed,
+            "User still exists!",
+            timeout=timeout,
         )
 
 
@@ -429,12 +454,12 @@ async def test_active_server_not_culled():
             )
         ).wait()
     )
-    # Check every 10s for idle servers to cull
+    # Check every 5s for idle servers to cull
     assert (
         0
         == await (
             await asyncio.create_subprocess_exec(
-                *TLJH_CONFIG_PATH, "set", "services.cull.every", "10"
+                *TLJH_CONFIG_PATH, "set", "services.cull.every", "5"
             )
         ).wait()
     )
@@ -447,7 +472,7 @@ async def test_active_server_not_culled():
             )
         ).wait()
     )
-    # Cull servers and users after 60s of activity
+    # Cull servers and users after 30s, regardless of activity
     assert (
         0
         == await (
@@ -471,27 +496,24 @@ async def test_active_server_not_culled():
         assert pwd.getpwnam(f"jupyter-{username}") is not None
 
         # Check that we can get to the user's server
-        r = await u.session.get(
-            u.hub_url / "hub/api/users" / username,
-            headers={"Referer": str(u.hub_url / "hub/")},
-        )
+        user_url = u.notebook_url / "api/status"
+        r = await u.session.get(user_url, allow_redirects=False)
         assert r.status == 200
 
-        async def _check_culling_done():
+        async def server_has_stopped():
             # Check that after 30s, we can still reach the user's server
-            r = await u.session.get(
-                u.hub_url / "hub/api/users" / username,
-                headers={"Referer": str(u.hub_url / "hub/")},
-            )
-            print(r.status)
+            r = await u.session.get(user_url, allow_redirects=False)
+            print(f"{r.status} {r.url}")
             return r.status != 200
 
         try:
             await exponential_backoff(
-                _check_culling_done,
-                "User's server is still reacheable!",
+                server_has_stopped,
+                "User's server is still reachable (good!)",
                 timeout=30,
             )
-        except TimeoutError:
-            # During the 30s timeout the user's server wasn't culled, which is what we intended.
+        except asyncio.TimeoutError:
+            # timeout error means the test passed - the server didn't go away while we were waiting
             pass
+        else:
+            pytest.fail(f"Server at {user_url} got culled prematurely!")