Merge pull request #366 from GeorgianaElena/addIdleCuller

Add idle culler
2025-12-18 21:54:05 +08:00 · 2019-06-14 09:50:06 -07:00
parent 4daa9650a4 346701502d
commit ba86dcb405
6 changed files with 642 additions and 1 deletions
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -124,6 +124,7 @@ Topic guides provide in-depth explanations of specific topics.
   topic/tljh-config
   topic/authenticator-configuration
   topic/escape-hatch
+   topic/idle-culler


 Troubleshooting
--- a/docs/topic/idle-culler.rst
+++ b/docs/topic/idle-culler.rst
@@ -0,0 +1,114 @@
+.. _topic/idle-culler:
+
+=============================
+Culling idle notebook servers
+=============================
+
+The idle culler automatically shuts down user notebook servers when they have
+not been used for a certain time period, in order to reduce the total resource
+usage on your JupyterHub.
+
+
+JupyterHub pings the user's notebook server at certain time intervals. If no response
+is received from the server during this checks and the timeout expires, the server is
+considered to be *inactive (idle)* and will be culled.
+
+
+Default settings
+================
+
+By default, JupyterHub will ping the user notebook servers every 60s to check their
+status. Every server found to be idle for more than 10 minutes will be culled.
+
+.. code-block:: python
+
+	services.cull.every = 60
+	services.cull.timeout = 600
+
+Because the servers don't have a maximum age set, an active server will not be shut down
+regardless of how long it has been up and running.
+
+.. code-block:: python
+
+	services.cull.max_age = 0
+
+If after the culling process, there are users with no active notebook servers, by default,
+the users will not be culled alongside their notebooks and will continue to exist.
+
+.. code-block:: python
+
+	services.cull.users = False
+
+
+Configuring the idle culler
+===========================
+
+The available configuration options are:
+
+Idle timeout
+------------
+The idle timeout is the maximum time (in seconds) a server can be inactive before it
+will be culled. The timeout can be configured using:
+
+.. code-block:: bash
+
+	sudo tljh-config set services.cull.timeout <max-idle-sec-before-server-is-culled>
+	sudo tljh-config reload
+
+Idle check interval
+-------------------
+The idle check interval represents how frequent (in seconds) the Hub will
+check if there are any idle servers to cull. It can be configured using:
+
+.. code-block:: bash
+
+ 	sudo tljh-config set services.cull.every <number-of-sec-this-check-is-done>
+ 	sudo tljh-config reload
+
+Maximum age
+-----------
+The maximum age sets the time (in seconds) a server should be running.
+The servers that exceed the maximum age, will be culled even if they are active.
+A maximum age of 0, will deactivate this option.
+The maximum age can be configured using:
+
+.. code-block:: bash
+
+ 	sudo tljh-config set services.cull.max_age <server-max-age>
+ 	sudo tljh-config reload
+
+User culling
+------------
+In addition to servers, it is also possible to cull the users. This is usually
+suited for temporary-user cases such as *tmpnb*.
+User culling can be activated using the following command:
+
+.. code-block:: bash
+
+ 	sudo tljh-config set services.cull.users True
+ 	sudo tljh-config reload
+
+Concurrency
+-----------
+Deleting a lot of users at the same time can slow down the Hub.
+The number of concurrent requests made to the Hub can be configured using:
+
+.. code-block:: bash
+
+ 	sudo tljh-config set services.cull.concurrency <number-of-concurrent-hub-requests>
+ 	sudo tljh-config reload
+
+Because TLJH it's used for a small number of users, the cases that may require to
+modify the concurrency limit should be rare.
+
+
+Disabling the idle culler
+=========================
+
+The idle culling service is enabled by default. To disable it, use the following
+command:
+
+.. code-block:: bash
+
+   sudo tljh-config set services.cull.enabled False
+   sudo tljh-config reload
--- a/integration-tests/test_hub.py
+++ b/integration-tests/test_hub.py
@@ -1,6 +1,7 @@
 import requests
 from hubtraf.user import User
 from hubtraf.auth.dummy import login_dummy
+from jupyterhub.utils import exponential_backoff
 import secrets
 import pytest
 from functools import partial
@@ -138,3 +139,98 @@ async def test_long_username():
            '--no-pager'
        ])
        raise
+
+
+@pytest.mark.asyncio
+async def test_idle_server_culled():
+    """
+    User logs in, starts a server & stays idle for 1 min.
+    (the user's server should be culled during this period)
+    """
+    # This *must* be localhost, not an IP
+    # aiohttp throws away cookies if we are connecting to an IP!
+    hub_url = 'http://localhost'
+    username = secrets.token_hex(8)
+
+    assert 0 == await (await asyncio.create_subprocess_exec(*TLJH_CONFIG_PATH, 'set', 'auth.type', 'dummyauthenticator.DummyAuthenticator')).wait()
+    # Check every 10s for idle servers to cull
+    assert 0 == await (await asyncio.create_subprocess_exec(*TLJH_CONFIG_PATH, 'set', 'services.cull.every', "10")).wait()
+    # Apart from servers, also cull users
+    assert 0 == await (await asyncio.create_subprocess_exec(*TLJH_CONFIG_PATH, 'set', 'services.cull.users', "True")).wait()
+    # Cull servers and users after 60s of activity
+    assert 0 == await (await asyncio.create_subprocess_exec(*TLJH_CONFIG_PATH, 'set', 'services.cull.max_age', "60")).wait()
+    assert 0 == await (await asyncio.create_subprocess_exec(*TLJH_CONFIG_PATH, 'reload')).wait()
+
+    async with User(username, hub_url, partial(login_dummy, password='')) as u:
+            await u.login()
+            # Start user's server
+            await u.ensure_server()
+            # Assert that the user exists
+            assert pwd.getpwnam(f'jupyter-{username}') is not None
+
+            # Check that we can get to the user's server
+            r = await u.session.get(u.hub_url / 'hub/api/users' / username,
+                headers={'Referer': str(u.hub_url / 'hub/')})
+            assert r.status == 200
+
+            async def _check_culling_done():
+                # Check that after 60s, the user and server have been culled and are not reacheable anymore
+                r = await u.session.get(u.hub_url / 'hub/api/users' / username,
+                    headers={'Referer': str(u.hub_url / 'hub/')})
+                print(r.status)
+                return r.status == 403
+
+            await exponential_backoff(
+                _check_culling_done,
+                "Server culling failed!",
+                timeout=100,
+            )
+
+@pytest.mark.asyncio
+async def test_active_server_not_culled():
+    """
+    User logs in, starts a server & stays idle for 30s
+    (the user's server should not be culled during this period).
+    """
+    # This *must* be localhost, not an IP
+    # aiohttp throws away cookies if we are connecting to an IP!
+    hub_url = 'http://localhost'
+    username = secrets.token_hex(8)
+
+    assert 0 == await (await asyncio.create_subprocess_exec(*TLJH_CONFIG_PATH, 'set', 'auth.type', 'dummyauthenticator.DummyAuthenticator')).wait()
+    # Check every 10s for idle servers to cull
+    assert 0 == await (await asyncio.create_subprocess_exec(*TLJH_CONFIG_PATH, 'set', 'services.cull.every', "10")).wait()
+    # Apart from servers, also cull users
+    assert 0 == await (await asyncio.create_subprocess_exec(*TLJH_CONFIG_PATH, 'set', 'services.cull.users', "True")).wait()
+    # Cull servers and users after 60s of activity
+    assert 0 == await (await asyncio.create_subprocess_exec(*TLJH_CONFIG_PATH, 'set', 'services.cull.max_age', "60")).wait()
+    assert 0 == await (await asyncio.create_subprocess_exec(*TLJH_CONFIG_PATH, 'reload')).wait()
+
+    async with User(username, hub_url, partial(login_dummy, password='')) as u:
+            await u.login()
+            # Start user's server
+            await u.ensure_server()
+            # Assert that the user exists
+            assert pwd.getpwnam(f'jupyter-{username}') is not None
+
+            # Check that we can get to the user's server
+            r = await u.session.get(u.hub_url / 'hub/api/users' / username,
+                headers={'Referer': str(u.hub_url / 'hub/')})
+            assert r.status == 200
+
+            async def _check_culling_done():
+                # Check that after 30s, we can still reach the user's server
+                r = await u.session.get(u.hub_url / 'hub/api/users' / username,
+                    headers={'Referer': str(u.hub_url / 'hub/')})
+                print(r.status)
+                return r.status != 200
+
+            try:
+                await exponential_backoff(
+                    _check_culling_done,
+                    "User's server is still reacheable!",
+                    timeout=30,
+                )
+            except TimeoutError:
+                # During the 30s timeout the user's server wasn't culled, which is what we intended.
+                pass
--- a/tests/test_configurer.py
+++ b/tests/test_configurer.py
@@ -3,6 +3,7 @@ Test configurer
 """

 import os
+import sys

 from tljh import configurer

@@ -187,6 +188,49 @@ def test_set_traefik_api():
    assert c.TraefikTomlProxy.traefik_api_password == '1234'


+def test_cull_service_default():
+    """
+    Test default cull service settings with no overrides
+    """
+    c = apply_mock_config({})
+
+    cull_cmd = [
+       sys.executable, '/srv/src/tljh/cull_idle_servers.py',
+       '--timeout=600', '--cull-every=60', '--concurrency=5',
+       '--max-age=0'
+    ]
+    assert c.JupyterHub.services == [{
+        'name': 'cull-idle',
+        'admin': True,
+        'command': cull_cmd,
+    }]
+
+
+def test_set_cull_service():
+    """
+    Test setting cull service options
+    """
+    c = apply_mock_config({
+        'services': {
+            'cull': {
+                'every': 10,
+                'users': True,
+                'max_age': 60
+            }
+        }
+    })
+    cull_cmd = [
+       sys.executable, '/srv/src/tljh/cull_idle_servers.py',
+       '--timeout=600', '--cull-every=10', '--concurrency=5',
+       '--max-age=60', '--cull-users'
+    ]
+    assert c.JupyterHub.services == [{
+        'name': 'cull-idle',
+        'admin': True,
+        'command': cull_cmd,
+    }]
+
+
 def test_load_secrets(tljh_dir):
    """
    Test loading secret files
--- a/tljh/configurer.py
+++ b/tljh/configurer.py
@@ -9,6 +9,7 @@ FIXME: A strong feeling that JSON Schema should be involved somehow.
 """

 import os
+import sys

 from .config import CONFIG_FILE, STATE_DIR
 from .yaml import yaml
@@ -55,6 +56,16 @@ default = {
    'user_environment': {
        'default_app': 'classic',
    },
+    'services': {
+        'cull': {
+            'enabled': True,
+            'timeout': 600,
+            'every': 60,
+            'concurrency': 5,
+            'users': False,
+            'max_age': 0
+        }
+    }
 }

 def load_config(config_file=CONFIG_FILE):
@@ -86,6 +97,7 @@ def apply_config(config_overrides, c):
    update_user_environment(c, tljh_config)
    update_user_account_config(c, tljh_config)
    update_traefik_api(c, tljh_config)
+    update_services(c, tljh_config)


 def set_if_not_none(parent, key, value):
@@ -191,6 +203,38 @@ def update_traefik_api(c, config):
    c.TraefikTomlProxy.traefik_api_password = config['traefik_api']['password']


+def set_cull_idle_service(config):
+    """
+    Set Idle Culler service
+    """
+    cull_cmd = [
+       sys.executable, '/srv/src/tljh/cull_idle_servers.py'
+    ]
+    cull_config = config['services']['cull']
+    print()
+
+    cull_cmd += ['--timeout=%d' % cull_config['timeout']]
+    cull_cmd += ['--cull-every=%d' % cull_config['every']]
+    cull_cmd += ['--concurrency=%d' % cull_config['concurrency']]
+    cull_cmd += ['--max-age=%d' % cull_config['max_age']]
+    if cull_config['users']:
+        cull_cmd += ['--cull-users']
+
+    cull_service = {
+        'name': 'cull-idle',
+        'admin': True,
+        'command': cull_cmd,
+    }
+
+    return cull_service
+
+
+def update_services(c, config):
+    c.JupyterHub.services = []
+    if config['services']['cull']['enabled']:
+        c.JupyterHub.services.append(set_cull_idle_service(config))
+
+
 def _merge_dictionaries(a, b, path=None, update=True):
    """
    Merge two dictionaries recursively.
--- a/tljh/cull_idle_servers.py
+++ b/tljh/cull_idle_servers.py
@@ -0,0 +1,342 @@
+#!/usr/bin/env python3
+"""script to monitor and cull idle single-user servers
+
+Imported from https://github.com/jupyterhub/jupyterhub/blob/6b1046697/examples/cull-idle/cull_idle_servers.py
+
+Caveats:
+
+last_activity is not updated with high frequency,
+so cull timeout should be greater than the sum of:
+
+- single-user websocket ping interval (default: 30s)
+- JupyterHub.last_activity_interval (default: 5 minutes)
+
+You can run this as a service managed by JupyterHub with this in your config::
+
+
+    c.JupyterHub.services = [
+        {
+            'name': 'cull-idle',
+            'admin': True,
+            'command': 'python cull_idle_servers.py --timeout=3600'.split(),
+        }
+    ]
+
+Or run it manually by generating an API token and storing it in `JUPYTERHUB_API_TOKEN`:
+
+    export JUPYTERHUB_API_TOKEN=`jupyterhub token`
+    python cull_idle_servers.py [--timeout=900] [--url=http://127.0.0.1:8081/hub/api]
+"""
+
+from datetime import datetime, timezone
+from functools import partial
+import json
+import os
+
+try:
+    from urllib.parse import quote
+except ImportError:
+    from urllib import quote
+
+import dateutil.parser
+
+from tornado.gen import coroutine, multi
+from tornado.locks import Semaphore
+from tornado.log import app_log
+from tornado.httpclient import AsyncHTTPClient, HTTPRequest
+from tornado.ioloop import IOLoop, PeriodicCallback
+from tornado.options import define, options, parse_command_line
+
+
+def parse_date(date_string):
+    """Parse a timestamp
+
+    If it doesn't have a timezone, assume utc
+
+    Returned datetime object will always be timezone-aware
+    """
+    dt = dateutil.parser.parse(date_string)
+    if not dt.tzinfo:
+        # assume naïve timestamps are UTC
+        dt = dt.replace(tzinfo=timezone.utc)
+    return dt
+
+
+def format_td(td):
+    """
+    Nicely format a timedelta object
+
+    as HH:MM:SS
+    """
+    if td is None:
+        return "unknown"
+    if isinstance(td, str):
+        return td
+    seconds = int(td.total_seconds())
+    h = seconds // 3600
+    seconds = seconds % 3600
+    m = seconds // 60
+    seconds = seconds % 60
+    return f"{h:02}:{m:02}:{seconds:02}"
+
+
+@coroutine
+def cull_idle(url, api_token, inactive_limit, cull_users=False, max_age=0, concurrency=10):
+    """Shutdown idle single-user servers
+
+    If cull_users, inactive *users* will be deleted as well.
+    """
+    auth_header = {
+        'Authorization': 'token %s' % api_token,
+    }
+    req = HTTPRequest(
+        url=url + '/users',
+        headers=auth_header,
+    )
+    now = datetime.now(timezone.utc)
+    client = AsyncHTTPClient()
+
+    if concurrency:
+        semaphore = Semaphore(concurrency)
+        @coroutine
+        def fetch(req):
+            """client.fetch wrapped in a semaphore to limit concurrency"""
+            yield semaphore.acquire()
+            try:
+                return (yield client.fetch(req))
+            finally:
+                yield semaphore.release()
+    else:
+        fetch = client.fetch
+
+    resp = yield fetch(req)
+    users = json.loads(resp.body.decode('utf8', 'replace'))
+    futures = []
+
+    @coroutine
+    def handle_server(user, server_name, server):
+        """Handle (maybe) culling a single server
+
+        Returns True if server is now stopped (user removable),
+        False otherwise.
+        """
+        log_name = user['name']
+        if server_name:
+            log_name = '%s/%s' % (user['name'], server_name)
+        if server.get('pending'):
+            app_log.warning(
+                "Not culling server %s with pending %s",
+                log_name, server['pending'])
+            return False
+
+        if server.get('started'):
+            age = now - parse_date(server['started'])
+        else:
+            # started may be undefined on jupyterhub < 0.9
+            age = None
+
+        # check last activity
+        # last_activity can be None in 0.9
+        if server['last_activity']:
+            inactive = now - parse_date(server['last_activity'])
+        else:
+            # no activity yet, use start date
+            # last_activity may be None with jupyterhub 0.9,
+            # which introduces the 'started' field which is never None
+            # for running servers
+            inactive = age
+
+        should_cull = (inactive is not None and
+                       inactive.total_seconds() >= inactive_limit)
+        if should_cull:
+            app_log.info(
+                "Culling server %s (inactive for %s)",
+                log_name, format_td(inactive))
+
+        if max_age and not should_cull:
+            # only check started if max_age is specified
+            # so that we can still be compatible with jupyterhub 0.8
+            # which doesn't define the 'started' field
+            if age is not None and age.total_seconds() >= max_age:
+                app_log.info(
+                    "Culling server %s (age: %s, inactive for %s)",
+                    log_name, format_td(age), format_td(inactive))
+                should_cull = True
+
+        if not should_cull:
+            app_log.debug(
+                "Not culling server %s (age: %s, inactive for %s)",
+                log_name, format_td(age), format_td(inactive))
+            return False
+
+        req = HTTPRequest(
+            url=url + '/users/%s/server' % quote(user['name']),
+            method='DELETE',
+            headers=auth_header,
+        )
+        resp = yield fetch(req)
+        if resp.code == 202:
+            app_log.warning(
+                "Server %s is slow to stop",
+                log_name,
+            )
+            # return False to prevent culling user with pending shutdowns
+            return False
+        return True
+
+    @coroutine
+    def handle_user(user):
+        """Handle one user.
+
+        Create a list of their servers, and async exec them.  Wait for
+        that to be done, and if all servers are stopped, possibly cull
+        the user.
+        """
+        # shutdown servers first.
+        # Hub doesn't allow deleting users with running servers.
+        # named servers contain the 'servers' dict
+        if 'servers' in user:
+            servers = user['servers']
+        # Otherwise, server data is intermingled in with the user
+        # model
+        else:
+            servers = {}
+            if user['server']:
+                servers[''] = {
+                    'started': user.get('started'),
+                    'last_activity': user['last_activity'],
+                    'pending': user['pending'],
+                    'url': user['server'],
+                }
+        server_futures = [
+            handle_server(user, server_name, server)
+            for server_name, server in servers.items()
+        ]
+        results = yield multi(server_futures)
+        if not cull_users:
+            return
+        # some servers are still running, cannot cull users
+        still_alive = len(results) - sum(results)
+        if still_alive:
+            app_log.debug(
+                "Not culling user %s with %i servers still alive",
+                user['name'], still_alive)
+            return False
+
+        should_cull = False
+        if user.get('created'):
+            age = now - parse_date(user['created'])
+        else:
+            # created may be undefined on jupyterhub < 0.9
+            age = None
+
+        # check last activity
+        # last_activity can be None in 0.9
+        if user['last_activity']:
+            inactive = now - parse_date(user['last_activity'])
+        else:
+            # no activity yet, use start date
+            # last_activity may be None with jupyterhub 0.9,
+            # which introduces the 'created' field which is never None
+            inactive = age
+
+        should_cull = (inactive is not None and
+                       inactive.total_seconds() >= inactive_limit)
+        if should_cull:
+            app_log.info(
+                "Culling user %s (inactive for %s)",
+                user['name'], inactive)
+
+        if max_age and not should_cull:
+            # only check created if max_age is specified
+            # so that we can still be compatible with jupyterhub 0.8
+            # which doesn't define the 'started' field
+            if age is not None and age.total_seconds() >= max_age:
+                app_log.info(
+                    "Culling user %s (age: %s, inactive for %s)",
+                    user['name'], format_td(age), format_td(inactive))
+                should_cull = True
+
+        if not should_cull:
+            app_log.debug(
+                "Not culling user %s (created: %s, last active: %s)",
+                user['name'], format_td(age), format_td(inactive))
+            return False
+
+        req = HTTPRequest(
+            url=url + '/users/%s' % user['name'],
+            method='DELETE',
+            headers=auth_header,
+        )
+        yield fetch(req)
+        return True
+
+    for user in users:
+        futures.append((user['name'], handle_user(user)))
+
+    for (name, f) in futures:
+        try:
+            result = yield f
+        except Exception:
+            app_log.exception("Error processing %s", name)
+        else:
+            if result:
+                app_log.debug("Finished culling %s", name)
+
+
+if __name__ == '__main__':
+    define(
+        'url',
+        default=os.environ.get('JUPYTERHUB_API_URL'),
+        help="The JupyterHub API URL",
+    )
+    define('timeout', type=int, default=600, help="The idle timeout (in seconds)")
+    define('cull_every', type=int, default=0,
+           help="The interval (in seconds) for checking for idle servers to cull")
+    define('max_age', type=int, default=0,
+           help="The maximum age (in seconds) of servers that should be culled even if they are active")
+    define('cull_users', type=bool, default=False,
+           help="""Cull users in addition to servers.
+                This is for use in temporary-user cases such as tmpnb.""",
+           )
+    define('concurrency', type=int, default=10,
+           help="""Limit the number of concurrent requests made to the Hub.
+
+                Deleting a lot of users at the same time can slow down the Hub,
+                so limit the number of API requests we have outstanding at any given time.
+                """
+           )
+
+    parse_command_line()
+    if not options.cull_every:
+        options.cull_every = options.timeout // 2
+    api_token = os.environ['JUPYTERHUB_API_TOKEN']
+
+    try:
+        AsyncHTTPClient.configure("tornado.curl_httpclient.CurlAsyncHTTPClient")
+    except ImportError as e:
+        app_log.warning(
+            "Could not load pycurl: %s\n"
+            "pycurl is recommended if you have a large number of users.",
+            e)
+
+    loop = IOLoop.current()
+    cull = partial(
+        cull_idle,
+        url=options.url,
+        api_token=api_token,
+        inactive_limit=options.timeout,
+        cull_users=options.cull_users,
+        max_age=options.max_age,
+        concurrency=options.concurrency,
+    )
+    # schedule first cull immediately
+    # because PeriodicCallback doesn't start until the end of the first interval
+    loop.add_callback(cull)
+    # schedule periodic cull
+    pc = PeriodicCallback(cull, 1e3 * options.cull_every)
+    pc.start()
+    try:
+        loop.start()
+    except KeyboardInterrupt:
+        pass