diff --git a/docs/index.rst b/docs/index.rst index 2bb1afc..a5058c5 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -124,6 +124,7 @@ Topic guides provide in-depth explanations of specific topics. topic/tljh-config topic/authenticator-configuration topic/escape-hatch + topic/idle-culler Troubleshooting diff --git a/docs/topic/idle-culler.rst b/docs/topic/idle-culler.rst new file mode 100644 index 0000000..530c171 --- /dev/null +++ b/docs/topic/idle-culler.rst @@ -0,0 +1,114 @@ +.. _topic/idle-culler: + +============================= +Culling idle notebook servers +============================= + +The idle culler automatically shuts down user notebook servers when they have +not been used for a certain time period, in order to reduce the total resource +usage on your JupyterHub. + + +JupyterHub pings the user's notebook server at certain time intervals. If no response +is received from the server during this checks and the timeout expires, the server is +considered to be *inactive (idle)* and will be culled. + + +Default settings +================ + +By default, JupyterHub will ping the user notebook servers every 60s to check their +status. Every server found to be idle for more than 10 minutes will be culled. + +.. code-block:: python + + services.cull.every = 60 + services.cull.timeout = 600 + +Because the servers don't have a maximum age set, an active server will not be shut down +regardless of how long it has been up and running. + +.. code-block:: python + + services.cull.max_age = 0 + +If after the culling process, there are users with no active notebook servers, by default, +the users will not be culled alongside their notebooks and will continue to exist. + +.. code-block:: python + + services.cull.users = False + + +Configuring the idle culler +=========================== + +The available configuration options are: + +Idle timeout +------------ +The idle timeout is the maximum time (in seconds) a server can be inactive before it +will be culled. The timeout can be configured using: + +.. code-block:: bash + + sudo tljh-config set services.cull.timeout + sudo tljh-config reload + +Idle check interval +------------------- +The idle check interval represents how frequent (in seconds) the Hub will +check if there are any idle servers to cull. It can be configured using: + +.. code-block:: bash + + sudo tljh-config set services.cull.every + sudo tljh-config reload + +Maximum age +----------- +The maximum age sets the time (in seconds) a server should be running. +The servers that exceed the maximum age, will be culled even if they are active. +A maximum age of 0, will deactivate this option. +The maximum age can be configured using: + +.. code-block:: bash + + sudo tljh-config set services.cull.max_age + sudo tljh-config reload + +User culling +------------ +In addition to servers, it is also possible to cull the users. This is usually +suited for temporary-user cases such as *tmpnb*. +User culling can be activated using the following command: + +.. code-block:: bash + + sudo tljh-config set services.cull.users True + sudo tljh-config reload + +Concurrency +----------- +Deleting a lot of users at the same time can slow down the Hub. +The number of concurrent requests made to the Hub can be configured using: + +.. code-block:: bash + + sudo tljh-config set services.cull.concurrency + sudo tljh-config reload + +Because TLJH it's used for a small number of users, the cases that may require to +modify the concurrency limit should be rare. + + +Disabling the idle culler +========================= + +The idle culling service is enabled by default. To disable it, use the following +command: + +.. code-block:: bash + + sudo tljh-config set services.cull.enabled False + sudo tljh-config reload diff --git a/integration-tests/test_hub.py b/integration-tests/test_hub.py index c598667..550e1ab 100644 --- a/integration-tests/test_hub.py +++ b/integration-tests/test_hub.py @@ -1,6 +1,7 @@ import requests from hubtraf.user import User from hubtraf.auth.dummy import login_dummy +from jupyterhub.utils import exponential_backoff import secrets import pytest from functools import partial @@ -137,4 +138,99 @@ async def test_long_username(): '-u', 'jupyterhub', '--no-pager' ]) - raise \ No newline at end of file + raise + + +@pytest.mark.asyncio +async def test_idle_server_culled(): + """ + User logs in, starts a server & stays idle for 1 min. + (the user's server should be culled during this period) + """ + # This *must* be localhost, not an IP + # aiohttp throws away cookies if we are connecting to an IP! + hub_url = 'http://localhost' + username = secrets.token_hex(8) + + assert 0 == await (await asyncio.create_subprocess_exec(*TLJH_CONFIG_PATH, 'set', 'auth.type', 'dummyauthenticator.DummyAuthenticator')).wait() + # Check every 10s for idle servers to cull + assert 0 == await (await asyncio.create_subprocess_exec(*TLJH_CONFIG_PATH, 'set', 'services.cull.every', "10")).wait() + # Apart from servers, also cull users + assert 0 == await (await asyncio.create_subprocess_exec(*TLJH_CONFIG_PATH, 'set', 'services.cull.users', "True")).wait() + # Cull servers and users after 60s of activity + assert 0 == await (await asyncio.create_subprocess_exec(*TLJH_CONFIG_PATH, 'set', 'services.cull.max_age', "60")).wait() + assert 0 == await (await asyncio.create_subprocess_exec(*TLJH_CONFIG_PATH, 'reload')).wait() + + async with User(username, hub_url, partial(login_dummy, password='')) as u: + await u.login() + # Start user's server + await u.ensure_server() + # Assert that the user exists + assert pwd.getpwnam(f'jupyter-{username}') is not None + + # Check that we can get to the user's server + r = await u.session.get(u.hub_url / 'hub/api/users' / username, + headers={'Referer': str(u.hub_url / 'hub/')}) + assert r.status == 200 + + async def _check_culling_done(): + # Check that after 60s, the user and server have been culled and are not reacheable anymore + r = await u.session.get(u.hub_url / 'hub/api/users' / username, + headers={'Referer': str(u.hub_url / 'hub/')}) + print(r.status) + return r.status == 403 + + await exponential_backoff( + _check_culling_done, + "Server culling failed!", + timeout=100, + ) + +@pytest.mark.asyncio +async def test_active_server_not_culled(): + """ + User logs in, starts a server & stays idle for 30s + (the user's server should not be culled during this period). + """ + # This *must* be localhost, not an IP + # aiohttp throws away cookies if we are connecting to an IP! + hub_url = 'http://localhost' + username = secrets.token_hex(8) + + assert 0 == await (await asyncio.create_subprocess_exec(*TLJH_CONFIG_PATH, 'set', 'auth.type', 'dummyauthenticator.DummyAuthenticator')).wait() + # Check every 10s for idle servers to cull + assert 0 == await (await asyncio.create_subprocess_exec(*TLJH_CONFIG_PATH, 'set', 'services.cull.every', "10")).wait() + # Apart from servers, also cull users + assert 0 == await (await asyncio.create_subprocess_exec(*TLJH_CONFIG_PATH, 'set', 'services.cull.users', "True")).wait() + # Cull servers and users after 60s of activity + assert 0 == await (await asyncio.create_subprocess_exec(*TLJH_CONFIG_PATH, 'set', 'services.cull.max_age', "60")).wait() + assert 0 == await (await asyncio.create_subprocess_exec(*TLJH_CONFIG_PATH, 'reload')).wait() + + async with User(username, hub_url, partial(login_dummy, password='')) as u: + await u.login() + # Start user's server + await u.ensure_server() + # Assert that the user exists + assert pwd.getpwnam(f'jupyter-{username}') is not None + + # Check that we can get to the user's server + r = await u.session.get(u.hub_url / 'hub/api/users' / username, + headers={'Referer': str(u.hub_url / 'hub/')}) + assert r.status == 200 + + async def _check_culling_done(): + # Check that after 30s, we can still reach the user's server + r = await u.session.get(u.hub_url / 'hub/api/users' / username, + headers={'Referer': str(u.hub_url / 'hub/')}) + print(r.status) + return r.status != 200 + + try: + await exponential_backoff( + _check_culling_done, + "User's server is still reacheable!", + timeout=30, + ) + except TimeoutError: + # During the 30s timeout the user's server wasn't culled, which is what we intended. + pass diff --git a/tests/test_configurer.py b/tests/test_configurer.py index 641e407..a2ef1e2 100644 --- a/tests/test_configurer.py +++ b/tests/test_configurer.py @@ -3,6 +3,7 @@ Test configurer """ import os +import sys from tljh import configurer @@ -187,6 +188,49 @@ def test_set_traefik_api(): assert c.TraefikTomlProxy.traefik_api_password == '1234' +def test_cull_service_default(): + """ + Test default cull service settings with no overrides + """ + c = apply_mock_config({}) + + cull_cmd = [ + sys.executable, '/srv/src/tljh/cull_idle_servers.py', + '--timeout=600', '--cull-every=60', '--concurrency=5', + '--max-age=0' + ] + assert c.JupyterHub.services == [{ + 'name': 'cull-idle', + 'admin': True, + 'command': cull_cmd, + }] + + +def test_set_cull_service(): + """ + Test setting cull service options + """ + c = apply_mock_config({ + 'services': { + 'cull': { + 'every': 10, + 'users': True, + 'max_age': 60 + } + } + }) + cull_cmd = [ + sys.executable, '/srv/src/tljh/cull_idle_servers.py', + '--timeout=600', '--cull-every=10', '--concurrency=5', + '--max-age=60', '--cull-users' + ] + assert c.JupyterHub.services == [{ + 'name': 'cull-idle', + 'admin': True, + 'command': cull_cmd, + }] + + def test_load_secrets(tljh_dir): """ Test loading secret files diff --git a/tljh/configurer.py b/tljh/configurer.py index e909ac7..2e549db 100644 --- a/tljh/configurer.py +++ b/tljh/configurer.py @@ -9,6 +9,7 @@ FIXME: A strong feeling that JSON Schema should be involved somehow. """ import os +import sys from .config import CONFIG_FILE, STATE_DIR from .yaml import yaml @@ -55,6 +56,16 @@ default = { 'user_environment': { 'default_app': 'classic', }, + 'services': { + 'cull': { + 'enabled': True, + 'timeout': 600, + 'every': 60, + 'concurrency': 5, + 'users': False, + 'max_age': 0 + } + } } def load_config(config_file=CONFIG_FILE): @@ -86,6 +97,7 @@ def apply_config(config_overrides, c): update_user_environment(c, tljh_config) update_user_account_config(c, tljh_config) update_traefik_api(c, tljh_config) + update_services(c, tljh_config) def set_if_not_none(parent, key, value): @@ -191,6 +203,38 @@ def update_traefik_api(c, config): c.TraefikTomlProxy.traefik_api_password = config['traefik_api']['password'] +def set_cull_idle_service(config): + """ + Set Idle Culler service + """ + cull_cmd = [ + sys.executable, '/srv/src/tljh/cull_idle_servers.py' + ] + cull_config = config['services']['cull'] + print() + + cull_cmd += ['--timeout=%d' % cull_config['timeout']] + cull_cmd += ['--cull-every=%d' % cull_config['every']] + cull_cmd += ['--concurrency=%d' % cull_config['concurrency']] + cull_cmd += ['--max-age=%d' % cull_config['max_age']] + if cull_config['users']: + cull_cmd += ['--cull-users'] + + cull_service = { + 'name': 'cull-idle', + 'admin': True, + 'command': cull_cmd, + } + + return cull_service + + +def update_services(c, config): + c.JupyterHub.services = [] + if config['services']['cull']['enabled']: + c.JupyterHub.services.append(set_cull_idle_service(config)) + + def _merge_dictionaries(a, b, path=None, update=True): """ Merge two dictionaries recursively. diff --git a/tljh/cull_idle_servers.py b/tljh/cull_idle_servers.py new file mode 100644 index 0000000..32524dd --- /dev/null +++ b/tljh/cull_idle_servers.py @@ -0,0 +1,342 @@ +#!/usr/bin/env python3 +"""script to monitor and cull idle single-user servers + +Imported from https://github.com/jupyterhub/jupyterhub/blob/6b1046697/examples/cull-idle/cull_idle_servers.py + +Caveats: + +last_activity is not updated with high frequency, +so cull timeout should be greater than the sum of: + +- single-user websocket ping interval (default: 30s) +- JupyterHub.last_activity_interval (default: 5 minutes) + +You can run this as a service managed by JupyterHub with this in your config:: + + + c.JupyterHub.services = [ + { + 'name': 'cull-idle', + 'admin': True, + 'command': 'python cull_idle_servers.py --timeout=3600'.split(), + } + ] + +Or run it manually by generating an API token and storing it in `JUPYTERHUB_API_TOKEN`: + + export JUPYTERHUB_API_TOKEN=`jupyterhub token` + python cull_idle_servers.py [--timeout=900] [--url=http://127.0.0.1:8081/hub/api] +""" + +from datetime import datetime, timezone +from functools import partial +import json +import os + +try: + from urllib.parse import quote +except ImportError: + from urllib import quote + +import dateutil.parser + +from tornado.gen import coroutine, multi +from tornado.locks import Semaphore +from tornado.log import app_log +from tornado.httpclient import AsyncHTTPClient, HTTPRequest +from tornado.ioloop import IOLoop, PeriodicCallback +from tornado.options import define, options, parse_command_line + + +def parse_date(date_string): + """Parse a timestamp + + If it doesn't have a timezone, assume utc + + Returned datetime object will always be timezone-aware + """ + dt = dateutil.parser.parse(date_string) + if not dt.tzinfo: + # assume naïve timestamps are UTC + dt = dt.replace(tzinfo=timezone.utc) + return dt + + +def format_td(td): + """ + Nicely format a timedelta object + + as HH:MM:SS + """ + if td is None: + return "unknown" + if isinstance(td, str): + return td + seconds = int(td.total_seconds()) + h = seconds // 3600 + seconds = seconds % 3600 + m = seconds // 60 + seconds = seconds % 60 + return f"{h:02}:{m:02}:{seconds:02}" + + +@coroutine +def cull_idle(url, api_token, inactive_limit, cull_users=False, max_age=0, concurrency=10): + """Shutdown idle single-user servers + + If cull_users, inactive *users* will be deleted as well. + """ + auth_header = { + 'Authorization': 'token %s' % api_token, + } + req = HTTPRequest( + url=url + '/users', + headers=auth_header, + ) + now = datetime.now(timezone.utc) + client = AsyncHTTPClient() + + if concurrency: + semaphore = Semaphore(concurrency) + @coroutine + def fetch(req): + """client.fetch wrapped in a semaphore to limit concurrency""" + yield semaphore.acquire() + try: + return (yield client.fetch(req)) + finally: + yield semaphore.release() + else: + fetch = client.fetch + + resp = yield fetch(req) + users = json.loads(resp.body.decode('utf8', 'replace')) + futures = [] + + @coroutine + def handle_server(user, server_name, server): + """Handle (maybe) culling a single server + + Returns True if server is now stopped (user removable), + False otherwise. + """ + log_name = user['name'] + if server_name: + log_name = '%s/%s' % (user['name'], server_name) + if server.get('pending'): + app_log.warning( + "Not culling server %s with pending %s", + log_name, server['pending']) + return False + + if server.get('started'): + age = now - parse_date(server['started']) + else: + # started may be undefined on jupyterhub < 0.9 + age = None + + # check last activity + # last_activity can be None in 0.9 + if server['last_activity']: + inactive = now - parse_date(server['last_activity']) + else: + # no activity yet, use start date + # last_activity may be None with jupyterhub 0.9, + # which introduces the 'started' field which is never None + # for running servers + inactive = age + + should_cull = (inactive is not None and + inactive.total_seconds() >= inactive_limit) + if should_cull: + app_log.info( + "Culling server %s (inactive for %s)", + log_name, format_td(inactive)) + + if max_age and not should_cull: + # only check started if max_age is specified + # so that we can still be compatible with jupyterhub 0.8 + # which doesn't define the 'started' field + if age is not None and age.total_seconds() >= max_age: + app_log.info( + "Culling server %s (age: %s, inactive for %s)", + log_name, format_td(age), format_td(inactive)) + should_cull = True + + if not should_cull: + app_log.debug( + "Not culling server %s (age: %s, inactive for %s)", + log_name, format_td(age), format_td(inactive)) + return False + + req = HTTPRequest( + url=url + '/users/%s/server' % quote(user['name']), + method='DELETE', + headers=auth_header, + ) + resp = yield fetch(req) + if resp.code == 202: + app_log.warning( + "Server %s is slow to stop", + log_name, + ) + # return False to prevent culling user with pending shutdowns + return False + return True + + @coroutine + def handle_user(user): + """Handle one user. + + Create a list of their servers, and async exec them. Wait for + that to be done, and if all servers are stopped, possibly cull + the user. + """ + # shutdown servers first. + # Hub doesn't allow deleting users with running servers. + # named servers contain the 'servers' dict + if 'servers' in user: + servers = user['servers'] + # Otherwise, server data is intermingled in with the user + # model + else: + servers = {} + if user['server']: + servers[''] = { + 'started': user.get('started'), + 'last_activity': user['last_activity'], + 'pending': user['pending'], + 'url': user['server'], + } + server_futures = [ + handle_server(user, server_name, server) + for server_name, server in servers.items() + ] + results = yield multi(server_futures) + if not cull_users: + return + # some servers are still running, cannot cull users + still_alive = len(results) - sum(results) + if still_alive: + app_log.debug( + "Not culling user %s with %i servers still alive", + user['name'], still_alive) + return False + + should_cull = False + if user.get('created'): + age = now - parse_date(user['created']) + else: + # created may be undefined on jupyterhub < 0.9 + age = None + + # check last activity + # last_activity can be None in 0.9 + if user['last_activity']: + inactive = now - parse_date(user['last_activity']) + else: + # no activity yet, use start date + # last_activity may be None with jupyterhub 0.9, + # which introduces the 'created' field which is never None + inactive = age + + should_cull = (inactive is not None and + inactive.total_seconds() >= inactive_limit) + if should_cull: + app_log.info( + "Culling user %s (inactive for %s)", + user['name'], inactive) + + if max_age and not should_cull: + # only check created if max_age is specified + # so that we can still be compatible with jupyterhub 0.8 + # which doesn't define the 'started' field + if age is not None and age.total_seconds() >= max_age: + app_log.info( + "Culling user %s (age: %s, inactive for %s)", + user['name'], format_td(age), format_td(inactive)) + should_cull = True + + if not should_cull: + app_log.debug( + "Not culling user %s (created: %s, last active: %s)", + user['name'], format_td(age), format_td(inactive)) + return False + + req = HTTPRequest( + url=url + '/users/%s' % user['name'], + method='DELETE', + headers=auth_header, + ) + yield fetch(req) + return True + + for user in users: + futures.append((user['name'], handle_user(user))) + + for (name, f) in futures: + try: + result = yield f + except Exception: + app_log.exception("Error processing %s", name) + else: + if result: + app_log.debug("Finished culling %s", name) + + +if __name__ == '__main__': + define( + 'url', + default=os.environ.get('JUPYTERHUB_API_URL'), + help="The JupyterHub API URL", + ) + define('timeout', type=int, default=600, help="The idle timeout (in seconds)") + define('cull_every', type=int, default=0, + help="The interval (in seconds) for checking for idle servers to cull") + define('max_age', type=int, default=0, + help="The maximum age (in seconds) of servers that should be culled even if they are active") + define('cull_users', type=bool, default=False, + help="""Cull users in addition to servers. + This is for use in temporary-user cases such as tmpnb.""", + ) + define('concurrency', type=int, default=10, + help="""Limit the number of concurrent requests made to the Hub. + + Deleting a lot of users at the same time can slow down the Hub, + so limit the number of API requests we have outstanding at any given time. + """ + ) + + parse_command_line() + if not options.cull_every: + options.cull_every = options.timeout // 2 + api_token = os.environ['JUPYTERHUB_API_TOKEN'] + + try: + AsyncHTTPClient.configure("tornado.curl_httpclient.CurlAsyncHTTPClient") + except ImportError as e: + app_log.warning( + "Could not load pycurl: %s\n" + "pycurl is recommended if you have a large number of users.", + e) + + loop = IOLoop.current() + cull = partial( + cull_idle, + url=options.url, + api_token=api_token, + inactive_limit=options.timeout, + cull_users=options.cull_users, + max_age=options.max_age, + concurrency=options.concurrency, + ) + # schedule first cull immediately + # because PeriodicCallback doesn't start until the end of the first interval + loop.add_callback(cull) + # schedule periodic cull + pc = PeriodicCallback(cull, 1e3 * options.cull_every) + pc.start() + try: + loop.start() + except KeyboardInterrupt: + pass