From efe9853b8542c3e769b8ac3703b1f0dd1030d9b2 Mon Sep 17 00:00:00 2001 From: GeorgianaElena Date: Thu, 6 Jun 2019 16:52:04 +0300 Subject: [PATCH 1/8] Added Idle Culler --- tljh/configurer.py | 47 ++++++ tljh/cull_idle_servers.py | 342 ++++++++++++++++++++++++++++++++++++++ tljh/jupyterhub_config.py | 2 + 3 files changed, 391 insertions(+) create mode 100644 tljh/cull_idle_servers.py diff --git a/tljh/configurer.py b/tljh/configurer.py index e909ac7..8783b43 100644 --- a/tljh/configurer.py +++ b/tljh/configurer.py @@ -9,6 +9,7 @@ FIXME: A strong feeling that JSON Schema should be involved somehow. """ import os +import sys from .config import CONFIG_FILE, STATE_DIR from .yaml import yaml @@ -55,6 +56,16 @@ default = { 'user_environment': { 'default_app': 'classic', }, + 'services': { + 'cull': { + 'enabled': True, + 'timeout': 600, + 'every': 60, + 'concurrency': 5, + 'users': False, + 'max_age': 0 + } + } } def load_config(config_file=CONFIG_FILE): @@ -86,6 +97,7 @@ def apply_config(config_overrides, c): update_user_environment(c, tljh_config) update_user_account_config(c, tljh_config) update_traefik_api(c, tljh_config) + update_services(c, tljh_config) def set_if_not_none(parent, key, value): @@ -191,6 +203,41 @@ def update_traefik_api(c, config): c.TraefikTomlProxy.traefik_api_password = config['traefik_api']['password'] +def set_cull_idle_service(c, config): + """ + Set Idle Culler service + """ + cull_cmd = [ + sys.executable, '/srv/src/tljh/cull_idle_servers.py' + ] + if config['services']['cull']['timeout']: + cull_cmd.append('--timeout=%s' % config['services']['cull']['timeout']) + + if config['services']['cull']['every']: + cull_cmd.append('--cull-every=%s' % config['services']['cull']['every']) + + if config['services']['cull']['concurrency']: + cull_cmd.append('--concurrency=%s' % config['services']['cull']['concurrency']) + + if config['services']['cull']['users']: + cull_cmd.append('--cull-users') + + if config['services']['cull']['max_age']: + cull_cmd.append('--max-age=%s' % config['services']['cull']['max_age']) + + cull_service = { + 'name': 'cull-idle', + 'admin': True, + 'command': cull_cmd, + } + + return cull_service + + +def update_services(c, config): + c.JupyterHub.services.append(set_cull_idle_service(c, config)) + + def _merge_dictionaries(a, b, path=None, update=True): """ Merge two dictionaries recursively. diff --git a/tljh/cull_idle_servers.py b/tljh/cull_idle_servers.py new file mode 100644 index 0000000..4f10b6e --- /dev/null +++ b/tljh/cull_idle_servers.py @@ -0,0 +1,342 @@ +#!/usr/bin/env python3 +"""script to monitor and cull idle single-user servers + +Imported from https://github.com/jupyterhub/jupyterhub/blob/6b1046697/examples/cull-idle/cull_idle_servers.py + +Caveats: + +last_activity is not updated with high frequency, +so cull timeout should be greater than the sum of: + +- single-user websocket ping interval (default: 30s) +- JupyterHub.last_activity_interval (default: 5 minutes) + +You can run this as a service managed by JupyterHub with this in your config:: + + + c.JupyterHub.services = [ + { + 'name': 'cull-idle', + 'admin': True, + 'command': 'python cull_idle_servers.py --timeout=3600'.split(), + } + ] + +Or run it manually by generating an API token and storing it in `JUPYTERHUB_API_TOKEN`: + + export JUPYTERHUB_API_TOKEN=`jupyterhub token` + python cull_idle_servers.py [--timeout=900] [--url=http://127.0.0.1:8081/hub/api] +""" + +from datetime import datetime, timezone +from functools import partial +import json +import os + +try: + from urllib.parse import quote +except ImportError: + from urllib import quote + +import dateutil.parser + +from tornado.gen import coroutine, multi +from tornado.locks import Semaphore +from tornado.log import app_log +from tornado.httpclient import AsyncHTTPClient, HTTPRequest +from tornado.ioloop import IOLoop, PeriodicCallback +from tornado.options import define, options, parse_command_line + + +def parse_date(date_string): + """Parse a timestamp + + If it doesn't have a timezone, assume utc + + Returned datetime object will always be timezone-aware + """ + dt = dateutil.parser.parse(date_string) + if not dt.tzinfo: + # assume naïve timestamps are UTC + dt = dt.replace(tzinfo=timezone.utc) + return dt + + +def format_td(td): + """ + Nicely format a timedelta object + + as HH:MM:SS + """ + if td is None: + return "unknown" + if isinstance(td, str): + return td + seconds = int(td.total_seconds()) + h = seconds // 3600 + seconds = seconds % 3600 + m = seconds // 60 + seconds = seconds % 60 + return f"{h:02}:{m:02}:{seconds:02}" + + +@coroutine +def cull_idle(url, api_token, inactive_limit, cull_users=False, max_age=0, concurrency=10): + """Shutdown idle single-user servers + + If cull_users, inactive *users* will be deleted as well. + """ + auth_header = { + 'Authorization': 'token %s' % api_token, + } + req = HTTPRequest( + url=url + '/users', + headers=auth_header, + ) + now = datetime.now(timezone.utc) + client = AsyncHTTPClient() + + if concurrency: + semaphore = Semaphore(concurrency) + @coroutine + def fetch(req): + """client.fetch wrapped in a semaphore to limit concurrency""" + yield semaphore.acquire() + try: + return (yield client.fetch(req)) + finally: + yield semaphore.release() + else: + fetch = client.fetch + + resp = yield fetch(req) + users = json.loads(resp.body.decode('utf8', 'replace')) + futures = [] + + @coroutine + def handle_server(user, server_name, server): + """Handle (maybe) culling a single server + + Returns True if server is now stopped (user removable), + False otherwise. + """ + log_name = user['name'] + if server_name: + log_name = '%s/%s' % (user['name'], server_name) + if server.get('pending'): + app_log.warning( + "Not culling server %s with pending %s", + log_name, server['pending']) + return False + + if server.get('started'): + age = now - parse_date(server['started']) + else: + # started may be undefined on jupyterhub < 0.9 + age = None + + # check last activity + # last_activity can be None in 0.9 + if server['last_activity']: + inactive = now - parse_date(server['last_activity']) + else: + # no activity yet, use start date + # last_activity may be None with jupyterhub 0.9, + # which introduces the 'started' field which is never None + # for running servers + inactive = age + + should_cull = (inactive is not None and + inactive.total_seconds() >= inactive_limit) + if should_cull: + app_log.info( + "Culling server %s (inactive for %s)", + log_name, format_td(inactive)) + + if max_age and not should_cull: + # only check started if max_age is specified + # so that we can still be compatible with jupyterhub 0.8 + # which doesn't define the 'started' field + if age is not None and age.total_seconds() >= max_age: + app_log.info( + "Culling server %s (age: %s, inactive for %s)", + log_name, format_td(age), format_td(inactive)) + should_cull = True + + if not should_cull: + app_log.debug( + "Not culling server %s (age: %s, inactive for %s)", + log_name, format_td(age), format_td(inactive)) + return False + + req = HTTPRequest( + url=url + '/users/%s/server' % quote(user['name']), + method='DELETE', + headers=auth_header, + ) + resp = yield fetch(req) + if resp.code == 202: + app_log.warning( + "Server %s is slow to stop", + log_name, + ) + # return False to prevent culling user with pending shutdowns + return False + return True + + @coroutine + def handle_user(user): + """Handle one user. + + Create a list of their servers, and async exec them. Wait for + that to be done, and if all servers are stopped, possibly cull + the user. + """ + # shutdown servers first. + # Hub doesn't allow deleting users with running servers. + # named servers contain the 'servers' dict + if 'servers' in user: + servers = user['servers'] + # Otherwise, server data is intermingled in with the user + # model + else: + servers = {} + if user['server']: + servers[''] = { + 'started': user.get('started'), + 'last_activity': user['last_activity'], + 'pending': user['pending'], + 'url': user['server'], + } + server_futures = [ + handle_server(user, server_name, server) + for server_name, server in servers.items() + ] + results = yield multi(server_futures) + if not cull_users: + return + # some servers are still running, cannot cull users + still_alive = len(results) - sum(results) + if still_alive: + app_log.debug( + "Not culling user %s with %i servers still alive", + user['name'], still_alive) + return False + + should_cull = False + if user.get('created'): + age = now - parse_date(user['created']) + else: + # created may be undefined on jupyterhub < 0.9 + age = None + + # check last activity + # last_activity can be None in 0.9 + if user['last_activity']: + inactive = now - parse_date(user['last_activity']) + else: + # no activity yet, use start date + # last_activity may be None with jupyterhub 0.9, + # which introduces the 'created' field which is never None + inactive = age + + should_cull = (inactive is not None and + inactive.total_seconds() >= inactive_limit) + if should_cull: + app_log.info( + "Culling user %s (inactive for %s)", + user['name'], inactive) + + if max_age and not should_cull: + # only check created if max_age is specified + # so that we can still be compatible with jupyterhub 0.8 + # which doesn't define the 'started' field + if age is not None and age.total_seconds() >= max_age: + app_log.info( + "Culling user %s (age: %s, inactive for %s)", + user['name'], format_td(age), format_td(inactive)) + should_cull = True + + if not should_cull: + app_log.debug( + "Not culling user %s (created: %s, last active: %s)", + user['name'], format_td(age), format_td(inactive)) + return False + + req = HTTPRequest( + url=url + '/users/%s' % user['name'], + method='DELETE', + headers=auth_header, + ) + yield fetch(req) + return True + + for user in users: + futures.append((user['name'], handle_user(user))) + + for (name, f) in futures: + try: + result = yield f + except Exception: + app_log.exception("Error processing %s", name) + else: + if result: + app_log.debug("Finished culling %s", name) + + +if __name__ == '__main__': + define( + 'url', + default=os.environ.get('JUPYTERHUB_API_URL'), + help="The JupyterHub API URL", + ) + define('timeout', default=600, help="The idle timeout (in seconds)") + define('cull_every', default=0, + help="The interval (in seconds) for checking for idle servers to cull") + define('max_age', default=0, + help="The maximum age (in seconds) of servers that should be culled even if they are active") + define('cull_users', default=False, + help="""Cull users in addition to servers. + This is for use in temporary-user cases such as tmpnb.""", + ) + define('concurrency', default=10, + help="""Limit the number of concurrent requests made to the Hub. + + Deleting a lot of users at the same time can slow down the Hub, + so limit the number of API requests we have outstanding at any given time. + """ + ) + + parse_command_line() + if not options.cull_every: + options.cull_every = options.timeout // 2 + api_token = os.environ['JUPYTERHUB_API_TOKEN'] + + try: + AsyncHTTPClient.configure("tornado.curl_httpclient.CurlAsyncHTTPClient") + except ImportError as e: + app_log.warning( + "Could not load pycurl: %s\n" + "pycurl is recommended if you have a large number of users.", + e) + + loop = IOLoop.current() + cull = partial( + cull_idle, + url=options.url, + api_token=api_token, + inactive_limit=options.timeout, + cull_users=options.cull_users, + max_age=options.max_age, + concurrency=options.concurrency, + ) + # schedule first cull immediately + # because PeriodicCallback doesn't start until the end of the first interval + loop.add_callback(cull) + # schedule periodic cull + pc = PeriodicCallback(cull, 1e3 * options.cull_every) + pc.start() + try: + loop.start() + except KeyboardInterrupt: + pass diff --git a/tljh/jupyterhub_config.py b/tljh/jupyterhub_config.py index 7f11bfa..9525bc7 100644 --- a/tljh/jupyterhub_config.py +++ b/tljh/jupyterhub_config.py @@ -44,6 +44,8 @@ c.JupyterHub.cleanup_servers = False # Use a high port so users can try this on machines with a JupyterHub already present c.JupyterHub.hub_port = 15001 +c.JupyterHub.services = [] + c.TraefikTomlProxy.should_start = False dynamic_conf_file_path = os.path.join(INSTALL_PREFIX, 'state', 'rules.toml') From 0b18b49cbcbce02b422b9ac49e809482fa27cebd Mon Sep 17 00:00:00 2001 From: GeorgianaElena Date: Thu, 6 Jun 2019 17:42:10 +0300 Subject: [PATCH 2/8] Fix services init --- tljh/configurer.py | 4 +++- tljh/jupyterhub_config.py | 2 -- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tljh/configurer.py b/tljh/configurer.py index 8783b43..245e286 100644 --- a/tljh/configurer.py +++ b/tljh/configurer.py @@ -235,7 +235,9 @@ def set_cull_idle_service(c, config): def update_services(c, config): - c.JupyterHub.services.append(set_cull_idle_service(c, config)) + c.JupyterHub.services = [] + if config['services']['cull']['enabled']: + c.JupyterHub.services.append(set_cull_idle_service(c, config)) def _merge_dictionaries(a, b, path=None, update=True): diff --git a/tljh/jupyterhub_config.py b/tljh/jupyterhub_config.py index 9525bc7..7f11bfa 100644 --- a/tljh/jupyterhub_config.py +++ b/tljh/jupyterhub_config.py @@ -44,8 +44,6 @@ c.JupyterHub.cleanup_servers = False # Use a high port so users can try this on machines with a JupyterHub already present c.JupyterHub.hub_port = 15001 -c.JupyterHub.services = [] - c.TraefikTomlProxy.should_start = False dynamic_conf_file_path = os.path.join(INSTALL_PREFIX, 'state', 'rules.toml') From c178afe4cdff9a27ba33f5f5e26902f8c4b39662 Mon Sep 17 00:00:00 2001 From: GeorgianaElena Date: Fri, 7 Jun 2019 12:03:17 +0300 Subject: [PATCH 3/8] Idle culler integration test --- integration-tests/test_hub.py | 40 ++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/integration-tests/test_hub.py b/integration-tests/test_hub.py index c598667..e2493ab 100644 --- a/integration-tests/test_hub.py +++ b/integration-tests/test_hub.py @@ -10,6 +10,7 @@ import grp import sys import subprocess from tljh.normalize import generate_system_username +import time # Use sudo to invoke it, since this is how users invoke it. @@ -137,4 +138,41 @@ async def test_long_username(): '-u', 'jupyterhub', '--no-pager' ]) - raise \ No newline at end of file + raise + + +@pytest.mark.asyncio +async def test_idle_culler (): + """ + User logs in, starts a server & stays idle for 1 min + """ + # This *must* be localhost, not an IP + # aiohttp throws away cookies if we are connecting to an IP! + hub_url = 'http://localhost' + username = secrets.token_hex(8) + + assert 0 == await (await asyncio.create_subprocess_exec(*TLJH_CONFIG_PATH, 'set', 'auth.type', 'dummyauthenticator.DummyAuthenticator')).wait() + # Check every 10s for idle servers to cull + assert 0 == await (await asyncio.create_subprocess_exec(*TLJH_CONFIG_PATH, 'set', 'services.cull.every', "10")).wait() + # Apart from servers, also cull users + assert 0 == await (await asyncio.create_subprocess_exec(*TLJH_CONFIG_PATH, 'set', 'services.cull.users', "True")).wait() + # Cull servers and users after 60s of activity + assert 0 == await (await asyncio.create_subprocess_exec(*TLJH_CONFIG_PATH, 'set', 'services.cull.max_age', "60")).wait() + assert 0 == await (await asyncio.create_subprocess_exec(*TLJH_CONFIG_PATH, 'reload')).wait() + + async with User(username, hub_url, partial(login_dummy, password='')) as u: + await u.login() + # Start user's server + await u.ensure_server() + # Assert that the user exists + assert pwd.getpwnam(f'jupyter-{username}') is not None + + # Check that we can get to the user's server + r = await u.session.get(u.hub_url / 'hub/api/users' / username, + headers={'Referer': str(u.hub_url / 'hub/')}) + assert r.status == 200 + time.sleep(60) + # Check that after 60s, the user and server have been culled and are not reacheable anymore + r = await u.session.get(u.hub_url / 'hub/api/users' / username, + headers={'Referer': str(u.hub_url / 'hub/')}) + assert r.status == 403 From 5ec046716a4ea8c3d7a1295c3872c06fc305fe98 Mon Sep 17 00:00:00 2001 From: GeorgianaElena Date: Wed, 12 Jun 2019 17:03:39 +0300 Subject: [PATCH 4/8] More idle culler tests --- integration-tests/test_hub.py | 70 ++++++++++++++++++++++++++++++++--- tests/test_configurer.py | 47 +++++++++++++++++++++++ 2 files changed, 111 insertions(+), 6 deletions(-) diff --git a/integration-tests/test_hub.py b/integration-tests/test_hub.py index e2493ab..550e1ab 100644 --- a/integration-tests/test_hub.py +++ b/integration-tests/test_hub.py @@ -1,6 +1,7 @@ import requests from hubtraf.user import User from hubtraf.auth.dummy import login_dummy +from jupyterhub.utils import exponential_backoff import secrets import pytest from functools import partial @@ -10,7 +11,6 @@ import grp import sys import subprocess from tljh.normalize import generate_system_username -import time # Use sudo to invoke it, since this is how users invoke it. @@ -142,9 +142,10 @@ async def test_long_username(): @pytest.mark.asyncio -async def test_idle_culler (): +async def test_idle_server_culled(): """ - User logs in, starts a server & stays idle for 1 min + User logs in, starts a server & stays idle for 1 min. + (the user's server should be culled during this period) """ # This *must* be localhost, not an IP # aiohttp throws away cookies if we are connecting to an IP! @@ -171,8 +172,65 @@ async def test_idle_culler (): r = await u.session.get(u.hub_url / 'hub/api/users' / username, headers={'Referer': str(u.hub_url / 'hub/')}) assert r.status == 200 - time.sleep(60) - # Check that after 60s, the user and server have been culled and are not reacheable anymore + + async def _check_culling_done(): + # Check that after 60s, the user and server have been culled and are not reacheable anymore + r = await u.session.get(u.hub_url / 'hub/api/users' / username, + headers={'Referer': str(u.hub_url / 'hub/')}) + print(r.status) + return r.status == 403 + + await exponential_backoff( + _check_culling_done, + "Server culling failed!", + timeout=100, + ) + +@pytest.mark.asyncio +async def test_active_server_not_culled(): + """ + User logs in, starts a server & stays idle for 30s + (the user's server should not be culled during this period). + """ + # This *must* be localhost, not an IP + # aiohttp throws away cookies if we are connecting to an IP! + hub_url = 'http://localhost' + username = secrets.token_hex(8) + + assert 0 == await (await asyncio.create_subprocess_exec(*TLJH_CONFIG_PATH, 'set', 'auth.type', 'dummyauthenticator.DummyAuthenticator')).wait() + # Check every 10s for idle servers to cull + assert 0 == await (await asyncio.create_subprocess_exec(*TLJH_CONFIG_PATH, 'set', 'services.cull.every', "10")).wait() + # Apart from servers, also cull users + assert 0 == await (await asyncio.create_subprocess_exec(*TLJH_CONFIG_PATH, 'set', 'services.cull.users', "True")).wait() + # Cull servers and users after 60s of activity + assert 0 == await (await asyncio.create_subprocess_exec(*TLJH_CONFIG_PATH, 'set', 'services.cull.max_age', "60")).wait() + assert 0 == await (await asyncio.create_subprocess_exec(*TLJH_CONFIG_PATH, 'reload')).wait() + + async with User(username, hub_url, partial(login_dummy, password='')) as u: + await u.login() + # Start user's server + await u.ensure_server() + # Assert that the user exists + assert pwd.getpwnam(f'jupyter-{username}') is not None + + # Check that we can get to the user's server r = await u.session.get(u.hub_url / 'hub/api/users' / username, headers={'Referer': str(u.hub_url / 'hub/')}) - assert r.status == 403 + assert r.status == 200 + + async def _check_culling_done(): + # Check that after 30s, we can still reach the user's server + r = await u.session.get(u.hub_url / 'hub/api/users' / username, + headers={'Referer': str(u.hub_url / 'hub/')}) + print(r.status) + return r.status != 200 + + try: + await exponential_backoff( + _check_culling_done, + "User's server is still reacheable!", + timeout=30, + ) + except TimeoutError: + # During the 30s timeout the user's server wasn't culled, which is what we intended. + pass diff --git a/tests/test_configurer.py b/tests/test_configurer.py index 641e407..cfd8d3c 100644 --- a/tests/test_configurer.py +++ b/tests/test_configurer.py @@ -3,6 +3,7 @@ Test configurer """ import os +import sys from tljh import configurer @@ -187,6 +188,52 @@ def test_set_traefik_api(): assert c.TraefikTomlProxy.traefik_api_password == '1234' +def test_cull_service_default(): + """ + Test default cull service settings with no overrides + """ + c = apply_mock_config({}) + + cull_cmd = [ + sys.executable, '/srv/src/tljh/cull_idle_servers.py', + '--timeout', '600', '--cull-every', '60', '--concurrency', '5', + '--max-age', '0' + ] + assert c.JupyterHub.services == [{ + 'name': 'cull-idle', + 'admin': True, + 'command': cull_cmd, + }] + assert c.TraefikTomlProxy.traefik_api_username == 'api_admin' + + +def test_set_cull_service(): + """ + Test setting cull service options + """ + c = apply_mock_config({ + 'services': { + 'cull': { + 'every': 10, + 'users': True, + 'max_age': 60 + } + } + }) + cull_cmd = [ + sys.executable, '/srv/src/tljh/cull_idle_servers.py', + '--timeout', '600', '--cull-every', '10', '--concurrency', '5', + '--max-age', '60', '--cull-users' + ] + assert c.JupyterHub.services == [{ + 'name': 'cull-idle', + 'admin': True, + 'command': cull_cmd, + }] + assert c.TraefikTomlProxy.traefik_api_username == 'api_admin' + + + def test_load_secrets(tljh_dir): """ Test loading secret files From 20374db7c6ce2934ce700873b9d1225929e29456 Mon Sep 17 00:00:00 2001 From: GeorgianaElena Date: Wed, 12 Jun 2019 17:05:01 +0300 Subject: [PATCH 5/8] Enforce the type of idle culler options --- tljh/configurer.py | 25 ++++++++++--------------- tljh/cull_idle_servers.py | 10 +++++----- 2 files changed, 15 insertions(+), 20 deletions(-) diff --git a/tljh/configurer.py b/tljh/configurer.py index 245e286..2e549db 100644 --- a/tljh/configurer.py +++ b/tljh/configurer.py @@ -203,27 +203,22 @@ def update_traefik_api(c, config): c.TraefikTomlProxy.traefik_api_password = config['traefik_api']['password'] -def set_cull_idle_service(c, config): +def set_cull_idle_service(config): """ Set Idle Culler service """ cull_cmd = [ sys.executable, '/srv/src/tljh/cull_idle_servers.py' ] - if config['services']['cull']['timeout']: - cull_cmd.append('--timeout=%s' % config['services']['cull']['timeout']) + cull_config = config['services']['cull'] + print() - if config['services']['cull']['every']: - cull_cmd.append('--cull-every=%s' % config['services']['cull']['every']) - - if config['services']['cull']['concurrency']: - cull_cmd.append('--concurrency=%s' % config['services']['cull']['concurrency']) - - if config['services']['cull']['users']: - cull_cmd.append('--cull-users') - - if config['services']['cull']['max_age']: - cull_cmd.append('--max-age=%s' % config['services']['cull']['max_age']) + cull_cmd += ['--timeout=%d' % cull_config['timeout']] + cull_cmd += ['--cull-every=%d' % cull_config['every']] + cull_cmd += ['--concurrency=%d' % cull_config['concurrency']] + cull_cmd += ['--max-age=%d' % cull_config['max_age']] + if cull_config['users']: + cull_cmd += ['--cull-users'] cull_service = { 'name': 'cull-idle', @@ -237,7 +232,7 @@ def set_cull_idle_service(c, config): def update_services(c, config): c.JupyterHub.services = [] if config['services']['cull']['enabled']: - c.JupyterHub.services.append(set_cull_idle_service(c, config)) + c.JupyterHub.services.append(set_cull_idle_service(config)) def _merge_dictionaries(a, b, path=None, update=True): diff --git a/tljh/cull_idle_servers.py b/tljh/cull_idle_servers.py index 4f10b6e..32524dd 100644 --- a/tljh/cull_idle_servers.py +++ b/tljh/cull_idle_servers.py @@ -290,16 +290,16 @@ if __name__ == '__main__': default=os.environ.get('JUPYTERHUB_API_URL'), help="The JupyterHub API URL", ) - define('timeout', default=600, help="The idle timeout (in seconds)") - define('cull_every', default=0, + define('timeout', type=int, default=600, help="The idle timeout (in seconds)") + define('cull_every', type=int, default=0, help="The interval (in seconds) for checking for idle servers to cull") - define('max_age', default=0, + define('max_age', type=int, default=0, help="The maximum age (in seconds) of servers that should be culled even if they are active") - define('cull_users', default=False, + define('cull_users', type=bool, default=False, help="""Cull users in addition to servers. This is for use in temporary-user cases such as tmpnb.""", ) - define('concurrency', default=10, + define('concurrency', type=int, default=10, help="""Limit the number of concurrent requests made to the Hub. Deleting a lot of users at the same time can slow down the Hub, From a6dee394aabfbe2c99795f8e73049fdad49335fc Mon Sep 17 00:00:00 2001 From: GeorgianaElena Date: Wed, 12 Jun 2019 17:32:33 +0300 Subject: [PATCH 6/8] Fixed test --- tests/test_configurer.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/tests/test_configurer.py b/tests/test_configurer.py index cfd8d3c..a2ef1e2 100644 --- a/tests/test_configurer.py +++ b/tests/test_configurer.py @@ -196,15 +196,14 @@ def test_cull_service_default(): cull_cmd = [ sys.executable, '/srv/src/tljh/cull_idle_servers.py', - '--timeout', '600', '--cull-every', '60', '--concurrency', '5', - '--max-age', '0' + '--timeout=600', '--cull-every=60', '--concurrency=5', + '--max-age=0' ] assert c.JupyterHub.services == [{ 'name': 'cull-idle', 'admin': True, 'command': cull_cmd, }] - assert c.TraefikTomlProxy.traefik_api_username == 'api_admin' def test_set_cull_service(): @@ -222,16 +221,14 @@ def test_set_cull_service(): }) cull_cmd = [ sys.executable, '/srv/src/tljh/cull_idle_servers.py', - '--timeout', '600', '--cull-every', '10', '--concurrency', '5', - '--max-age', '60', '--cull-users' + '--timeout=600', '--cull-every=10', '--concurrency=5', + '--max-age=60', '--cull-users' ] assert c.JupyterHub.services == [{ 'name': 'cull-idle', 'admin': True, 'command': cull_cmd, }] - assert c.TraefikTomlProxy.traefik_api_username == 'api_admin' - def test_load_secrets(tljh_dir): From 31d92199e14d6b53e5213cffb24f878754fdedeb Mon Sep 17 00:00:00 2001 From: GeorgianaElena Date: Thu, 13 Jun 2019 14:54:51 +0300 Subject: [PATCH 7/8] Added idle culler docs --- docs/index.rst | 1 + docs/topic/idle-culler.rst | 86 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+) create mode 100644 docs/topic/idle-culler.rst diff --git a/docs/index.rst b/docs/index.rst index 0171f2d..d87c203 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -123,6 +123,7 @@ Topic guides provide in-depth explanations of specific topics. topic/tljh-config topic/authenticator-configuration topic/escape-hatch + topic/idle-culler Troubleshooting diff --git a/docs/topic/idle-culler.rst b/docs/topic/idle-culler.rst new file mode 100644 index 0000000..cb3d760 --- /dev/null +++ b/docs/topic/idle-culler.rst @@ -0,0 +1,86 @@ +.. _topic/idle-culler: + +============================= +Culling idle notebook servers +============================= + +The idle culler is a hub-managed service that automatically shuts down idle +single-user notebook servers in order to free up resources. After culling, any +in-memory data will be lost. + + +Disabling the idle culler +========================= + +The idle culling service is enabled by default. To disable it, use the following +command: + +.. code-block:: bash + + sudo tljh-config set services.cull.enabled False + + +Configuring the idle culler +=========================== + +By **default**, JupyterHub will: + * Run the culling process every minute. + * Cull any user servers that have been inactive for more than 10 minutes. + +The configuration options available are: + +Idle timeout +------------ + +The idle timeout (in seconds) can be configured using: + +.. code-block:: bash + + sudo tljh-config set services.cull.timeout + +*By default services.cull.timeout = 600* + +Idle check interval +------------------- + +The interval (in seconds) for checking for idle servers to cull can be configured using: + +.. code-block:: bash + + sudo tljh-config set services.cull.every + +*By default services.cull.every = 60* + +Maximum age +----------- + +The maximum age (in seconds) of servers that should be culled even if they are active +can be configured using: + +.. code-block:: bash + + sudo tljh-config set services.cull.max_age + +*By default services.cull.max_age = 0* + +User culling +------------ + +In addition to servers, the users will also be culled if the following command is used: + +.. code-block:: bash + + sudo tljh-config set services.cull.users True + +*By default services.cull.users = False* + +Concurrency +----------- + +The number of concurrent requests made to the Hub ca be configured using: + +.. code-block:: bash + + sudo tljh-config set services.cull.concurrency + +*By default services.cull.concurrency = 5* From 346701502d510f6ee56552f50102232496bcbc28 Mon Sep 17 00:00:00 2001 From: GeorgianaElena Date: Fri, 14 Jun 2019 14:18:39 +0300 Subject: [PATCH 8/8] Addressed docs feedback --- docs/topic/idle-culler.rst | 166 ++++++++++++++++++++++--------------- 1 file changed, 97 insertions(+), 69 deletions(-) diff --git a/docs/topic/idle-culler.rst b/docs/topic/idle-culler.rst index cb3d760..530c171 100644 --- a/docs/topic/idle-culler.rst +++ b/docs/topic/idle-culler.rst @@ -4,9 +4,102 @@ Culling idle notebook servers ============================= -The idle culler is a hub-managed service that automatically shuts down idle -single-user notebook servers in order to free up resources. After culling, any -in-memory data will be lost. +The idle culler automatically shuts down user notebook servers when they have +not been used for a certain time period, in order to reduce the total resource +usage on your JupyterHub. + + +JupyterHub pings the user's notebook server at certain time intervals. If no response +is received from the server during this checks and the timeout expires, the server is +considered to be *inactive (idle)* and will be culled. + + +Default settings +================ + +By default, JupyterHub will ping the user notebook servers every 60s to check their +status. Every server found to be idle for more than 10 minutes will be culled. + +.. code-block:: python + + services.cull.every = 60 + services.cull.timeout = 600 + +Because the servers don't have a maximum age set, an active server will not be shut down +regardless of how long it has been up and running. + +.. code-block:: python + + services.cull.max_age = 0 + +If after the culling process, there are users with no active notebook servers, by default, +the users will not be culled alongside their notebooks and will continue to exist. + +.. code-block:: python + + services.cull.users = False + + +Configuring the idle culler +=========================== + +The available configuration options are: + +Idle timeout +------------ +The idle timeout is the maximum time (in seconds) a server can be inactive before it +will be culled. The timeout can be configured using: + +.. code-block:: bash + + sudo tljh-config set services.cull.timeout + sudo tljh-config reload + +Idle check interval +------------------- +The idle check interval represents how frequent (in seconds) the Hub will +check if there are any idle servers to cull. It can be configured using: + +.. code-block:: bash + + sudo tljh-config set services.cull.every + sudo tljh-config reload + +Maximum age +----------- +The maximum age sets the time (in seconds) a server should be running. +The servers that exceed the maximum age, will be culled even if they are active. +A maximum age of 0, will deactivate this option. +The maximum age can be configured using: + +.. code-block:: bash + + sudo tljh-config set services.cull.max_age + sudo tljh-config reload + +User culling +------------ +In addition to servers, it is also possible to cull the users. This is usually +suited for temporary-user cases such as *tmpnb*. +User culling can be activated using the following command: + +.. code-block:: bash + + sudo tljh-config set services.cull.users True + sudo tljh-config reload + +Concurrency +----------- +Deleting a lot of users at the same time can slow down the Hub. +The number of concurrent requests made to the Hub can be configured using: + +.. code-block:: bash + + sudo tljh-config set services.cull.concurrency + sudo tljh-config reload + +Because TLJH it's used for a small number of users, the cases that may require to +modify the concurrency limit should be rare. Disabling the idle culler @@ -18,69 +111,4 @@ command: .. code-block:: bash sudo tljh-config set services.cull.enabled False - - -Configuring the idle culler -=========================== - -By **default**, JupyterHub will: - * Run the culling process every minute. - * Cull any user servers that have been inactive for more than 10 minutes. - -The configuration options available are: - -Idle timeout ------------- - -The idle timeout (in seconds) can be configured using: - -.. code-block:: bash - - sudo tljh-config set services.cull.timeout - -*By default services.cull.timeout = 600* - -Idle check interval -------------------- - -The interval (in seconds) for checking for idle servers to cull can be configured using: - -.. code-block:: bash - - sudo tljh-config set services.cull.every - -*By default services.cull.every = 60* - -Maximum age ------------ - -The maximum age (in seconds) of servers that should be culled even if they are active -can be configured using: - -.. code-block:: bash - - sudo tljh-config set services.cull.max_age - -*By default services.cull.max_age = 0* - -User culling ------------- - -In addition to servers, the users will also be culled if the following command is used: - -.. code-block:: bash - - sudo tljh-config set services.cull.users True - -*By default services.cull.users = False* - -Concurrency ------------ - -The number of concurrent requests made to the Hub ca be configured using: - -.. code-block:: bash - - sudo tljh-config set services.cull.concurrency - -*By default services.cull.concurrency = 5* + sudo tljh-config reload