From 474005cbd1d71608e830bbdbc50262e5c6bf0bd3 Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Mon, 27 Apr 2020 16:39:03 +0530 Subject: [PATCH 1/4] Use idle culler from jupyterhub-idle-culler package The idle culler lives as a script in at least 3 different places: - In the JupyterHub repo, as an 'example' https://github.com/jupyterhub/jupyterhub/tree/d126baa443ad7d893be2ff4a70afe9ef5b8a4a1a/examples/cull-idle - In the TLJH repo, as a core part of the service https://github.com/jupyterhub/the-littlest-jupyterhub/blob/01ba34857dd4e316d839034ae2b3cc400b929964/tljh/cull_idle_servers.py. This is an import from a specific version of the JupyterHub repo, and has had a couple of changes made to it since. - In the z2jh repo, as a core part of the service https://github.com/jupyterhub/zero-to-jupyterhub-k8s/blob/c3f3be25f8ae6c72d02f385f41983b70ee1d416e/jupyterhub/files/hub/cull_idle_servers.py This is also an import from a specific version of the JupyterHub repo, but has had a lot more work done on it. Most had been sync'd back the JupyterHub repo, but some had not been. See https://github.com/jupyterhub/zero-to-jupyterhub-k8s/commits/9c15a42b1227f3b54826f273f1689e4dc8c8e12e/images/hub/cull_idle_servers.py and https://github.com/jupyterhub/zero-to-jupyterhub-k8s/commits/master/jupyterhub/files/hub/cull_idle_servers.py The idle culler is a core integral part of every JupyterHub deployment these days. It would be great if it was maintained separately on its own, without being split across multiple repos. The latest changes had been to the version in the JupyterHub repo, so I copied it (while preserving commit history, because credit is important) to a new repository: https://github.com/yuvipanda/jupyterhub-idle-culler I looked through z2jh and tljh copies, and cherry-picked the following changes manually https://github.com/jupyterhub/zero-to-jupyterhub-k8s/commit/ae80fb516337b653306aa4a74f4118a38c3cb3f6 https://github.com/jupyterhub/zero-to-jupyterhub-k8s/commit/836f19a4c7445f3dc32120c892873275ff870d1a https://github.com/jupyterhub/zero-to-jupyterhub-k8s/commit/a0787c64f19550ec96f2e06a3d7079e029fc6465 https://github.com/jupyterhub/zero-to-jupyterhub-k8s/commit/b230ef8156924e25368e7abf2174a9d1edf99c55 https://github.com/jupyterhub/the-littlest-jupyterhub/commit/20374db7c6ce2934ce700873b9d1225929e29456#diff-f00cd100e9f673285208aaa6fc0c3212 There were a few from https://github.com/jupyterhub/zero-to-jupyterhub-k8s/commits/9c15a42b1227f3b54826f273f1689e4dc8c8e12e/images/hub/cull_idle_servers.py I could not apply, but mostly because those features had been re-implemented already. Right now, the package is a direct port of the code we had. Once this settles in, I am hopefull we can iterate faster and make cool new changes. --- tljh/configurer.py | 2 +- tljh/cull_idle_servers.py | 342 -------------------------------------- tljh/installer.py | 1 + 3 files changed, 2 insertions(+), 343 deletions(-) delete mode 100644 tljh/cull_idle_servers.py diff --git a/tljh/configurer.py b/tljh/configurer.py index 3a9ec05..73f602b 100644 --- a/tljh/configurer.py +++ b/tljh/configurer.py @@ -218,7 +218,7 @@ def set_cull_idle_service(config): Set Idle Culler service """ cull_cmd = [ - sys.executable, '-m', 'tljh.cull_idle_servers' + sys.executable, '-m', 'jupyterhub_idle_culler' ] cull_config = config['services']['cull'] print() diff --git a/tljh/cull_idle_servers.py b/tljh/cull_idle_servers.py deleted file mode 100644 index 32524dd..0000000 --- a/tljh/cull_idle_servers.py +++ /dev/null @@ -1,342 +0,0 @@ -#!/usr/bin/env python3 -"""script to monitor and cull idle single-user servers - -Imported from https://github.com/jupyterhub/jupyterhub/blob/6b1046697/examples/cull-idle/cull_idle_servers.py - -Caveats: - -last_activity is not updated with high frequency, -so cull timeout should be greater than the sum of: - -- single-user websocket ping interval (default: 30s) -- JupyterHub.last_activity_interval (default: 5 minutes) - -You can run this as a service managed by JupyterHub with this in your config:: - - - c.JupyterHub.services = [ - { - 'name': 'cull-idle', - 'admin': True, - 'command': 'python cull_idle_servers.py --timeout=3600'.split(), - } - ] - -Or run it manually by generating an API token and storing it in `JUPYTERHUB_API_TOKEN`: - - export JUPYTERHUB_API_TOKEN=`jupyterhub token` - python cull_idle_servers.py [--timeout=900] [--url=http://127.0.0.1:8081/hub/api] -""" - -from datetime import datetime, timezone -from functools import partial -import json -import os - -try: - from urllib.parse import quote -except ImportError: - from urllib import quote - -import dateutil.parser - -from tornado.gen import coroutine, multi -from tornado.locks import Semaphore -from tornado.log import app_log -from tornado.httpclient import AsyncHTTPClient, HTTPRequest -from tornado.ioloop import IOLoop, PeriodicCallback -from tornado.options import define, options, parse_command_line - - -def parse_date(date_string): - """Parse a timestamp - - If it doesn't have a timezone, assume utc - - Returned datetime object will always be timezone-aware - """ - dt = dateutil.parser.parse(date_string) - if not dt.tzinfo: - # assume naïve timestamps are UTC - dt = dt.replace(tzinfo=timezone.utc) - return dt - - -def format_td(td): - """ - Nicely format a timedelta object - - as HH:MM:SS - """ - if td is None: - return "unknown" - if isinstance(td, str): - return td - seconds = int(td.total_seconds()) - h = seconds // 3600 - seconds = seconds % 3600 - m = seconds // 60 - seconds = seconds % 60 - return f"{h:02}:{m:02}:{seconds:02}" - - -@coroutine -def cull_idle(url, api_token, inactive_limit, cull_users=False, max_age=0, concurrency=10): - """Shutdown idle single-user servers - - If cull_users, inactive *users* will be deleted as well. - """ - auth_header = { - 'Authorization': 'token %s' % api_token, - } - req = HTTPRequest( - url=url + '/users', - headers=auth_header, - ) - now = datetime.now(timezone.utc) - client = AsyncHTTPClient() - - if concurrency: - semaphore = Semaphore(concurrency) - @coroutine - def fetch(req): - """client.fetch wrapped in a semaphore to limit concurrency""" - yield semaphore.acquire() - try: - return (yield client.fetch(req)) - finally: - yield semaphore.release() - else: - fetch = client.fetch - - resp = yield fetch(req) - users = json.loads(resp.body.decode('utf8', 'replace')) - futures = [] - - @coroutine - def handle_server(user, server_name, server): - """Handle (maybe) culling a single server - - Returns True if server is now stopped (user removable), - False otherwise. - """ - log_name = user['name'] - if server_name: - log_name = '%s/%s' % (user['name'], server_name) - if server.get('pending'): - app_log.warning( - "Not culling server %s with pending %s", - log_name, server['pending']) - return False - - if server.get('started'): - age = now - parse_date(server['started']) - else: - # started may be undefined on jupyterhub < 0.9 - age = None - - # check last activity - # last_activity can be None in 0.9 - if server['last_activity']: - inactive = now - parse_date(server['last_activity']) - else: - # no activity yet, use start date - # last_activity may be None with jupyterhub 0.9, - # which introduces the 'started' field which is never None - # for running servers - inactive = age - - should_cull = (inactive is not None and - inactive.total_seconds() >= inactive_limit) - if should_cull: - app_log.info( - "Culling server %s (inactive for %s)", - log_name, format_td(inactive)) - - if max_age and not should_cull: - # only check started if max_age is specified - # so that we can still be compatible with jupyterhub 0.8 - # which doesn't define the 'started' field - if age is not None and age.total_seconds() >= max_age: - app_log.info( - "Culling server %s (age: %s, inactive for %s)", - log_name, format_td(age), format_td(inactive)) - should_cull = True - - if not should_cull: - app_log.debug( - "Not culling server %s (age: %s, inactive for %s)", - log_name, format_td(age), format_td(inactive)) - return False - - req = HTTPRequest( - url=url + '/users/%s/server' % quote(user['name']), - method='DELETE', - headers=auth_header, - ) - resp = yield fetch(req) - if resp.code == 202: - app_log.warning( - "Server %s is slow to stop", - log_name, - ) - # return False to prevent culling user with pending shutdowns - return False - return True - - @coroutine - def handle_user(user): - """Handle one user. - - Create a list of their servers, and async exec them. Wait for - that to be done, and if all servers are stopped, possibly cull - the user. - """ - # shutdown servers first. - # Hub doesn't allow deleting users with running servers. - # named servers contain the 'servers' dict - if 'servers' in user: - servers = user['servers'] - # Otherwise, server data is intermingled in with the user - # model - else: - servers = {} - if user['server']: - servers[''] = { - 'started': user.get('started'), - 'last_activity': user['last_activity'], - 'pending': user['pending'], - 'url': user['server'], - } - server_futures = [ - handle_server(user, server_name, server) - for server_name, server in servers.items() - ] - results = yield multi(server_futures) - if not cull_users: - return - # some servers are still running, cannot cull users - still_alive = len(results) - sum(results) - if still_alive: - app_log.debug( - "Not culling user %s with %i servers still alive", - user['name'], still_alive) - return False - - should_cull = False - if user.get('created'): - age = now - parse_date(user['created']) - else: - # created may be undefined on jupyterhub < 0.9 - age = None - - # check last activity - # last_activity can be None in 0.9 - if user['last_activity']: - inactive = now - parse_date(user['last_activity']) - else: - # no activity yet, use start date - # last_activity may be None with jupyterhub 0.9, - # which introduces the 'created' field which is never None - inactive = age - - should_cull = (inactive is not None and - inactive.total_seconds() >= inactive_limit) - if should_cull: - app_log.info( - "Culling user %s (inactive for %s)", - user['name'], inactive) - - if max_age and not should_cull: - # only check created if max_age is specified - # so that we can still be compatible with jupyterhub 0.8 - # which doesn't define the 'started' field - if age is not None and age.total_seconds() >= max_age: - app_log.info( - "Culling user %s (age: %s, inactive for %s)", - user['name'], format_td(age), format_td(inactive)) - should_cull = True - - if not should_cull: - app_log.debug( - "Not culling user %s (created: %s, last active: %s)", - user['name'], format_td(age), format_td(inactive)) - return False - - req = HTTPRequest( - url=url + '/users/%s' % user['name'], - method='DELETE', - headers=auth_header, - ) - yield fetch(req) - return True - - for user in users: - futures.append((user['name'], handle_user(user))) - - for (name, f) in futures: - try: - result = yield f - except Exception: - app_log.exception("Error processing %s", name) - else: - if result: - app_log.debug("Finished culling %s", name) - - -if __name__ == '__main__': - define( - 'url', - default=os.environ.get('JUPYTERHUB_API_URL'), - help="The JupyterHub API URL", - ) - define('timeout', type=int, default=600, help="The idle timeout (in seconds)") - define('cull_every', type=int, default=0, - help="The interval (in seconds) for checking for idle servers to cull") - define('max_age', type=int, default=0, - help="The maximum age (in seconds) of servers that should be culled even if they are active") - define('cull_users', type=bool, default=False, - help="""Cull users in addition to servers. - This is for use in temporary-user cases such as tmpnb.""", - ) - define('concurrency', type=int, default=10, - help="""Limit the number of concurrent requests made to the Hub. - - Deleting a lot of users at the same time can slow down the Hub, - so limit the number of API requests we have outstanding at any given time. - """ - ) - - parse_command_line() - if not options.cull_every: - options.cull_every = options.timeout // 2 - api_token = os.environ['JUPYTERHUB_API_TOKEN'] - - try: - AsyncHTTPClient.configure("tornado.curl_httpclient.CurlAsyncHTTPClient") - except ImportError as e: - app_log.warning( - "Could not load pycurl: %s\n" - "pycurl is recommended if you have a large number of users.", - e) - - loop = IOLoop.current() - cull = partial( - cull_idle, - url=options.url, - api_token=api_token, - inactive_limit=options.timeout, - cull_users=options.cull_users, - max_age=options.max_age, - concurrency=options.concurrency, - ) - # schedule first cull immediately - # because PeriodicCallback doesn't start until the end of the first interval - loop.add_callback(cull) - # schedule periodic cull - pc = PeriodicCallback(cull, 1e3 * options.cull_every) - pc.start() - try: - loop.start() - except KeyboardInterrupt: - pass diff --git a/tljh/installer.py b/tljh/installer.py index c13a6e9..fd6be5d 100644 --- a/tljh/installer.py +++ b/tljh/installer.py @@ -225,6 +225,7 @@ def ensure_jupyterhub_package(prefix): 'jupyterhub-ldapauthenticator==1.3.0', 'jupyterhub-tmpauthenticator==0.6', 'oauthenticator==0.10.0', + 'git+https://github.com/yuvipanda/jupyterhub-idle-culler@4e710c0f45d57a7435d9ae79055a4ce499052a1c' ]) traefik.ensure_traefik_binary(prefix) From 41474e97e3db785240bfceac3a029bdfff994802 Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Mon, 27 Apr 2020 17:07:54 +0530 Subject: [PATCH 2/4] Update tests to check for newer location of idle culler --- tests/test_configurer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_configurer.py b/tests/test_configurer.py index 0481585..fa3090e 100644 --- a/tests/test_configurer.py +++ b/tests/test_configurer.py @@ -213,7 +213,7 @@ def test_cull_service_default(): c = apply_mock_config({}) cull_cmd = [ - sys.executable, '-m', 'tljh.cull_idle_servers', + sys.executable, '-m', 'jupyterhub_idle_culler', '--timeout=600', '--cull-every=60', '--concurrency=5', '--max-age=0' ] @@ -238,7 +238,7 @@ def test_set_cull_service(): } }) cull_cmd = [ - sys.executable, '-m', 'tljh.cull_idle_servers', + sys.executable, '-m', 'jupyterhub_idle_culler', '--timeout=600', '--cull-every=10', '--concurrency=5', '--max-age=60', '--cull-users' ] @@ -261,7 +261,7 @@ def test_load_secrets(tljh_dir): c = apply_mock_config(tljh_config) assert c.TraefikTomlProxy.traefik_api_password == "traefik-password" - + def test_auth_native(): """ Test setting Native Authenticator From 1c7e89ab1ea7dd6b62f4fb48d531196a862b2fe8 Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Mon, 27 Apr 2020 18:27:59 +0530 Subject: [PATCH 3/4] Install jupyterhub-idle-culler from PyPI A v1.0 release has been made! --- tljh/installer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tljh/installer.py b/tljh/installer.py index fd6be5d..0a32173 100644 --- a/tljh/installer.py +++ b/tljh/installer.py @@ -225,7 +225,7 @@ def ensure_jupyterhub_package(prefix): 'jupyterhub-ldapauthenticator==1.3.0', 'jupyterhub-tmpauthenticator==0.6', 'oauthenticator==0.10.0', - 'git+https://github.com/yuvipanda/jupyterhub-idle-culler@4e710c0f45d57a7435d9ae79055a4ce499052a1c' + 'jupyterhub-idle-culler==1.0' ]) traefik.ensure_traefik_binary(prefix) From ed9430e6b9cf2829f88f2d0023f7951a1908515c Mon Sep 17 00:00:00 2001 From: YuviPanda Date: Mon, 4 May 2020 12:42:32 +0530 Subject: [PATCH 4/4] [test] Provide 1.0G rather than 1G for integration tests I couldn't figure out why JupyterLab fails to build with 1G on initial commit, but succeeds if you do a bunch of other commits after. Previously, I binary searched down from 2G. Here, I just add a new, no-op commit to see if that helps --- .circleci/integration-test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/integration-test.py b/.circleci/integration-test.py index 7afbe88..0c47235 100755 --- a/.circleci/integration-test.py +++ b/.circleci/integration-test.py @@ -32,7 +32,7 @@ def run_systemd_image(image_name, container_name, bootstrap_pip_spec): # This is the minimum VM size we support. JupyterLab extensions seem # to need at least this much RAM to build. Boo? # If we change this, need to change all other references to this number. - '--memory', '1G', + '--memory', '1.0G', ] if bootstrap_pip_spec: