commands: use a single ThreadPool for spack versions
(#16749)
This fixes a fork bomb in `spack versions`. Recursive generation of pools to scrape URLs in `_spider` was creating large numbers of processes. Instead of recursively creating process pools, we now use a single `ThreadPool` with a concurrency limit. More on the issue: having ~10 users running at the same time spack versions on front-end nodes caused kernel lockup due to the high number of sockets opened (sys-admin reports ~210k distributed over 3 nodes). Users were internal, so they had ulimit -n set to ~70k. The forking behavior could be observed by just running: $ spack versions boost and checking the number of processes spawned. Number of processes per se was not the issue, but each one of them opens a socket which can stress `iptables`. In the original issue the kernel watchdog was reporting: Message from syslogd@login03 at May 19 12:01:30 ... kernel:Watchdog CPU:110 Hard LOCKUP Message from syslogd@login03 at May 19 12:01:31 ... kernel:watchdog: BUG: soft lockup - CPU#110 stuck for 23s! [python3:2756] Message from syslogd@login03 at May 19 12:01:31 ... kernel:watchdog: BUG: soft lockup - CPU#94 stuck for 22s! [iptables:5603]
This commit is contained in:
parent
92e24950e5
commit
5b272e3ff3
@ -21,6 +21,10 @@
|
||||
def setup_parser(subparser):
|
||||
subparser.add_argument('-s', '--safe-only', action='store_true',
|
||||
help='only list safe versions of the package')
|
||||
subparser.add_argument(
|
||||
'-c', '--concurrency', default=32, type=int,
|
||||
help='number of concurrent requests'
|
||||
)
|
||||
arguments.add_common_arguments(subparser, ['package'])
|
||||
|
||||
|
||||
@ -45,7 +49,7 @@ def versions(parser, args):
|
||||
if sys.stdout.isatty():
|
||||
tty.msg('Remote versions (not yet checksummed):')
|
||||
|
||||
fetched_versions = pkg.fetch_remote_versions()
|
||||
fetched_versions = pkg.fetch_remote_versions(args.concurrency)
|
||||
remote_versions = set(fetched_versions).difference(safe_versions)
|
||||
|
||||
if not remote_versions:
|
||||
|
@ -2020,7 +2020,7 @@ def all_urls(self):
|
||||
urls.append(args['url'])
|
||||
return urls
|
||||
|
||||
def fetch_remote_versions(self):
|
||||
def fetch_remote_versions(self, concurrency=128):
|
||||
"""Find remote versions of this package.
|
||||
|
||||
Uses ``list_url`` and any other URLs listed in the package file.
|
||||
@ -2033,7 +2033,8 @@ def fetch_remote_versions(self):
|
||||
|
||||
try:
|
||||
return spack.util.web.find_versions_of_archive(
|
||||
self.all_urls, self.list_url, self.list_depth)
|
||||
self.all_urls, self.list_url, self.list_depth, concurrency
|
||||
)
|
||||
except spack.util.web.NoNetworkConnectionError as e:
|
||||
tty.die("Package.fetch_versions couldn't connect to:", e.url,
|
||||
e.message)
|
||||
|
@ -2,125 +2,101 @@
|
||||
# Spack Project Developers. See the top-level COPYRIGHT file for details.
|
||||
#
|
||||
# SPDX-License-Identifier: (Apache-2.0 OR MIT)
|
||||
|
||||
"""Tests for web.py."""
|
||||
import os
|
||||
|
||||
import ordereddict_backport
|
||||
import pytest
|
||||
|
||||
from ordereddict_backport import OrderedDict
|
||||
|
||||
import spack.paths
|
||||
import spack.util.web as web_util
|
||||
import spack.util.web
|
||||
from spack.version import ver
|
||||
|
||||
|
||||
web_data_path = os.path.join(spack.paths.test_path, 'data', 'web')
|
||||
|
||||
root = 'file://' + web_data_path + '/index.html'
|
||||
root_tarball = 'file://' + web_data_path + '/foo-0.0.0.tar.gz'
|
||||
|
||||
page_1 = 'file://' + os.path.join(web_data_path, '1.html')
|
||||
page_2 = 'file://' + os.path.join(web_data_path, '2.html')
|
||||
page_3 = 'file://' + os.path.join(web_data_path, '3.html')
|
||||
page_4 = 'file://' + os.path.join(web_data_path, '4.html')
|
||||
def _create_url(relative_url):
|
||||
web_data_path = os.path.join(spack.paths.test_path, 'data', 'web')
|
||||
return 'file://' + os.path.join(web_data_path, relative_url)
|
||||
|
||||
|
||||
def test_spider_0():
|
||||
pages, links = web_util.spider(root, depth=0)
|
||||
|
||||
assert root in pages
|
||||
assert page_1 not in pages
|
||||
assert page_2 not in pages
|
||||
assert page_3 not in pages
|
||||
assert page_4 not in pages
|
||||
|
||||
assert "This is the root page." in pages[root]
|
||||
|
||||
assert root not in links
|
||||
assert page_1 in links
|
||||
assert page_2 not in links
|
||||
assert page_3 not in links
|
||||
assert page_4 not in links
|
||||
root = _create_url('index.html')
|
||||
root_tarball = _create_url('foo-0.0.0.tar.gz')
|
||||
page_1 = _create_url('1.html')
|
||||
page_2 = _create_url('2.html')
|
||||
page_3 = _create_url('3.html')
|
||||
page_4 = _create_url('4.html')
|
||||
|
||||
|
||||
def test_spider_1():
|
||||
pages, links = web_util.spider(root, depth=1)
|
||||
@pytest.mark.parametrize(
|
||||
'depth,expected_found,expected_not_found,expected_text', [
|
||||
(0,
|
||||
{'pages': [root], 'links': [page_1]},
|
||||
{'pages': [page_1, page_2, page_3, page_4],
|
||||
'links': [root, page_2, page_3, page_4]},
|
||||
{root: "This is the root page."}),
|
||||
(1,
|
||||
{'pages': [root, page_1], 'links': [page_1, page_2]},
|
||||
{'pages': [page_2, page_3, page_4],
|
||||
'links': [root, page_3, page_4]},
|
||||
{root: "This is the root page.",
|
||||
page_1: "This is page 1."}),
|
||||
(2,
|
||||
{'pages': [root, page_1, page_2],
|
||||
'links': [page_1, page_2, page_3, page_4]},
|
||||
{'pages': [page_3, page_4], 'links': [root]},
|
||||
{root: "This is the root page.",
|
||||
page_1: "This is page 1.",
|
||||
page_2: "This is page 2."}),
|
||||
(3,
|
||||
{'pages': [root, page_1, page_2, page_3, page_4],
|
||||
'links': [root, page_1, page_2, page_3, page_4]},
|
||||
{'pages': [], 'links': []},
|
||||
{root: "This is the root page.",
|
||||
page_1: "This is page 1.",
|
||||
page_2: "This is page 2.",
|
||||
page_3: "This is page 3.",
|
||||
page_4: "This is page 4."}),
|
||||
])
|
||||
def test_spider(depth, expected_found, expected_not_found, expected_text):
|
||||
pages, links = spack.util.web.spider(root, depth=depth)
|
||||
|
||||
assert root in pages
|
||||
assert page_1 in pages
|
||||
assert page_2 not in pages
|
||||
assert page_3 not in pages
|
||||
assert page_4 not in pages
|
||||
for page in expected_found['pages']:
|
||||
assert page in pages
|
||||
|
||||
assert "This is the root page." in pages[root]
|
||||
assert "This is page 1." in pages[page_1]
|
||||
for page in expected_not_found['pages']:
|
||||
assert page not in pages
|
||||
|
||||
assert root not in links
|
||||
assert page_1 in links
|
||||
assert page_2 in links
|
||||
assert page_3 not in links
|
||||
assert page_4 not in links
|
||||
for link in expected_found['links']:
|
||||
assert link in links
|
||||
|
||||
for link in expected_not_found['links']:
|
||||
assert link not in links
|
||||
|
||||
for page, text in expected_text.items():
|
||||
assert text in pages[page]
|
||||
|
||||
|
||||
def test_spider_2():
|
||||
pages, links = web_util.spider(root, depth=2)
|
||||
|
||||
assert root in pages
|
||||
assert page_1 in pages
|
||||
assert page_2 in pages
|
||||
assert page_3 not in pages
|
||||
assert page_4 not in pages
|
||||
|
||||
assert "This is the root page." in pages[root]
|
||||
assert "This is page 1." in pages[page_1]
|
||||
assert "This is page 2." in pages[page_2]
|
||||
|
||||
assert root not in links
|
||||
assert page_1 in links
|
||||
assert page_1 in links
|
||||
assert page_2 in links
|
||||
assert page_3 in links
|
||||
assert page_4 in links
|
||||
|
||||
|
||||
def test_spider_3():
|
||||
pages, links = web_util.spider(root, depth=3)
|
||||
|
||||
assert root in pages
|
||||
assert page_1 in pages
|
||||
assert page_2 in pages
|
||||
assert page_3 in pages
|
||||
assert page_4 in pages
|
||||
|
||||
assert "This is the root page." in pages[root]
|
||||
assert "This is page 1." in pages[page_1]
|
||||
assert "This is page 2." in pages[page_2]
|
||||
assert "This is page 3." in pages[page_3]
|
||||
assert "This is page 4." in pages[page_4]
|
||||
|
||||
assert root in links # circular link on page 3
|
||||
assert page_1 in links
|
||||
assert page_1 in links
|
||||
assert page_2 in links
|
||||
assert page_3 in links
|
||||
assert page_4 in links
|
||||
def test_spider_no_response(monkeypatch):
|
||||
# Mock the absence of a response
|
||||
monkeypatch.setattr(
|
||||
spack.util.web, 'read_from_url', lambda x, y: (None, None, None)
|
||||
)
|
||||
pages, links = spack.util.web.spider(root, depth=0)
|
||||
assert not pages and not links
|
||||
|
||||
|
||||
def test_find_versions_of_archive_0():
|
||||
versions = web_util.find_versions_of_archive(
|
||||
versions = spack.util.web.find_versions_of_archive(
|
||||
root_tarball, root, list_depth=0)
|
||||
assert ver('0.0.0') in versions
|
||||
|
||||
|
||||
def test_find_versions_of_archive_1():
|
||||
versions = web_util.find_versions_of_archive(
|
||||
versions = spack.util.web.find_versions_of_archive(
|
||||
root_tarball, root, list_depth=1)
|
||||
assert ver('0.0.0') in versions
|
||||
assert ver('1.0.0') in versions
|
||||
|
||||
|
||||
def test_find_versions_of_archive_2():
|
||||
versions = web_util.find_versions_of_archive(
|
||||
versions = spack.util.web.find_versions_of_archive(
|
||||
root_tarball, root, list_depth=2)
|
||||
assert ver('0.0.0') in versions
|
||||
assert ver('1.0.0') in versions
|
||||
@ -128,14 +104,14 @@ def test_find_versions_of_archive_2():
|
||||
|
||||
|
||||
def test_find_exotic_versions_of_archive_2():
|
||||
versions = web_util.find_versions_of_archive(
|
||||
versions = spack.util.web.find_versions_of_archive(
|
||||
root_tarball, root, list_depth=2)
|
||||
# up for grabs to make this better.
|
||||
assert ver('2.0.0b2') in versions
|
||||
|
||||
|
||||
def test_find_versions_of_archive_3():
|
||||
versions = web_util.find_versions_of_archive(
|
||||
versions = spack.util.web.find_versions_of_archive(
|
||||
root_tarball, root, list_depth=3)
|
||||
assert ver('0.0.0') in versions
|
||||
assert ver('1.0.0') in versions
|
||||
@ -145,7 +121,7 @@ def test_find_versions_of_archive_3():
|
||||
|
||||
|
||||
def test_find_exotic_versions_of_archive_3():
|
||||
versions = web_util.find_versions_of_archive(
|
||||
versions = spack.util.web.find_versions_of_archive(
|
||||
root_tarball, root, list_depth=3)
|
||||
assert ver('2.0.0b2') in versions
|
||||
assert ver('3.0a1') in versions
|
||||
@ -159,35 +135,35 @@ def test_get_header():
|
||||
|
||||
# looking up headers should just work like a plain dict
|
||||
# lookup when there is an entry with the right key
|
||||
assert(web_util.get_header(headers, 'Content-type') == 'text/plain')
|
||||
assert(spack.util.web.get_header(headers, 'Content-type') == 'text/plain')
|
||||
|
||||
# looking up headers should still work if there is a fuzzy match
|
||||
assert(web_util.get_header(headers, 'contentType') == 'text/plain')
|
||||
assert(spack.util.web.get_header(headers, 'contentType') == 'text/plain')
|
||||
|
||||
# ...unless there is an exact match for the "fuzzy" spelling.
|
||||
headers['contentType'] = 'text/html'
|
||||
assert(web_util.get_header(headers, 'contentType') == 'text/html')
|
||||
assert(spack.util.web.get_header(headers, 'contentType') == 'text/html')
|
||||
|
||||
# If lookup has to fallback to fuzzy matching and there are more than one
|
||||
# fuzzy match, the result depends on the internal ordering of the given
|
||||
# mapping
|
||||
headers = OrderedDict()
|
||||
headers = ordereddict_backport.OrderedDict()
|
||||
headers['Content-type'] = 'text/plain'
|
||||
headers['contentType'] = 'text/html'
|
||||
|
||||
assert(web_util.get_header(headers, 'CONTENT_TYPE') == 'text/plain')
|
||||
assert(spack.util.web.get_header(headers, 'CONTENT_TYPE') == 'text/plain')
|
||||
del headers['Content-type']
|
||||
assert(web_util.get_header(headers, 'CONTENT_TYPE') == 'text/html')
|
||||
assert(spack.util.web.get_header(headers, 'CONTENT_TYPE') == 'text/html')
|
||||
|
||||
# Same as above, but different ordering
|
||||
headers = OrderedDict()
|
||||
headers = ordereddict_backport.OrderedDict()
|
||||
headers['contentType'] = 'text/html'
|
||||
headers['Content-type'] = 'text/plain'
|
||||
|
||||
assert(web_util.get_header(headers, 'CONTENT_TYPE') == 'text/html')
|
||||
assert(spack.util.web.get_header(headers, 'CONTENT_TYPE') == 'text/html')
|
||||
del headers['contentType']
|
||||
assert(web_util.get_header(headers, 'CONTENT_TYPE') == 'text/plain')
|
||||
assert(spack.util.web.get_header(headers, 'CONTENT_TYPE') == 'text/plain')
|
||||
|
||||
# If there isn't even a fuzzy match, raise KeyError
|
||||
with pytest.raises(KeyError):
|
||||
web_util.get_header(headers, 'ContentLength')
|
||||
spack.util.web.get_header(headers, 'ContentLength')
|
||||
|
@ -7,17 +7,18 @@
|
||||
|
||||
import codecs
|
||||
import errno
|
||||
import re
|
||||
import multiprocessing.pool
|
||||
import os
|
||||
import os.path
|
||||
import re
|
||||
import shutil
|
||||
import ssl
|
||||
import sys
|
||||
import traceback
|
||||
|
||||
from six.moves.urllib.request import urlopen, Request
|
||||
import six
|
||||
from six.moves.urllib.error import URLError
|
||||
import multiprocessing.pool
|
||||
from six.moves.urllib.request import urlopen, Request
|
||||
|
||||
try:
|
||||
# Python 2 had these in the HTMLParser package.
|
||||
@ -63,34 +64,6 @@ def handle_starttag(self, tag, attrs):
|
||||
self.links.append(val)
|
||||
|
||||
|
||||
class NonDaemonProcess(multiprocessing.Process):
|
||||
"""Process that allows sub-processes, so pools can have sub-pools."""
|
||||
@property
|
||||
def daemon(self):
|
||||
return False
|
||||
|
||||
@daemon.setter
|
||||
def daemon(self, value):
|
||||
pass
|
||||
|
||||
|
||||
if sys.version_info[0] < 3:
|
||||
class NonDaemonPool(multiprocessing.pool.Pool):
|
||||
"""Pool that uses non-daemon processes"""
|
||||
Process = NonDaemonProcess
|
||||
else:
|
||||
|
||||
class NonDaemonContext(type(multiprocessing.get_context())): # novm
|
||||
Process = NonDaemonProcess
|
||||
|
||||
class NonDaemonPool(multiprocessing.pool.Pool):
|
||||
"""Pool that uses non-daemon processes"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
kwargs['context'] = NonDaemonContext()
|
||||
super(NonDaemonPool, self).__init__(*args, **kwargs)
|
||||
|
||||
|
||||
def uses_ssl(parsed_url):
|
||||
if parsed_url.scheme == 'https':
|
||||
return True
|
||||
@ -336,109 +309,152 @@ def list_url(url):
|
||||
for key in _iter_s3_prefix(s3, url)))
|
||||
|
||||
|
||||
def _spider(url, visited, root, depth, max_depth, raise_on_error):
|
||||
"""Fetches URL and any pages it links to up to max_depth.
|
||||
def spider(root_urls, depth=0, concurrency=32):
|
||||
"""Get web pages from root URLs.
|
||||
|
||||
depth should initially be zero, and max_depth is the max depth of
|
||||
links to follow from the root.
|
||||
If depth is specified (e.g., depth=2), then this will also follow
|
||||
up to <depth> levels of links from each root.
|
||||
|
||||
Prints out a warning only if the root can't be fetched; it ignores
|
||||
errors with pages that the root links to.
|
||||
Args:
|
||||
root_urls (str or list of str): root urls used as a starting point
|
||||
for spidering
|
||||
depth (int): level of recursion into links
|
||||
concurrency (int): number of simultaneous requests that can be sent
|
||||
|
||||
Returns a tuple of:
|
||||
- pages: dict of pages visited (URL) mapped to their full text.
|
||||
- links: set of links encountered while visiting the pages.
|
||||
Returns:
|
||||
A dict of pages visited (URL) mapped to their full text and the
|
||||
set of visited links.
|
||||
"""
|
||||
pages = {} # dict from page URL -> text content.
|
||||
links = set() # set of all links seen on visited pages.
|
||||
# Cache of visited links, meant to be captured by the closure below
|
||||
_visited = set()
|
||||
|
||||
try:
|
||||
response_url, _, response = read_from_url(url, 'text/html')
|
||||
if not response_url or not response:
|
||||
return pages, links
|
||||
def _spider(url, collect_nested):
|
||||
"""Fetches URL and any pages it links to.
|
||||
|
||||
page = codecs.getreader('utf-8')(response).read()
|
||||
pages[response_url] = page
|
||||
Prints out a warning only if the root can't be fetched; it ignores
|
||||
errors with pages that the root links to.
|
||||
|
||||
# Parse out the links in the page
|
||||
link_parser = LinkParser()
|
||||
Args:
|
||||
url (str): url being fetched and searched for links
|
||||
collect_nested (bool): whether we want to collect arguments
|
||||
for nested spidering on the links found in this url
|
||||
|
||||
Returns:
|
||||
A tuple of:
|
||||
- pages: dict of pages visited (URL) mapped to their full text.
|
||||
- links: set of links encountered while visiting the pages.
|
||||
- spider_args: argument for subsequent call to spider
|
||||
"""
|
||||
pages = {} # dict from page URL -> text content.
|
||||
links = set() # set of all links seen on visited pages.
|
||||
subcalls = []
|
||||
link_parser.feed(page)
|
||||
|
||||
while link_parser.links:
|
||||
raw_link = link_parser.links.pop()
|
||||
abs_link = url_util.join(
|
||||
response_url,
|
||||
raw_link.strip(),
|
||||
resolve_href=True)
|
||||
links.add(abs_link)
|
||||
try:
|
||||
response_url, _, response = read_from_url(url, 'text/html')
|
||||
if not response_url or not response:
|
||||
return pages, links, subcalls
|
||||
|
||||
# Skip stuff that looks like an archive
|
||||
if any(raw_link.endswith(suf) for suf in ALLOWED_ARCHIVE_TYPES):
|
||||
continue
|
||||
page = codecs.getreader('utf-8')(response).read()
|
||||
pages[response_url] = page
|
||||
|
||||
# Skip things outside the root directory
|
||||
if not abs_link.startswith(root):
|
||||
continue
|
||||
# Parse out the links in the page
|
||||
link_parser = LinkParser()
|
||||
link_parser.feed(page)
|
||||
|
||||
# Skip already-visited links
|
||||
if abs_link in visited:
|
||||
continue
|
||||
while link_parser.links:
|
||||
raw_link = link_parser.links.pop()
|
||||
abs_link = url_util.join(
|
||||
response_url,
|
||||
raw_link.strip(),
|
||||
resolve_href=True)
|
||||
links.add(abs_link)
|
||||
|
||||
# If we're not at max depth, follow links.
|
||||
if depth < max_depth:
|
||||
subcalls.append((abs_link, visited, root,
|
||||
depth + 1, max_depth, raise_on_error))
|
||||
visited.add(abs_link)
|
||||
# Skip stuff that looks like an archive
|
||||
if any(raw_link.endswith(s) for s in ALLOWED_ARCHIVE_TYPES):
|
||||
continue
|
||||
|
||||
if subcalls:
|
||||
pool = NonDaemonPool(processes=len(subcalls))
|
||||
try:
|
||||
results = pool.map(_spider_wrapper, subcalls)
|
||||
# Skip already-visited links
|
||||
if abs_link in _visited:
|
||||
continue
|
||||
|
||||
for sub_pages, sub_links in results:
|
||||
pages.update(sub_pages)
|
||||
links.update(sub_links)
|
||||
# If we're not at max depth, follow links.
|
||||
if collect_nested:
|
||||
subcalls.append((abs_link,))
|
||||
_visited.add(abs_link)
|
||||
|
||||
finally:
|
||||
pool.terminate()
|
||||
pool.join()
|
||||
except URLError as e:
|
||||
tty.debug(str(e))
|
||||
|
||||
except URLError as e:
|
||||
tty.debug(e)
|
||||
if hasattr(e, 'reason') and isinstance(e.reason, ssl.SSLError):
|
||||
tty.warn("Spack was unable to fetch url list due to a "
|
||||
"certificate verification problem. You can try "
|
||||
"running spack -k, which will not check SSL "
|
||||
"certificates. Use this at your own risk.")
|
||||
|
||||
if hasattr(e, 'reason') and isinstance(e.reason, ssl.SSLError):
|
||||
tty.warn("Spack was unable to fetch url list due to a certificate "
|
||||
"verification problem. You can try running spack -k, "
|
||||
"which will not check SSL certificates. Use this at your "
|
||||
"own risk.")
|
||||
except HTMLParseError as e:
|
||||
# This error indicates that Python's HTML parser sucks.
|
||||
msg = "Got an error parsing HTML."
|
||||
|
||||
if raise_on_error:
|
||||
raise NoNetworkConnectionError(str(e), url)
|
||||
# Pre-2.7.3 Pythons in particular have rather prickly HTML parsing.
|
||||
if sys.version_info[:3] < (2, 7, 3):
|
||||
msg += " Use Python 2.7.3 or newer for better HTML parsing."
|
||||
|
||||
except HTMLParseError as e:
|
||||
# This error indicates that Python's HTML parser sucks.
|
||||
msg = "Got an error parsing HTML."
|
||||
tty.warn(msg, url, "HTMLParseError: " + str(e))
|
||||
|
||||
# Pre-2.7.3 Pythons in particular have rather prickly HTML parsing.
|
||||
if sys.version_info[:3] < (2, 7, 3):
|
||||
msg += " Use Python 2.7.3 or newer for better HTML parsing."
|
||||
except Exception as e:
|
||||
# Other types of errors are completely ignored,
|
||||
# except in debug mode
|
||||
tty.debug("Error in _spider: %s:%s" % (type(e), str(e)),
|
||||
traceback.format_exc())
|
||||
|
||||
tty.warn(msg, url, "HTMLParseError: " + str(e))
|
||||
finally:
|
||||
tty.debug("SPIDER: [url={0}]".format(url))
|
||||
|
||||
except Exception as e:
|
||||
# Other types of errors are completely ignored, except in debug mode.
|
||||
tty.debug("Error in _spider: %s:%s" % (type(e), e),
|
||||
traceback.format_exc())
|
||||
return pages, links, subcalls
|
||||
|
||||
# TODO: Needed until we drop support for Python 2.X
|
||||
def star(func):
|
||||
def _wrapper(args):
|
||||
return func(*args)
|
||||
return _wrapper
|
||||
|
||||
if isinstance(root_urls, six.string_types):
|
||||
root_urls = [root_urls]
|
||||
|
||||
# Clear the local cache of visited pages before starting the search
|
||||
_visited.clear()
|
||||
|
||||
current_depth = 0
|
||||
pages, links, spider_args = {}, set(), []
|
||||
|
||||
collect = current_depth < depth
|
||||
for root in root_urls:
|
||||
root = url_util.parse(root)
|
||||
spider_args.append((root, collect))
|
||||
|
||||
tp = multiprocessing.pool.ThreadPool(processes=concurrency)
|
||||
try:
|
||||
while current_depth <= depth:
|
||||
tty.debug("SPIDER: [depth={0}, max_depth={1}, urls={2}]".format(
|
||||
current_depth, depth, len(spider_args))
|
||||
)
|
||||
results = tp.map(star(_spider), spider_args)
|
||||
spider_args = []
|
||||
collect = current_depth < depth
|
||||
for sub_pages, sub_links, sub_spider_args in results:
|
||||
sub_spider_args = [x + (collect,) for x in sub_spider_args]
|
||||
pages.update(sub_pages)
|
||||
links.update(sub_links)
|
||||
spider_args.extend(sub_spider_args)
|
||||
|
||||
current_depth += 1
|
||||
finally:
|
||||
tp.terminate()
|
||||
tp.join()
|
||||
|
||||
return pages, links
|
||||
|
||||
|
||||
def _spider_wrapper(args):
|
||||
"""Wrapper for using spider with multiprocessing."""
|
||||
return _spider(*args)
|
||||
|
||||
|
||||
def _urlopen(req, *args, **kwargs):
|
||||
"""Wrapper for compatibility with old versions of Python."""
|
||||
url = req
|
||||
@ -460,37 +476,22 @@ def _urlopen(req, *args, **kwargs):
|
||||
return opener(req, *args, **kwargs)
|
||||
|
||||
|
||||
def spider(root, depth=0):
|
||||
"""Gets web pages from a root URL.
|
||||
|
||||
If depth is specified (e.g., depth=2), then this will also follow
|
||||
up to <depth> levels of links from the root.
|
||||
|
||||
This will spawn processes to fetch the children, for much improved
|
||||
performance over a sequential fetch.
|
||||
|
||||
"""
|
||||
root = url_util.parse(root)
|
||||
pages, links = _spider(root, set(), root, 0, depth, False)
|
||||
return pages, links
|
||||
|
||||
|
||||
def find_versions_of_archive(archive_urls, list_url=None, list_depth=0):
|
||||
def find_versions_of_archive(
|
||||
archive_urls, list_url=None, list_depth=0, concurrency=32
|
||||
):
|
||||
"""Scrape web pages for new versions of a tarball.
|
||||
|
||||
Arguments:
|
||||
Args:
|
||||
archive_urls (str or list or tuple): URL or sequence of URLs for
|
||||
different versions of a package. Typically these are just the
|
||||
tarballs from the package file itself. By default, this searches
|
||||
the parent directories of archives.
|
||||
|
||||
Keyword Arguments:
|
||||
list_url (str or None): URL for a listing of archives.
|
||||
Spack will scrape these pages for download links that look
|
||||
like the archive URL.
|
||||
|
||||
list_depth (int): Max depth to follow links on list_url pages.
|
||||
list_depth (int): max depth to follow links on list_url pages.
|
||||
Defaults to 0.
|
||||
concurrency (int): maximum number of concurrent requests
|
||||
"""
|
||||
if not isinstance(archive_urls, (list, tuple)):
|
||||
archive_urls = [archive_urls]
|
||||
@ -511,12 +512,7 @@ def find_versions_of_archive(archive_urls, list_url=None, list_depth=0):
|
||||
list_urls |= additional_list_urls
|
||||
|
||||
# Grab some web pages to scrape.
|
||||
pages = {}
|
||||
links = set()
|
||||
for lurl in list_urls:
|
||||
pg, lnk = spider(lurl, depth=list_depth)
|
||||
pages.update(pg)
|
||||
links.update(lnk)
|
||||
pages, links = spider(list_urls, depth=list_depth, concurrency=concurrency)
|
||||
|
||||
# Scrape them for archive URLs
|
||||
regexes = []
|
||||
|
@ -1510,7 +1510,7 @@ _spack_verify() {
|
||||
_spack_versions() {
|
||||
if $list_options
|
||||
then
|
||||
SPACK_COMPREPLY="-h --help -s --safe-only"
|
||||
SPACK_COMPREPLY="-h --help -s --safe-only -c --concurrency"
|
||||
else
|
||||
_all_packages
|
||||
fi
|
||||
|
Loading…
Reference in New Issue
Block a user