Add better tests for web.py; fix some bugs found with spidering.
- _spider in web.py was actually failing to spider deeper than a certain point. - Fixed multiprocessing pools to not use daemons and to allow recursive spawning. - Added detailed tests for spidering and for finding archive versions. - left some xfail URL finding exercises for the reader. - Fix noqa annotations for some @when decorators
This commit is contained in:
@@ -570,7 +570,7 @@ def __init__(self, spec):
|
||||
self.list_url = None
|
||||
|
||||
if not hasattr(self, 'list_depth'):
|
||||
self.list_depth = 1
|
||||
self.list_depth = 0
|
||||
|
||||
# Set default licensing information
|
||||
if not hasattr(self, 'license_required'):
|
||||
@@ -966,6 +966,10 @@ def do_stage(self, mirror_only=False):
|
||||
self.stage.expand_archive()
|
||||
self.stage.chdir_to_source()
|
||||
|
||||
def patch(self):
|
||||
"""Default patch implementation is a no-op."""
|
||||
pass
|
||||
|
||||
def do_patch(self):
|
||||
"""Calls do_stage(), then applied patches to the expanded tarball if they
|
||||
haven't been applied already."""
|
||||
@@ -1686,9 +1690,7 @@ def fetch_remote_versions(self):
|
||||
|
||||
try:
|
||||
return spack.util.web.find_versions_of_archive(
|
||||
*self.all_urls,
|
||||
list_url=self.list_url,
|
||||
list_depth=self.list_depth)
|
||||
self.all_urls, self.list_url, self.list_depth)
|
||||
except spack.error.NoNetworkConnectionError as e:
|
||||
tty.die("Package.fetch_versions couldn't connect to:", e.url,
|
||||
e.message)
|
||||
|
10
lib/spack/spack/test/data/web/1.html
Normal file
10
lib/spack/spack/test/data/web/1.html
Normal file
@@ -0,0 +1,10 @@
|
||||
<html>
|
||||
<head>
|
||||
This is page 1.
|
||||
</head>
|
||||
<body>
|
||||
<a href="2.html">list_depth=2 follows this.</a>
|
||||
|
||||
<a href="foo-1.0.0.tar.gz">foo-1.0.0.tar.gz</a>
|
||||
</body>
|
||||
</html>
|
12
lib/spack/spack/test/data/web/2.html
Normal file
12
lib/spack/spack/test/data/web/2.html
Normal file
@@ -0,0 +1,12 @@
|
||||
<html>
|
||||
<head>
|
||||
This is page 2.
|
||||
</head>
|
||||
<body>
|
||||
<a href="3.html">list_depth=3 follows this.</a>
|
||||
<a href="4.html">list_depth=3 follows this too.</a>
|
||||
|
||||
<a href="foo-2.0.0.tar.gz">foo-2.0.0.tar.gz</a>
|
||||
<a href="foo-2.0.0b2.tar.gz">foo-2.0.0b2.tar.gz</a>
|
||||
</body>
|
||||
</html>
|
11
lib/spack/spack/test/data/web/3.html
Normal file
11
lib/spack/spack/test/data/web/3.html
Normal file
@@ -0,0 +1,11 @@
|
||||
<html>
|
||||
<head>
|
||||
This is page 3.
|
||||
</head>
|
||||
<body>
|
||||
<a href="index.html">This link is already visited.</a>
|
||||
|
||||
<a href="foo-3.0.tar.gz">foo-3.0.tar.gz</a>
|
||||
<a href="foo-3.0a1.tar.gz">foo-3.0a1.tar.gz</a>
|
||||
</body>
|
||||
</html>
|
11
lib/spack/spack/test/data/web/4.html
Normal file
11
lib/spack/spack/test/data/web/4.html
Normal file
@@ -0,0 +1,11 @@
|
||||
<html>
|
||||
<head>
|
||||
This is page 4.
|
||||
</head>
|
||||
<body>
|
||||
This page is terminal and has no links to other pages.
|
||||
|
||||
<a href="foo-4.5.tar.gz">foo-4.5.tar.gz.</a>
|
||||
<a href="foo-4.5-rc5.tar.gz">foo-4.1-rc5.tar.gz.</a>
|
||||
</body>
|
||||
</html>
|
10
lib/spack/spack/test/data/web/index.html
Normal file
10
lib/spack/spack/test/data/web/index.html
Normal file
@@ -0,0 +1,10 @@
|
||||
<html>
|
||||
<head>
|
||||
This is the root page.
|
||||
</head>
|
||||
<body>
|
||||
<a href="1.html">list_depth=1 follows this.</a>
|
||||
|
||||
<a href="foo-0.0.0.tar.gz">foo-0.0.0.tar.gz</a>
|
||||
</body>
|
||||
</html>
|
165
lib/spack/spack/test/web.py
Normal file
165
lib/spack/spack/test/web.py
Normal file
@@ -0,0 +1,165 @@
|
||||
##############################################################################
|
||||
# Copyright (c) 2013-2016, Lawrence Livermore National Security, LLC.
|
||||
# Produced at the Lawrence Livermore National Laboratory.
|
||||
#
|
||||
# This file is part of Spack.
|
||||
# Created by Todd Gamblin, tgamblin@llnl.gov, All rights reserved.
|
||||
# LLNL-CODE-647188
|
||||
#
|
||||
# For details, see https://github.com/llnl/spack
|
||||
# Please also see the LICENSE file for our notice and the LGPL.
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU Lesser General Public License (as
|
||||
# published by the Free Software Foundation) version 2.1, February 1999.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful, but
|
||||
# WITHOUT ANY WARRANTY; without even the IMPLIED WARRANTY OF
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the terms and
|
||||
# conditions of the GNU Lesser General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
##############################################################################
|
||||
"""Tests for web.py."""
|
||||
import pytest
|
||||
import os
|
||||
|
||||
import spack
|
||||
from spack.util.web import spider, find_versions_of_archive
|
||||
from spack.version import *
|
||||
|
||||
|
||||
web_data_path = os.path.join(spack.test_path, 'data', 'web')
|
||||
|
||||
root = 'file://' + web_data_path + '/index.html'
|
||||
root_tarball = 'file://' + web_data_path + '/foo-0.0.0.tar.gz'
|
||||
|
||||
page_1 = 'file://' + os.path.join(web_data_path, '1.html')
|
||||
page_2 = 'file://' + os.path.join(web_data_path, '2.html')
|
||||
page_3 = 'file://' + os.path.join(web_data_path, '3.html')
|
||||
page_4 = 'file://' + os.path.join(web_data_path, '4.html')
|
||||
|
||||
|
||||
def test_spider_0():
|
||||
pages, links = spider(root, depth=0)
|
||||
|
||||
assert root in pages
|
||||
assert page_1 not in pages
|
||||
assert page_2 not in pages
|
||||
assert page_3 not in pages
|
||||
assert page_4 not in pages
|
||||
|
||||
assert "This is the root page." in pages[root]
|
||||
|
||||
assert root not in links
|
||||
assert page_1 in links
|
||||
assert page_2 not in links
|
||||
assert page_3 not in links
|
||||
assert page_4 not in links
|
||||
|
||||
|
||||
def test_spider_1():
|
||||
pages, links = spider(root, depth=1)
|
||||
|
||||
assert root in pages
|
||||
assert page_1 in pages
|
||||
assert page_2 not in pages
|
||||
assert page_3 not in pages
|
||||
assert page_4 not in pages
|
||||
|
||||
assert "This is the root page." in pages[root]
|
||||
assert "This is page 1." in pages[page_1]
|
||||
|
||||
assert root not in links
|
||||
assert page_1 in links
|
||||
assert page_2 in links
|
||||
assert page_3 not in links
|
||||
assert page_4 not in links
|
||||
|
||||
|
||||
def test_spider_2():
|
||||
pages, links = spider(root, depth=2)
|
||||
|
||||
assert root in pages
|
||||
assert page_1 in pages
|
||||
assert page_2 in pages
|
||||
assert page_3 not in pages
|
||||
assert page_4 not in pages
|
||||
|
||||
assert "This is the root page." in pages[root]
|
||||
assert "This is page 1." in pages[page_1]
|
||||
assert "This is page 2." in pages[page_2]
|
||||
|
||||
assert root not in links
|
||||
assert page_1 in links
|
||||
assert page_1 in links
|
||||
assert page_2 in links
|
||||
assert page_3 in links
|
||||
assert page_4 in links
|
||||
|
||||
|
||||
def test_spider_3():
|
||||
pages, links = spider(root, depth=3)
|
||||
|
||||
assert root in pages
|
||||
assert page_1 in pages
|
||||
assert page_2 in pages
|
||||
assert page_3 in pages
|
||||
assert page_4 in pages
|
||||
|
||||
assert "This is the root page." in pages[root]
|
||||
assert "This is page 1." in pages[page_1]
|
||||
assert "This is page 2." in pages[page_2]
|
||||
assert "This is page 3." in pages[page_3]
|
||||
assert "This is page 4." in pages[page_4]
|
||||
|
||||
assert root in links # circular link on page 3
|
||||
assert page_1 in links
|
||||
assert page_1 in links
|
||||
assert page_2 in links
|
||||
assert page_3 in links
|
||||
assert page_4 in links
|
||||
|
||||
|
||||
def test_find_versions_of_archive_0():
|
||||
versions = find_versions_of_archive(root_tarball, root, list_depth=0)
|
||||
assert ver('0.0.0') in versions
|
||||
|
||||
|
||||
def test_find_versions_of_archive_1():
|
||||
versions = find_versions_of_archive(root_tarball, root, list_depth=1)
|
||||
assert ver('0.0.0') in versions
|
||||
assert ver('1.0.0') in versions
|
||||
|
||||
|
||||
def test_find_versions_of_archive_2():
|
||||
versions = find_versions_of_archive(root_tarball, root, list_depth=2)
|
||||
assert ver('0.0.0') in versions
|
||||
assert ver('1.0.0') in versions
|
||||
assert ver('2.0.0') in versions
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_find_exotic_versions_of_archive_2():
|
||||
versions = find_versions_of_archive(root_tarball, root, list_depth=2)
|
||||
# up for grabs to make this better.
|
||||
assert ver('2.0.0b2') in versions
|
||||
|
||||
|
||||
def test_find_versions_of_archive_3():
|
||||
versions = find_versions_of_archive(root_tarball, root, list_depth=3)
|
||||
assert ver('0.0.0') in versions
|
||||
assert ver('1.0.0') in versions
|
||||
assert ver('2.0.0') in versions
|
||||
assert ver('3.0') in versions
|
||||
assert ver('4.5') in versions
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_find_exotic_versions_of_archive_3():
|
||||
versions = find_versions_of_archive(root_tarball, root, list_depth=3)
|
||||
assert ver('2.0.0b2') in versions
|
||||
assert ver('3.0a1') in versions
|
||||
assert ver('4.5-rc5') in versions
|
@@ -25,11 +25,12 @@
|
||||
import re
|
||||
import os
|
||||
import sys
|
||||
import traceback
|
||||
|
||||
from six.moves.urllib.request import urlopen, Request
|
||||
from six.moves.urllib.error import URLError
|
||||
from six.moves.urllib.parse import urljoin
|
||||
from multiprocessing import Pool
|
||||
import multiprocessing.pool
|
||||
|
||||
try:
|
||||
# Python 2 had these in the HTMLParser package.
|
||||
@@ -67,25 +68,42 @@ def handle_starttag(self, tag, attrs):
|
||||
self.links.append(val)
|
||||
|
||||
|
||||
def _spider(args):
|
||||
"""_spider(url, depth, max_depth)
|
||||
class NonDaemonProcess(multiprocessing.Process):
|
||||
"""Process tha allows sub-processes, so pools can have sub-pools."""
|
||||
def _get_daemon(self):
|
||||
return False
|
||||
|
||||
Fetches URL and any pages it links to up to max_depth. depth should
|
||||
initially be 1, and max_depth includes the root. This function will
|
||||
print out a warning only if the root can't be fetched; it ignores
|
||||
def _set_daemon(self, value):
|
||||
pass
|
||||
|
||||
daemon = property(_get_daemon, _set_daemon)
|
||||
|
||||
|
||||
class NonDaemonPool(multiprocessing.pool.Pool):
|
||||
"""Pool that uses non-daemon processes"""
|
||||
Process = NonDaemonProcess
|
||||
|
||||
|
||||
def _spider(url, visited, root, depth, max_depth, raise_on_error):
|
||||
"""Fetches URL and any pages it links to up to max_depth.
|
||||
|
||||
depth should initially be zero, and max_depth is the max depth of
|
||||
links to follow from the root.
|
||||
|
||||
Prints out a warning only if the root can't be fetched; it ignores
|
||||
errors with pages that the root links to.
|
||||
|
||||
This will return a list of the pages fetched, in no particular order.
|
||||
|
||||
Takes args as a tuple b/c it's intended to be used by a multiprocessing
|
||||
pool. Firing off all the child links at once makes the fetch MUCH
|
||||
faster for pages with lots of children.
|
||||
Returns a tuple of:
|
||||
- pages: dict of pages visited (URL) mapped to their full text.
|
||||
- links: set of links encountered while visiting the pages.
|
||||
"""
|
||||
url, visited, root, opener, depth, max_depth, raise_on_error = args
|
||||
|
||||
pages = {} # dict from page URL -> text content.
|
||||
links = set() # set of all links seen on visited pages.
|
||||
|
||||
# root may end with index.html -- chop that off.
|
||||
if root.endswith('/index.html'):
|
||||
root = re.sub('/index.html$', '', root)
|
||||
|
||||
try:
|
||||
# Make a HEAD request first to check the content type. This lets
|
||||
# us ignore tarballs and gigantic files.
|
||||
@@ -139,17 +157,19 @@ def _spider(args):
|
||||
|
||||
# If we're not at max depth, follow links.
|
||||
if depth < max_depth:
|
||||
subcalls.append((abs_link, visited, root, None,
|
||||
subcalls.append((abs_link, visited, root,
|
||||
depth + 1, max_depth, raise_on_error))
|
||||
visited.add(abs_link)
|
||||
|
||||
if subcalls:
|
||||
pool = NonDaemonPool(processes=len(subcalls))
|
||||
try:
|
||||
pool = Pool(processes=len(subcalls))
|
||||
results = pool.map(_spider, subcalls)
|
||||
results = pool.map(_spider_wrapper, subcalls)
|
||||
|
||||
for sub_pages, sub_links in results:
|
||||
pages.update(sub_pages)
|
||||
links.update(sub_links)
|
||||
|
||||
finally:
|
||||
pool.terminate()
|
||||
pool.join()
|
||||
@@ -171,46 +191,53 @@ def _spider(args):
|
||||
|
||||
except Exception as e:
|
||||
# Other types of errors are completely ignored, except in debug mode.
|
||||
tty.debug("Error in _spider: %s" % e)
|
||||
tty.debug("Error in _spider: %s:%s" % (type(e), e),
|
||||
traceback.format_exc())
|
||||
|
||||
return pages, links
|
||||
|
||||
|
||||
def spider(root_url, **kwargs):
|
||||
def _spider_wrapper(args):
|
||||
"""Wrapper for using spider with multiprocessing."""
|
||||
return _spider(*args)
|
||||
|
||||
|
||||
def spider(root_url, depth=0):
|
||||
|
||||
"""Gets web pages from a root URL.
|
||||
If depth is specified (e.g., depth=2), then this will also fetches pages
|
||||
linked from the root and its children up to depth.
|
||||
|
||||
If depth is specified (e.g., depth=2), then this will also follow
|
||||
up to <depth> levels of links from the root.
|
||||
|
||||
This will spawn processes to fetch the children, for much improved
|
||||
performance over a sequential fetch.
|
||||
|
||||
"""
|
||||
max_depth = kwargs.setdefault('depth', 1)
|
||||
pages, links = _spider((root_url, set(), root_url, None,
|
||||
1, max_depth, False))
|
||||
pages, links = _spider(root_url, set(), root_url, 0, depth, False)
|
||||
return pages, links
|
||||
|
||||
|
||||
def find_versions_of_archive(*archive_urls, **kwargs):
|
||||
def find_versions_of_archive(archive_urls, list_url=None, list_depth=0):
|
||||
"""Scrape web pages for new versions of a tarball.
|
||||
|
||||
Arguments:
|
||||
archive_urls:
|
||||
URLs for different versions of a package. Typically these
|
||||
are just the tarballs from the package file itself. By
|
||||
default, this searches the parent directories of archives.
|
||||
URL or sequence of URLs for different versions of a
|
||||
package. Typically these are just the tarballs from the package
|
||||
file itself. By default, this searches the parent directories
|
||||
of archives.
|
||||
|
||||
Keyword Arguments:
|
||||
list_url:
|
||||
|
||||
URL for a listing of archives. Spack wills scrape these
|
||||
pages for download links that look like the archive URL.
|
||||
|
||||
list_depth:
|
||||
Max depth to follow links on list_url pages.
|
||||
Max depth to follow links on list_url pages. Default 0.
|
||||
|
||||
"""
|
||||
list_url = kwargs.get('list_url', None)
|
||||
list_depth = kwargs.get('list_depth', 1)
|
||||
if not isinstance(archive_urls, (list, tuple)):
|
||||
archive_urls = [archive_urls]
|
||||
|
||||
# Generate a list of list_urls based on archive urls and any
|
||||
# explicitly listed list_url in the package
|
||||
|
Reference in New Issue
Block a user