Merge pull request #276 from LLNL/bugfix/235-smarter-spider

Fix #235: Smarter web spidering -- use parsed links instead of recons…
2015-12-23 14:06:37 -08:00 · 2015-12-23 14:06:37 -08:00 · 28d61f0d7f
commit 28d61f0d7f
parent d63cb8b537 5ca5884ad6
4 changed files with 187 additions and 86 deletions
--- a/lib/spack/spack/cmd/create.py
+++ b/lib/spack/spack/cmd/create.py
@ -34,8 +34,8 @@
 import spack
 import spack.cmd
 import spack.cmd.checksum
 import spack.package
 import spack.url
 import spack.util.web
 from spack.util.naming import *
 import spack.util.crypto as crypto
@ -166,7 +166,7 @@ def create(parser, args):
    tty.msg("This looks like a URL for %s version %s." % (name, version))
    tty.msg("Creating template for package %s" % name)
-    versions = spack.package.find_versions_of_archive(url)
+    versions = spack.util.web.find_versions_of_archive(url)
    rkeys = sorted(versions.keys(), reverse=True)
    versions = OrderedDict(zip(rkeys, (versions[v] for v in rkeys)))
--- a/lib/spack/spack/cmd/url-parse.py
+++ b/lib/spack/spack/cmd/url-parse.py
@ -0,0 +1,75 @@
 ##############################################################################
 # Copyright (c) 2013, Lawrence Livermore National Security, LLC.
 # Produced at the Lawrence Livermore National Laboratory.
 #
 # This file is part of Spack.
 # Written by Todd Gamblin, tgamblin@llnl.gov, All rights reserved.
 # LLNL-CODE-647188
 #
 # For details, see https://github.com/llnl/spack
 # Please also see the LICENSE file for our notice and the LGPL.
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License (as published by
 # the Free Software Foundation) version 2.1 dated February 1999.
 #
 # This program is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the IMPLIED WARRANTY OF
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the terms and
 # conditions of the GNU General Public License for more details.
 #
 # You should have received a copy of the GNU Lesser General Public License
 # along with this program; if not, write to the Free Software Foundation,
 # Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 ##############################################################################
 import sys
 import llnl.util.tty as tty
 import spack
 import spack.url
 from spack.util.web import find_versions_of_archive
 description = "Show parsing of a URL, optionally spider web for other versions."
 def setup_parser(subparser):
    subparser.add_argument('url', help="url of a package archive")
    subparser.add_argument(
        '-s', '--spider', action='store_true', help="Spider the source page for versions.")
 def print_name_and_version(url):
    name, ns, nl, ntup, ver, vs, vl, vtup = spack.url.substitution_offsets(url)
    underlines = [" "] * max(ns+nl, vs+vl)
    for i in range(ns, ns+nl):
        underlines[i] = '-'
    for i in range(vs, vs+vl):
        underlines[i] = '~'
    print "    %s" % url
    print "    %s" % ''.join(underlines)
 def url_parse(parser, args):
    url = args.url
    ver,  vs, vl = spack.url.parse_version_offset(url)
    name, ns, nl = spack.url.parse_name_offset(url, ver)
    tty.msg("Parsing URL:")
    try:
        print_name_and_version(url)
    except spack.url.UrlParseError as e:
        tty.error(str(e))
    print
    tty.msg("Substituting version 9.9.9b:")
    newurl = spack.url.substitute_version(url, '9.9.9b')
    print_name_and_version(newurl)
    if args.spider:
        print
        tty.msg("Spidering for versions:")
        versions = find_versions_of_archive(url)
        for v in sorted(versions):
            print "%-20s%s" % (v, versions[v])
--- a/lib/spack/spack/package.py
+++ b/lib/spack/spack/package.py
@ -1164,7 +1164,7 @@ def fetch_remote_versions(self):
            raise VersionFetchError(self.__class__)
        try:
-            return find_versions_of_archive(
+            return spack.util.web.find_versions_of_archive(
                *self.all_urls, list_url=self.list_url, list_depth=self.list_depth)
        except spack.error.NoNetworkConnectionError, e:
            tty.die("Package.fetch_versions couldn't connect to:",
@ -1188,50 +1188,6 @@ def rpath_args(self):
        return " ".join("-Wl,-rpath=%s" % p for p in self.rpath)
 def find_versions_of_archive(*archive_urls, **kwargs):
    list_url   = kwargs.get('list_url', None)
    list_depth = kwargs.get('list_depth', 1)
    # Generate a list of list_urls based on archive urls and any
    # explicitly listed list_url in the package
    list_urls = set()
    if list_url:
        list_urls.add(list_url)
    for aurl in archive_urls:
        list_urls.add(spack.url.find_list_url(aurl))
    # Grab some web pages to scrape.
    page_map = {}
    for lurl in list_urls:
        pages = spack.util.web.get_pages(lurl, depth=list_depth)
        page_map.update(pages)
    # Scrape them for archive URLs
    regexes = []
    for aurl in archive_urls:
        # This creates a regex from the URL with a capture group for
        # the version part of the URL.  The capture group is converted
        # to a generic wildcard, so we can use this to extract things
        # on a page that look like archive URLs.
        url_regex = spack.url.wildcard_version(aurl)
        # We'll be a bit more liberal and just look for the archive
        # part, not the full path.
        regexes.append(os.path.basename(url_regex))
    # Build a version list from all the matches we find
    versions = {}
    for page_url, content in page_map.iteritems():
        # extract versions from matches.
        for regex in regexes:
            for m in re.finditer(regex, content):
                url = urljoin(page_url, m.group(0))
                ver = spack.url.parse_version(url)
                versions[ver] = url
    return versions
 def validate_package_url(url_string):
    """Determine whether spack can handle a particular URL or not."""
    url = urlparse(url_string)
--- a/lib/spack/spack/util/web.py
+++ b/lib/spack/spack/util/web.py
@ -23,6 +23,7 @@
 # Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 ##############################################################################
 import re
 import os
 import sys
 import subprocess
 import urllib2, cookielib
@ -70,7 +71,9 @@ def _spider(args):
    """
    url, visited, root, opener, depth, max_depth, raise_on_error = args
-    pages = {}
+    pages = {}     # dict from page URL -> text content.
    links = set()  # set of all links seen on visited pages.
    try:
        # Make a HEAD request first to check the content type.  This lets
        # us ignore tarballs and gigantic files.
@ -99,22 +102,21 @@ def _spider(args):
        page = response.read()
        pages[response_url] = page
-        # If we're not at max depth, parse out the links in the page
+        # Parse out the links in the page
        if depth < max_depth:
        link_parser = LinkParser()
        subcalls = []
        link_parser.feed(page)
        while link_parser.links:
            raw_link = link_parser.links.pop()
            abs_link = urlparse.urljoin(response_url, raw_link)
            links.add(abs_link)
            # Skip stuff that looks like an archive
            if any(raw_link.endswith(suf) for suf in ALLOWED_ARCHIVE_TYPES):
                continue
                # Evaluate the link relative to the page it came from.
                abs_link = urlparse.urljoin(response_url, raw_link)
            # Skip things outside the root directory
            if not abs_link.startswith(root):
                continue
@ -123,15 +125,19 @@ def _spider(args):
            if abs_link in visited:
                continue
-                subcalls.append((abs_link, visited, root, None, depth+1, max_depth, raise_on_error))
+        # If we're not at max depth, follow links.
        if depth < max_depth:
            subcalls.append((abs_link, visited, root, None,
                             depth+1, max_depth, raise_on_error))
            visited.add(abs_link)
        if subcalls:
            try:
                pool = Pool(processes=len(subcalls))
-                    dicts = pool.map(_spider, subcalls)
+                results = pool.map(_spider, subcalls)
-                    for d in dicts:
+                for sub_pages, sub_links in results:
-                        pages.update(d)
+                    pages.update(sub_pages)
                    links.update(sub_links)
            finally:
                pool.terminate()
                pool.join()
@ -155,10 +161,10 @@ def _spider(args):
        # Other types of errors are completely ignored, except in debug mode.
        tty.debug("Error in _spider: %s" % e)
-    return pages
+    return pages, links
-def get_pages(root_url, **kwargs):
+def spider(root_url, **kwargs):
    """Gets web pages from a root URL.
       If depth is specified (e.g., depth=2), then this will also fetches pages
       linked from the root and its children up to depth.
@ -167,5 +173,69 @@ def get_pages(root_url, **kwargs):
       performance over a sequential fetch.
    """
    max_depth = kwargs.setdefault('depth', 1)
-    pages =  _spider((root_url, set(), root_url, None, 1, max_depth, False))
+    pages, links =  _spider((root_url, set(), root_url, None, 1, max_depth, False))
-    return pages
+    return pages, links
 def find_versions_of_archive(*archive_urls, **kwargs):
    """Scrape web pages for new versions of a tarball.
    Arguments:
      archive_urls:
          URLs for different versions of a package. Typically these
          are just the tarballs from the package file itself.  By
          default, this searches the parent directories of archives.
    Keyword Arguments:
      list_url:
          URL for a listing of archives.  Spack wills scrape these
          pages for download links that look like the archive URL.
      list_depth:
          Max depth to follow links on list_url pages.
    """
    list_url   = kwargs.get('list_url', None)
    list_depth = kwargs.get('list_depth', 1)
    # Generate a list of list_urls based on archive urls and any
    # explicitly listed list_url in the package
    list_urls = set()
    if list_url:
        list_urls.add(list_url)
    for aurl in archive_urls:
        list_urls.add(spack.url.find_list_url(aurl))
    # Grab some web pages to scrape.
    pages = {}
    links = set()
    for lurl in list_urls:
        p, l = spider(lurl, depth=list_depth)
        pages.update(p)
        links.update(l)
    # Scrape them for archive URLs
    regexes = []
    for aurl in archive_urls:
        # This creates a regex from the URL with a capture group for
        # the version part of the URL.  The capture group is converted
        # to a generic wildcard, so we can use this to extract things
        # on a page that look like archive URLs.
        url_regex = spack.url.wildcard_version(aurl)
        # We'll be a bit more liberal and just look for the archive
        # part, not the full path.
        regexes.append(os.path.basename(url_regex))
    # Build a dict version -> URL from any links that match the wildcards.
    versions = {}
    for url in links:
        if any(re.search(r, url) for r in regexes):
            try:
                ver = spack.url.parse_version(url)
                versions[ver] = url
            except spack.url.UndetectableVersionError as e:
                continue
    return versions