Added web spider capability for listing versions.

2013-11-23 13:04:36 -08:00
parent fe7da0dcff
commit 389fa1792d
19 changed files with 321 additions and 59 deletions
--- a/bin/spack
+++ b/bin/spack
@@ -30,6 +30,8 @@ parser.add_argument('-v', '--verbose', action='store_true', dest='verbose',
                    help="print additional output during builds")
 parser.add_argument('-d', '--debug', action='store_true', dest='debug',
                    help="write out debug logs during compile")
+parser.add_argument('-m', '--mock', action='store_true', dest='mock',
+                    help="Use mock packages instead of real ones.")

 # each command module implements a parser() function, to which we pass its
 # subparser for setup.
@@ -46,6 +48,10 @@ args = parser.parse_args()
 # Set up environment based on args.
 spack.verbose = args.verbose
 spack.debug = args.debug
+if args.mock:
+    from spack.util.filesystem import new_path
+    mock_path = new_path(spack.module_path, 'test', 'mock_packages')
+    spack.packages_path = mock_path

 # Try to load the particular command asked for and run it
 command = spack.cmd.get_command(args.command)
--- a/lib/spack/spack/cmd/checksum.py
+++ b/lib/spack/spack/cmd/checksum.py
@@ -0,0 +1,63 @@
+import os
+import re
+import argparse
+from pprint import pprint
+from subprocess import CalledProcessError
+
+import spack.tty as tty
+import spack.packages as packages
+from spack.stage import Stage
+from spack.colify import colify
+from spack.util.crypto import md5
+from spack.version import *
+
+group='foo'
+description ="Checksum available versions of a package, print out checksums for addition to a package file."
+
+def setup_parser(subparser):
+    subparser.add_argument('package', metavar='PACKAGE', help='Package to list versions for')
+    subparser.add_argument('versions', nargs=argparse.REMAINDER, help='Versions to generate checksums for')
+    subparser.add_argument('-n', '--number', dest='number', type=int,
+                           default=10, help='Number of versions to list')
+
+
+def checksum(parser, args):
+    # get the package we're going to generate checksums for
+    pkg = packages.get(args.package)
+
+    # If the user asked for specific versions, use those.
+    # Otherwise get the latest n, where n is from the -n/--number param
+    versions = [ver(v) for v in args.versions]
+
+    if not all(type(v) == Version for v in versions):
+        tty.die("Cannot generate checksums for version lists or " +
+                "version ranges.  Use unambiguous versions.")
+
+    if not versions:
+        versions = pkg.fetch_available_versions()[:args.number]
+        if not versions:
+            tty.die("Could not fetch any available versions for %s."
+                    % pkg.name)
+
+    versions.sort()
+    versions.reverse()
+    urls = [pkg.url_for_version(v) for v in versions]
+
+    tty.msg("Found %s versions to checksum." % len(urls))
+    tty.msg("Downloading...")
+
+    hashes = []
+    for url, version in zip(urls, versions):
+        stage = Stage("checksum-%s-%s" % (pkg.name, version), url)
+        try:
+            stage.fetch()
+            hashes.append(md5(stage.archive_file))
+        finally:
+            stage.destroy()
+
+    dict_string = ["{"]
+    for i, (v, h) in enumerate(zip(versions, hashes)):
+        comma = "" if i == len(hashes) - 1 else ","
+        dict_string.append("    '%s' : '%s'%s" % (str(v), str(h), comma))
+    dict_string.append("}")
+    tty.msg("Checksummed new versions of %s:" % pkg.name, *dict_string)
--- a/lib/spack/spack/cmd/spec.py
+++ b/lib/spack/spack/cmd/spec.py
@@ -2,8 +2,10 @@
 import spack.cmd

 import spack.tty as tty
+import spack.url as url
 import spack

+
 description = "parse specs and print them out to the command line."

 def setup_parser(subparser):
@@ -13,7 +15,11 @@ def spec(parser, args):
    specs = spack.cmd.parse_specs(args.specs)
    for spec in specs:
        spec.normalize()
-        print spec.tree()
+        print spec.tree(color=True)

        spec.concretize()
-        print spec.tree()
+        print spec.tree(color=True)
+
+        pkg = spec.package
+        wc = url.wildcard_version(pkg.url)
+        print wc
--- a/lib/spack/spack/cmd/versions.py
+++ b/lib/spack/spack/cmd/versions.py
@@ -2,12 +2,8 @@
 import re
 from subprocess import CalledProcessError

-import spack
 import spack.packages as packages
-import spack.url as url
-import spack.tty as tty
 from spack.colify import colify
-from spack.version import ver

 description ="List available versions of a package"

@@ -17,4 +13,4 @@ def setup_parser(subparser):

 def versions(parser, args):
    pkg = packages.get(args.package)
-    colify(reversed(pkg.available_versions))
+    colify(reversed(pkg.fetch_available_versions()))
--- a/lib/spack/spack/package.py
+++ b/lib/spack/spack/package.py
@@ -29,6 +29,8 @@
 from multi_function import platform
 from stage import Stage
 from spack.util.lang import memoized, list_modules
+from spack.util.crypto import md5
+from spack.util.web import get_pages


 class Package(object):
@@ -251,6 +253,9 @@ class SomePackage(Package):
    """By default a package has no dependencies."""
    dependencies = {}

+    """List of specs of virtual packages provided by this package."""
+    provided_virtual_packages = {}
+
    #
    # These are default values for instance variables.
    #
@@ -310,6 +315,9 @@ def __init__(self, spec):
        if not hasattr(self, 'list_url'):
            self.list_url = os.path.dirname(self.url)

+        if not hasattr(self, 'list_depth'):
+            self.list_depth = 1
+

    def add_commands_to_module(self):
        """Populate the module scope of install() with some useful functions.
@@ -464,6 +472,11 @@ def url_version(self, version):
        return str(version)


+    def url_for_version(self, version):
+        """Gives a URL that you can download a new version of this package from."""
+        return url.substitute_version(self.url, self.url_version(version))
+
+
    def remove_prefix(self):
        """Removes the prefix for a package along with any empty parent directories."""
        if self.dirty:
@@ -640,37 +653,42 @@ def do_clean_dist(self):
        tty.msg("Successfully cleaned %s" % self.name)


+    def fetch_available_versions(self):
+        # If not, then try to fetch using list_url
+        if not self._available_versions:
+            self._available_versions = VersionList()
+            url_regex = os.path.basename(url.wildcard_version(self.url))
+            wildcard = self.version.wildcard()
+
+            page_map = get_pages(self.list_url, depth=self.list_depth)
+            for site, page in page_map.iteritems():
+                strings = re.findall(url_regex, page)
+
+                for s in strings:
+                    match = re.search(wildcard, s)
+                    if match:
+                        v = match.group(0)
+                        self._available_versions.add(Version(v))
+
+            if not self._available_versions:
+                tty.warn("Found no versions for %s" % self.name,
+                         "Check the list_url and list_depth attribute on the "
+                         + self.name + " package.",
+                         "Use them to tell Spack where to look for versions.")
+
+        return self._available_versions
+
+
    @property
    def available_versions(self):
        # If the package overrode available_versions, then use that.
        if self.versions is not None:
            return self.versions
-
-        # If not, then try to fetch using list_url
-        if not self._available_versions:
-            self._available_versions = ver([self.version])
-            try:
-                # Run curl but grab the mime type from the http headers
-                listing = spack.curl('-s', '-L', self.list_url, return_output=True)
-                url_regex = os.path.basename(url.wildcard_version(self.url))
-                strings = re.findall(url_regex, listing)
-                wildcard = self.version.wildcard()
-                for s in strings:
-                    match = re.search(wildcard, s)
-                    if match:
-                        self._available_versions.add(Version(match.group(0)))
-
-                if not self._available_versions:
-                    tty.warn("Found no versions for %s" % self.name,
-                             "Packate.available_versions may require adding the list_url attribute",
-                             "to the package to tell Spack where to look for versions.")
-
-            except subprocess.CalledProcessError:
-                tty.warn("Could not connect to %s" % self.list_url,
-                         "Package.available_versions requires an internet connection.",
-                         "Version list may be incomplete.")
-
-        return self._available_versions
+        else:
+            vlist = self.fetch_available_versions()
+            if not vlist:
+                vlist = ver([self.version])
+            return vlist


 class MakeExecutable(Executable):
--- a/lib/spack/spack/packages/init.py
+++ b/lib/spack/spack/packages/init.py
@@ -19,6 +19,7 @@
 invalid_package_re = r'[_-][_-]+'

 instances = {}
+providers = {}


 def get(pkg_name):
@@ -29,6 +30,24 @@ def get(pkg_name):
    return instances[pkg_name]


+def get_providers(vpkg_name):
+    if not providers:
+        compute_providers()
+
+    if not vpkg_name in providers:
+        raise UnknownPackageError("No such virtual package: %s" % vpkg_name)
+
+    return providers[vpkg_name]
+
+
+def compute_providers():
+    for pkg in all_packages():
+        for vpkg in pkg.provided_virtual_packages:
+            if vpkg not in providers:
+                providers[vpkg] = []
+            providers[vpkg].append(pkg)
+
+
 def valid_package_name(pkg_name):
    return (re.match(valid_package_re, pkg_name) and
            not re.search(invalid_package_re, pkg_name))
@@ -75,6 +94,11 @@ def class_name_for_package_name(pkg_name):
    return class_name


+def exists(pkg_name):
+    """Whether a package is concrete."""
+    return os.path.exists(filename_for_package_name(pkg_name))
+
+
 def get_class_for_package_name(pkg_name):
    file_name = filename_for_package_name(pkg_name)

@@ -149,7 +173,6 @@ def quote(string):
    out.write('}\n')


-
 class InvalidPackageNameError(spack.error.SpackError):
    """Raised when we encounter a bad package name."""
    def __init__(self, name):
--- a/lib/spack/spack/packages/dyninst.py
+++ b/lib/spack/spack/packages/dyninst.py
@@ -4,6 +4,7 @@ class Dyninst(Package):
    homepage = "https://paradyn.org"
    url      = "http://www.dyninst.org/sites/default/files/downloads/dyninst/8.1.2/DyninstAPI-8.1.2.tgz"
    md5      = "bf03b33375afa66fe0efa46ce3f4b17a"
+    list_url = "http://www.dyninst.org/downloads/dyninst-8.x"

    depends_on("libelf")
    depends_on("libdwarf")
--- a/lib/spack/spack/relations.py
+++ b/lib/spack/spack/relations.py
@@ -45,16 +45,28 @@ class Mpileaks(Package):
        spack install mpileaks ^mpich
 """
 import sys
+import inspect
 import spack.spec


+def _caller_locals():
+    """This will return the locals of the *parent* of the caller.
+       This allows a fucntion to insert variables into its caller's
+       scope.
+    """
+    stack = inspect.stack()
+    try:
+        return stack[2][0].f_locals
+    finally:
+        del stack
+
+
 def depends_on(*specs):
    """Adds a dependencies local variable in the locals of
       the calling class, based on args.
    """
    # Get the enclosing package's scope and add deps to it.
-    locals = sys._getframe(1).f_locals
-    dependencies = locals.setdefault("dependencies", {})
+    dependencies = _caller_locals().setdefault("dependencies", {})
    for string in specs:
        for spec in spack.spec.parse(string):
            dependencies[spec.name] = spec
@@ -66,7 +78,6 @@ def provides(*args):
       can use the providing package to satisfy the dependency.
    """
    # Get the enclosing package's scope and add deps to it.
-    locals = sys._getframe(1).f_locals
-    provides = locals.setdefault("provides", [])
+    provides = _caller_locals().setdefault("provides", [])
    for name in args:
        provides.append(name)
--- a/lib/spack/spack/spec.py
+++ b/lib/spack/spack/spec.py
@@ -321,9 +321,15 @@ def package(self):
        return packages.get(self.name)


+    @property
+    def virtual(self):
+        return packages.exists(self.name)
+
+
    @property
    def concrete(self):
-        return bool(self.versions.concrete
+        return bool(not self.virtual
+                    and self.versions.concrete
                    # TODO: support variants
                    and self.architecture
                    and self.compiler and self.compiler.concrete
--- a/lib/spack/spack/test/mock_packages/callpath.py
+++ b/lib/spack/spack/test/mock_packages/callpath.py
@@ -5,7 +5,9 @@ class Callpath(Package):
    url      = "http://github.com/tgamblin/callpath-0.2.tar.gz"
    md5      = "foobarbaz"

-    versions = [0.8, 0.9, 1.0]
+    versions = { 0.8 : 'bf03b33375afa66fe0efa46ce3f4b17a',
+                 0.9 : 'bf03b33375afa66fe0efa46ce3f4b17a',
+                 1.0 : 'bf03b33375afa66fe0efa46ce3f4b17a' }

    depends_on("dyninst")
    depends_on("mpich")
--- a/lib/spack/spack/test/mock_packages/dyninst.py
+++ b/lib/spack/spack/test/mock_packages/dyninst.py
@@ -5,7 +5,11 @@ class Dyninst(Package):
    url      = "http://www.dyninst.org/sites/default/files/downloads/dyninst/8.1.2/DyninstAPI-8.1.2.tgz"
    md5      = "bf03b33375afa66fe0efa46ce3f4b17a"

-    versions = '7.0, 7.0.1, 8.0, 8.1.1, 8.1.2'
+    list_url = "http://www.dyninst.org/downloads/dyninst-8.x"
+
+    versions = {
+        '8.1.2' : 'bf03b33375afa66fe0efa46ce3f4b17a',
+        '8.1.1' : '1f8743e3a5662b25ce64a7edf647e77d' }

    depends_on("libelf")
    depends_on("libdwarf")
--- a/lib/spack/spack/test/mock_packages/libdwarf.py
+++ b/lib/spack/spack/test/mock_packages/libdwarf.py
@@ -11,6 +11,8 @@ class Libdwarf(Package):

    md5      = "64b42692e947d5180e162e46c689dfbf"

+    versions = [20070703, 20111030, 20130207]
+
    depends_on("libelf")


--- a/lib/spack/spack/test/mock_packages/libelf.py
+++ b/lib/spack/spack/test/mock_packages/libelf.py
@@ -5,7 +5,10 @@ class Libelf(Package):
    url      = "http://www.mr511.de/software/libelf-0.8.13.tar.gz"
    md5      = "4136d7b4c04df68b686570afa26988ac"

-    versions = '0.8.10, 0.8.12, 0.8.13'
+    versions =   {
+        '0.8.13' : '4136d7b4c04df68b686570afa26988ac',
+        '0.8.12' : 'e21f8273d9f5f6d43a59878dc274fec7',
+        '0.8.10' : '9db4d36c283d9790d8fa7df1f4d7b4d9' }

    def install(self, prefix):
        configure("--prefix=%s" % prefix,
--- a/lib/spack/spack/test/mock_packages/mpich.py
+++ b/lib/spack/spack/test/mock_packages/mpich.py
@@ -3,6 +3,9 @@
 class Mpich(Package):
    homepage = "http://www.mpich.org"
    url      = "http://www.mpich.org/static/downloads/3.0.4/mpich-3.0.4.tar.gz"
+
+    list_url   = "http://www.mpich.org/static/downloads/"
+    list_depth = 2
    md5      = "9c5d5d4fe1e17dd12153f40bc5b6dbc0"

    versions = '1.0.3, 1.3.2p1, 1.4.1p1, 3.0.4, 3.1b1'
--- a/lib/spack/spack/test/mock_packages/mpileaks.py
+++ b/lib/spack/spack/test/mock_packages/mpileaks.py
@@ -5,7 +5,10 @@ class Mpileaks(Package):
    url      = "http://www.llnl.gov/mpileaks-1.0.tar.gz"
    md5      = "foobarbaz"

-    versions = [1.0, 2.1, 2.2, 2.3]
+    versions = { 1.0 : None,
+                 2.1 : None,
+                 2.2 : None,
+                 2.3 : None }

    depends_on("mpich")
    depends_on("callpath")
--- a/lib/spack/spack/url.py
+++ b/lib/spack/spack/url.py
@@ -176,6 +176,8 @@ def wildcard_version(path):
       that will match this path with any version in its place.
    """
    ver, start, end = parse_version_string_with_indices(path)
-    v = Version(ver)

-    return re.escape(path[:start]) + v.wildcard() + re.escape(path[end:])
+    v = Version(ver)
+    parts = list(re.escape(p) for p in path.split(str(v)))
+
+    return  v.wildcard().join(parts)
--- a/lib/spack/spack/util/crypto.py
+++ b/lib/spack/spack/util/crypto.py
@@ -0,0 +1,13 @@
+import hashlib
+from contextlib import closing
+
+def md5(filename, block_size=2**20):
+    """Computes the md5 hash of a file."""
+    md5 = hashlib.md5()
+    with closing(open(filename)) as file:
+        while True:
+            data = file.read(block_size)
+            if not data:
+                break
+            md5.update(data)
+        return md5.hexdigest()
--- a/lib/spack/spack/util/filesystem.py
+++ b/lib/spack/spack/util/filesystem.py
@@ -56,16 +56,3 @@ def stem(path):
        if re.search(suffix, path):
            return re.sub(suffix, "", path)
    return path
-
-
-def md5(filename, block_size=2**20):
-    """Computes the md5 hash of a file."""
-    import hashlib
-    md5 = hashlib.md5()
-    with closing(open(filename)) as file:
-        while True:
-            data = file.read(block_size)
-            if not data:
-                break
-            md5.update(data)
-        return md5.hexdigest()
--- a/lib/spack/spack/util/web.py
+++ b/lib/spack/spack/util/web.py
@@ -0,0 +1,113 @@
+import re
+import subprocess
+import urllib2
+import urlparse
+from multiprocessing import Pool
+from HTMLParser import HTMLParser
+
+import spack
+import spack.tty as tty
+from spack.util.compression import ALLOWED_ARCHIVE_TYPES
+
+# Timeout in seconds for web requests
+TIMEOUT = 10
+
+
+class LinkParser(HTMLParser):
+    """This parser just takes an HTML page and strips out the hrefs on the
+       links.  Good enough for a really simple spider. """
+    def __init__(self):
+        HTMLParser.__init__(self)
+        self.links = []
+
+    def handle_starttag(self, tag, attrs):
+        if tag == 'a':
+            for attr, val in attrs:
+                if attr == 'href':
+                    self.links.append(val)
+
+
+def _spider(args):
+    """_spider(url, depth, max_depth)
+
+       Fetches URL and any pages it links to up to max_depth.  depth should
+       initially be 1, and max_depth includes the root.  This function will
+       print out a warning only if the root can't be fetched; it ignores
+       errors with pages that the root links to.
+
+       This will return a list of the pages fetched, in no particular order.
+
+       Takes args as a tuple b/c it's intended to be used by a multiprocessing
+       pool.  Firing off all the child links at once makes the fetch MUCH
+       faster for pages with lots of children.
+    """
+    url, depth, max_depth = args
+
+    pages = {}
+    try:
+        # Make a HEAD request first to check the content type.  This lets
+        # us ignore tarballs and gigantic files.
+        # It would be nice to do this with the HTTP Accept header to avoid
+        # one round-trip.  However, most servers seem to ignore the header
+        # if you ask for a tarball with Accept: text/html.
+        req = urllib2.Request(url)
+        req.get_method = lambda: "HEAD"
+        resp = urllib2.urlopen(req, timeout=TIMEOUT)
+
+        if not resp.headers["Content-type"].startswith('text/html'):
+            print "ignoring page " + url + " with content type " + resp.headers["Content-type"]
+            return pages
+
+        # Do the real GET request when we know it's just HTML.
+        req.get_method = lambda: "GET"
+        response = urllib2.urlopen(req, timeout=TIMEOUT)
+        response_url = response.geturl()
+
+        # Read the page and and stick it in the map we'll return
+        page = response.read()
+        pages[response_url] = page
+
+        # If we're not at max depth, parse out the links in the page
+        if depth < max_depth:
+            link_parser = LinkParser()
+
+            subcalls = []
+            link_parser.feed(page)
+            while link_parser.links:
+                raw_link = link_parser.links.pop()
+
+                # Skip stuff that looks like an archive
+                if any(raw_link.endswith(suf) for suf in ALLOWED_ARCHIVE_TYPES):
+                    continue
+
+                # Evaluate the link relative to the page it came from.
+                abs_link = urlparse.urljoin(response_url, raw_link)
+                subcalls.append((abs_link, depth+1, max_depth))
+
+            if subcalls:
+                pool = Pool(processes=len(subcalls))
+                dicts = pool.map(_spider, subcalls)
+                for d in dicts:
+                    pages.update(d)
+
+    except urllib2.HTTPError, e:
+        # Only report it if it's the root page.  We ignore errors when spidering.
+        if depth == 1:
+            tty.warn("Could not connect to %s" % url, e.reason,
+                     "Package.available_versions requires an internet connection.",
+                     "Version list may be incomplete.")
+
+    return pages
+
+
+def get_pages(root_url, **kwargs):
+    """Gets web pages from a root URL.
+       If depth is specified (e.g., depth=2), then this will also fetches pages
+       linked from the root and its children up to depth.
+
+       This will spawn processes to fetch the children, for much improved
+       performance over a sequential fetch.
+    """
+    max_depth = kwargs.setdefault('depth', 1)
+    pages =  _spider((root_url, 1, max_depth))
+    return pages