Factor out URL fetching into URLFetchStrategy

- Added FetchStrategy class to Spack - Isolated pieces that need to be separate from Stage for git/svn/http - Added URLFetchStrategy for curl-based fetching.
2014-08-25 01:11:12 -07:00 · 2014-08-25 01:11:12 -07:00 · 52d140c337
commit 52d140c337
parent 74a603dcd3
5 changed files with 308 additions and 115 deletions
--- a/lib/spack/spack/fetch_strategy.py
+++ b/lib/spack/spack/fetch_strategy.py
@ -0,0 +1,222 @@
+##############################################################################
+# Copyright (c) 2013, Lawrence Livermore National Security, LLC.
+# Produced at the Lawrence Livermore National Laboratory.
+#
+# This file is part of Spack.
+# Written by Todd Gamblin, tgamblin@llnl.gov, All rights reserved.
+# LLNL-CODE-647188
+#
+# For details, see https://scalability-llnl.github.io/spack
+# Please also see the LICENSE file for our notice and the LGPL.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License (as published by
+# the Free Software Foundation) version 2.1 dated February 1999.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the IMPLIED WARRANTY OF
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the terms and
+# conditions of the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program; if not, write to the Free Software Foundation,
+# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+##############################################################################
+"""
+Fetch strategies are used to download source code into a staging area
+in order to build it.  They need to define the following methods:
+
+    * fetch()
+        This should attempt to download/check out source from somewhere.
+    * check()
+        Apply a checksum to the downloaded source code, e.g. for an archive.
+        May not do anything if the fetch method was safe to begin with.
+    * expand()
+        Expand (e.g., an archive) downloaded file to source.
+    * reset()
+        Restore original state of downloaded code.  Used by clean commands.
+        This may just remove the expanded source and re-expand an archive,
+        or it may run something like git reset --hard.
+"""
+import os
+import re
+import shutil
+
+import llnl.util.tty as tty
+
+import spack
+import spack.error
+import spack.util.crypto as crypto
+from spack.util.compression import decompressor_for
+
+
+class FetchStrategy(object):
+    def __init__(self):
+        # The stage is initialized late, so that fetch strategies can be constructed
+        # at package construction time.  This is where things will be fetched.
+        self.stage = None
+
+
+    def set_stage(self, stage):
+        """This is called by Stage before any of the fetching
+           methods are called on the stage."""
+        self.stage = stage
+
+
+    # Subclasses need to implement tehse methods
+    def fetch(self): pass    # Return True on success, False on fail
+    def check(self): pass
+    def expand(self): pass
+    def reset(self): pass
+    def __str__(self): pass
+
+
+
+class URLFetchStrategy(FetchStrategy):
+
+    def __init__(self, url, digest=None):
+        super(URLFetchStrategy, self).__init__()
+        self.url = url
+        self.digest = digest
+
+
+    def fetch(self):
+        assert(self.stage)
+
+        self.stage.chdir()
+
+        if self.archive_file:
+            tty.msg("Already downloaded %s." % self.archive_file)
+            return
+
+        tty.msg("Trying to fetch from %s" % self.url)
+
+        # Run curl but grab the mime type from the http headers
+        headers = spack.curl('-#',        # status bar
+                             '-O',        # save file to disk
+                             '-D', '-',   # print out HTML headers
+                             '-L', self.url,
+                             return_output=True, fail_on_error=False)
+
+        if spack.curl.returncode != 0:
+            # clean up archive on failure.
+            if self.archive_file:
+                os.remove(self.archive_file)
+
+            if spack.curl.returncode == 60:
+                # This is a certificate error.  Suggest spack -k
+                raise FailedDownloadError(
+                    self.url,
+                    "Curl was unable to fetch due to invalid certificate. "
+                    "This is either an attack, or your cluster's SSL configuration "
+                    "is bad.  If you believe your SSL configuration is bad, you "
+                    "can try running spack -k, which will not check SSL certificates."
+                    "Use this at your own risk.")
+
+        # Check if we somehow got an HTML file rather than the archive we
+        # asked for.  We only look at the last content type, to handle
+        # redirects properly.
+        content_types = re.findall(r'Content-Type:[^\r\n]+', headers)
+        if content_types and 'text/html' in content_types[-1]:
+            tty.warn("The contents of " + self.archive_file + " look like HTML.",
+                     "The checksum will likely be bad.  If it is, you can use",
+                     "'spack clean --dist' to remove the bad archive, then fix",
+                     "your internet gateway issue and install again.")
+
+        if not self.archive_file:
+            raise FailedDownloadError(self.url)
+
+
+    @property
+    def archive_file(self):
+        """Path to the source archive within this stage directory."""
+        assert(self.stage)
+        path = os.path.join(self.stage.path, os.path.basename(self.url))
+        return path if os.path.exists(path) else None
+
+
+    def expand(self):
+        assert(self.stage)
+        tty.msg("Staging archive: %s" % self.archive_file)
+
+        self.stage.chdir()
+        if not self.archive_file:
+            raise NoArchiveFileError("URLFetchStrategy couldn't find archive file",
+                                     "Failed on expand() for URL %s" % self.url)
+
+        print self.archive_file
+
+        decompress = decompressor_for(self.archive_file)
+        decompress(self.archive_file)
+
+
+    def check(self):
+        """Check the downloaded archive against a checksum digest.
+           No-op if this stage checks code out of a repository."""
+        assert(self.stage)
+        if not self.digest:
+            raise NoDigestError("Attempt to check URLFetchStrategy with no digest.")
+        checker = crypto.Checker(digest)
+        if not checker.check(self.archive_file):
+            raise ChecksumError(
+                "%s checksum failed for %s." % (checker.hash_name, self.archive_file),
+                "Expected %s but got %s." % (digest, checker.sum))
+
+
+    def reset(self):
+        """Removes the source path if it exists, then re-expands the archive."""
+        assert(self.stage)
+        if not self.archive_file:
+            raise NoArchiveFileError("Tried to reset URLFetchStrategy before fetching",
+                                     "Failed on reset() for URL %s" % self.url)
+        if self.stage.source_path:
+            shutil.rmtree(self.stage.source_path, ignore_errors=True)
+        self.expand()
+
+
+    def __str__(self):
+        return self.url
+
+
+
+class GitFetchStrategy(FetchStrategy):
+    pass
+
+
+class SvnFetchStrategy(FetchStrategy):
+    pass
+
+
+def strategy_for_url(url):
+    """Given a URL, find an appropriate fetch strategy for it.
+       Currently just gives you a URLFetchStrategy that uses curl.
+
+       TODO: make this return appropriate fetch strategies for other
+             types of URLs.
+    """
+    return URLFetchStrategy(url)
+
+
+class FetchStrategyError(spack.error.SpackError):
+    def __init__(self, msg, long_msg):
+        super(FetchStrategyError, self).__init__(msg, long_msg)
+
+
+class FailedDownloadError(FetchStrategyError):
+    """Raised wen a download fails."""
+    def __init__(self, url, msg=""):
+        super(FailedDownloadError, self).__init__(
+            "Failed to fetch file from URL: %s" % url, msg)
+        self.url = url
+
+
+class NoArchiveFileError(FetchStrategyError):
+    def __init__(self, msg, long_msg):
+        super(NoArchiveFileError, self).__init__(msg, long_msg)
+
+
+class NoDigestError(FetchStrategyError):
+    def __init__(self, msg, long_msg):
+        super(NoDigestError, self).__init__(msg, long_msg)
+
+
--- a/lib/spack/spack/package.py
+++ b/lib/spack/spack/package.py
@ -337,7 +337,7 @@ def __init__(self, spec):

        # Sanity check some required variables that could be
        # overridden by package authors.
-        def sanity_check_dict(attr_name):
+        def ensure_has_dict(attr_name):
            if not hasattr(self, attr_name):
                raise PackageError("Package %s must define %s" % attr_name)

@ -345,10 +345,10 @@ def sanity_check_dict(attr_name):
            if not isinstance(attr, dict):
                raise PackageError("Package %s has non-dict %s attribute!"
                                   % (self.name, attr_name))
-        sanity_check_dict('versions')
-        sanity_check_dict('dependencies')
-        sanity_check_dict('conflicted')
-        sanity_check_dict('patches')
+        ensure_has_dict('versions')
+        ensure_has_dict('dependencies')
+        ensure_has_dict('conflicted')
+        ensure_has_dict('patches')

        # Check versions in the versions dict.
        for v in self.versions:
@ -362,9 +362,8 @@ def sanity_check_dict(attr_name):
        # Version-ize the keys in versions dict
        try:
            self.versions = dict((Version(v), h) for v,h in self.versions.items())
-        except ValueError:
-            raise ValueError("Keys of versions dict in package %s must be versions!"
-                             % self.name)
+        except ValueError, e:
+            raise ValueError("In package %s: %s" % (self.name, e.message))

        # stage used to build this package.
        self._stage = None
@ -600,9 +599,8 @@ def do_stage(self):

        self.do_fetch()

-        archive_dir = self.stage.expanded_archive_path
+        archive_dir = self.stage.source_path
        if not archive_dir:
-            tty.msg("Staging archive: %s" % self.stage.archive_file)
            self.stage.expand_archive()
            tty.msg("Created stage directory in %s." % self.stage.path)
        else:
@ -620,7 +618,7 @@ def do_patch(self):

        # Construct paths to special files in the archive dir used to
        # keep track of whether patches were successfully applied.
-        archive_dir = self.stage.expanded_archive_path
+        archive_dir = self.stage.source_path
        good_file = join_path(archive_dir, '.spack_patched')
        bad_file  = join_path(archive_dir, '.spack_patch_failed')

--- a/lib/spack/spack/relations.py
+++ b/lib/spack/spack/relations.py
@ -95,7 +95,7 @@ def __init__(self, checksum, url):


 def version(ver, checksum, **kwargs):
-    """Adds a version and associated metadata to the package."""
+    """Adds a version and metadata describing how to fetch it."""
    pkg = caller_locals()

    versions = pkg.setdefault('versions', {})
--- a/lib/spack/spack/stage.py
+++ b/lib/spack/spack/stage.py
@ -32,18 +32,20 @@

 import spack
 import spack.config
+from spack.fetch_strategy import strategy_for_url, URLFetchStrategy
 import spack.error
-import spack.util.crypto as crypto
-from spack.util.compression import decompressor_for
+


 STAGE_PREFIX = 'spack-stage-'


 class Stage(object):
-    """A Stage object manaages a directory where an archive is downloaded,
-       expanded, and built before being installed.  It also handles downloading
-       the archive.  A stage's lifecycle looks like this:
+    """A Stage object manaages a directory where some source code is
+       downloaded and built before being installed.  It handles
+       fetching the source code, either as an archive to be expanded
+       or by checking it out of a repository.  A stage's lifecycle
+       looks like this:

       Stage()
         Constructor creates the stage directory.
@ -71,18 +73,24 @@ class Stage(object):
    def __init__(self, url, **kwargs):
        """Create a stage object.
           Parameters:
-             url     URL of the archive to be downloaded into this stage.
+             url_or_fetch_strategy
+                 URL of the archive to be downloaded into this stage, OR
+                 a valid FetchStrategy.

-             name    If a name is provided, then this stage is a named stage
-                     and will persist between runs (or if you construct another
-                     stage object later).  If name is not provided, then this
-                     stage will be given a unique name automatically.
+             name
+                 If a name is provided, then this stage is a named stage
+                 and will persist between runs (or if you construct another
+                 stage object later).  If name is not provided, then this
+                 stage will be given a unique name automatically.
        """
+        if isinstance(url, basestring):
+            self.fetcher = strategy_for_url(url)
+            self.fetcher.set_stage(self)
+
        self.name = kwargs.get('name')
        self.mirror_path = kwargs.get('mirror_path')

        self.tmp_root = find_tmp_root()
-        self.url = url

        self.path = None
        self._setup()
@ -198,17 +206,17 @@ def archive_file(self):


    @property
-    def expanded_archive_path(self):
-        """Returns the path to the expanded archive directory if it's expanded;
-           None if the archive hasn't been expanded.
-        """
-        if not self.archive_file:
-            return None
+    def source_path(self):
+        """Returns the path to the expanded/checked out source code
+           within this fetch strategy's path.

-        for file in os.listdir(self.path):
-            archive_path = join_path(self.path, file)
-            if os.path.isdir(archive_path):
-                return archive_path
+           This assumes nothing else is going ot be put in the
+           FetchStrategy's path.  It searches for the first
+           subdirectory of the path it can find, then returns that.
+        """
+        for p in [os.path.join(self.path, f) for f in os.listdir(self.path)]:
+            if os.path.isdir(p):
+                return p
        return None


@ -220,71 +228,35 @@ def chdir(self):
            tty.die("Setup failed: no such directory: " + self.path)


-    def fetch_from_url(self, url):
-        # Run curl but grab the mime type from the http headers
-        headers = spack.curl('-#',        # status bar
-                             '-O',        # save file to disk
-                             '-D', '-',   # print out HTML headers
-                             '-L', url,
-                             return_output=True, fail_on_error=False)
-
-        if spack.curl.returncode != 0:
-            # clean up archive on failure.
-            if self.archive_file:
-                os.remove(self.archive_file)
-
-            if spack.curl.returncode == 60:
-                # This is a certificate error.  Suggest spack -k
-                raise FailedDownloadError(
-                    url,
-                    "Curl was unable to fetch due to invalid certificate. "
-                    "This is either an attack, or your cluster's SSL configuration "
-                    "is bad.  If you believe your SSL configuration is bad, you "
-                    "can try running spack -k, which will not check SSL certificates."
-                    "Use this at your own risk.")
-
-        # Check if we somehow got an HTML file rather than the archive we
-        # asked for.  We only look at the last content type, to handle
-        # redirects properly.
-        content_types = re.findall(r'Content-Type:[^\r\n]+', headers)
-        if content_types and 'text/html' in content_types[-1]:
-            tty.warn("The contents of " + self.archive_file + " look like HTML.",
-                     "The checksum will likely be bad.  If it is, you can use",
-                     "'spack clean --dist' to remove the bad archive, then fix",
-                     "your internet gateway issue and install again.")
-
-
    def fetch(self):
-        """Downloads the file at URL to the stage.  Returns true if it was downloaded,
-           false if it already existed."""
+        """Downloads an archive or checks out code from a repository."""
        self.chdir()
-        if self.archive_file:
-            tty.msg("Already downloaded %s." % self.archive_file)

-        else:
-            urls = [self.url]
-            if self.mirror_path:
-                urls = ["%s/%s" % (m, self.mirror_path) for m in _get_mirrors()] + urls
+        fetchers = [self.fetcher]

-            for url in urls:
-                tty.msg("Trying to fetch from %s" % url)
-                self.fetch_from_url(url)
-                if self.archive_file:
-                    break
+        # TODO: move mirror logic out of here and clean it up!
+        if self.mirror_path:
+            urls = ["%s/%s" % (m, self.mirror_path) for m in _get_mirrors()]
+            digest = None
+            if isinstance(self.fetcher, URLFetchStrategy):
+                digest = self.fetcher.digest
+            fetchers = [URLFetchStrategy(url, digest) for url in urls] + fetchers
+            for f in fetchers:
+                f.set_stage(self)

-        if not self.archive_file:
-            raise FailedDownloadError(url)
-
-        return self.archive_file
+        for fetcher in fetchers:
+            try:
+                fetcher.fetch()
+                break
+            except spack.error.SpackError, e:
+                tty.msg("Download from %s failed." % fetcher)
+                continue


    def check(self, digest):
-        """Check the downloaded archive against a checksum digest"""
-        checker = crypto.Checker(digest)
-        if not checker.check(self.archive_file):
-            raise ChecksumError(
-                "%s checksum failed for %s." % (checker.hash_name, self.archive_file),
-                "Expected %s but got %s." % (digest, checker.sum))
+        """Check the downloaded archive against a checksum digest.
+           No-op if this stage checks code out of a repository."""
+        self.fetcher.check()


    def expand_archive(self):
@ -292,19 +264,14 @@ def expand_archive(self):
           archive.  Fail if the stage is not set up or if the archive is not yet
           downloaded.
        """
-        self.chdir()
-        if not self.archive_file:
-            tty.die("Attempt to expand archive before fetching.")
-
-        decompress = decompressor_for(self.archive_file)
-        decompress(self.archive_file)
+        self.fetcher.expand()


    def chdir_to_archive(self):
        """Changes directory to the expanded archive directory.
           Dies with an error if there was no expanded archive.
        """
-        path = self.expanded_archive_path
+        path = self.source_path
        if not path:
            tty.die("Attempt to chdir before expanding archive.")
        else:
@ -317,12 +284,7 @@ def restage(self):
        """Removes the expanded archive path if it exists, then re-expands
           the archive.
        """
-        if not self.archive_file:
-            tty.die("Attempt to restage when not staged.")
-
-        if self.expanded_archive_path:
-            shutil.rmtree(self.expanded_archive_path, True)
-        self.expand_archive()
+        self.fetcher.reset()


    def destroy(self):
@ -393,15 +355,26 @@ def find_tmp_root():
    return None


-class FailedDownloadError(spack.error.SpackError):
-    """Raised wen a download fails."""
-    def __init__(self, url, msg=""):
-        super(FailedDownloadError, self).__init__(
-            "Failed to fetch file from URL: %s" % url, msg)
-        self.url = url
+class StageError(spack.error.SpackError):
+    def __init__(self, message, long_message=None):
+        super(self, StageError).__init__(message, long_message)


-class ChecksumError(spack.error.SpackError):
+class ChecksumError(StageError):
    """Raised when archive fails to checksum."""
-    def __init__(self, message, long_msg):
+    def __init__(self, message, long_msg=None):
        super(ChecksumError, self).__init__(message, long_msg)
+
+
+class RestageError(StageError):
+    def __init__(self, message, long_msg=None):
+        super(RestageError, self).__init__(message, long_msg)
+
+
+class ChdirError(StageError):
+    def __init__(self, message, long_msg=None):
+        super(ChdirError, self).__init__(message, long_msg)
+
+
+# Keep this in namespace for convenience
+FailedDownloadError = spack.fetch_strategy.FailedDownloadError
--- a/lib/spack/spack/test/stage.py
+++ b/lib/spack/spack/test/stage.py
@ -146,7 +146,7 @@ def check_fetch(self, stage, stage_name):
        stage_path = self.get_stage_path(stage, stage_name)
        self.assertTrue(archive_name in os.listdir(stage_path))
        self.assertEqual(join_path(stage_path, archive_name),
-                         stage.archive_file)
+                         stage.fetcher.archive_file)


    def check_expand_archive(self, stage, stage_name):
@ -156,7 +156,7 @@ def check_expand_archive(self, stage, stage_name):

        self.assertEqual(
            join_path(stage_path, archive_dir),
-            stage.expanded_archive_path)
+            stage.source_path)

        readme = join_path(stage_path, archive_dir, readme_name)
        self.assertTrue(os.path.isfile(readme))
@ -292,7 +292,7 @@ def test_restage(self):
        with closing(open('foobar', 'w')) as file:
            file.write("this file is to be destroyed.")

-        self.assertTrue('foobar' in os.listdir(stage.expanded_archive_path))
+        self.assertTrue('foobar' in os.listdir(stage.source_path))

        # Make sure the file is not there after restage.
        stage.restage()
@ -301,7 +301,7 @@ def test_restage(self):

        stage.chdir_to_archive()
        self.check_chdir_to_archive(stage, stage_name)
-        self.assertFalse('foobar' in os.listdir(stage.expanded_archive_path))
+        self.assertFalse('foobar' in os.listdir(stage.source_path))

        stage.destroy()
        self.check_destroy(stage, stage_name)