Improved website scraping.
This commit is contained in:
		@@ -85,24 +85,24 @@ def checksum(parser, args):
 | 
			
		||||
    pkg = spack.db.get(args.package)
 | 
			
		||||
 | 
			
		||||
    # If the user asked for specific versions, use those.
 | 
			
		||||
    versions = [ver(v) for v in args.versions]
 | 
			
		||||
 | 
			
		||||
    if not all(type(v) == Version for v in versions):
 | 
			
		||||
        tty.die("Cannot generate checksums for version lists or " +
 | 
			
		||||
                "version ranges.  Use unambiguous versions.")
 | 
			
		||||
 | 
			
		||||
    if not versions:
 | 
			
		||||
        versions = pkg.fetch_available_versions()
 | 
			
		||||
    if args.versions:
 | 
			
		||||
        versions = {}
 | 
			
		||||
        for v in args.versions:
 | 
			
		||||
            v = ver(v)
 | 
			
		||||
            if not isinstance(v, Version):
 | 
			
		||||
                tty.die("Cannot generate checksums for version lists or " +
 | 
			
		||||
                        "version ranges.  Use unambiguous versions.")
 | 
			
		||||
            versions[v] = pkg.url_for_version(v)
 | 
			
		||||
    else:
 | 
			
		||||
        versions = pkg.fetch_remote_versions()
 | 
			
		||||
        if not versions:
 | 
			
		||||
            tty.die("Could not fetch any available versions for %s." % pkg.name)
 | 
			
		||||
            tty.die("Could not fetch any versions for %s." % pkg.name)
 | 
			
		||||
 | 
			
		||||
    versions = list(reversed(sorted(versions)))
 | 
			
		||||
    urls = [pkg.url_for_version(v) for v in versions]
 | 
			
		||||
    sorted_versions = list(reversed(sorted(versions)))
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    tty.msg("Found %s versions of %s." % (len(urls), pkg.name),
 | 
			
		||||
    tty.msg("Found %s versions of %s." % (len(versions), pkg.name),
 | 
			
		||||
            *spack.cmd.elide_list(
 | 
			
		||||
            ["%-10s%s" % (v,u) for v, u in zip(versions, urls)]))
 | 
			
		||||
            ["%-10s%s" % (v, versions[v]) for v in sorted_versions]))
 | 
			
		||||
    print
 | 
			
		||||
    archives_to_fetch = tty.get_number(
 | 
			
		||||
        "How many would you like to checksum?", default=5, abort='q')
 | 
			
		||||
@@ -112,10 +112,12 @@ def checksum(parser, args):
 | 
			
		||||
        return
 | 
			
		||||
 | 
			
		||||
    version_hashes = get_checksums(
 | 
			
		||||
        versions[:archives_to_fetch], urls[:archives_to_fetch], keep_stage=args.keep_stage)
 | 
			
		||||
        sorted_versions[:archives_to_fetch],
 | 
			
		||||
        [versions[v] for v in sorted_versions[:archives_to_fetch]],
 | 
			
		||||
        keep_stage=args.keep_stage)
 | 
			
		||||
 | 
			
		||||
    if not version_hashes:
 | 
			
		||||
        tty.die("Could not fetch any available versions for %s." % pkg.name)
 | 
			
		||||
        tty.die("Could not fetch any versions for %s." % pkg.name)
 | 
			
		||||
 | 
			
		||||
    version_lines = ["    version('%s', '%s')" % (v, h) for v, h in version_hashes]
 | 
			
		||||
    tty.msg("Checksummed new versions of %s:" % pkg.name, *version_lines)
 | 
			
		||||
 
 | 
			
		||||
@@ -159,13 +159,12 @@ def create(parser, args):
 | 
			
		||||
    else:
 | 
			
		||||
        mkdirp(os.path.dirname(pkg_path))
 | 
			
		||||
 | 
			
		||||
    versions = list(reversed(spack.package.find_versions_of_archive(url)))
 | 
			
		||||
    versions = spack.package.find_versions_of_archive(url)
 | 
			
		||||
 | 
			
		||||
    archives_to_fetch = 1
 | 
			
		||||
    if not versions:
 | 
			
		||||
        # If the fetch failed for some reason, revert to what the user provided
 | 
			
		||||
        versions = [version]
 | 
			
		||||
        urls = [url]
 | 
			
		||||
        versions = { version : url }
 | 
			
		||||
    else:
 | 
			
		||||
        urls = [spack.url.substitute_version(url, v) for v in versions]
 | 
			
		||||
        if len(urls) > 1:
 | 
			
		||||
@@ -181,6 +180,8 @@ def create(parser, args):
 | 
			
		||||
                tty.msg("Aborted.")
 | 
			
		||||
                return
 | 
			
		||||
 | 
			
		||||
    sorted_versions = list(reversed(versions))
 | 
			
		||||
 | 
			
		||||
    guesser = ConfigureGuesser()
 | 
			
		||||
    ver_hash_tuples = spack.cmd.checksum.get_checksums(
 | 
			
		||||
        versions[:archives_to_fetch], urls[:archives_to_fetch],
 | 
			
		||||
 
 | 
			
		||||
@@ -24,6 +24,7 @@
 | 
			
		||||
##############################################################################
 | 
			
		||||
import os
 | 
			
		||||
from llnl.util.tty.colify import colify
 | 
			
		||||
import llnl.util.tty as tty
 | 
			
		||||
import spack
 | 
			
		||||
 | 
			
		||||
description ="List available versions of a package"
 | 
			
		||||
@@ -34,4 +35,21 @@ def setup_parser(subparser):
 | 
			
		||||
 | 
			
		||||
def versions(parser, args):
 | 
			
		||||
    pkg = spack.db.get(args.package)
 | 
			
		||||
    colify(reversed(pkg.fetch_available_versions()))
 | 
			
		||||
 | 
			
		||||
    safe_versions = pkg.versions
 | 
			
		||||
    fetched_versions = pkg.fetch_remote_versions()
 | 
			
		||||
    remote_versions = set(fetched_versions).difference(safe_versions)
 | 
			
		||||
 | 
			
		||||
    tty.msg("Safe versions (already checksummed):")
 | 
			
		||||
    colify(sorted(safe_versions, reverse=True), indent=2)
 | 
			
		||||
 | 
			
		||||
    tty.msg("Remote versions (not yet checksummed):")
 | 
			
		||||
    if not remote_versions:
 | 
			
		||||
        if not fetched_versions:
 | 
			
		||||
            print "  Found no versions for %s" % pkg.name
 | 
			
		||||
            tty.debug("Check the list_url and list_depth attribute on the "
 | 
			
		||||
                      "package to help Spack find versions.")
 | 
			
		||||
        else:
 | 
			
		||||
            print "  Found no unckecksummed versions for %s" % pkg.name
 | 
			
		||||
    else:
 | 
			
		||||
        colify(sorted(remote_versions, reverse=True), indent=2)
 | 
			
		||||
 
 | 
			
		||||
@@ -68,7 +68,7 @@ def concretize_version(self, spec):
 | 
			
		||||
        # If there are known avaialble versions, return the most recent
 | 
			
		||||
        # version that satisfies the spec
 | 
			
		||||
        pkg = spec.package
 | 
			
		||||
        valid_versions = [v for v in pkg.available_versions
 | 
			
		||||
        valid_versions = [v for v in pkg.versions
 | 
			
		||||
                          if any(v.satisfies(sv) for sv in spec.versions)]
 | 
			
		||||
 | 
			
		||||
        if valid_versions:
 | 
			
		||||
 
 | 
			
		||||
@@ -39,7 +39,7 @@
 | 
			
		||||
import subprocess
 | 
			
		||||
import platform as py_platform
 | 
			
		||||
import multiprocessing
 | 
			
		||||
from urlparse import urlparse
 | 
			
		||||
from urlparse import urlparse, urljoin
 | 
			
		||||
 | 
			
		||||
import llnl.util.tty as tty
 | 
			
		||||
from llnl.util.filesystem import *
 | 
			
		||||
@@ -333,9 +333,6 @@ def __init__(self, spec):
 | 
			
		||||
        if '.' in self.name:
 | 
			
		||||
            self.name = self.name[self.name.rindex('.') + 1:]
 | 
			
		||||
 | 
			
		||||
        # This is set by scraping a web page.
 | 
			
		||||
        self._available_versions = None
 | 
			
		||||
 | 
			
		||||
        # Sanity check some required variables that could be
 | 
			
		||||
        # overridden by package authors.
 | 
			
		||||
        def ensure_has_dict(attr_name):
 | 
			
		||||
@@ -370,14 +367,15 @@ def ensure_has_dict(attr_name):
 | 
			
		||||
 | 
			
		||||
        # Init fetch strategy and url to None
 | 
			
		||||
        self._fetcher = None
 | 
			
		||||
        self.url = None
 | 
			
		||||
        self.url = getattr(self.__class__, 'url', None)
 | 
			
		||||
 | 
			
		||||
        # Fix up self.url if this package fetches with a URLFetchStrategy.
 | 
			
		||||
        # This makes self.url behave sanely.
 | 
			
		||||
        if self.spec.versions.concrete:
 | 
			
		||||
            # TODO: this is a really roundabout way of determining the type of fetch to do.
 | 
			
		||||
            # TODO: figure out a more sane fetch strategy/package init order
 | 
			
		||||
            # TODO: (right now it's conflated with stage, package, and the tests make assumptions)
 | 
			
		||||
            # TODO: this is a really roundabout way of determining the type
 | 
			
		||||
            # TODO: of fetch to do. figure out a more sane fetch strategy/package
 | 
			
		||||
            # TODO: init order (right now it's conflated with stage, package, and
 | 
			
		||||
            # TODO: the tests make assumptions)
 | 
			
		||||
            f = fs.for_package_version(self, self.version)
 | 
			
		||||
            if isinstance(f, fs.URLFetchStrategy):
 | 
			
		||||
                self.url = self.url_for_version(self.spec.version)
 | 
			
		||||
@@ -852,71 +850,70 @@ def do_clean_dist(self):
 | 
			
		||||
            self.stage.destroy()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def fetch_available_versions(self):
 | 
			
		||||
        if not hasattr(self, 'url'):
 | 
			
		||||
    @property
 | 
			
		||||
    def all_urls(self):
 | 
			
		||||
        urls = []
 | 
			
		||||
        if self.url:
 | 
			
		||||
            urls.append(self.url)
 | 
			
		||||
 | 
			
		||||
        for args in self.versions.values():
 | 
			
		||||
            if 'url' in args:
 | 
			
		||||
                urls.append(args['url'])
 | 
			
		||||
        return urls
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    def fetch_remote_versions(self):
 | 
			
		||||
        """Try to find remote versions of this package using the
 | 
			
		||||
           list_url and any other URLs described in the package file."""
 | 
			
		||||
        if not self.all_urls:
 | 
			
		||||
            raise VersionFetchError(self.__class__)
 | 
			
		||||
 | 
			
		||||
        # If not, then try to fetch using list_url
 | 
			
		||||
        if not self._available_versions:
 | 
			
		||||
            try:
 | 
			
		||||
                self._available_versions = find_versions_of_archive(
 | 
			
		||||
                    self.url,
 | 
			
		||||
                    list_url=self.list_url,
 | 
			
		||||
                    list_depth=self.list_depth)
 | 
			
		||||
 | 
			
		||||
                if not self._available_versions:
 | 
			
		||||
                    tty.warn("Found no versions for %s" % self.name,
 | 
			
		||||
                             "Check the list_url and list_depth attribute on the "
 | 
			
		||||
                             + self.name + " package.",
 | 
			
		||||
                             "Use them to tell Spack where to look for versions.")
 | 
			
		||||
 | 
			
		||||
            except spack.error.NoNetworkConnectionError, e:
 | 
			
		||||
                tty.die("Package.fetch_available_versions couldn't connect to:",
 | 
			
		||||
                        e.url, e.message)
 | 
			
		||||
 | 
			
		||||
        return self._available_versions
 | 
			
		||||
        try:
 | 
			
		||||
            return find_versions_of_archive(
 | 
			
		||||
                *self.all_urls, list_url=self.list_url, list_depth=self.list_depth)
 | 
			
		||||
        except spack.error.NoNetworkConnectionError, e:
 | 
			
		||||
            tty.die("Package.fetch_versions couldn't connect to:",
 | 
			
		||||
                    e.url, e.message)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
    def available_versions(self):
 | 
			
		||||
        # If the package overrode available_versions, then use that.
 | 
			
		||||
        if self.versions is not None:
 | 
			
		||||
            return VersionList(self.versions.keys())
 | 
			
		||||
        else:
 | 
			
		||||
            vlist = self.fetch_available_versions()
 | 
			
		||||
            if not vlist:
 | 
			
		||||
                vlist = ver([self.version])
 | 
			
		||||
            return vlist
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def find_versions_of_archive(archive_url, **kwargs):
 | 
			
		||||
def find_versions_of_archive(*archive_urls, **kwargs):
 | 
			
		||||
    list_url   = kwargs.get('list_url', None)
 | 
			
		||||
    list_depth = kwargs.get('list_depth', 1)
 | 
			
		||||
 | 
			
		||||
    if not list_url:
 | 
			
		||||
        list_url = url.find_list_url(archive_url)
 | 
			
		||||
 | 
			
		||||
    # This creates a regex from the URL with a capture group for the
 | 
			
		||||
    # version part of the URL.  The capture group is converted to a
 | 
			
		||||
    # generic wildcard, so we can use this to extract things on a page
 | 
			
		||||
    # that look like archive URLs.
 | 
			
		||||
    url_regex = url.wildcard_version(archive_url)
 | 
			
		||||
 | 
			
		||||
    # We'll be a bit more liberal and just look for the archive part,
 | 
			
		||||
    # not the full path.
 | 
			
		||||
    archive_regex = os.path.basename(url_regex)
 | 
			
		||||
    # Generate a list of list_urls based on archive urls and any
 | 
			
		||||
    # explicitly listed list_url in the package
 | 
			
		||||
    list_urls = set()
 | 
			
		||||
    if list_url:
 | 
			
		||||
        list_urls.add(list_url)
 | 
			
		||||
    for aurl in archive_urls:
 | 
			
		||||
        list_urls.add(url.find_list_url(aurl))
 | 
			
		||||
 | 
			
		||||
    # Grab some web pages to scrape.
 | 
			
		||||
    page_map = get_pages(list_url, depth=list_depth)
 | 
			
		||||
    page_map = {}
 | 
			
		||||
    for lurl in list_urls:
 | 
			
		||||
        page_map.update(get_pages(lurl, depth=list_depth))
 | 
			
		||||
 | 
			
		||||
    # Scrape them for archive URLs
 | 
			
		||||
    regexes = []
 | 
			
		||||
    for aurl in archive_urls:
 | 
			
		||||
        # This creates a regex from the URL with a capture group for
 | 
			
		||||
        # the version part of the URL.  The capture group is converted
 | 
			
		||||
        # to a generic wildcard, so we can use this to extract things
 | 
			
		||||
        # on a page that look like archive URLs.
 | 
			
		||||
        url_regex = url.wildcard_version(aurl)
 | 
			
		||||
 | 
			
		||||
        # We'll be a bit more liberal and just look for the archive
 | 
			
		||||
        # part, not the full path.
 | 
			
		||||
        regexes.append(os.path.basename(url_regex))
 | 
			
		||||
 | 
			
		||||
    # Build a version list from all the matches we find
 | 
			
		||||
    versions = VersionList()
 | 
			
		||||
    for site, page in page_map.iteritems():
 | 
			
		||||
    versions = {}
 | 
			
		||||
    for page_url, content in page_map.iteritems():
 | 
			
		||||
        # extract versions from matches.
 | 
			
		||||
        matches = re.finditer(archive_regex, page)
 | 
			
		||||
        version_strings = set(m.group(1) for m in matches)
 | 
			
		||||
        for v in version_strings:
 | 
			
		||||
            versions.add(Version(v))
 | 
			
		||||
        for regex in regexes:
 | 
			
		||||
            versions.update(
 | 
			
		||||
                (Version(m.group(1)), urljoin(page_url, m.group(0)))
 | 
			
		||||
                for m in re.finditer(regex, content))
 | 
			
		||||
 | 
			
		||||
    return versions
 | 
			
		||||
 | 
			
		||||
@@ -979,8 +976,8 @@ class VersionFetchError(PackageError):
 | 
			
		||||
    """Raised when a version URL cannot automatically be determined."""
 | 
			
		||||
    def __init__(self, cls):
 | 
			
		||||
        super(VersionFetchError, self).__init__(
 | 
			
		||||
            "Cannot fetch version for package %s " % cls.__name__ +
 | 
			
		||||
            "because it does not define a default url.")
 | 
			
		||||
            "Cannot fetch versions for package %s " % cls.__name__ +
 | 
			
		||||
            "because it does not define any URLs to fetch.")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
class NoURLError(PackageError):
 | 
			
		||||
 
 | 
			
		||||
@@ -245,6 +245,10 @@ def wildcard_version(path):
 | 
			
		||||
    # Construct a case-insensitive regular expression for the package name.
 | 
			
		||||
    name_re = '(%s)' % insensitize(name)
 | 
			
		||||
 | 
			
		||||
    # protect extensions like bz2 from wildcarding.
 | 
			
		||||
    ext = comp.extension(path)
 | 
			
		||||
    path = comp.strip_extension(path)
 | 
			
		||||
 | 
			
		||||
    # Split the string apart by things that match the name so that if the
 | 
			
		||||
    # name contains numbers or things that look like versions, we don't
 | 
			
		||||
    # catch them with the version wildcard.
 | 
			
		||||
@@ -261,4 +265,4 @@ def wildcard_version(path):
 | 
			
		||||
        name_parts[i] = vgroup.join(re.escape(vp) for vp in vparts)
 | 
			
		||||
 | 
			
		||||
    # Put it all back together with original name matches intact.
 | 
			
		||||
    return ''.join(name_parts)
 | 
			
		||||
    return ''.join(name_parts) + '.' + ext
 | 
			
		||||
 
 | 
			
		||||
@@ -25,7 +25,7 @@
 | 
			
		||||
import re
 | 
			
		||||
import sys
 | 
			
		||||
import subprocess
 | 
			
		||||
import urllib2
 | 
			
		||||
import urllib2, cookielib
 | 
			
		||||
import urlparse
 | 
			
		||||
from multiprocessing import Pool
 | 
			
		||||
from HTMLParser import HTMLParser, HTMLParseError
 | 
			
		||||
@@ -68,7 +68,7 @@ def _spider(args):
 | 
			
		||||
       pool.  Firing off all the child links at once makes the fetch MUCH
 | 
			
		||||
       faster for pages with lots of children.
 | 
			
		||||
    """
 | 
			
		||||
    url, depth, max_depth, raise_on_error = args
 | 
			
		||||
    url, visited, root, opener, depth, max_depth, raise_on_error = args
 | 
			
		||||
 | 
			
		||||
    pages = {}
 | 
			
		||||
    try:
 | 
			
		||||
@@ -82,12 +82,12 @@ def _spider(args):
 | 
			
		||||
        resp = urllib2.urlopen(req, timeout=TIMEOUT)
 | 
			
		||||
 | 
			
		||||
        if not "Content-type" in resp.headers:
 | 
			
		||||
            tty.warn("ignoring page " + url)
 | 
			
		||||
            tty.debug("ignoring page " + url)
 | 
			
		||||
            return pages
 | 
			
		||||
 | 
			
		||||
        if not resp.headers["Content-type"].startswith('text/html'):
 | 
			
		||||
            tty.warn("ignoring page " + url + " with content type " +
 | 
			
		||||
                     resp.headers["Content-type"])
 | 
			
		||||
            tty.debug("ignoring page " + url + " with content type " +
 | 
			
		||||
                      resp.headers["Content-type"])
 | 
			
		||||
            return pages
 | 
			
		||||
 | 
			
		||||
        # Do the real GET request when we know it's just HTML.
 | 
			
		||||
@@ -114,15 +114,30 @@ def _spider(args):
 | 
			
		||||
 | 
			
		||||
                # Evaluate the link relative to the page it came from.
 | 
			
		||||
                abs_link = urlparse.urljoin(response_url, raw_link)
 | 
			
		||||
                subcalls.append((abs_link, depth+1, max_depth, raise_on_error))
 | 
			
		||||
 | 
			
		||||
                # Skip things outside the root directory
 | 
			
		||||
                if not abs_link.startswith(root):
 | 
			
		||||
                    continue
 | 
			
		||||
 | 
			
		||||
                # Skip already-visited links
 | 
			
		||||
                if abs_link in visited:
 | 
			
		||||
                    continue
 | 
			
		||||
 | 
			
		||||
                subcalls.append((abs_link, visited, root, None, depth+1, max_depth, raise_on_error))
 | 
			
		||||
                visited.add(abs_link)
 | 
			
		||||
 | 
			
		||||
            if subcalls:
 | 
			
		||||
                pool = Pool(processes=len(subcalls))
 | 
			
		||||
                dicts = pool.map(_spider, subcalls)
 | 
			
		||||
                for d in dicts:
 | 
			
		||||
                    pages.update(d)
 | 
			
		||||
                try:
 | 
			
		||||
                    pool = Pool(processes=len(subcalls))
 | 
			
		||||
                    dicts = pool.map(_spider, subcalls)
 | 
			
		||||
                    for d in dicts:
 | 
			
		||||
                        pages.update(d)
 | 
			
		||||
                finally:
 | 
			
		||||
                    pool.terminate()
 | 
			
		||||
                    pool.join()
 | 
			
		||||
 | 
			
		||||
    except urllib2.URLError, e:
 | 
			
		||||
        tty.debug(e)
 | 
			
		||||
        if raise_on_error:
 | 
			
		||||
            raise spack.error.NoNetworkConnectionError(str(e), url)
 | 
			
		||||
 | 
			
		||||
@@ -137,7 +152,8 @@ def _spider(args):
 | 
			
		||||
        tty.warn(msg, url, "HTMLParseError: " + str(e))
 | 
			
		||||
 | 
			
		||||
    except Exception, e:
 | 
			
		||||
        pass    # Other types of errors are completely ignored.
 | 
			
		||||
        # Other types of errors are completely ignored, except in debug mode.
 | 
			
		||||
        tty.debug("Error in _spider: %s" % e)
 | 
			
		||||
 | 
			
		||||
    return pages
 | 
			
		||||
 | 
			
		||||
@@ -151,5 +167,5 @@ def get_pages(root_url, **kwargs):
 | 
			
		||||
       performance over a sequential fetch.
 | 
			
		||||
    """
 | 
			
		||||
    max_depth = kwargs.setdefault('depth', 1)
 | 
			
		||||
    pages =  _spider((root_url, 1, max_depth, False))
 | 
			
		||||
    pages =  _spider((root_url, set(), root_url, None, 1, max_depth, False))
 | 
			
		||||
    return pages
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user