Merge pull request #276 from LLNL/bugfix/235-smarter-spider
Fix #235: Smarter web spidering -- use parsed links instead of recons…
This commit is contained in:
commit
28d61f0d7f
@ -34,8 +34,8 @@
|
|||||||
import spack
|
import spack
|
||||||
import spack.cmd
|
import spack.cmd
|
||||||
import spack.cmd.checksum
|
import spack.cmd.checksum
|
||||||
import spack.package
|
|
||||||
import spack.url
|
import spack.url
|
||||||
|
import spack.util.web
|
||||||
from spack.util.naming import *
|
from spack.util.naming import *
|
||||||
import spack.util.crypto as crypto
|
import spack.util.crypto as crypto
|
||||||
|
|
||||||
@ -166,7 +166,7 @@ def create(parser, args):
|
|||||||
tty.msg("This looks like a URL for %s version %s." % (name, version))
|
tty.msg("This looks like a URL for %s version %s." % (name, version))
|
||||||
tty.msg("Creating template for package %s" % name)
|
tty.msg("Creating template for package %s" % name)
|
||||||
|
|
||||||
versions = spack.package.find_versions_of_archive(url)
|
versions = spack.util.web.find_versions_of_archive(url)
|
||||||
rkeys = sorted(versions.keys(), reverse=True)
|
rkeys = sorted(versions.keys(), reverse=True)
|
||||||
versions = OrderedDict(zip(rkeys, (versions[v] for v in rkeys)))
|
versions = OrderedDict(zip(rkeys, (versions[v] for v in rkeys)))
|
||||||
|
|
||||||
|
75
lib/spack/spack/cmd/url-parse.py
Normal file
75
lib/spack/spack/cmd/url-parse.py
Normal file
@ -0,0 +1,75 @@
|
|||||||
|
##############################################################################
|
||||||
|
# Copyright (c) 2013, Lawrence Livermore National Security, LLC.
|
||||||
|
# Produced at the Lawrence Livermore National Laboratory.
|
||||||
|
#
|
||||||
|
# This file is part of Spack.
|
||||||
|
# Written by Todd Gamblin, tgamblin@llnl.gov, All rights reserved.
|
||||||
|
# LLNL-CODE-647188
|
||||||
|
#
|
||||||
|
# For details, see https://github.com/llnl/spack
|
||||||
|
# Please also see the LICENSE file for our notice and the LGPL.
|
||||||
|
#
|
||||||
|
# This program is free software; you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU General Public License (as published by
|
||||||
|
# the Free Software Foundation) version 2.1 dated February 1999.
|
||||||
|
#
|
||||||
|
# This program is distributed in the hope that it will be useful, but
|
||||||
|
# WITHOUT ANY WARRANTY; without even the IMPLIED WARRANTY OF
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the terms and
|
||||||
|
# conditions of the GNU General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU Lesser General Public License
|
||||||
|
# along with this program; if not, write to the Free Software Foundation,
|
||||||
|
# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||||
|
##############################################################################
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import llnl.util.tty as tty
|
||||||
|
|
||||||
|
import spack
|
||||||
|
import spack.url
|
||||||
|
from spack.util.web import find_versions_of_archive
|
||||||
|
|
||||||
|
description = "Show parsing of a URL, optionally spider web for other versions."
|
||||||
|
|
||||||
|
def setup_parser(subparser):
|
||||||
|
subparser.add_argument('url', help="url of a package archive")
|
||||||
|
subparser.add_argument(
|
||||||
|
'-s', '--spider', action='store_true', help="Spider the source page for versions.")
|
||||||
|
|
||||||
|
|
||||||
|
def print_name_and_version(url):
|
||||||
|
name, ns, nl, ntup, ver, vs, vl, vtup = spack.url.substitution_offsets(url)
|
||||||
|
underlines = [" "] * max(ns+nl, vs+vl)
|
||||||
|
for i in range(ns, ns+nl):
|
||||||
|
underlines[i] = '-'
|
||||||
|
for i in range(vs, vs+vl):
|
||||||
|
underlines[i] = '~'
|
||||||
|
|
||||||
|
print " %s" % url
|
||||||
|
print " %s" % ''.join(underlines)
|
||||||
|
|
||||||
|
|
||||||
|
def url_parse(parser, args):
|
||||||
|
url = args.url
|
||||||
|
|
||||||
|
ver, vs, vl = spack.url.parse_version_offset(url)
|
||||||
|
name, ns, nl = spack.url.parse_name_offset(url, ver)
|
||||||
|
|
||||||
|
tty.msg("Parsing URL:")
|
||||||
|
try:
|
||||||
|
print_name_and_version(url)
|
||||||
|
except spack.url.UrlParseError as e:
|
||||||
|
tty.error(str(e))
|
||||||
|
|
||||||
|
print
|
||||||
|
tty.msg("Substituting version 9.9.9b:")
|
||||||
|
newurl = spack.url.substitute_version(url, '9.9.9b')
|
||||||
|
print_name_and_version(newurl)
|
||||||
|
|
||||||
|
if args.spider:
|
||||||
|
print
|
||||||
|
tty.msg("Spidering for versions:")
|
||||||
|
versions = find_versions_of_archive(url)
|
||||||
|
for v in sorted(versions):
|
||||||
|
print "%-20s%s" % (v, versions[v])
|
@ -1164,7 +1164,7 @@ def fetch_remote_versions(self):
|
|||||||
raise VersionFetchError(self.__class__)
|
raise VersionFetchError(self.__class__)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
return find_versions_of_archive(
|
return spack.util.web.find_versions_of_archive(
|
||||||
*self.all_urls, list_url=self.list_url, list_depth=self.list_depth)
|
*self.all_urls, list_url=self.list_url, list_depth=self.list_depth)
|
||||||
except spack.error.NoNetworkConnectionError, e:
|
except spack.error.NoNetworkConnectionError, e:
|
||||||
tty.die("Package.fetch_versions couldn't connect to:",
|
tty.die("Package.fetch_versions couldn't connect to:",
|
||||||
@ -1188,50 +1188,6 @@ def rpath_args(self):
|
|||||||
return " ".join("-Wl,-rpath=%s" % p for p in self.rpath)
|
return " ".join("-Wl,-rpath=%s" % p for p in self.rpath)
|
||||||
|
|
||||||
|
|
||||||
def find_versions_of_archive(*archive_urls, **kwargs):
|
|
||||||
list_url = kwargs.get('list_url', None)
|
|
||||||
list_depth = kwargs.get('list_depth', 1)
|
|
||||||
|
|
||||||
# Generate a list of list_urls based on archive urls and any
|
|
||||||
# explicitly listed list_url in the package
|
|
||||||
list_urls = set()
|
|
||||||
if list_url:
|
|
||||||
list_urls.add(list_url)
|
|
||||||
for aurl in archive_urls:
|
|
||||||
list_urls.add(spack.url.find_list_url(aurl))
|
|
||||||
|
|
||||||
# Grab some web pages to scrape.
|
|
||||||
page_map = {}
|
|
||||||
for lurl in list_urls:
|
|
||||||
pages = spack.util.web.get_pages(lurl, depth=list_depth)
|
|
||||||
page_map.update(pages)
|
|
||||||
|
|
||||||
# Scrape them for archive URLs
|
|
||||||
regexes = []
|
|
||||||
for aurl in archive_urls:
|
|
||||||
# This creates a regex from the URL with a capture group for
|
|
||||||
# the version part of the URL. The capture group is converted
|
|
||||||
# to a generic wildcard, so we can use this to extract things
|
|
||||||
# on a page that look like archive URLs.
|
|
||||||
url_regex = spack.url.wildcard_version(aurl)
|
|
||||||
|
|
||||||
# We'll be a bit more liberal and just look for the archive
|
|
||||||
# part, not the full path.
|
|
||||||
regexes.append(os.path.basename(url_regex))
|
|
||||||
|
|
||||||
# Build a version list from all the matches we find
|
|
||||||
versions = {}
|
|
||||||
for page_url, content in page_map.iteritems():
|
|
||||||
# extract versions from matches.
|
|
||||||
for regex in regexes:
|
|
||||||
for m in re.finditer(regex, content):
|
|
||||||
url = urljoin(page_url, m.group(0))
|
|
||||||
ver = spack.url.parse_version(url)
|
|
||||||
versions[ver] = url
|
|
||||||
|
|
||||||
return versions
|
|
||||||
|
|
||||||
|
|
||||||
def validate_package_url(url_string):
|
def validate_package_url(url_string):
|
||||||
"""Determine whether spack can handle a particular URL or not."""
|
"""Determine whether spack can handle a particular URL or not."""
|
||||||
url = urlparse(url_string)
|
url = urlparse(url_string)
|
||||||
|
@ -23,6 +23,7 @@
|
|||||||
# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||||
##############################################################################
|
##############################################################################
|
||||||
import re
|
import re
|
||||||
|
import os
|
||||||
import sys
|
import sys
|
||||||
import subprocess
|
import subprocess
|
||||||
import urllib2, cookielib
|
import urllib2, cookielib
|
||||||
@ -70,7 +71,9 @@ def _spider(args):
|
|||||||
"""
|
"""
|
||||||
url, visited, root, opener, depth, max_depth, raise_on_error = args
|
url, visited, root, opener, depth, max_depth, raise_on_error = args
|
||||||
|
|
||||||
pages = {}
|
pages = {} # dict from page URL -> text content.
|
||||||
|
links = set() # set of all links seen on visited pages.
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Make a HEAD request first to check the content type. This lets
|
# Make a HEAD request first to check the content type. This lets
|
||||||
# us ignore tarballs and gigantic files.
|
# us ignore tarballs and gigantic files.
|
||||||
@ -99,22 +102,21 @@ def _spider(args):
|
|||||||
page = response.read()
|
page = response.read()
|
||||||
pages[response_url] = page
|
pages[response_url] = page
|
||||||
|
|
||||||
# If we're not at max depth, parse out the links in the page
|
# Parse out the links in the page
|
||||||
if depth < max_depth:
|
|
||||||
link_parser = LinkParser()
|
link_parser = LinkParser()
|
||||||
subcalls = []
|
subcalls = []
|
||||||
link_parser.feed(page)
|
link_parser.feed(page)
|
||||||
|
|
||||||
while link_parser.links:
|
while link_parser.links:
|
||||||
raw_link = link_parser.links.pop()
|
raw_link = link_parser.links.pop()
|
||||||
|
abs_link = urlparse.urljoin(response_url, raw_link)
|
||||||
|
|
||||||
|
links.add(abs_link)
|
||||||
|
|
||||||
# Skip stuff that looks like an archive
|
# Skip stuff that looks like an archive
|
||||||
if any(raw_link.endswith(suf) for suf in ALLOWED_ARCHIVE_TYPES):
|
if any(raw_link.endswith(suf) for suf in ALLOWED_ARCHIVE_TYPES):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Evaluate the link relative to the page it came from.
|
|
||||||
abs_link = urlparse.urljoin(response_url, raw_link)
|
|
||||||
|
|
||||||
# Skip things outside the root directory
|
# Skip things outside the root directory
|
||||||
if not abs_link.startswith(root):
|
if not abs_link.startswith(root):
|
||||||
continue
|
continue
|
||||||
@ -123,15 +125,19 @@ def _spider(args):
|
|||||||
if abs_link in visited:
|
if abs_link in visited:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
subcalls.append((abs_link, visited, root, None, depth+1, max_depth, raise_on_error))
|
# If we're not at max depth, follow links.
|
||||||
|
if depth < max_depth:
|
||||||
|
subcalls.append((abs_link, visited, root, None,
|
||||||
|
depth+1, max_depth, raise_on_error))
|
||||||
visited.add(abs_link)
|
visited.add(abs_link)
|
||||||
|
|
||||||
if subcalls:
|
if subcalls:
|
||||||
try:
|
try:
|
||||||
pool = Pool(processes=len(subcalls))
|
pool = Pool(processes=len(subcalls))
|
||||||
dicts = pool.map(_spider, subcalls)
|
results = pool.map(_spider, subcalls)
|
||||||
for d in dicts:
|
for sub_pages, sub_links in results:
|
||||||
pages.update(d)
|
pages.update(sub_pages)
|
||||||
|
links.update(sub_links)
|
||||||
finally:
|
finally:
|
||||||
pool.terminate()
|
pool.terminate()
|
||||||
pool.join()
|
pool.join()
|
||||||
@ -155,10 +161,10 @@ def _spider(args):
|
|||||||
# Other types of errors are completely ignored, except in debug mode.
|
# Other types of errors are completely ignored, except in debug mode.
|
||||||
tty.debug("Error in _spider: %s" % e)
|
tty.debug("Error in _spider: %s" % e)
|
||||||
|
|
||||||
return pages
|
return pages, links
|
||||||
|
|
||||||
|
|
||||||
def get_pages(root_url, **kwargs):
|
def spider(root_url, **kwargs):
|
||||||
"""Gets web pages from a root URL.
|
"""Gets web pages from a root URL.
|
||||||
If depth is specified (e.g., depth=2), then this will also fetches pages
|
If depth is specified (e.g., depth=2), then this will also fetches pages
|
||||||
linked from the root and its children up to depth.
|
linked from the root and its children up to depth.
|
||||||
@ -167,5 +173,69 @@ def get_pages(root_url, **kwargs):
|
|||||||
performance over a sequential fetch.
|
performance over a sequential fetch.
|
||||||
"""
|
"""
|
||||||
max_depth = kwargs.setdefault('depth', 1)
|
max_depth = kwargs.setdefault('depth', 1)
|
||||||
pages = _spider((root_url, set(), root_url, None, 1, max_depth, False))
|
pages, links = _spider((root_url, set(), root_url, None, 1, max_depth, False))
|
||||||
return pages
|
return pages, links
|
||||||
|
|
||||||
|
|
||||||
|
def find_versions_of_archive(*archive_urls, **kwargs):
|
||||||
|
"""Scrape web pages for new versions of a tarball.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
archive_urls:
|
||||||
|
URLs for different versions of a package. Typically these
|
||||||
|
are just the tarballs from the package file itself. By
|
||||||
|
default, this searches the parent directories of archives.
|
||||||
|
|
||||||
|
Keyword Arguments:
|
||||||
|
list_url:
|
||||||
|
|
||||||
|
URL for a listing of archives. Spack wills scrape these
|
||||||
|
pages for download links that look like the archive URL.
|
||||||
|
|
||||||
|
list_depth:
|
||||||
|
Max depth to follow links on list_url pages.
|
||||||
|
|
||||||
|
"""
|
||||||
|
list_url = kwargs.get('list_url', None)
|
||||||
|
list_depth = kwargs.get('list_depth', 1)
|
||||||
|
|
||||||
|
# Generate a list of list_urls based on archive urls and any
|
||||||
|
# explicitly listed list_url in the package
|
||||||
|
list_urls = set()
|
||||||
|
if list_url:
|
||||||
|
list_urls.add(list_url)
|
||||||
|
for aurl in archive_urls:
|
||||||
|
list_urls.add(spack.url.find_list_url(aurl))
|
||||||
|
|
||||||
|
# Grab some web pages to scrape.
|
||||||
|
pages = {}
|
||||||
|
links = set()
|
||||||
|
for lurl in list_urls:
|
||||||
|
p, l = spider(lurl, depth=list_depth)
|
||||||
|
pages.update(p)
|
||||||
|
links.update(l)
|
||||||
|
|
||||||
|
# Scrape them for archive URLs
|
||||||
|
regexes = []
|
||||||
|
for aurl in archive_urls:
|
||||||
|
# This creates a regex from the URL with a capture group for
|
||||||
|
# the version part of the URL. The capture group is converted
|
||||||
|
# to a generic wildcard, so we can use this to extract things
|
||||||
|
# on a page that look like archive URLs.
|
||||||
|
url_regex = spack.url.wildcard_version(aurl)
|
||||||
|
|
||||||
|
# We'll be a bit more liberal and just look for the archive
|
||||||
|
# part, not the full path.
|
||||||
|
regexes.append(os.path.basename(url_regex))
|
||||||
|
|
||||||
|
# Build a dict version -> URL from any links that match the wildcards.
|
||||||
|
versions = {}
|
||||||
|
for url in links:
|
||||||
|
if any(re.search(r, url) for r in regexes):
|
||||||
|
try:
|
||||||
|
ver = spack.url.parse_version(url)
|
||||||
|
versions[ver] = url
|
||||||
|
except spack.url.UndetectableVersionError as e:
|
||||||
|
continue
|
||||||
|
|
||||||
|
return versions
|
||||||
|
Loading…
Reference in New Issue
Block a user