Better version wildcard handling, better spidering
- Allow version wildcards to match [_-.] instead of the exact separators the version was constructed with. - Handles the fact that boost versions are written both 1.55.0 and 1_55_0. - Update spidering to handle parse errors and warn that Python < 2.7.3 has less robust HTML parsing abilities.
This commit is contained in:
		
							
								
								
									
										2
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @@ -1,6 +1,6 @@ | ||||
| /var/spack/stage | ||||
| *.pyc | ||||
| /opt/ | ||||
| /var/ | ||||
| *~ | ||||
| .DS_Store | ||||
| .idea | ||||
|   | ||||
| @@ -41,5 +41,7 @@ def __init__(self, message): | ||||
| class NoNetworkConnectionError(SpackError): | ||||
|     """Raised when an operation needs an internet connection.""" | ||||
|     def __init__(self, message, url): | ||||
|         super(NoNetworkConnectionError, self).__init__(message) | ||||
|         super(NoNetworkConnectionError, self).__init__( | ||||
|             "No network connection: " + str(message), | ||||
|             "URL was: " + str(url)) | ||||
|         self.url = url | ||||
|   | ||||
| @@ -206,7 +206,7 @@ def wildcard_version(path): | ||||
|     ver, start, end = parse_version_string_with_indices(path) | ||||
|  | ||||
|     v = Version(ver) | ||||
|     parts = list(re.escape(p) for p in path.split(str(v))) | ||||
|     parts = [re.escape(p) for p in re.split(v.wildcard(), path)] | ||||
|  | ||||
|     # Make a group for the wildcard, so it will be captured by the regex. | ||||
|     version_group = '(%s)' % v.wildcard() | ||||
|   | ||||
| @@ -23,11 +23,12 @@ | ||||
| # Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||||
| ############################################################################## | ||||
| import re | ||||
| import sys | ||||
| import subprocess | ||||
| import urllib2 | ||||
| import urlparse | ||||
| from multiprocessing import Pool | ||||
| from HTMLParser import HTMLParser | ||||
| from HTMLParser import HTMLParser, HTMLParseError | ||||
|  | ||||
| import llnl.util.tty as tty | ||||
|  | ||||
| @@ -67,7 +68,7 @@ def _spider(args): | ||||
|        pool.  Firing off all the child links at once makes the fetch MUCH | ||||
|        faster for pages with lots of children. | ||||
|     """ | ||||
|     url, depth, max_depth = args | ||||
|     url, depth, max_depth, raise_on_error = args | ||||
|  | ||||
|     pages = {} | ||||
|     try: | ||||
| @@ -81,11 +82,12 @@ def _spider(args): | ||||
|         resp = urllib2.urlopen(req, timeout=TIMEOUT) | ||||
|  | ||||
|         if not "Content-type" in resp.headers: | ||||
|             print "ignoring page " + url | ||||
|             tty.warn("ignoring page " + url) | ||||
|             return pages | ||||
|  | ||||
|         if not resp.headers["Content-type"].startswith('text/html'): | ||||
|             print "ignoring page " + url + " with content type " + resp.headers["Content-type"] | ||||
|             tty.warn("ignoring page " + url + " with content type " + | ||||
|                      resp.headers["Content-type"]) | ||||
|             return pages | ||||
|  | ||||
|         # Do the real GET request when we know it's just HTML. | ||||
| @@ -100,9 +102,9 @@ def _spider(args): | ||||
|         # If we're not at max depth, parse out the links in the page | ||||
|         if depth < max_depth: | ||||
|             link_parser = LinkParser() | ||||
|  | ||||
|             subcalls = [] | ||||
|             link_parser.feed(page) | ||||
|  | ||||
|             while link_parser.links: | ||||
|                 raw_link = link_parser.links.pop() | ||||
|  | ||||
| @@ -112,7 +114,7 @@ def _spider(args): | ||||
|  | ||||
|                 # Evaluate the link relative to the page it came from. | ||||
|                 abs_link = urlparse.urljoin(response_url, raw_link) | ||||
|                 subcalls.append((abs_link, depth+1, max_depth)) | ||||
|                 subcalls.append((abs_link, depth+1, max_depth, raise_on_error)) | ||||
|  | ||||
|             if subcalls: | ||||
|                 pool = Pool(processes=len(subcalls)) | ||||
| @@ -121,13 +123,21 @@ def _spider(args): | ||||
|                     pages.update(d) | ||||
|  | ||||
|     except urllib2.URLError, e: | ||||
|         # Only report it if it's the root page.  We ignore errors when spidering. | ||||
|         if depth == 1: | ||||
|             raise spack.error.NoNetworkConnectionError(e.reason, url) | ||||
|         if raise_on_error: | ||||
|             raise spack.error.NoNetworkConnectionError(str(e), url) | ||||
|  | ||||
|     except HTMLParseError, e: | ||||
|         # This error indicates that Python's HTML parser sucks. | ||||
|         msg = "Got an error parsing HTML." | ||||
|  | ||||
|         # Pre-2.7.3 Pythons in particular have rather prickly HTML parsing. | ||||
|         if sys.version_info[:3] < (2,7,3): | ||||
|             msg += " Use Python 2.7.3 or newer for better HTML parsing." | ||||
|  | ||||
|         tty.warn(msg, url, "HTMLParseError: " + str(e)) | ||||
|  | ||||
|     except Exception, e: | ||||
|         # Other types of errors are completely ignored. | ||||
|         pass | ||||
|         pass    # Other types of errors are completely ignored. | ||||
|  | ||||
|     return pages | ||||
|  | ||||
| @@ -141,5 +151,5 @@ def get_pages(root_url, **kwargs): | ||||
|        performance over a sequential fetch. | ||||
|     """ | ||||
|     max_depth = kwargs.setdefault('depth', 1) | ||||
|     pages =  _spider((root_url, 1, max_depth)) | ||||
|     pages =  _spider((root_url, 1, max_depth, False)) | ||||
|     return pages | ||||
|   | ||||
| @@ -152,21 +152,24 @@ def a_or_n(seg): | ||||
|                 return r'[a-zA-Z]+' | ||||
|  | ||||
|         version = self.version | ||||
|         separators = ('',) + self.separators | ||||
|  | ||||
|         # Use a wildcard for separators, in case a version is written | ||||
|         # two different ways (e.g., boost writes 1_55_0 and 1.55.0) | ||||
|         sep_re = '[_.-]' | ||||
|         separators = ('',) + (sep_re,) * len(self.separators) | ||||
|  | ||||
|         version += (version[-1],) * 2 | ||||
|         separators += (separators[-1],) * 2 | ||||
|         separators += (sep_re,) * 2 | ||||
|  | ||||
|         sep_res = [re.escape(sep) for sep in separators] | ||||
|         seg_res = [a_or_n(seg) for seg in version] | ||||
|         segments = [a_or_n(seg) for seg in version] | ||||
|  | ||||
|         wc = seg_res[0] | ||||
|         for i in xrange(1, len(sep_res)): | ||||
|             wc += '(?:' + sep_res[i] + seg_res[i] | ||||
|         wc = segments[0] | ||||
|         for i in xrange(1, len(separators)): | ||||
|             wc += '(?:' + separators[i] + segments[i] | ||||
|  | ||||
|         # Add possible alpha or beta indicator at the end of each segemnt | ||||
|         # We treat these specially b/c they're so common. | ||||
|         wc += '[ab]?)?' * (len(seg_res) - 1) | ||||
|         wc += '[ab]?)?' * (len(segments) - 1) | ||||
|         return wc | ||||
|  | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Todd Gamblin
					Todd Gamblin