Better version wildcard handling, better spidering
- Allow version wildcards to match [_-.] instead of the exact separators the version was constructed with. - Handles the fact that boost versions are written both 1.55.0 and 1_55_0. - Update spidering to handle parse errors and warn that Python < 2.7.3 has less robust HTML parsing abilities.
This commit is contained in:
		
							
								
								
									
										2
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							| @@ -1,6 +1,6 @@ | |||||||
|  | /var/spack/stage | ||||||
| *.pyc | *.pyc | ||||||
| /opt/ | /opt/ | ||||||
| /var/ |  | ||||||
| *~ | *~ | ||||||
| .DS_Store | .DS_Store | ||||||
| .idea | .idea | ||||||
|   | |||||||
| @@ -41,5 +41,7 @@ def __init__(self, message): | |||||||
| class NoNetworkConnectionError(SpackError): | class NoNetworkConnectionError(SpackError): | ||||||
|     """Raised when an operation needs an internet connection.""" |     """Raised when an operation needs an internet connection.""" | ||||||
|     def __init__(self, message, url): |     def __init__(self, message, url): | ||||||
|         super(NoNetworkConnectionError, self).__init__(message) |         super(NoNetworkConnectionError, self).__init__( | ||||||
|  |             "No network connection: " + str(message), | ||||||
|  |             "URL was: " + str(url)) | ||||||
|         self.url = url |         self.url = url | ||||||
|   | |||||||
| @@ -206,7 +206,7 @@ def wildcard_version(path): | |||||||
|     ver, start, end = parse_version_string_with_indices(path) |     ver, start, end = parse_version_string_with_indices(path) | ||||||
|  |  | ||||||
|     v = Version(ver) |     v = Version(ver) | ||||||
|     parts = list(re.escape(p) for p in path.split(str(v))) |     parts = [re.escape(p) for p in re.split(v.wildcard(), path)] | ||||||
|  |  | ||||||
|     # Make a group for the wildcard, so it will be captured by the regex. |     # Make a group for the wildcard, so it will be captured by the regex. | ||||||
|     version_group = '(%s)' % v.wildcard() |     version_group = '(%s)' % v.wildcard() | ||||||
|   | |||||||
| @@ -23,11 +23,12 @@ | |||||||
| # Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | # Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||||||
| ############################################################################## | ############################################################################## | ||||||
| import re | import re | ||||||
|  | import sys | ||||||
| import subprocess | import subprocess | ||||||
| import urllib2 | import urllib2 | ||||||
| import urlparse | import urlparse | ||||||
| from multiprocessing import Pool | from multiprocessing import Pool | ||||||
| from HTMLParser import HTMLParser | from HTMLParser import HTMLParser, HTMLParseError | ||||||
|  |  | ||||||
| import llnl.util.tty as tty | import llnl.util.tty as tty | ||||||
|  |  | ||||||
| @@ -67,7 +68,7 @@ def _spider(args): | |||||||
|        pool.  Firing off all the child links at once makes the fetch MUCH |        pool.  Firing off all the child links at once makes the fetch MUCH | ||||||
|        faster for pages with lots of children. |        faster for pages with lots of children. | ||||||
|     """ |     """ | ||||||
|     url, depth, max_depth = args |     url, depth, max_depth, raise_on_error = args | ||||||
|  |  | ||||||
|     pages = {} |     pages = {} | ||||||
|     try: |     try: | ||||||
| @@ -81,11 +82,12 @@ def _spider(args): | |||||||
|         resp = urllib2.urlopen(req, timeout=TIMEOUT) |         resp = urllib2.urlopen(req, timeout=TIMEOUT) | ||||||
|  |  | ||||||
|         if not "Content-type" in resp.headers: |         if not "Content-type" in resp.headers: | ||||||
|             print "ignoring page " + url |             tty.warn("ignoring page " + url) | ||||||
|             return pages |             return pages | ||||||
|  |  | ||||||
|         if not resp.headers["Content-type"].startswith('text/html'): |         if not resp.headers["Content-type"].startswith('text/html'): | ||||||
|             print "ignoring page " + url + " with content type " + resp.headers["Content-type"] |             tty.warn("ignoring page " + url + " with content type " + | ||||||
|  |                      resp.headers["Content-type"]) | ||||||
|             return pages |             return pages | ||||||
|  |  | ||||||
|         # Do the real GET request when we know it's just HTML. |         # Do the real GET request when we know it's just HTML. | ||||||
| @@ -100,9 +102,9 @@ def _spider(args): | |||||||
|         # If we're not at max depth, parse out the links in the page |         # If we're not at max depth, parse out the links in the page | ||||||
|         if depth < max_depth: |         if depth < max_depth: | ||||||
|             link_parser = LinkParser() |             link_parser = LinkParser() | ||||||
|  |  | ||||||
|             subcalls = [] |             subcalls = [] | ||||||
|             link_parser.feed(page) |             link_parser.feed(page) | ||||||
|  |  | ||||||
|             while link_parser.links: |             while link_parser.links: | ||||||
|                 raw_link = link_parser.links.pop() |                 raw_link = link_parser.links.pop() | ||||||
|  |  | ||||||
| @@ -112,7 +114,7 @@ def _spider(args): | |||||||
|  |  | ||||||
|                 # Evaluate the link relative to the page it came from. |                 # Evaluate the link relative to the page it came from. | ||||||
|                 abs_link = urlparse.urljoin(response_url, raw_link) |                 abs_link = urlparse.urljoin(response_url, raw_link) | ||||||
|                 subcalls.append((abs_link, depth+1, max_depth)) |                 subcalls.append((abs_link, depth+1, max_depth, raise_on_error)) | ||||||
|  |  | ||||||
|             if subcalls: |             if subcalls: | ||||||
|                 pool = Pool(processes=len(subcalls)) |                 pool = Pool(processes=len(subcalls)) | ||||||
| @@ -121,13 +123,21 @@ def _spider(args): | |||||||
|                     pages.update(d) |                     pages.update(d) | ||||||
|  |  | ||||||
|     except urllib2.URLError, e: |     except urllib2.URLError, e: | ||||||
|         # Only report it if it's the root page.  We ignore errors when spidering. |         if raise_on_error: | ||||||
|         if depth == 1: |             raise spack.error.NoNetworkConnectionError(str(e), url) | ||||||
|             raise spack.error.NoNetworkConnectionError(e.reason, url) |  | ||||||
|  |     except HTMLParseError, e: | ||||||
|  |         # This error indicates that Python's HTML parser sucks. | ||||||
|  |         msg = "Got an error parsing HTML." | ||||||
|  |  | ||||||
|  |         # Pre-2.7.3 Pythons in particular have rather prickly HTML parsing. | ||||||
|  |         if sys.version_info[:3] < (2,7,3): | ||||||
|  |             msg += " Use Python 2.7.3 or newer for better HTML parsing." | ||||||
|  |  | ||||||
|  |         tty.warn(msg, url, "HTMLParseError: " + str(e)) | ||||||
|  |  | ||||||
|     except Exception, e: |     except Exception, e: | ||||||
|         # Other types of errors are completely ignored. |         pass    # Other types of errors are completely ignored. | ||||||
|         pass |  | ||||||
|  |  | ||||||
|     return pages |     return pages | ||||||
|  |  | ||||||
| @@ -141,5 +151,5 @@ def get_pages(root_url, **kwargs): | |||||||
|        performance over a sequential fetch. |        performance over a sequential fetch. | ||||||
|     """ |     """ | ||||||
|     max_depth = kwargs.setdefault('depth', 1) |     max_depth = kwargs.setdefault('depth', 1) | ||||||
|     pages =  _spider((root_url, 1, max_depth)) |     pages =  _spider((root_url, 1, max_depth, False)) | ||||||
|     return pages |     return pages | ||||||
|   | |||||||
| @@ -152,21 +152,24 @@ def a_or_n(seg): | |||||||
|                 return r'[a-zA-Z]+' |                 return r'[a-zA-Z]+' | ||||||
|  |  | ||||||
|         version = self.version |         version = self.version | ||||||
|         separators = ('',) + self.separators |  | ||||||
|  |         # Use a wildcard for separators, in case a version is written | ||||||
|  |         # two different ways (e.g., boost writes 1_55_0 and 1.55.0) | ||||||
|  |         sep_re = '[_.-]' | ||||||
|  |         separators = ('',) + (sep_re,) * len(self.separators) | ||||||
|  |  | ||||||
|         version += (version[-1],) * 2 |         version += (version[-1],) * 2 | ||||||
|         separators += (separators[-1],) * 2 |         separators += (sep_re,) * 2 | ||||||
|  |  | ||||||
|         sep_res = [re.escape(sep) for sep in separators] |         segments = [a_or_n(seg) for seg in version] | ||||||
|         seg_res = [a_or_n(seg) for seg in version] |  | ||||||
|  |  | ||||||
|         wc = seg_res[0] |         wc = segments[0] | ||||||
|         for i in xrange(1, len(sep_res)): |         for i in xrange(1, len(separators)): | ||||||
|             wc += '(?:' + sep_res[i] + seg_res[i] |             wc += '(?:' + separators[i] + segments[i] | ||||||
|  |  | ||||||
|         # Add possible alpha or beta indicator at the end of each segemnt |         # Add possible alpha or beta indicator at the end of each segemnt | ||||||
|         # We treat these specially b/c they're so common. |         # We treat these specially b/c they're so common. | ||||||
|         wc += '[ab]?)?' * (len(seg_res) - 1) |         wc += '[ab]?)?' * (len(segments) - 1) | ||||||
|         return wc |         return wc | ||||||
|  |  | ||||||
|  |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Todd Gamblin
					Todd Gamblin